.. index:: pair: example; cpu_inference_int8.cpp .. _doxid-cpu_inference_int8_8cpp-example: cpu_inference_int8.cpp ====================== This is an example to demonstrate how to build an int8 graph with Graph API and run it on CPU. Annotated version: :ref:`Convolution int8 inference example with Graph API ` This is an example to demonstrate how to build an int8 graph with Graph API and run it on CPU. Annotated version: :ref:`Convolution int8 inference example with Graph API ` .. ref-code-block:: cpp /******************************************************************************* * Copyright 2023-2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ //[Headers and namespace] #include #include #include #include #include #include #include "oneapi/dnnl/dnnl_graph.hpp" #include "example_utils.hpp" #include "graph_example_utils.hpp" using namespace :ref:`dnnl::graph `; using :ref:`data_type ` = :ref:`logical_tensor::data_type `; using :ref:`layout_type ` = :ref:`logical_tensor::layout_type `; using :ref:`property_type ` = :ref:`logical_tensor::property_type `; using dim = :ref:`logical_tensor::dim `; using dims = :ref:`logical_tensor::dims `; //[Headers and namespace] void simple_pattern_int8() { dim N = 8, IC = 256, IH = 56, IW = 56, KH = 1, KW = 1, OC = 64; dims conv_input_dims {N, IH, IW, IC}; dims conv_weight_dims {KH, KW, IC, OC}; dims conv_bias_dims {OC}; //[Create dequant's logical tensor and the op] :ref:`logical_tensor ` dequant0_src_desc {0, data_type::u8}; :ref:`logical_tensor ` conv_src_desc {1, :ref:`data_type::f32 `}; :ref:`op ` dequant0(2, op::kind::Dequantize, {dequant0_src_desc}, {conv_src_desc}, "dequant0"); dequant0.:ref:`set_attr `(:ref:`op::attr::qtype `, "per_tensor"); dequant0.:ref:`set_attr `>(:ref:`op::attr::scales `, {0.1f}); dequant0.:ref:`set_attr `>(:ref:`op::attr::zps `, {10}); //[Create dequant's logical tensor and the op] //[Create dequant's logical tensor and the op.] :ref:`logical_tensor ` dequant1_src_desc {3, data_type::s8}; :ref:`logical_tensor ` conv_weight_desc { 4, :ref:`data_type::f32 `, 4, :ref:`layout_type::undef `, property_type::constant}; :ref:`op ` dequant1(5, op::kind::Dequantize, {dequant1_src_desc}, {conv_weight_desc}, "dequant1"); dequant1.:ref:`set_attr `(:ref:`op::attr::qtype `, "per_channel"); // the memory format of weight is XIO, which indicates channel equals // to 64 for the convolution. std::vector wei_scales(64, 0.1f); dims wei_zps(64, 0); dequant1.set_attr>(:ref:`op::attr::scales `, wei_scales); dequant1.set_attr>(:ref:`op::attr::zps `, wei_zps); dequant1.set_attr(:ref:`op::attr::axis `, 1); //[Create dequant's logical tensor and the op.] //[Create conv's logical tensor and the op] :ref:`logical_tensor ` conv_bias_desc { 6, :ref:`data_type::f32 `, 1, :ref:`layout_type::undef `, property_type::constant}; :ref:`logical_tensor ` conv_dst_desc {7, :ref:`data_type::f32 `, :ref:`layout_type::undef `}; // create the convolution op :ref:`op ` conv(8, op::kind::Convolution, {conv_src_desc, conv_weight_desc, conv_bias_desc}, {conv_dst_desc}, "conv"); conv.set_attr(:ref:`op::attr::strides `, {1, 1}); conv.set_attr(:ref:`op::attr::pads_begin `, {0, 0}); conv.set_attr(:ref:`op::attr::pads_end `, {0, 0}); conv.set_attr(:ref:`op::attr::dilations `, {1, 1}); conv.set_attr(:ref:`op::attr::data_format `, "NXC"); conv.set_attr(:ref:`op::attr::weights_format `, "XIO"); conv.set_attr(:ref:`op::attr::groups `, 1); //[Create conv's logical tensor and the op] //[Create ReLu's logical tensor and the op] :ref:`logical_tensor ` relu_dst_desc {9, :ref:`data_type::f32 `, :ref:`layout_type::undef `}; :ref:`op ` relu(10, op::kind::ReLU, {conv_dst_desc}, {relu_dst_desc}, "relu"); //[Create ReLu's logical tensor and the op] //[Create Quantize's logical tensor and the op] :ref:`logical_tensor ` quant_dst_desc {11, data_type::u8, :ref:`layout_type::undef `}; :ref:`op ` quant( 12, op::kind::Quantize, {relu_dst_desc}, {quant_dst_desc}, "quant"); quant.:ref:`set_attr `(:ref:`op::attr::qtype `, "per_tensor"); quant.:ref:`set_attr `>(:ref:`op::attr::scales `, {0.1f}); quant.:ref:`set_attr `>(:ref:`op::attr::zps `, {10}); //[Create Quantize's logical tensor and the op] //[Create graph and add ops] :ref:`graph ` g(:ref:`dnnl::engine::kind::cpu `); g.add_op(dequant0); g.add_op(dequant1); g.add_op(conv); g.add_op(relu); g.add_op(quant); //[Create graph and add ops] g.finalize(); //[Get partition] auto partitions = g.get_partitions(); //[Get partition] // Check partitioning results to ensure the examples works. Users do // not need to follow this step. assert(partitions.size() == 1); //[Create engine] :ref:`allocator ` alloc {}; :ref:`dnnl::engine ` eng = :ref:`make_engine_with_allocator `(:ref:`dnnl::engine::kind::cpu `, 0, alloc); :ref:`dnnl::stream ` strm {eng}; //[Create engine] // Mapping from logical tensor id to output tensors // used to the connection relationship between partitions (e.g partition 0's // output tensor is fed into partition 1) std::unordered_map global_outputs_ts_map; // Memory buffers bound to the partition input/output tensors // that helps manage the lifetime of these tensors std::vector> data_buffer; // Mapping from id to queried logical tensor from compiled partition // used to record the logical tensors that are previously enabled with // ANY layout std::unordered_map id_to_queried_logical_tensors; // This is a helper function which helps decide which logical tensor is // needed to be set with `dnnl::graph::logical_tensor::layout_type::any` // layout. // This function is not a part to Graph API, but similar logic is // essential for Graph API integration to achieve best performance. // Typically, users need implement the similar logic in their code. std::unordered_set ids_with_any_layout; set_any_layout(partitions, ids_with_any_layout); // Mapping from logical tensor id to the concrete shapes. // In practical usage, concrete shapes and layouts are not given // until compilation stage, hence need this mapping to mock the step. std::unordered_map concrete_shapes { {0, conv_input_dims}, {3, conv_weight_dims}, {6, conv_bias_dims}}; // Compile and execute the partitions, including the following steps: // // 1. Update the input/output logical tensors with concrete shape and layout // 2. Compile the partition // 3. Update the output logical tensors with queried ones after compilation // 4. Allocate memory and bind the data buffer for the partition // 5. Execute the partition // // Although they are not part of the APIs, these steps are essential for // the integration of Graph API., hence users need to implement similar // logic. for (const auto &:ref:`partition ` : partitions) { if (!:ref:`partition `.:ref:`is_supported `()) { std::cout << "cpu_inference_int8: Got unsupported partition, users " "need handle the operators by themselves." << std::endl; continue; } std::vector inputs = :ref:`partition `.:ref:`get_input_ports `(); std::vector outputs = :ref:`partition `.:ref:`get_output_ports `(); // Update input logical tensors with concrete shape and layout for (auto &input : inputs) { const auto id = input.get_id(); // If the tensor is an output of another partition, // use the cached logical tensor if (id_to_queried_logical_tensors.find(id) != id_to_queried_logical_tensors.end()) input = id_to_queried_logical_tensors[id]; else // Create logical tensor with strided layout input = :ref:`logical_tensor ` {id, input.:ref:`get_data_type `(), concrete_shapes[id], layout_type::strided}; } // Update output logical tensors with concrete shape and layout for (auto &output : outputs) { const auto id = output.get_id(); output = :ref:`logical_tensor ` {id, output.:ref:`get_data_type `(), :ref:`DNNL_GRAPH_UNKNOWN_NDIMS `, // set output dims to unknown ids_with_any_layout.count(id) ? :ref:`layout_type::any ` : layout_type::strided}; } //[Compile partition] :ref:`compiled_partition ` cp = :ref:`partition `.:ref:`compile `(inputs, outputs, eng); //[Compile partition] // Update output logical tensors with queried one for (auto &output : outputs) { const auto id = output.get_id(); output = cp.:ref:`query_logical_tensor `(id); id_to_queried_logical_tensors[id] = output; } // Allocate memory for the partition, and bind the data buffers with // input and output logical tensors std::vector inputs_ts, outputs_ts; allocate_graph_mem(inputs_ts, inputs, data_buffer, global_outputs_ts_map, eng, /*is partition input=*/true); allocate_graph_mem(outputs_ts, outputs, data_buffer, global_outputs_ts_map, eng, /*is partition input=*/false); //[Execute compiled partition] cp.:ref:`execute `(strm, inputs_ts, outputs_ts); //[Execute compiled partition] } // wait for all compiled partition's execution finished strm.:ref:`wait `(); std::cout << "Graph:" << std::endl << " [dq0_src] [dq1_src]" << std::endl << " | |" << std::endl << " dequant0 dequant1" << std::endl << " \\ /" << std::endl << " conv" << std::endl << " |" << std::endl << " relu" << std::endl << " |" << std::endl << " quant" << std::endl << " |" << std::endl << " [quant_dst]" << std::endl << "Note:" << std::endl << " '[]' represents a logical tensor, which refers to " "inputs/outputs of the graph. " << std::endl; } int main(int argc, char **argv) { return handle_example_errors({:ref:`engine::kind::cpu `}, simple_pattern_int8); }