.. index:: pair: page; Matrix Multiplication Performance Example .. _doxid-matmul_perf_cpp: Matrix Multiplication Performance Example ========================================= This C++ example runs a simple matrix multiplication (matmul) performance test using oneDNN. This C++ example runs a simple matrix multiplication (matmul) performance test using oneDNN. The workflow includes following steps: * Set up and execute a matmul operation with the specified engine kind and matrix dimensions, using f32, f16, bf16 and s8 data types. * Measure the execution time and prints the achieved performance in GFlop/s or GOp/s, depending on the data type. To execute the example, compile it with oneDNN and run the following way: .. ref-code-block:: cpp ./matmul_perf [ ] Input parameters: * ```` : The kind of oneDNN engine to use (e.g., CPU, GPU). * ```` : (Required) The number of rows in the first matrix. * ```` : (Optional) The number of columns in the second matrix. If not specified, ``n = m``. * ```` : (Optional) The number of columns in the first matrix. If not specified, ``k = m``. .. ref-code-block:: cpp /******************************************************************************* * Copyright 2022-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ #include #include #include #include #include #include #include #include #include "example_utils.hpp" #include "oneapi/dnnl/dnnl.hpp" using namespace :ref:`dnnl `; struct gemm_dims_t { :ref:`memory::dim ` m, n, k; }; static const int min_runs = 4; const char *get_type_string(:ref:`memory::data_type ` type) { const char *type_string = "unknown"; #define TYPE_CASE(T) \ if (type == memory::data_type::T) type_string = #T; TYPE_CASE(f16); TYPE_CASE(f32); TYPE_CASE(f64); TYPE_CASE(bf16); TYPE_CASE(s8); TYPE_CASE(u8); #undef TYPE_CASE return type_string; } void print_test_case(:ref:`memory::data_type ` type, gemm_dims_t dims) { std::cout << '[' << std::setw(4) << get_type_string(type); if (dims.m == dims.n && dims.m == dims.k) std::cout << " m = n = k = " << dims.m; else std::cout << " m = " << dims.m << ", n = " << dims.n << ", k = " << dims.k; std::cout << "] " << std::flush; } void fill_random(std::vector &out, bool is_integer) { static std::vector random_data_i, random_data_f; constexpr size_t nrand = 1037; if (random_data_i.empty() || random_data_f.empty()) { std::mt19937 generator; std::uniform_int_distribution dist_i(-16, 15); std::uniform_real_distribution dist_f(-1.0f, 1.0f); random_data_i.resize(nrand); for (auto &d : random_data_i) d = static_cast(dist_i(generator)); random_data_f.resize(nrand); for (auto &d : random_data_f) d = dist_f(generator); } auto &rd = is_integer ? random_data_i : random_data_f; for (size_t i = 0; i < out.size(); i += nrand) { size_t chunk = std::min(nrand, out.size() - i); std::memcpy(&out[i], rd.data(), chunk * sizeof(float)); } } double run_case(:ref:`engine::kind ` engine_kind, :ref:`memory::data_type ` type, gemm_dims_t dims, double time_limit = 0.) { bool is_integer = (type == :ref:`memory::data_type::s8 ` || type == :ref:`memory::data_type::u8 `); bool quick_test = (time_limit == 0.); // Create execution dnnl::engine. :ref:`dnnl::engine ` :ref:`engine `(engine_kind, 0); // Create dnnl::stream. :ref:`dnnl::stream ` engine_stream(:ref:`engine `); // Source (A), weights (B), and destination (C) matrix dimensions. :ref:`memory::dims ` a_dims = {dims.m, dims.k}; :ref:`memory::dims ` b_dims = {dims.k, dims.n}; :ref:`memory::dims ` c_dims = {dims.m, dims.n}; // Allocate buffers and random-initialize A/B std::vector a_data(product(a_dims)); std::vector b_data(product(b_dims)); std::vector c_data(product(c_dims)); fill_random(a_data, is_integer); fill_random(b_data, is_integer); // Create memory descriptors and memory objects for src, weights, bias, and // dst. auto a_md = :ref:`memory::desc `(a_dims, type, :ref:`memory::format_tag::any `); auto b_md = :ref:`memory::desc `(b_dims, type, :ref:`memory::format_tag::any `); auto c_md = :ref:`memory::desc `(c_dims, type, :ref:`memory::format_tag::any `); auto a_in_md = :ref:`memory::desc `( a_dims, :ref:`memory::data_type::f32 `, :ref:`memory::format_tag::ab `); auto b_in_md = :ref:`memory::desc `( b_dims, :ref:`memory::data_type::f32 `, :ref:`memory::format_tag::ab `); auto a_in_mem = :ref:`memory `(a_in_md, :ref:`engine `); auto b_in_mem = :ref:`memory `(b_in_md, :ref:`engine `); // Write data to memory object's handles. write_to_dnnl_memory(a_data.data(), a_in_mem); write_to_dnnl_memory(b_data.data(), b_in_mem); // Create primitive descriptor. auto matmul_pd = :ref:`matmul::primitive_desc `(:ref:`engine `, a_md, b_md, c_md); // Repack and convert input data. auto a_mem = :ref:`memory `(matmul_pd.src_desc(), :ref:`engine `); :ref:`reorder `(a_in_mem, a_mem).:ref:`execute `(engine_stream, a_in_mem, a_mem); auto b_mem = :ref:`memory `(matmul_pd.weights_desc(), :ref:`engine `); :ref:`reorder `(b_in_mem, b_mem).:ref:`execute `(engine_stream, b_in_mem, b_mem); auto c_mem = :ref:`memory `(matmul_pd.dst_desc(), :ref:`engine `); // Create the primitive. auto matmul_prim = :ref:`matmul `(matmul_pd); // Start output. if (!quick_test) print_test_case(type, dims); // Primitive arguments. std::unordered_map matmul_args; matmul_args.insert({:ref:`DNNL_ARG_SRC `, a_mem}); matmul_args.insert({:ref:`DNNL_ARG_WEIGHTS `, b_mem}); matmul_args.insert({:ref:`DNNL_ARG_DST `, c_mem}); // Warmup executions. matmul_prim.execute(engine_stream, matmul_args); engine_stream.wait(); auto start_first = std::chrono::steady_clock::now(); matmul_prim.execute(engine_stream, matmul_args); engine_stream.wait(); auto end_first = std::chrono::steady_clock::now(); std::chrono::duration dur_first = end_first - start_first; if (quick_test) return dur_first.count(); int runs = std::max(min_runs, int(time_limit / dur_first.count())); // Timing runs. auto start = std::chrono::steady_clock::now(); for (int i = 0; i <= runs; i++) matmul_prim.execute(engine_stream, matmul_args); engine_stream.wait(); auto end = std::chrono::steady_clock::now(); std::chrono::duration duration = end - start; // Display the result. double avg_time = (duration.count() - dur_first.count()) / runs; double total_ops = double(dims.m) * double(dims.n) * double(dims.k) * 2; double perf = (total_ops / avg_time) * 1e-9; auto scale_string = "G"; auto unit_string = is_integer ? "Op/s" : "Flop/s"; if (perf >= 1000) { perf /= 1000; scale_string = "T"; } std::cout << perf << ' ' << scale_string << unit_string << std::endl; return avg_time; } void run(:ref:`engine::kind ` engine_kind, :ref:`memory::data_type ` type, gemm_dims_t dims, double time_limit) { try { if (dims.m * dims.n != 0) { // Dimensions manually specified by user. run_case(engine_kind, type, dims, time_limit); } else { // Automatically choose dimensions to fit time limit. int mnk = 128; const int max_mnk = 8192; while (mnk < max_mnk) { dims.m = dims.n = dims.k = mnk; double time1 = run_case(engine_kind, type, dims); double nruns_est = std::max(1., time_limit / time1); double mnk_expand = std::exp2( std::round(std::log2(nruns_est / min_runs) / 3.)); if (mnk_expand <= 1) break; mnk = static_cast( std::min(max_mnk, mnk * mnk_expand)); } dims.m = dims.n = dims.k = mnk; run_case(engine_kind, type, dims, time_limit); } } catch (:ref:`dnnl::error ` &e) { // Catch and report unimplemented cases. if (e.status == :ref:`dnnl_unimplemented `) { print_test_case(type, dims); std::cout << "unsupported" << std::endl; } else throw; } } void bad_args() { std::cerr << "Usage: matmul-perf-cpp [cpu|gpu]\n" " matmul-perf-cpp [cpu|gpu] \n" " matmul-perf-cpp [cpu|gpu] \n" "If a single is specified, it is used for all three " "dimensions (m/n/k).\n"; throw std::invalid_argument("Incorrect input arguments."); } void matmul_perf(:ref:`engine::kind ` engine_kind, int argc, char **argv) { gemm_dims_t dims = {0, 0, 0}; if (argc > 2) { if (argc == 3) dims.m = dims.n = dims.k = std::atoi(argv[2]); else if (argc == 5) { dims.m = std::atoi(argv[2]); dims.n = std::atoi(argv[3]); dims.k = std::atoi(argv[4]); } else bad_args(); if (dims.m <= 0 || dims.n <= 0 || dims.k <= 0) bad_args(); } run(engine_kind, :ref:`memory::data_type::f32 `, dims, 2.0); run(engine_kind, :ref:`memory::data_type::f16 `, dims, 2.0); run(engine_kind, :ref:`memory::data_type::bf16 `, dims, 2.0); run(engine_kind, :ref:`memory::data_type::s8 `, dims, 2.0); } int main(int argc, char **argv) { return handle_example_errors( matmul_perf, parse_engine_kind(argc, argv, 3), argc, argv); }