//============================================================== // Copyright © 2020 Intel Corporation // // SPDX-License-Identifier: MIT // Copyright © 2022 Martin, Linköping University // ============================================================= #include #include #include #include #include #include #include "dpc_common.hpp" #if FPGA || FPGA_EMULATOR #include #endif using namespace std; using namespace sycl; class a_initialization; class b_initialization; class c_calculation; constexpr int m_size = 4096 * 8; // Multiplication of 8. constexpr int M = m_size / 8; constexpr int N = m_size / 8; constexpr int P = m_size / 8; //verify the results int VerifyResult(float (*c_back)[P]); int main() { // Host memory buffer that device will write data back before destruction. float(*c_back)[P] = new float[M][P]; // Intialize c_back for (int i = 0; i < M; i++) for (int j = 0; j < P; j++) c_back[i][j] = 0.0f; #if FPGA_EMULATOR // FPGA emu selector ext::intel::fpga_emulator_selector DeviceSelector; #elif FPGA // FPGA hardware selector ext::intel::fpga_selector DeviceSelector; #else // selecting default the device default_selector DeviceSelector; #endif queue q(DeviceSelector, dpc_common::exception_handler); cout << "Device: " << q.get_device().get_info() << "\n"; // Host memory buffer that device will write data back before destruction. // device write data back to buffer of host memory before be destroyed //auto data = malloc_host(M*P, q); //auto c = malloc_device(M*N, q); //auto b = malloc_device(N*P, q); // buffers for matrices, buffer c is bound with host memory data buffer a_buf(range(M, N)); buffer b_buf(range(N, P)); buffer c_buf(reinterpret_cast(c_back),range(M, P)); cout << "matrices size: c(" << M << "," << P << ") = a(" << M << "," << N << ") * b(" << N << "," << P << ")\n"; #if FPGA || FPGA_EMULATOR dpc_common::TimeInterval kernel_execution_a_runtime; auto execution_a = q.submit([&](handler& h) { accessor a(a_buf, h, write_only); h.single_task([=]() [[intel::kernel_args_restrict]] { // every element of A matrix is set to 1. for (int iway = 0; iway < M; iway++) { #pragma unroll for (int jway = 0; jway < N; jway++) { a[iway][jway] = 1.0f; } } }); }); double elapsed_execution_a_time = kernel_execution_a_runtime.Elapsed(); dpc_common::TimeInterval kernel_execution_b_runtime; // Initializing B matrix by submitting command group to queue auto execution_b = q.submit([&](handler& h) { // access (write) to buffer on the device accessor b(b_buf, h, write_only); h.single_task([=]() [[intel::kernel_args_restrict]] { // Every column of B (sequence 1,2,...,N) for (int iway = 0; iway < N; iway++) { #pragma unroll for (int jway = 0; jway < P; jway++) { b[iway][jway] = iway + 1.0f; } } }); }); double elapsed_execution_b_time = kernel_execution_b_runtime.Elapsed(); dpc_common::TimeInterval kernel_execution_c_runtime; // Multiplying matrices: c = a * b by submitting command group to queue auto execution_c = q.submit([&](handler& h) { accessor a(a_buf, h, read_only); accessor b(b_buf, h, read_only); accessor c(c_buf, h, write_only); h.single_task([=]() [[intel::kernel_args_restrict]] { for (int iway = 0; iway < M; iway++) { #pragma unroll for (int jway = 0; jway < P; jway++) { // Getting y direction global position. int row = jway; // Getting x direction global position. int col = iway; int width_a = M; float sum = 0.0f; // Calculating result of an element of c #pragma unroll for (int iway = 0; iway < width_a; iway++) { sum += a[row][iway] * b[iway][col]; } c[iway][jway] = sum; } } }); }); #else dpc_common::TimeInterval kernel_execution_a_runtime; auto execution_a = q.submit([&](handler& h) { accessor a(a_buf, h, write_only); h.parallel_for(range(M, N), [=](auto index) { // every element of A matrix is set to 1. a[index] = 1.0f; }); }); double elapsed_execution_a_time = kernel_execution_a_runtime.Elapsed(); dpc_common::TimeInterval kernel_execution_b_runtime; // Initializing B matrix by submitting command group to queue auto execution_b = q.submit([&](handler& h) { // access (write) to buffer on the device accessor b(b_buf, h, write_only); h.parallel_for(range(N, P), [=](auto index) { // Every column of B (sequence 1,2,...,N) b[index] = index[0] + 1.0f; }); }); double elapsed_execution_b_time = kernel_execution_b_runtime.Elapsed(); dpc_common::TimeInterval kernel_execution_c_runtime; // Multiplying matrices: c = a * b by submitting command group to queue auto execution_c = q.submit([&](handler& h) { accessor a(a_buf, h, read_only); accessor b(b_buf, h, read_only); accessor c(c_buf, h, write_only); h.parallel_for(range(M, P), [=](auto index) { // Getting y direction global position. int row = index[0]; // Getting x direction global position. int col = index[1]; int width_a = M; float sum = 0.0f; // Calculating result of an element of c #pragma unroll for (int iway = 0; iway < width_a; iway++) { sum += a[row][iway] * b[iway][col]; } c[index] = sum; }); }); #endif double elapsed_execution_c_time = kernel_execution_c_runtime.Elapsed(); double input_size_kb = (2 * N) * sizeof(float) / (1024); host_accessor A{c_buf}; std::cout << "Initializing kernel A throughput: " << (input_size_kb / elapsed_execution_a_time) << " KB/s \n"; std::cout << "Initializing kernel B throughput: " << (input_size_kb / elapsed_execution_b_time) << " KB/s \n"; std::cout << "Initializing kernel C throughput: " << (input_size_kb / elapsed_execution_c_time) << " KB/s \n"; cout << "Total execution time:" << elapsed_execution_a_time + elapsed_execution_b_time + elapsed_execution_c_time << "sec\n"; int result; cout << "Result of GEMM using DPC++: "; result = VerifyResult(c_back); //delete[] c_back; return result; //return 1; } bool ValueSame(float a, float b) { return fabs(a - b) < numeric_limits::epsilon(); } int VerifyResult(float (*c_back)[P]) { // Check that the results are correct by comparing with host computing. int i, j, k; // 2D arrays on host side. float(*a_host)[N] = new float[M][N]; float(*b_host)[P] = new float[N][P]; float(*c_host)[P] = new float[M][P]; // Each element of matrix a is 1. for (i = 0; i < M; i++) for (j = 0; j < N; j++) a_host[i][j] = 1.0f; // Each column of b_host is the sequence 1,2,...,N for (i = 0; i < N; i++) for (j = 0; j < P; j++) b_host[i][j] = i + 1.0f; // c_host is initialized to zero. for (i = 0; i < M; i++) for (j = 0; j < P; j++) c_host[i][j] = 0.0f; for (i = 0; i < M; i++) { for (k = 0; k < N; k++) { // Each element of the product is just the sum 1+2+...+n for (j = 0; j < P; j++) { c_host[i][j] += a_host[i][k] * b_host[k][j]; } } } bool mismatch_found = false; // Compare host side results with the result buffer from device side: print // mismatched data 5 times only. int print_count = 0; for (i = 0; i < M; i++) { for (j = 0; j < P; j++) { if (!ValueSame(c_back[i][j], c_host[i][j])) { cout << "Fail - The result is incorrect for element: [" << i << ", " << j << "], expected: " << c_host[i][j] << ", but found: " << c_back[i][j] << "\n"; mismatch_found = true; print_count++; if (print_count == 5) break; } } if (print_count == 5) break; } delete[] a_host; delete[] b_host; delete[] c_host; if (!mismatch_found) { cout << "Success - The results are correct!\n"; return 0; } else { cout << "Fail - The results mismatch!\n"; return -1; } }