#include #include #include #include const char *kernel_source = "\n\ #define BLOCK_SIZE 24\n\ #define C_WIDTH 24\n\ #define AB_COMMON 72\n\ \n\ __kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, BLOCK_SIZE, 1)))\n\ void mx_mul(__global const float *A,\n\ __global const float *B,\n\ __global float *C) {\n\ \n\ #ifdef USE_2D\n\ __local float AS[BLOCK_SIZE][BLOCK_SIZE];\n\ __local float BS[BLOCK_SIZE][BLOCK_SIZE];\n\ #else\n\ __local float AS[BLOCK_SIZE * BLOCK_SIZE];\n\ __local float BS[BLOCK_SIZE * BLOCK_SIZE];\n\ #endif\n\ \n\ int bx = get_group_id(0);\n\ int by = get_group_id(1);\n\ \n\ int tx = get_local_id(0);\n\ int ty = get_local_id(1);\n\ \n\ int a_offs = (by * BLOCK_SIZE + ty) * AB_COMMON + tx;\n\ int b_offs = (bx * BLOCK_SIZE + ty) * AB_COMMON + tx;\n\ \n\ float sum = 0;\n\ for (int i = 0; i < AB_COMMON / BLOCK_SIZE; i++, a_offs += BLOCK_SIZE, b_offs += BLOCK_SIZE) {\n\ #ifdef USE_2D\n\ AS[ty][tx] = A[a_offs];\n\ BS[ty][tx] = B[b_offs];\n\ #else\n\ AS[ty * BLOCK_SIZE + tx] = A[a_offs];\n\ BS[ty * BLOCK_SIZE + tx] = B[b_offs];\n\ #endif\n\ \n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ #pragma unroll\n\ for (int k = 0; k < BLOCK_SIZE; k++) {\n\ #ifdef USE_2D\n\ sum += AS[ty][k] * BS[tx][k];\n\ #else\n\ sum += AS[ty * BLOCK_SIZE + k] * BS[tx * BLOCK_SIZE + k];\n\ #endif\n\ }\n\ \n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ }\n\ \n\ C[get_global_id(1) * C_WIDTH + get_global_id(0)] = sum;\n\ }\n\ \n"; float A[24][72], B[24][72], C[24][24]; void mx_fill() { int i, j, k = 0; for (i = 0; i < 24; i++) { for (j = 0; j < 72; j++, k++) { A[i][j] = sinf((float)k); B[i][j] = cosf((float)k); } } for (i = 0; i < 24; i++) { for (j = 0; j < 24; j++) { float sum = 0; for (k = 0; k < 72; k++) { sum += A[i][k] * B[j][k]; } C[i][j] = sum; } } } int main(int argc, char **argv) { mx_fill(); cl_uint n_platforms = 0; if (clGetPlatformIDs(0, NULL, &n_platforms)) { return 1; } cl_platform_id *platforms = malloc(n_platforms * sizeof(*platforms)); if (clGetPlatformIDs(n_platforms, platforms, NULL)) { return 2; } cl_uint i_platform; char nme[256]; size_t nn; for (i_platform = 0; i_platform < n_platforms; i_platform++) { if (clGetPlatformInfo(platforms[i_platform], CL_PLATFORM_VENDOR, sizeof(nme), nme, &nn)) { return 3; } if (strcasestr(nme, "Intel")) { break; } } if (i_platform >= n_platforms) { printf("No Intel ocl platforms found\n"); return 4; } printf("Selected ocl platform: %s\n", nme); cl_uint n_devices; if (clGetDeviceIDs(platforms[i_platform], CL_DEVICE_TYPE_CPU, 0, 0, &n_devices)) { return 5; } cl_device_id *devices = malloc(n_devices * sizeof(*devices)); if (clGetDeviceIDs(platforms[i_platform], CL_DEVICE_TYPE_CPU, n_devices, devices, NULL)) { return 6; } if (clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(nme), nme, &nn)) { return 7; } printf("Selected ocl device: %s\n", nme); cl_context ctx = clCreateContext(NULL, 1, devices, NULL, NULL, NULL); if (!ctx) { return 8; } cl_command_queue queue = clCreateCommandQueue(ctx, devices[0], 0, NULL); if (!queue) { return 9; } const char *srcs[2] = {"\n", kernel_source}; cl_program prgs[2]; prgs[0] = clCreateProgramWithSource(ctx, 2, srcs, NULL, NULL); if (!prgs[0]) { return 10; } srcs[0] = "#define USE_2D 1\n"; prgs[1] = clCreateProgramWithSource(ctx, 2, srcs, NULL, NULL); if (!prgs[1]) { return 11; } cl_kernel krns[2]; char log[2048]; int i; for (i = 0; i < 2; i++) { if (clBuildProgram(prgs[i], 1, devices, "", NULL, NULL)) { return 12; } if (clGetProgramBuildInfo(prgs[i], devices[0], CL_PROGRAM_BUILD_LOG, sizeof(log), log, &nn)) { return 13; } printf("\nBuild log for program %d:\n%s\n", i, log); krns[i] = clCreateKernel(prgs[i], "mx_mul", NULL); if (!krns[i]) { return 14; } } cl_mem a, b, c; a = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(A), A, NULL); b = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(B), B, NULL); c = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(C), NULL, NULL); if ((!a) || (!b) || (!c)) { return 15; } float CC[24][24]; for (i = 0; i < 2; i++) { printf("\nKernel %d\n", i); if (clSetKernelArg(krns[i], 0, sizeof(a), &a) || clSetKernelArg(krns[i], 1, sizeof(b), &b) || clSetKernelArg(krns[i], 2, sizeof(c), &c)) { return 16; } size_t global_size[2] = {24, 24}, local_size[2] = {24, 24}; cl_int err = clEnqueueNDRangeKernel(queue, krns[i], 2, NULL, global_size, local_size, 0, NULL, NULL); if (err) { printf("clEnqueueNDRangeKernel() failed with code %d\n", err); continue; } if ((err = clEnqueueReadBuffer(queue, c, CL_TRUE, 0, sizeof(CC), CC, 0, NULL, NULL))) { printf("clEnqueueReadBuffer() failed with code %d\n", err); continue; } float max_diff = 0; int j, k; for (j = 0; j < 24; j++) { for (k = 0; k < 24; k++) { float diff = fabs(CC[j][k] - C[j][k]); if (diff > max_diff) { max_diff = diff; } } } printf("max_diff = %.6f\n", max_diff); } printf("\nEnd of job\n"); return 0; }