Intel® Quartus® Prime Software
Intel® Quartus® Prime Design Software, Design Entry, Synthesis, Simulation, Verification, Timing Analysis, System Design (Platform Designer, formerly Qsys)
16597 Discussions

clEnqueueNDRangeKernel returns -59 on HARPv2

whan01
Beginner
748 Views

I write a short code in OpenCL. The host fuction is below

float h_gaus[3][3] = {{0.0625, 0.125, 0.0625}, {0.1250, 0.250, 0.1250}, {0.0625, 0.125, 0.0625}}; cl_mem d_gaus = clCreateBuffer(ocl.clContext, CL_MEM_READ_WRITE, 3 * 3 * sizeof(float), NULL, &clStatus); clStatus = clEnqueueWriteBuffer(ocl.clCommandQueue, d_gaus, CL_TRUE, 0, 3 * 3 * sizeof(float), h_gaus, 0, NULL, NULL); int rowsc, colsc, in_size; #ifdef TEST_ROWS rowsc = TEST_ROWS; #endif   #ifdef TEST_COLS colsc = TEST_COLS; #endif in_size = rowsc * colsc * sizeof(unsigned char); // define input and output(local buffer) unsigned char *test_frame; test_frame = (unsigned char*) alignedMalloc(in_size);   // initialize input frame for (int i = 0; i < rowsc; i++) for (int j = 0; j < colsc; j++) { test_frame[i * rowsc + j] = (unsigned char)100; }   unsigned char *h_test; unsigned char *h_out; h_test = (unsigned char *)clSVMAllocAltera(ocl.clContext, 0, in_size, 1024);h_out = (unsigned char *)clSVMAllocAltera(ocl.clContext, 0, in_size, 1024); memcpy(h_test, test_frame, in_size); size_t ls[2] = {(size_t)16, (size_t)16}; size_t gs[2] = {(size_t)rowsc,(size_t)colsc}; size_t *offset = NULL; clSetKernelArgSVMPointerAltera(ocl.clKernel_gauss, 0, (void*)h_test); clSetKernelArgSVMPointerAltera(ocl.clKernel_gauss, 1, (void*)h_out); clSetKernelArg(ocl.clKernel_gauss, 2, sizeof(int), &rowsc); clSetKernelArg(ocl.clKernel_gauss, 3, sizeof(int), &colsc); clSetKernelArg(ocl.clKernel_gauss, 4, (L_SIZE + 2) * (L_SIZE + 2) * sizeof(int), NULL); clSetKernelArg(ocl.clKernel_gauss, 5, sizeof(cl_mem), &d_gaus); clStatus = clEnqueueSVMMap(ocl.clCommandQueue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, (void *)h_test, in_size, 0, NULL, NULL); CL_ERR(); clStatus = clEnqueueSVMMap(ocl.clCommandQueue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, (void *)h_out, in_size, 0, NULL, NULL); CL_ERR(); clStatus = clEnqueueNDRangeKernel(ocl.clCommandQueue, ocl.clKernel_gauss, 2, offset, gs, ls, 0, NULL, NULL);

Here I define rowsc and colsc are both 64. The kernel function is below 

__kernel void gaussian_kernel(__global unsigned char *data, __global unsigned char *out, int rows, int cols, __local int *l_data, __global float *gaus) { int sum = 0; int g_row = get_global_id(0); int g_col = get_global_id(1); int l_row = get_local_id(0) + 1; int l_col = get_local_id(1) + 1;   int pos = g_row * cols + g_col; out[pos] = data[pos]; // copy to local, the position itself l_data[l_row * (L_SIZE + 2) + l_col] = data[pos];   // top most row if (l_row == 1) { // top most global row, fill it with 0 l_data[0 * (L_SIZE + 2) + l_col] = (g_row==0 ? 0 : data[pos - cols]);   // top left if (l_col == 1) l_data[0 * (L_SIZE + 2) + 0] = (g_row==0 ? 0 : data[pos - cols - 1]);   // top right else if (l_col == L_SIZE) l_data[0 * (L_SIZE + 2) + L_SIZE + 1] = (g_row==0 ? 0 : data[pos - cols + 1]); }   // bottom most row else if (l_row == L_SIZE) { l_data[(L_SIZE + 1) * (L_SIZE + 2) + l_col] = (g_row==rows-1 ? 0 : data[pos + cols]);   // bottom left if (l_col == 1) l_data[(L_SIZE + 1) * (L_SIZE + 2) + 0] = (g_row==rows-1 ? 0 : data[pos + cols - 1]);   // bottom right else if (l_col == L_SIZE) l_data[(L_SIZE + 1) * (L_SIZE + 2) + L_SIZE + 1] = (g_row == rows-1 ? 0 : data[pos + cols + 1]); }   // left most col if (l_col == 1) l_data[l_row * (L_SIZE + 2) + 0] = (g_col == 0 ? 0 : data[pos - 1]); // right most col else if (l_col == L_SIZE) l_data[l_row * (L_SIZE + 2) + L_SIZE + 1] = (g_col == cols-1 ? 0 : data[pos + 1]);   barrier(CLK_LOCAL_MEM_FENCE);   // compute convolution for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { sum += gaus[i * 3 + j] * l_data[(i + l_row - 1) * (L_SIZE + 2) + j + l_col - 1]; } }   out[pos] = min(255, max(0, sum)); }

I didn't add any restriction to neither max work group size nor max work items. I didn't assign the kernel a "task" attribute, either. But when I try to enqueue a 2D NDRange kernel, it returns -59. I didn't find any introduction about this condition on khronos' group website. So I wonder where is wrong in my code? Or any point I didn't get about HARPv2?

0 Kudos
3 Replies
MEIYAN_L_Intel
Employee
603 Views

Hi,

May I know do you have add attribute header eg: __attribute__((reqd_work_group_size(256, 2, 1))) in the kernel?

Thanks

0 Kudos
whan01
Beginner
603 Views
0 Kudos
MEIYAN_L_Intel
Employee
603 Views

Hi,

Can you provide the host code and kernel code attached as file here, so that I can run it on my side for further investigation.

Thanks.

 

0 Kudos
Reply