Intel® Quartus® Prime Software
Intel® Quartus® Prime Design Software, Design Entry, Synthesis, Simulation, Verification, Timing Analysis, System Design (Platform Designer, formerly Qsys)
Announcements
Intel Support hours are Monday-Fridays, 8am-5pm PST, except Holidays. Thanks to our community members who provide support during our down time or before we get to your questions. We appreciate you!

Need Forum Guidance? Click here
Search our FPGA Knowledge Articles here.
15463 Discussions

clEnqueueNDRangeKernel returns -59 on HARPv2

whan01
Beginner
274 Views

I write a short code in OpenCL. The host fuction is below

float h_gaus[3][3] = {{0.0625, 0.125, 0.0625}, {0.1250, 0.250, 0.1250}, {0.0625, 0.125, 0.0625}}; cl_mem d_gaus = clCreateBuffer(ocl.clContext, CL_MEM_READ_WRITE, 3 * 3 * sizeof(float), NULL, &clStatus); clStatus = clEnqueueWriteBuffer(ocl.clCommandQueue, d_gaus, CL_TRUE, 0, 3 * 3 * sizeof(float), h_gaus, 0, NULL, NULL); int rowsc, colsc, in_size; #ifdef TEST_ROWS rowsc = TEST_ROWS; #endif   #ifdef TEST_COLS colsc = TEST_COLS; #endif in_size = rowsc * colsc * sizeof(unsigned char); // define input and output(local buffer) unsigned char *test_frame; test_frame = (unsigned char*) alignedMalloc(in_size);   // initialize input frame for (int i = 0; i < rowsc; i++) for (int j = 0; j < colsc; j++) { test_frame[i * rowsc + j] = (unsigned char)100; }   unsigned char *h_test; unsigned char *h_out; h_test = (unsigned char *)clSVMAllocAltera(ocl.clContext, 0, in_size, 1024);h_out = (unsigned char *)clSVMAllocAltera(ocl.clContext, 0, in_size, 1024); memcpy(h_test, test_frame, in_size); size_t ls[2] = {(size_t)16, (size_t)16}; size_t gs[2] = {(size_t)rowsc,(size_t)colsc}; size_t *offset = NULL; clSetKernelArgSVMPointerAltera(ocl.clKernel_gauss, 0, (void*)h_test); clSetKernelArgSVMPointerAltera(ocl.clKernel_gauss, 1, (void*)h_out); clSetKernelArg(ocl.clKernel_gauss, 2, sizeof(int), &rowsc); clSetKernelArg(ocl.clKernel_gauss, 3, sizeof(int), &colsc); clSetKernelArg(ocl.clKernel_gauss, 4, (L_SIZE + 2) * (L_SIZE + 2) * sizeof(int), NULL); clSetKernelArg(ocl.clKernel_gauss, 5, sizeof(cl_mem), &d_gaus); clStatus = clEnqueueSVMMap(ocl.clCommandQueue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, (void *)h_test, in_size, 0, NULL, NULL); CL_ERR(); clStatus = clEnqueueSVMMap(ocl.clCommandQueue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, (void *)h_out, in_size, 0, NULL, NULL); CL_ERR(); clStatus = clEnqueueNDRangeKernel(ocl.clCommandQueue, ocl.clKernel_gauss, 2, offset, gs, ls, 0, NULL, NULL);

Here I define rowsc and colsc are both 64. The kernel function is below 

__kernel void gaussian_kernel(__global unsigned char *data, __global unsigned char *out, int rows, int cols, __local int *l_data, __global float *gaus) { int sum = 0; int g_row = get_global_id(0); int g_col = get_global_id(1); int l_row = get_local_id(0) + 1; int l_col = get_local_id(1) + 1;   int pos = g_row * cols + g_col; out[pos] = data[pos]; // copy to local, the position itself l_data[l_row * (L_SIZE + 2) + l_col] = data[pos];   // top most row if (l_row == 1) { // top most global row, fill it with 0 l_data[0 * (L_SIZE + 2) + l_col] = (g_row==0 ? 0 : data[pos - cols]);   // top left if (l_col == 1) l_data[0 * (L_SIZE + 2) + 0] = (g_row==0 ? 0 : data[pos - cols - 1]);   // top right else if (l_col == L_SIZE) l_data[0 * (L_SIZE + 2) + L_SIZE + 1] = (g_row==0 ? 0 : data[pos - cols + 1]); }   // bottom most row else if (l_row == L_SIZE) { l_data[(L_SIZE + 1) * (L_SIZE + 2) + l_col] = (g_row==rows-1 ? 0 : data[pos + cols]);   // bottom left if (l_col == 1) l_data[(L_SIZE + 1) * (L_SIZE + 2) + 0] = (g_row==rows-1 ? 0 : data[pos + cols - 1]);   // bottom right else if (l_col == L_SIZE) l_data[(L_SIZE + 1) * (L_SIZE + 2) + L_SIZE + 1] = (g_row == rows-1 ? 0 : data[pos + cols + 1]); }   // left most col if (l_col == 1) l_data[l_row * (L_SIZE + 2) + 0] = (g_col == 0 ? 0 : data[pos - 1]); // right most col else if (l_col == L_SIZE) l_data[l_row * (L_SIZE + 2) + L_SIZE + 1] = (g_col == cols-1 ? 0 : data[pos + 1]);   barrier(CLK_LOCAL_MEM_FENCE);   // compute convolution for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { sum += gaus[i * 3 + j] * l_data[(i + l_row - 1) * (L_SIZE + 2) + j + l_col - 1]; } }   out[pos] = min(255, max(0, sum)); }

I didn't add any restriction to neither max work group size nor max work items. I didn't assign the kernel a "task" attribute, either. But when I try to enqueue a 2D NDRange kernel, it returns -59. I didn't find any introduction about this condition on khronos' group website. So I wonder where is wrong in my code? Or any point I didn't get about HARPv2?

0 Kudos
3 Replies
MEIYAN_L_Intel
Employee
129 Views

Hi,

May I know do you have add attribute header eg: __attribute__((reqd_work_group_size(256, 2, 1))) in the kernel?

Thanks

whan01
Beginner
129 Views
MEIYAN_L_Intel
Employee
129 Views

Hi,

Can you provide the host code and kernel code attached as file here, so that I can run it on my side for further investigation.

Thanks.

 

Reply