clEnqueueNDRangeKernel returns -59 on HARPv2

whan01 · ‎10-05-2019

I write a short code in OpenCL. The host fuction is below

    float   h_gaus[3][3] = {{0.0625, 0.125, 0.0625}, {0.1250, 0.250, 0.1250}, {0.0625, 0.125, 0.0625}};
    cl_mem  d_gaus = clCreateBuffer(ocl.clContext, CL_MEM_READ_WRITE, 3 * 3 * sizeof(float), NULL, &clStatus);
    clStatus = clEnqueueWriteBuffer(ocl.clCommandQueue, d_gaus, CL_TRUE, 0, 3 * 3 * sizeof(float), h_gaus, 0, NULL, NULL);
    int rowsc, colsc, in_size;
#ifdef TEST_ROWS
    rowsc = TEST_ROWS;
#endif
 
#ifdef TEST_COLS
    colsc = TEST_COLS;
#endif
        in_size = rowsc * colsc * sizeof(unsigned char);
        // define input and output(local buffer)
        unsigned char *test_frame;
        test_frame = (unsigned char*) alignedMalloc(in_size);
 
        // initialize input frame
        for (int i = 0; i < rowsc; i++)
            for (int j = 0; j < colsc; j++)
            {
                test_frame[i * rowsc + j] = (unsigned char)100;
            }
 
        unsigned char *h_test;
        unsigned char *h_out;
        h_test = (unsigned char *)clSVMAllocAltera(ocl.clContext, 0, in_size, 1024);h_out = (unsigned char *)clSVMAllocAltera(ocl.clContext, 0, in_size, 1024); 
    memcpy(h_test, test_frame, in_size);
    size_t ls[2] = {(size_t)16, (size_t)16};
    size_t gs[2] = {(size_t)rowsc,(size_t)colsc};
    size_t *offset = NULL;
    clSetKernelArgSVMPointerAltera(ocl.clKernel_gauss, 0, (void*)h_test);
    clSetKernelArgSVMPointerAltera(ocl.clKernel_gauss, 1, (void*)h_out);
    clSetKernelArg(ocl.clKernel_gauss, 2, sizeof(int), &rowsc);
    clSetKernelArg(ocl.clKernel_gauss, 3, sizeof(int), &colsc);
    clSetKernelArg(ocl.clKernel_gauss, 4, (L_SIZE + 2) * (L_SIZE + 2) * sizeof(int), NULL);
    clSetKernelArg(ocl.clKernel_gauss, 5, sizeof(cl_mem), &d_gaus);
    clStatus = clEnqueueSVMMap(ocl.clCommandQueue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, (void *)h_test, in_size, 0, NULL, NULL);
                    CL_ERR();
    clStatus = clEnqueueSVMMap(ocl.clCommandQueue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, (void *)h_out, in_size, 0, NULL, NULL);
                    CL_ERR();
    clStatus = clEnqueueNDRangeKernel(ocl.clCommandQueue, ocl.clKernel_gauss, 2, offset, gs, ls, 0, NULL, NULL);

Here I define rowsc and colsc are both 64. The kernel function is below

__kernel void gaussian_kernel(__global unsigned char *data, __global unsigned char *out, int rows, int cols,
                              __local int *l_data, __global float *gaus)
{
    int sum = 0;
    int g_row = get_global_id(0);
    int g_col = get_global_id(1);
    int l_row = get_local_id(0) + 1;
    int l_col = get_local_id(1) + 1;
 
    int pos = g_row * cols + g_col;
    out[pos] = data[pos];
    // copy to local, the position itself
    l_data[l_row * (L_SIZE + 2) + l_col] = data[pos];
 
    // top most row
    if (l_row == 1)
    {
        // top most global row, fill it with 0
        l_data[0 * (L_SIZE + 2) + l_col] = (g_row==0 ? 0 : data[pos - cols]);
 
        // top left
        if (l_col == 1)
            l_data[0 * (L_SIZE + 2) + 0] = (g_row==0 ? 0 : data[pos - cols - 1]);
 
        // top right
        else if (l_col == L_SIZE)
            l_data[0 * (L_SIZE + 2) + L_SIZE + 1] = (g_row==0 ? 0 : data[pos - cols + 1]);
    }
 
    // bottom most row
    else if (l_row == L_SIZE)
    {
        l_data[(L_SIZE + 1) * (L_SIZE + 2) + l_col] = (g_row==rows-1 ? 0 : data[pos + cols]);
 
        // bottom left
        if (l_col == 1)
            l_data[(L_SIZE + 1) * (L_SIZE + 2) + 0] = (g_row==rows-1 ? 0 : data[pos + cols - 1]);
 
        // bottom right
        else if (l_col == L_SIZE)
            l_data[(L_SIZE + 1) * (L_SIZE + 2) + L_SIZE + 1] = (g_row == rows-1 ? 0 : data[pos + cols + 1]);
    }
 
    // left most col
    if (l_col == 1)
        l_data[l_row * (L_SIZE + 2) + 0] = (g_col == 0 ? 0 : data[pos - 1]);
    // right most col
    else if (l_col == L_SIZE)
        l_data[l_row * (L_SIZE + 2) + L_SIZE + 1] = (g_col == cols-1 ? 0 : data[pos + 1]);
 
    barrier(CLK_LOCAL_MEM_FENCE);
 
    // compute convolution
    for (int i = 0; i < 3; i++)
    {
        for (int j = 0; j < 3; j++)
        {
            sum += gaus[i * 3 + j] * l_data[(i + l_row - 1) * (L_SIZE + 2) + j + l_col - 1];
        }
    }
 
    out[pos] = min(255, max(0, sum));
}

I didn't add any restriction to neither max work group size nor max work items. I didn't assign the kernel a "task" attribute, either. But when I try to enqueue a 2D NDRange kernel, it returns -59. I didn't find any introduction about this condition on khronos' group website. So I wonder where is wrong in my code? Or any point I didn't get about HARPv2?

MEIYAN_L_Intel · ‎10-08-2019

Hi,

May I know do you have add attribute header eg: __attribute__((reqd_work_group_size(256, 2, 1))) in the kernel?

Thanks

whan01 · ‎10-08-2019

No I didn't add any attribute.

MEIYAN_L_Intel · ‎10-09-2019

Hi,

Can you provide the host code and kernel code attached as file here, so that I can run it on my side for further investigation.

Thanks.