/* Bug description =============== A video processing filter was reported to produce garbage on Intel OpenCL. After narrowing down the issue, I demonstrated that the chosen 32x8 work size was the culprit. This demo tries the X-Y combinations up to 32x32 and shows that not only 32x8 but other combinations also don't work. Bug summary: At (X-Y) 32x16, 32x8, and 32x4, only the first row is processed from each block. Occurs on - Intel integrated GPUs (both UHD 750 and 770) - My environment is Windows 11 Pro, i7-11700 Community reports come from different systems. - Intel OpenCL drivers 32.0.101.6078 (2024), 32.0.101.6632 (2025) and on older ones (unknown since when) NVIDIA OpenCL seems to be unaffected, not tested for other GPUs. The problem occurs at specific work sizes: 32x4, 32x8, 32x16 Other notes: The queue is verified as in-order The issue persists across GPU generations The issue persists across driver versions No synchronization helps (events, clFinish(), etc.) No global barriers in kernel code help. pinterf */ #include #include #include #include #include #include #include // Utility function to check OpenCL errors int check_cl_error(cl_int err, const char* msg) { if (err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error %d at %s\n", err, msg); return 1; } return 0; } #define CHECK_CL_ERROR(res, msg) \ "retval = cleck_cl_error(res, msg); if (retval) return 1;" // Kernel source without memory fences const char* kernel_source = "\n\ //__constant sampler_t nne = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n\ //__constant sampler_t clm = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\ __constant sampler_t clm = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n\ \n\ __kernel __attribute__((reqd_work_group_size(BLOCK_X,BLOCK_Y,1)))\n\ __kernel void fillA(__read_only image2d_t input, __write_only image2d_t outputA) {\n\ size_t global_id_x = get_global_id(0);\n\ size_t global_id_y = get_global_id(1);\n\ uint pixel = read_imageui(input, clm, (int2)(global_id_x, global_id_y)).x;\n\ pixel = pixel + 1;\n\ // Process the pixel\n\ write_imageui(outputA, (int2)(global_id_x, global_id_y), pixel);\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(BLOCK_X,BLOCK_Y,1)))\n\ __kernel void processB(__read_only image2d_t inputA, __write_only image2d_t outputB) {\n\ size_t global_id_x = get_global_id(0);\n\ size_t global_id_y = get_global_id(1);\n\ uint pixel = read_imageui(inputA, clm, (int2)(global_id_x, global_id_y)).x;\n\ pixel = pixel + 2;\n\ // Process the pixel\n\ write_imageui(outputB, (int2)(global_id_x, global_id_y), pixel);\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(BLOCK_X,BLOCK_Y,1)))\n\ __kernel void processC(__read_only image2d_t inputB, __write_only image2d_t outputA) {\n\ size_t global_id_x = get_global_id(0);\n\ size_t global_id_y = get_global_id(1);\n\ uint pixel = read_imageui(inputB, clm, (int2)(global_id_x, global_id_y)).x;\n\ pixel = pixel + 3;\n\ pixel = global_id_y*640 + global_id_x + 6;\n\ // Process the pixel\n\ write_imageui(outputA, (int2)(global_id_x, global_id_y), pixel);\n\ }\n\ \n\ __kernel __attribute__((reqd_work_group_size(BLOCK_X,BLOCK_Y,1)))\n\ __kernel void verify(__read_only image2d_t inputA, __write_only image2d_t output) {\n\ size_t global_id_x = get_global_id(0);\n\ size_t global_id_y = get_global_id(1);\n\ uint pixel = read_imageui(inputA, clm, (int2)(global_id_x, global_id_y)).x;\n\ write_imageui(output, (int2)(global_id_x, global_id_y), pixel);\n\ }\n"; cl_device_id getIntelUHDDevice() { cl_uint num_platforms; clGetPlatformIDs(0, nullptr, &num_platforms); std::vector platforms(num_platforms); clGetPlatformIDs(num_platforms, platforms.data(), nullptr); for (auto platform : platforms) { cl_uint num_devices; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, nullptr, &num_devices); std::vector devices(num_devices); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices.data(), nullptr); for (auto device : devices) { char device_name[128]; clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, nullptr); if (std::string(device_name).find("Intel(R) UHD") != std::string::npos) { std::cout << "Found Intel UHD GPU: " << device_name << std::endl; return device; } } } std::cout << "Intel UHD GPU not found." << std::endl; return nullptr; } static void print_driver_info(cl_device_id deviceID) { // Print device info char device_name[256]; cl_int err = clGetDeviceInfo(deviceID, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); CHECK_CL_ERROR(err, "clGetDeviceInfo CL_DEVICE_NAME"); printf("Using device: %s\n", device_name); // Print driver version char driver_version[256]; err = clGetDeviceInfo(deviceID, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL); CHECK_CL_ERROR(err, "clGetDeviceInfo CL_DRIVER_VERSION"); printf("Driver version: %s\n", driver_version); } static int main_reproduce_error(cl_device_id deviceID, const size_t BLOCK_X, const size_t BLOCK_Y) { cl_int err; // 4,4: OK // 8,4: OK // 16,4: OK // 32,4: bug // 8,8: OK // 16,8: OK // 16,16: OK // 16,32: OK // 32,1 32,2 : OK // 32,4: 32,8, 32,16: bug // 32,32: too much, exceeds limit const size_t DATA_SIZE_X = 640; const size_t DATA_SIZE_Y = 480; const size_t DATA_SIZE = DATA_SIZE_X * DATA_SIZE_Y; // Initialize input data uint32_t* input_data = (uint32_t*)malloc(DATA_SIZE * sizeof(uint32_t)); uint32_t* result_data = (uint32_t*)malloc(DATA_SIZE * sizeof(uint32_t)); for (uint32_t i = 0; i < DATA_SIZE; i++) { input_data[i] = i; } size_t max_work_group_size; cl_int ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); CHECK_CL_ERROR(err, "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE"); printf("Local work size: [%zu, %zu] Max workgroup size: %zu\n", BLOCK_X, BLOCK_Y, max_work_group_size); if (max_work_group_size < BLOCK_X * BLOCK_Y) return 3; // x, y exceeds work group size limit // Create context and command queue cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err); CHECK_CL_ERROR(err, "clCreateContext"); cl_command_queue_properties properties[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0 }; cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, properties, &err); /* OpenCL 2.0 // Create in-order queue cl_command_queue queue = clCreateCommandQueue(context, deviceID, 0, &err); CHECK_CL_ERROR(err, "clCreateCommandQueue"); */ // Create and build program cl_program program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, &err); CHECK_CL_ERROR(err, "clCreateProgramWithSource"); // dimensions injected into kernel as defines char options[2048]; snprintf(options, 2048, "-D VI_DIM_X=%zu -D VI_DIM_Y=%zu -D BLOCK_X=%zu -D BLOCK_Y=%zu", DATA_SIZE_X, DATA_SIZE_Y, BLOCK_X, BLOCK_Y); err = clBuildProgram(program, 1, &deviceID, options, NULL, NULL); if (err != CL_SUCCESS) { size_t log_size; clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); char* log = (char*)malloc(log_size); clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); fprintf(stderr, "Build Error:\n%s\n", log); free(log); exit(1); } // Create kernels cl_kernel kernel_fillA = clCreateKernel(program, "fillA", &err); CHECK_CL_ERROR(err, "clCreateKernel fillA"); cl_kernel kernel_processB = clCreateKernel(program, "processB", &err); CHECK_CL_ERROR(err, "clCreateKernel processB"); cl_kernel kernel_processC = clCreateKernel(program, "processC", &err); CHECK_CL_ERROR(err, "clCreateKernel processC"); cl_kernel kernel_verify = clCreateKernel(program, "verify", &err); CHECK_CL_ERROR(err, "clCreateKernel verify"); cl_image_format format; format.image_channel_order = CL_R; format.image_channel_data_type = CL_UNSIGNED_INT32; cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_width = DATA_SIZE_X; desc.image_height = DATA_SIZE_Y; desc.image_depth = 0; desc.image_array_size = 1; desc.image_row_pitch = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0; desc.buffer = NULL; cl_mem input_image = clCreateImage(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format, &desc, input_data, &err); CHECK_CL_ERROR(err, "clCreateImage input"); cl_mem imageA = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); CHECK_CL_ERROR(err, "clCreateImage A"); cl_mem imageB = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); CHECK_CL_ERROR(err, "clCreateImage B"); cl_mem output_image = clCreateImage(context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, &err); CHECK_CL_ERROR(err, "clCreateImage output"); err = clSetKernelArg(kernel_processC, 0, sizeof(cl_mem), &imageB); err |= clSetKernelArg(kernel_processC, 1, sizeof(cl_mem), &imageA); CHECK_CL_ERROR(err, "setKernelArg processC"); err = clSetKernelArg(kernel_verify, 0, sizeof(cl_mem), &imageA); err |= clSetKernelArg(kernel_verify, 1, sizeof(cl_mem), &output_image); CHECK_CL_ERROR(err, "setKernelArg verify"); // Execute kernels. global: whole image. size_t global_size[] = { DATA_SIZE_X, DATA_SIZE_Y }; // Using smaller work size, comes from parameter size_t work_size_2d[] = { BLOCK_X, BLOCK_Y }; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { DATA_SIZE_X, DATA_SIZE_Y, 1 }; err = clEnqueueNDRangeKernel(queue, kernel_processC, 2, NULL, global_size, work_size_2d, 0, NULL, NULL); CHECK_CL_ERROR(err, "enqueueNDRange processC"); err = clEnqueueNDRangeKernel(queue, kernel_verify, 2, NULL, global_size, work_size_2d, 0, NULL, NULL); CHECK_CL_ERROR(err, "enqueueNDRange verify"); // Read back results err = clEnqueueReadImage(queue, output_image, CL_TRUE, origin, region, DATA_SIZE_X * sizeof(uint32_t), 0, result_data, 0, NULL, NULL); CHECK_CL_ERROR(err, "enqueueReadImage"); // Verify results int errors = 0; for (size_t i = 0; i < DATA_SIZE; i++) { uint32_t expected = input_data[i] + 1 + 2 + 3; // Expected transformation chain if (result_data[i] != expected) { if (errors < 4) { // Only print first 4 errors printf("Error at index %zu: expected %u, got %u\n", i, expected, result_data[i]); } errors++; } } if (errors) printf("Total errors: %d out of %zu elements\n", errors, DATA_SIZE); else printf("O.K.\n"); // Cleanup clReleaseMemObject(input_image); clReleaseMemObject(imageA); clReleaseMemObject(imageB); clReleaseMemObject(output_image); clReleaseKernel(kernel_fillA); clReleaseKernel(kernel_processB); clReleaseKernel(kernel_processC); clReleaseKernel(kernel_verify); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); free(input_data); free(result_data); if (errors > 0) return 2; return 0; } void demonstrate_error() { cl_device_id intel_device_id = getIntelUHDDevice(); if (intel_device_id == nullptr) exit(0); print_driver_info(intel_device_id); int res; // Try block sizes 4x1,2,4 to 32x1,2,..32 for (int i = 4; i <= 32; i *= 2) { for (int j = 1; j <= i; j *= 2) { res = main_reproduce_error(intel_device_id, i, j); if (res == 1) printf("parameter failure\n\n"); else if (res == 2) printf("Bug. Data mismatch.\n\n"); else if (res == 3) printf("X-Y size too much, not supported\n\n"); } } printf("Ready.\n"); } int main() { demonstrate_error(); exit(0); }