I am running into an issue on my Intel GPU where the high 32 bits of a 64-bit product are all 0's. In particular it happens when:
1. One of the operands is an immediate value
and
2. The other operand is a 64-bit long less than 2^32-1, but large enough that the product should overflow into the higher 32 bits of the 64-bit result.
I have provided code to demonstrate the bug. It copies 0xfefefefe to the device (as a 64-bit long), and multiplies it by 0x1234 and copies back the result.
It produces this output on my machine:
Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz Expected: 1221b9b9a798 Actual: 1221b9b9a798 Intel(R) UHD Graphics 630 Expected: 1221b9b9a798 Actual: b9b9a798 AMD Radeon Pro 560X Compute Engine Expected: 1221b9b9a798 Actual: 1221b9b9a798
#include <iostream>
#include <vector>
#ifdef __APPLE__
#define CL_SILENCE_DEPRECATION
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
const char *_kernelSource =
"__kernel void testKernel(__global long *ptr)\
{\
if(get_global_id(0) == 0) {\
ptr[0] *= 0x1234;\
}\
}";
void run_test(cl_device_id device_id)
{
cl_context ctx;
cl_command_queue cmd;
cl_program prog;
cl_kernel kernel;
cl_int err = 0;
ctx = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!ctx)
{
std::cout << "Error: Failed to create a compute context!" << std::endl;
return;
}
cmd = clCreateCommandQueue(ctx, device_id, 0, &err);
if (!cmd)
{
std::cout << "Error: Failed to create a command commands!" << std::endl;
return;
}
size_t len = strlen(_kernelSource);
prog = clCreateProgramWithSource(ctx, 1, &_kernelSource, &len, &err);
if (!prog)
{
std::cout << "Error: Failed to create compute program!" << std::endl;
return;
}
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048] = {0};
std::cout << "Error: Failed to build program executable! error code " << err << std::endl;
clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
std::cout << "error: " << buffer << std::endl;
return;
}
kernel = clCreateKernel(prog, "testKernel", &err);
if (!kernel || err != CL_SUCCESS)
{
std::cout << "Error: Failed to create compute kernel!" << std::endl;
return;
}
size_t global = 1;
size_t local = 0;
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
std::cout << "Error: Failed to retrieve kernel work group info!" << std::endl;
return;
}
global = local;
// Copy test value over to device
uint64_t test_value = 0xfefefefe;
cl_mem dev_ptr = clCreateBuffer(ctx, 0, sizeof(uint64_t), NULL, &err);
clEnqueueWriteBuffer(cmd, dev_ptr, CL_TRUE, 0, sizeof(uint64_t), &test_value, 0, NULL, NULL);
// Call the kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), &dev_ptr);
err = clEnqueueNDRangeKernel(cmd, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (err)
{
std::cout << "Error: Failed to execute kernel! Error code " << err << std::endl;
return;
}
clFinish(cmd);
// Copy result back from device
std::cout << " Expected: " << std::hex << (test_value * 0x1234) << std::endl;
uint64_t result;
clEnqueueReadBuffer(cmd, dev_ptr, CL_TRUE, 0, sizeof(uint64_t), &result, 0, NULL, NULL);
std::cout << " Actual: " << std::hex << result << std::endl;
clFinish(cmd);
// Cleanup
clReleaseMemObject(dev_ptr);
clReleaseProgram(prog);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmd);
clReleaseContext(ctx);
}
int main(int argc, char** argv)
{
int err = 0;
unsigned int num_platforms = 0;
cl_platform_id platform_ids[10];
std::vector<cl_device_id> device_ids;
std::vector<std::string> device_names;
err = clGetPlatformIDs(0, NULL, &num_platforms);
if(num_platforms == 0) {
std::cout << "No OpenCL platforms found" << std::endl;
return 0;
}
clGetPlatformIDs(num_platforms, platform_ids, NULL);
// Get device ID's and names
for(int i = 0; i < num_platforms; i++) {
unsigned int num_devices = 0;
cl_device_id ids[10];
clGetDeviceIDs(platform_ids, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
if(num_devices == 0) {
continue;
}
clGetDeviceIDs(platform_ids, CL_DEVICE_TYPE_ALL, num_devices, ids, NULL);
for(int j = 0; j < num_devices; j++) {
char buf[256] = {0};
size_t name_size;
clGetDeviceInfo(ids, CL_DEVICE_NAME, sizeof(buf), buf, &name_size);
device_ids.push_back(ids);
device_names.push_back(std::string(buf));
}
}
// Run the test on each device
for(int i = 0; i < device_ids.size(); i++) {
std::cout << device_names << std::endl;
run_test(device_ids);
}
return 0;
}
Thanks
連結已複製
Hi BenR,
Thanks for the clear example and the interest.
Can you confirm some platform details?
Are you on Linux* OS or Windows* OS?
For Intel Graphics Technology, which OpenCL runtime are you using? Which version?
If you're on a recent NEO Linux Debian/Ubuntu build, can you provide the versions of intel-gmmlib_X.X.X.X, intel-igc-core_X.X.X intel-igc-opencl_X.X.X, and intel-opencl_X.X.X packages? Or perhaps at least the Device Driver Version as reported from clGetDeviceInfo(...) or a platform interrogation tool like clinfo?
If it's Windows* OS can you provide the Device Driver Version as reported from clGetDeviceInfo(...) or a platform interrogation tool like clinfo? The driver revision for the package can be good as well. Reference for Windows* OS: https://downloadcenter.intel.com/product/80939/Graphics-Drivers.
Any deployment details that can be provided for enabling Intel Graphics Technology/Intel(R) UHD Graphics 630 are useful... thanks.
-MichaelC
Hi Michael,
I originally discovered it on MacOS, but I have reproduced it on a Windows 10 laptop with an Intel HD Graphics 530.
This is the version information from clinfo.exe on the Windows machine:
Device Name Intel(R) HD Graphics 530 Device Vendor Intel(R) Corporation Device Vendor ID 0x8086 Device Version OpenCL 2.0 Driver Version 21.20.16.4664 Device OpenCL C Version OpenCL C 2.0
Thanks