Solved: OnenCL: Why get_global_id(0) returns 32 bit value on a 64 bit device?

TrueY · ‎05-10-2022

I'm new to OpenCL and I found a strange behavior. Maybe I did something wrong.

In short: I have created a simple app and it works nicely. Until a given limit. After (1<<32) get_global_id(0) returns value less then (1<<32). I checked size_t and (cl_)ulong is 64 bit!

Info got from the device:

CL_DEVICE_NAME [25]: Intel(R) HD Graphics 520
CL_DEVICE_VERSION [16]: OpenCL 3.0 NEO
CL_DRIVER_VERSION [14]: 30.0.101.1660
CL_DEVICE_ADDRESS_BITS: 64

I tried to reduce the code as much I could. My code is shown bellow (kernel embedded in the code). I use Windows 10 Enterprise (21H2) with Visual Studio 2019.

#include <cstdio>
#include <cassert>
#include <iostream>
using namespace std;

#include <CL/opencl.h>

int main() {
    cl_int err = 0;
    cl_uint num_platforms;
    cl_platform_id platforms[16]; // Can be on stack!
    err = clGetPlatformIDs(16, platforms, &num_platforms);
    assert(err == 0 && num_platforms);

    cl_uint num_devices;
    cl_device_id devices[16]; // Can be on stack!
    err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 16, devices, &num_devices);
    assert(err == 0 && num_devices);

#define PR_DEV_INFO(name, type) invoke([devices]()->type { type wrk; \
    cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(wrk), (void*)&wrk, NULL);\
    assert(err == 0); cout << #name << ": " << wrk << endl; return wrk;})
#define PR_DEV_INFO_CHAR(name) invoke([devices]()->string { size_t size; \
    cl_uint err = clGetDeviceInfo(devices[0], name, 0, NULL, &size);\
    assert(err == 0); char* wrk = new char[size];\
    err = clGetDeviceInfo(devices[0], name, size, (void*)wrk, NULL);\
    assert(err == 0); string s(wrk); delete[] wrk;\
    cout << #name << " [" << size << "]: " << s << endl; return s;})
#define PR_DEV_INFO_ARR(name, type, len) invoke([devices](size_t arr_len)->void { \
    type *wrk = new type[arr_len]; \
    cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(type)*arr_len, (void*)wrk, NULL);\
    assert(err == 0); cout << #name << ":";\
    for(int i=0; i<arr_len;++i) cout << ' ' << wrk[i]; cout << endl; delete[] wrk;}, len)

    PR_DEV_INFO_CHAR(CL_DEVICE_NAME);
    PR_DEV_INFO_CHAR(CL_DEVICE_VERSION);
    PR_DEV_INFO_CHAR(CL_DRIVER_VERSION);
    PR_DEV_INFO_CHAR(CL_DEVICE_EXTENSIONS);
    PR_DEV_INFO(CL_DEVICE_ADDRESS_BITS, cl_uint);
    PR_DEV_INFO(CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
    const size_t max_item_dim = 
        PR_DEV_INFO(CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
    cl_uint dims = PR_DEV_INFO(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint);
    PR_DEV_INFO_ARR(CL_DEVICE_MAX_WORK_ITEM_SIZES, size_t, dims);

    cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
    assert(err == 0);

    string kernel_txt(
        "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
        "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
        "#pragma OPENCL EXTENSION cles_khr_int64 : enable\n"
        "__kernel void render(__global ulong * out) {\n"
        "  size_t gid = get_global_id(0);\n"
        "  size_t lid = get_local_id(0);\n"
        "  //size_t gid = lid+get_local_size(0)*get_group_id(0);\n"
        "  ulong val_new, val_org = out[lid];\n"
        "  do {\n"
        "    val_new = val_org > gid ? val_org : gid;\n"
        "  } while (!atomic_compare_exchange_strong(out + lid, &val_org, val_new));\n"
        "  if (lid == 0) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong);\n"
        "}\n");
    const char* kernel_mem = kernel_txt.c_str();
    // kernel_mem cannot be on stack
    cl_program program = clCreateProgramWithSource(context, 1, &kernel_mem, NULL, &err);
    assert(err == 0);

    const char* options = "-w -Werror -cl-std=CL3.0";
    err = clBuildProgram(program, num_devices, devices, options, NULL, NULL);
    if (err) { // Dump build error log
        cerr << "Build error: " << err << endl;
        size_t size = 0;
        // Just get log size first, then read it again to the proper log
        cl_int err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
        char* plog = new char[size];
        err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, size, plog, &size);
        cerr << "Build log (size: " << size << "): '" << plog << "' [err:" << err2 << "d]" << endl;
        delete[] plog;
        exit(1);
    }
    cl_kernel kernel = clCreateKernel(program, "render", &err);
    assert(err == 0);

    cl_ulong* host_image = new cl_ulong[max_item_dim](); // cannot be on stack!
    size_t buffer_size = sizeof(cl_ulong) * max_item_dim;

    cl_mem image = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, &err);
    assert(err == 0);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
    assert(err == 0);

    cl_command_queue cmd_queue = clCreateCommandQueueWithProperties(context, devices[0], NULL, &err);
    assert(err == 0);

    size_t dev_wrk_size[1] = { 8 };
    size_t dev_wrk_offs[1] = { (1ULL<<32) - 4 };
    size_t loc_wrk_size[1] = { 8 };
    err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, dev_wrk_offs, dev_wrk_size, loc_wrk_size, 0, NULL, NULL);
    assert(err == 0);

    // blocking read
    err = clEnqueueReadBuffer(cmd_queue, image, CL_TRUE, 0, buffer_size, host_image, 0, NULL, NULL);
    assert(err == 0);

    err = clFinish(cmd_queue);
    assert(err == 0);

    // I use cout and printf to be sure that not this is the problem
    for (int i = 0; i < 8; ++i) cout << '[' << i << ':' << host_image[i] << "]";
    cout << endl;
    for (int i = 0; i < 8; ++i) printf("[%d:%zd]", i, host_image[i]);
    printf("\nsize_t:%zd, cl_ulong:%zd\n", sizeof(size_t), sizeof(cl_ulong));

    clReleaseMemObject(image);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
    delete[] host_image;

    return CL_SUCCESS;
}

The output is:

CL_DEVICE_NAME [25]: Intel(R) HD Graphics 520
CL_DEVICE_VERSION [16]: OpenCL 3.0 NEO
CL_DRIVER_VERSION [14]: 30.0.101.1660
CL_DEVICE_EXTENSIONS [1654]: cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_command_queue_families cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_driver_diagnostics cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_intel_subgroups_char cl_intel_subgroups_long cl_khr_il_program cl_intel_mem_force_host_memory cl_khr_subgroup_extended_types cl_khr_subgroup_non_uniform_vote cl_khr_subgroup_ballot cl_khr_subgroup_non_uniform_arithmetic cl_khr_subgroup_shuffle cl_khr_subgroup_shuffle_relative cl_khr_subgroup_clustered_reduce cl_intel_device_attribute_query cl_khr_suggested_local_work_size cl_khr_fp64 cl_khr_subgroups cl_intel_spirv_device_side_avc_motion_estimation cl_intel_spirv_media_block_io cl_intel_spirv_subgroups cl_khr_spirv_no_integer_wrap_decoration cl_intel_unified_shared_memory_preview cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_advanced_motion_estimation cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_image2d_from_buffer cl_khr_depth_images cl_khr_3d_image_writes cl_intel_media_block_io cl_khr_gl_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_intel_dx9_media_sharing cl_khr_dx9_media_sharing cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_intel_d3d11_nv12_media_sharing cl_intel_sharing_format_query cl_khr_pci_bus_info cl_intel_simultaneous_sharing
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 24
CL_DEVICE_MAX_WORK_GROUP_SIZE: 256
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 256 256 256
[0:8008][1:4294967293][2:4294967294][3:4294967295][4:0][5:1][6:2][7:3]
[0:8008][1:4294967293][2:4294967294][3:4294967295][4:0][5:1][6:2][7:3]
size_t:8, cl_ulong:8

I found a workaround in kernel:

size_t glob_id = get_local_id(0) + get_local_size(0)*get_group_id(0);

But this cannot handle if offset parameter in clEnqueueNDRangeKernel (it can be passed as a kernel parameter).

I asked this on Stackoverflow (SO) and it seems that on other Intel GPU and on nVidia this code works properly.

Do I something wrong or there is a bug in the new OpenCL dll(s)?

Thanks in advance! TrueY

Ben_A_Intel · ‎05-24-2022

@TrueY wrote:

Thanks for the workaround, but IMHO it is not quite right!

The problem is that get_local_size(0) changes and I think it should not!

Yes, good observation! The snippet I posted above will only work for uniform work-groups. It will not work for non-uniform work-groups. For non-uniform work-groups, get_local_size returns the size of this work-group, which isn't what you want if you are computing a local ID. Instead you'll want to use get_enqueued_local_size. Something like:

// If you need the global offset:
size_t global_id_0 = get_group_id(0) * get_enqueued_local_size(0) + get_global_offset(0) + get_local_id(0);

// If you do not need the global offset:
size_t global_id_0 = get_group_id(0) * get_enqueued_local_size(0) + get_local_id(0);

Note that get_enqueued_local_size is an OpenCL C 2.0 built-in, so it requires OpenCL C 2.0 or newer (e.g. -cl-std=CL2.0, or -cl-std=CL3.0). This shouldn't be an issue because non-uniform work-groups also require OpenCL C 2.0 or newer, but worth mentioning for completeness.

View solution in original post

NoorjahanSk_Intel · ‎05-12-2022

Hi,

Thanks for reaching out to us.

Could you please provide us with the steps you have followed to reproduce the issue as we are not able to build your program at our end?if possible please provide a visual studio project file.

Also please let us know, how did you confirm that gloabl_id is giving 32-bit value.

Thanks & Regards,

Noorjahan.

TrueY · ‎05-21-2022

Dear Noorjahan,

Sorry, I did not realize that there is a comment on my question!

I have a Visual Studio 2019 Enterprise installed on my Windows 10
I installed the Intel SDK package and OpenCL related dll files.
In Visual Studio I created a new project with a single file (which is in the reported text)
Set to: Debug / x64
In menu Project / Properties:
1. C / C++ / General / Additional Include Directories
  Add : "C:\Program Files (x86)\IntelSWTools\system_studio_2020\OpenCL\sdk\include\"
2. Linker / Input / Additional Dependencies
  Add: "C:\Program Files (x86)\IntelSWTools\system_studio_2020\OpenCL\sdk\lib\x64\OpenCL.lib"
Then do Ctrl+B
Then I add two breakpoints:
1. At line 79 (exit(1);) to be able to see if the kernel compilation fails.
2. At line 121 (return CL_SUCCESS;) to be able to see the result.
Then run the built code pressing "Debug / Start Debugging (F5)"

The output is also in the report.

In the 10th output line it can be seen that the get_global_id(0) is written into the out[] global array, which is passed to the kernel. In the 0th element "8008" is returned which is "sizeof(size_t) * 1000 + sizeof(ulong)" on the kernel side. So, both types are 8 bytes wide. From the 1th element it is the value of get_global_id(0) from "(1ULL<<32) - 3" to "(1ULL<<32) + 3". Also on the PC side code the size of cl_ulong and size_t types are printed and it gives 8 bytes for both. So, it looks ok!

But the expected result is:

[1:4294967293][2:4294967294][3:4294967295][4:4294967296][5:4294967297][6:4294967298][7:4294967299]

But it is

[1:4294967293][2:4294967294][3:4294967295][4:0][5:1][6:2][7:3]

instead (see 10th and 11th lines of output). It definitely shows an overflow exactly at the 4th item, which is (1ULL<<32).

I installed a new Intel Graphics Device Driver, but same result:

CL_DRIVER_VERSION [14]: 30.0.101.1960

Project file attached.

I also add the list of dll files loaded during the run (OpenCL related dll-s are bolded):
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Users\tajttam\source\repos\opencl_bug\x64\Debug\Exercise OpenCL.exe'. Symbols loaded.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\ntdll.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\kernel32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\KernelBase.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\OpenCL.dll'. Module was built without symbols.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\vcruntime140_1d.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\vcruntime140d.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\cfgmgr32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\msvcp140d.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\ucrtbase.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\ucrtbased.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\ucrtbased.dll'.
'Exercise OpenCL.exe' (Win32): Unloaded 'C:\Windows\System32\ucrtbased.dll'
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\combase.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\rpcrt4.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\ole32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\gdi32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\win32u.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\gdi32full.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\msvcp_win.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\user32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\advapi32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\msvcrt.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\sechost.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\imm32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\QIPCAP64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\shell32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\dnsapi.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\IPHLPAPI.DLL'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\nsi.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\kernel.appcore.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\bcryptprimitives.dll'.
'Exercise OpenCL.exe' (Win32): Unloaded 'C:\Windows\System32\bcryptprimitives.dll'
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\bcryptprimitives.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\clbcatq.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\AppXDeploymentClient.dll'.
'Exercise OpenCL.exe' (Win32): Unloaded 'C:\Windows\System32\AppXDeploymentClient.dll'
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\DriverStore\FileRepository\iigd_dch.inf_amd64_51826ddb7b10b441\igdrcl64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\ws2_32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\dxgi.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\DXCore.dll'.
'Exercise OpenCL.exe' (Win32): Unloaded 'C:\Windows\System32\DXCore.dll'
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\DriverStore\FileRepository\iigd_dch.inf_amd64_51826ddb7b10b441\igdgmm64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\DXCore.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\DriverStore\FileRepository\iigd_dch.inf_amd64_51826ddb7b10b441\igdfcl64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\DriverStore\FileRepository\iigd_dch.inf_amd64_51826ddb7b10b441\igc64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\opengl32.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\glu32.dll'.
'Exercise OpenCL.exe' (Win32): Unloaded 'C:\Windows\System32\glu32.dll'
'Exercise OpenCL.exe' (Win32): Unloaded 'C:\Windows\System32\opengl32.dll'
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Program Files (x86)\Common Files\Intel\Shared Libraries\intel64\intelocl64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Program Files (x86)\Common Files\Intel\Shared Libraries\intel64\task_executor64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Program Files (x86)\Common Files\Intel\Shared Libraries\intel64\cpu_device64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\DriverStore\FileRepository\iigd_dch.inf_amd64_51826ddb7b10b441\opencl-clang64.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\oleaut32.dll'.
'Exercise OpenCL.exe' (Win32): Unloaded 'C:\Windows\System32\oleaut32.dll'
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\version.dll'.
'Exercise OpenCL.exe' (Win32): Loaded 'C:\Windows\System32\oleaut32.dll'.

I hope this helps!

NoorjahanSk_Intel · ‎05-19-2022

Hi,

We haven't heard back from you. Could you please provide an update on your issue?

Thanks & Regards,

Noorjahan.

TrueY · ‎05-22-2022

Dear Noorjahan,

I have sent a reply yesterday evening. With project file and description. I cannot see it now. Did you get it?

TIA && TT

Ben_A_Intel · ‎05-23-2022

Thank you for the interesting question!

There are actually two things going on here which are causing the behavior you are seeing.

First, some of our hardware counters that feed into the global ID calculation are limited to 32 bits, specifically the work-group ID. This isn't a limit specific to our GPUs and other GPUs also have similar restrictions (other non-GPU accelerators also). This doesn't necessarily mean that the global ID is limited to 32-bits, but it does mean there is a maximum global ID, it will be less than 64 bits, and the maximum global ID will be based on the size of the work-group.

Second, our GPU compiler currently compute the global ID with these 32-bit values using 32-bit math:

https://github.com/intel/intel-graphics-compiler/blob/bc1f4c233eba9530ea35531367e6b4f1a4986177/IGC/BiFModule/Implementation/workitem.cl#L95

This is usually fine, and the 32-bit math is faster than 64-bit math, but it breaks when there is an overflow.

Here is what we're looking to do:

First, generate an error when the global work size exceeds one of the hardware limits, to give applications an opportunity to detect and react to this condition vs. silently producing incorrect results.

Second, either compute the global ID with 64-bit math unconditionally, or provide a mechanism to opt-in to 64-bit global IDs if there is a significant performance benefit to computing the global ID with 32-bit math.

In the meantime, a good workaround is to compute the global ID with 64-bit math in your kernel source, as you've done:

// If you need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) + get_global_offset(0) + get_local_id(0);

// If you do not need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) + get_local_id(0);

This will work so long as the number of work-groups (and hence the work-group ID) is representable in 32 bits.

Hope this helps!

TrueY · ‎05-24-2022

Dear Ben_A_Intel,

Thank you for your kind effort to puzzle out the issue!

Actually 32 bit vs 64 bit computing might be set using #pragma-s in the kernel code.

Thanks for the workaround, but IMHO it is not quite right!

The problem is that get_local_size(0) changes and I think it should not!

Let's say I have 256 ulong elements in global variable and the global work size is 258. In clEnqueueNDRangeKernel the local_work_size is set to {256}.

In this case there will be two calls of the work units. During the first the get_local_size(0) returns 256, meanwhile during the second call it is just 2.

So this workaround returns a wrong value during the second call:

size_t global_id_0 = get_group_id(0) * get_local_size(0) + get_local_id(0);

get_group_id(0) is 1

get_local_size(0) is 2

get_local_id(0) is {0, 1}

global_id_0 will be 1*2 + {0,1} instead of 1*256 + {0,1}.

I assume the get_local_size(0) should be always 256 to work this properly.

TIA! TrueY

Ben_A_Intel · ‎05-24-2022

@TrueY wrote:

Thanks for the workaround, but IMHO it is not quite right!

The problem is that get_local_size(0) changes and I think it should not!

Yes, good observation! The snippet I posted above will only work for uniform work-groups. It will not work for non-uniform work-groups. For non-uniform work-groups, get_local_size returns the size of this work-group, which isn't what you want if you are computing a local ID. Instead you'll want to use get_enqueued_local_size. Something like:

// If you need the global offset:
size_t global_id_0 = get_group_id(0) * get_enqueued_local_size(0) + get_global_offset(0) + get_local_id(0);

// If you do not need the global offset:
size_t global_id_0 = get_group_id(0) * get_enqueued_local_size(0) + get_local_id(0);

Note that get_enqueued_local_size is an OpenCL C 2.0 built-in, so it requires OpenCL C 2.0 or newer (e.g. -cl-std=CL2.0, or -cl-std=CL3.0). This shouldn't be an issue because non-uniform work-groups also require OpenCL C 2.0 or newer, but worth mentioning for completeness.

Castle · ‎11-29-2023

I have the same question, but I just want to know where to determine if the current device only supports 32-bit global_id, such as a certain value of the clinfo command.

NoorjahanSk_Intel · ‎05-25-2022

Hi,

Glad to know that your issue is resolved. If you need any additional information, please post a new question as this thread will no longer be monitored by Intel.

Thanks & Regards,

Noorjahan.

OnenCL: Why get_global_id(0) returns 32 bit value on a 64 bit device?

OpenCL* for GPU