- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi,
I am on Windows 11 using oneAPI 2024.2, and I am trying to run an OpenCL kernel on the NPU. The same code works for the CPU and GPU, but it crashes on NPU in OpenCLOn12.dll and D3D12Core.dll. If you have any leads, I would really appreciate it. Thank you!
Regards,
-yanny
#include <CL/cl.h>
#include <cassert>
#include <iostream>
#include <string>
const char* kernel_source = "__kernel void vector_add(__global const int* A, __global const int* B, __global int* C, const int N) {\n"
" int id = get_global_id(0);\n"
" if (id < N) {\n"
" C[id] = A[id] + B[id];\n"
" }\n"
"}\n";
int main() {
// It only crash on NPU ("Intel(R) AI Boost"), it doesn't crash on CPU or GPU, ie:
// constexpr char* target_device_name = "Intel(R) Core(TM) Ultra 7 155H"; // CPU
// constexpr char* target_device_name = "Intel(R) Arc(TM) Graphics"; // GPU
// constexpr char* target_device_name = "Intel(R) FPGA Emulation Device"; // FPGA
constexpr char* target_device_name = "Intel(R) AI Boost"; // NPU
constexpr int N = 1;
int* A = (int*)malloc(sizeof(int) * N);
int* B = (int*)malloc(sizeof(int) * N);
int* C = (int*)malloc(sizeof(int) * N);
std::fill(A, A + N, 1);
std::fill(B, B + N, 2);
std::fill(C, C + N, 0);
cl_platform_id platform = nullptr;
cl_device_id device = nullptr;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem buffer_A, buffer_B, buffer_C;
cl_int err;
constexpr cl_uint num_entries = 10;
cl_platform_id platforms[num_entries] = {};
cl_uint num_platforms = 0;
err = clGetPlatformIDs(num_entries, platforms, &num_platforms);
assert(err == CL_SUCCESS);
cl_device_id devices[num_entries] = {};
cl_uint num_devices = 0;
for (cl_uint platform_idx = 0; platform_idx < num_platforms; ++platform_idx)
{
auto curr_platform = platforms[platform_idx];
err = clGetDeviceIDs(curr_platform, CL_DEVICE_TYPE_ALL, num_entries, devices, &num_devices);
if (err != CL_SUCCESS)
{
continue;
}
constexpr int buffer_size = 128;
char device_name[buffer_size];
char vendor_name[buffer_size];
for (cl_uint device_idx = 0; device_idx < num_devices; ++device_idx)
{
auto curr_device = devices[device_idx];
err = clGetDeviceInfo(curr_device, CL_DEVICE_NAME, sizeof(device_name), device_name, nullptr);
assert(err == CL_SUCCESS);
err = clGetDeviceInfo(curr_device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, nullptr);
assert(err == CL_SUCCESS);
cl_device_type device_type;
err = clGetDeviceInfo(curr_device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, nullptr);
assert(err == CL_SUCCESS);
if (std::strcmp(target_device_name, device_name) == 0)
{
//std::cout << " Name: " << device_name << "\n Vendor: " << vendor_name << std::endl;
device = curr_device;
platform = curr_platform;
break;
}
}
if (platform != nullptr)
{
break;
}
}
assert(platform != nullptr);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
queue = clCreateCommandQueue(context, device, 0, &err);
program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, &err);
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
kernel = clCreateKernel(program, "vector_add", &err);
buffer_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N, A, &err);
buffer_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N, B, &err);
buffer_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * N, NULL, &err);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer_A);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer_B);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &buffer_C);
clSetKernelArg(kernel, 3, sizeof(int), &N);
size_t global_work_size = N;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
assert(err == CL_SUCCESS);
err = clEnqueueReadBuffer(queue, buffer_C, CL_TRUE, 0, sizeof(int) * N, C, 0, NULL, NULL);
assert(err == CL_SUCCESS);
for (int i = 0; i < N; ++i) {
std::cout << C[i] << " ";
}
std::cout << std::endl;
clReleaseMemObject(buffer_A);
clReleaseMemObject(buffer_B);
clReleaseMemObject(buffer_C);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Yanny,
We do AI framework level support so not quite sure, but to my best knowledge OpenCL does not support NPU.
For running models on NPU you can try OpenVINO. https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html
OpenVINO uses level-zero for NPU support.
Thanks,
Zaili
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi Yanny,
We do AI framework level support so not quite sure, but to my best knowledge OpenCL does not support NPU.
For running models on NPU you can try OpenVINO. https://docs.openvino.ai/2024/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html
OpenVINO uses level-zero for NPU support.
Thanks,
Zaili

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page