- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I have a number of classes that use the same `cl::sycl::context` to allocate device-side memory; let's call these `WorkerClass`s. I then have a number of kernels -- in a separate source file -- that need to access the device data from the `WorkerClass`s. The device data are typically `struct`s and/or pointers allocated on the device via USM.
What I've noticed is that if I remove the `WorkerClass`s from the code -- that is, I allocate all device memory in a `main` function -- the code executes as expected. For example,
// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc
#include <CL/sycl.hpp>
#include <iostream>
static const unsigned int MAX_FLOATS = 10;
struct MyStructA {
int* a;
float* a_floats;
};
struct MyStructB {
int* b;
float* b_floats;
MyStructA* ms_A;
};
class TestKernel {
public:
TestKernel() = delete;
TestKernel(MyStructB* ms)
: b_(ms->b), b_floats_(ms->b_floats), ms_A_(ms->ms_A) {}
void operator()(cl::sycl::id<1> idx) {
unsigned int id = (int)idx[0];
b_floats_[id] += 0.5;
*ms_A_->a = 100;
}
private:
int* b_;
float* b_floats_;
MyStructA* ms_A_;
};
int main() {
// Catch asynchronous exceptions
auto exception_handler = [](cl::sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch (cl::sycl::exception const& e) {
std::cout << "Caught asynchronous SYCL exception during generation:\n"
<< e.what() << std::endl;
}
}
};
// Initialize device, queue and context
cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
cl::sycl::queue queue = cl::sycl::queue(dev, exception_handler);
cl::sycl::context ctx = queue.get_context();
// Name of the device to run on
std::string dev_name =
queue.get_device().get_info<cl::sycl::info::device::name>();
std::cout << "Using device \"" << dev_name << "\"" << std::endl;
// Ensure device can handle USM device allocations.
if (!queue.get_device()
.get_info<cl::sycl::info::device::usm_device_allocations>()) {
std::cout << "ERROR :: device \"" << dev_name
<< "\" does not support usm_device_allocations!" << std::endl;
return -1;
}
// Instantiate a MyStructA
MyStructA ms_A{0, nullptr};
int* a_int = (int*)malloc_device(sizeof(int), dev, ctx);
float* a_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
// Host-side float data to copy
int* host_a = (int*)malloc(sizeof(int));
float* host_array = new float[MAX_FLOATS];
for (unsigned int i = 0; i < MAX_FLOATS; i++) {
host_array[i] = i * 1.0;
}
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_int, &host_a[0], sizeof(int));
})
.wait_and_throw();
// Assign MyStructA pointers
ms_A.a = a_int;
ms_A.a_floats = a_floats;
MyStructA* ms_A_dev = (MyStructA*)malloc_device(sizeof(ms_A), dev, ctx);
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(ms_A_dev, &ms_A, sizeof(ms_A));
})
.wait_and_throw();
// Instantiate a MyStructB
MyStructB ms_B{0, nullptr, nullptr};
int* b_int = (int*)malloc_device(sizeof(int), dev, ctx);
float* b_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
// Assign MyStructB pointers
ms_B.b = b_int;
ms_B.b_floats = b_floats;
ms_B.ms_A = ms_A_dev;
queue.submit([&](cl::sycl::handler& h) {
TestKernel kernel(&ms_B);
h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
});
queue.wait();
// Copy back to host
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(host_array, &ms_B.b_floats[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(host_a, &ms_B.ms_A->a[0], sizeof(float));
})
.wait_and_throw();
free(host_array);
free(host_a);
cl::sycl::free(a_int, ctx);
cl::sycl::free(a_floats, ctx);
cl::sycl::free(ms_A_dev, ctx);
cl::sycl::free(b_int, ctx);
cl::sycl::free(b_floats, ctx);
return 0;
}
On the other hand, with the `WorkerClass`s taking care of the memory allocations, e.g.,
// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc
#include <CL/sycl.hpp>
#include <iostream>
static const unsigned int MAX_FLOATS = 10;
struct MyStructA {
int* a;
float* a_floats;
};
struct MyStructB {
int* b;
float* b_floats;
MyStructA* ms_A;
};
class WorkerClass {
public:
WorkerClass() {}
~WorkerClass() {}
bool Init() {
// Initialize device, queue and context
cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
cl::sycl::queue queue = cl::sycl::queue(dev);
ctx_ = new cl::sycl::context(queue.get_context());
// Name of the device to run on
std::string dev_name =
queue.get_device().get_info<cl::sycl::info::device::name>();
std::cout << "Using device \"" << dev_name << "\"" << std::endl;
// Ensure device can handle USM device allocations.
if (!queue.get_device()
.get_info<cl::sycl::info::device::usm_device_allocations>()) {
std::cout << "ERROR :: device \"" << dev_name
<< "\" does not support usm_device_allocations!" << std::endl;
return false;
}
// Instantiate a MyStructA
sA_ = {0, nullptr};
int* a_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
float* a_floats =
(float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
// Host-side float data to copy
int* host_a = (int*)malloc(sizeof(int));
float* host_array = new float[MAX_FLOATS];
for (unsigned int i = 0; i < MAX_FLOATS; i++) {
host_array[i] = i * 1.0;
}
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
queue
.submit([&](cl::sycl::handler& h) {
// Copy host data to device
h.memcpy(a_int, &host_a[0], sizeof(int));
})
.wait_and_throw();
// Assign MyStructA pointers
sA_.a = a_int;
sA_.a_floats = a_floats;
MyStructA* sA_dev = (MyStructA*)malloc_device(sizeof(sA_), dev, *ctx_);
queue
.submit(
[&](cl::sycl::handler& h) { h.memcpy(sA_dev, &sA_, sizeof(sA_)); })
.wait_and_throw();
// Instantiate a MyStructB
MyStructB sB_{0, nullptr, nullptr};
int* b_host = (int*)malloc(sizeof(int));
int* b_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
float* b_floats =
(float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
queue
.submit([&](cl::sycl::handler& h) {
h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
})
.wait_and_throw();
// Assign MyStructB pointers
sB_.b = b_int;
sB_.b_floats = b_floats;
sB_.ms_A = sA_dev;
return true;
}
cl::sycl::context* GetContext() { return ctx_; }
MyStructB* sB() { return &sB_; }
private:
MyStructA sA_;
MyStructB sB_;
cl::sycl::context* ctx_;
};
class TestKernel {
public:
TestKernel() = delete;
TestKernel(MyStructB* sB) {
b_floats_ = sB->b_floats;
sA_ = sB->ms_A;
}
void operator()(cl::sycl::id<1> idx) {
unsigned int id = (int)idx[0];
// b_floats_[id] += 0.5;
// *sA_->a = 100;
}
private:
float* b_floats_;
MyStructA* sA_;
};
int main() {
// Catch asynchronous exceptions
auto exception_handler = [](cl::sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch (cl::sycl::exception const& e) {
std::cout << "Caught asynchronous SYCL exception during generation:\n"
<< e.what() << std::endl;
}
}
};
WorkerClass* wc = new WorkerClass();
wc->Init();
// Get context from WorkerClass
cl::sycl::context* ctx = wc->GetContext();
cl::sycl::device dev = ctx->get_devices()[0];
cl::sycl::queue queue = cl::sycl::queue(dev);
MyStructB* sB = wc->sB();
queue.submit([&](cl::sycl::handler& h) {
TestKernel kernel(sB);
h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
});
queue.wait();
return 0;
}
I get,
terminate called after throwing an instance of 'cl::sycl::runtime_error'
what(): OpenCL API failed. OpenCL API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)
Aborted
I've seen a related post to access device-side memory [1] but does not use kernel function objects as I would like to. I'd like to note that I can access device memory with local variables but not member variables.
Thanks in advance.
Link Copied
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page