Access iGPU device data allocated across classes with USM

Pascuzzi__Vincent · ‎08-17-2020

I have a number of classes that use the same `cl::sycl::context` to allocate device-side memory; let's call these `WorkerClass`s. I then have a number of kernels -- in a separate source file -- that need to access the device data from the `WorkerClass`s. The device data are typically `struct`s and/or pointers allocated on the device via USM.

What I've noticed is that if I remove the `WorkerClass`s from the code -- that is, I allocate all device memory in a `main` function -- the code executes as expected. For example,

// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc

#include <CL/sycl.hpp>
#include <iostream>

static const unsigned int MAX_FLOATS = 10;

struct MyStructA {
  int* a;
  float* a_floats;
};

struct MyStructB {
  int* b;
  float* b_floats;
  MyStructA* ms_A;
};

class TestKernel {
 public:
  TestKernel() = delete;
  TestKernel(MyStructB* ms)
      : b_(ms->b), b_floats_(ms->b_floats), ms_A_(ms->ms_A) {}

  void operator()(cl::sycl::id<1> idx) {
    unsigned int id = (int)idx[0];
    b_floats_[id] += 0.5;
    *ms_A_->a = 100;
  }

 private:
  int* b_;
  float* b_floats_;
  MyStructA* ms_A_;
};

int main() {
  // Catch asynchronous exceptions
  auto exception_handler = [](cl::sycl::exception_list exceptions) {
    for (std::exception_ptr const& e : exceptions) {
      try {
        std::rethrow_exception(e);
      } catch (cl::sycl::exception const& e) {
        std::cout << "Caught asynchronous SYCL exception during generation:\n"
                  << e.what() << std::endl;
      }
    }
  };
  // Initialize device, queue and context
  cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
  cl::sycl::queue queue = cl::sycl::queue(dev, exception_handler);
  cl::sycl::context ctx = queue.get_context();

  // Name of the device to run on
  std::string dev_name =
      queue.get_device().get_info<cl::sycl::info::device::name>();
  std::cout << "Using device \"" << dev_name << "\"" << std::endl;
  // Ensure device can handle USM device allocations.
  if (!queue.get_device()
           .get_info<cl::sycl::info::device::usm_device_allocations>()) {
    std::cout << "ERROR :: device \"" << dev_name
              << "\" does not support usm_device_allocations!" << std::endl;
    return -1;
  }

  // Instantiate a MyStructA
  MyStructA ms_A{0, nullptr};
  int* a_int = (int*)malloc_device(sizeof(int), dev, ctx);
  float* a_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
  // Host-side float data to copy
  int* host_a = (int*)malloc(sizeof(int));
  float* host_array = new float[MAX_FLOATS];
  for (unsigned int i = 0; i < MAX_FLOATS; i++) {
    host_array[i] = i * 1.0;
  }

  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(a_int, &host_a[0], sizeof(int));
      })
      .wait_and_throw();
  // Assign MyStructA pointers
  ms_A.a = a_int;
  ms_A.a_floats = a_floats;
  MyStructA* ms_A_dev = (MyStructA*)malloc_device(sizeof(ms_A), dev, ctx);
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(ms_A_dev, &ms_A, sizeof(ms_A));
      })
      .wait_and_throw();

  // Instantiate a MyStructB
  MyStructB ms_B{0, nullptr, nullptr};
  int* b_int = (int*)malloc_device(sizeof(int), dev, ctx);
  float* b_floats = (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, ctx);
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  // Assign MyStructB pointers
  ms_B.b = b_int;
  ms_B.b_floats = b_floats;
  ms_B.ms_A = ms_A_dev;

  queue.submit([&](cl::sycl::handler& h) {
    TestKernel kernel(&ms_B);
    h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
  });
  queue.wait();

  // Copy back to host
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(host_array, &ms_B.b_floats[0], MAX_FLOATS * sizeof(float));
      })
      .wait_and_throw();
  queue
      .submit([&](cl::sycl::handler& h) {
        // Copy host data to device
        h.memcpy(host_a, &ms_B.ms_A->a[0], sizeof(float));
      })
      .wait_and_throw();

  free(host_array);
  free(host_a);
  cl::sycl::free(a_int, ctx);
  cl::sycl::free(a_floats, ctx);
  cl::sycl::free(ms_A_dev, ctx);
  cl::sycl::free(b_int, ctx);
  cl::sycl::free(b_floats, ctx);
  return 0;
}

On the other hand, with the `WorkerClass`s taking care of the memory allocations, e.g.,

// Compile:
// clang++ \
// -g -fsycl \
// -o kernel_args kernel_args.cc

#include <CL/sycl.hpp>
#include <iostream>

static const unsigned int MAX_FLOATS = 10;

struct MyStructA {
  int* a;
  float* a_floats;
};

struct MyStructB {
  int* b;
  float* b_floats;
  MyStructA* ms_A;
};

class WorkerClass {
 public:
  WorkerClass() {}
  ~WorkerClass() {}

  bool Init() {
    // Initialize device, queue and context
    cl::sycl::device dev = cl::sycl::device(cl::sycl::default_selector());
    cl::sycl::queue queue = cl::sycl::queue(dev);
    ctx_ = new cl::sycl::context(queue.get_context());

    // Name of the device to run on
    std::string dev_name =
        queue.get_device().get_info<cl::sycl::info::device::name>();
    std::cout << "Using device \"" << dev_name << "\"" << std::endl;
    // Ensure device can handle USM device allocations.
    if (!queue.get_device()
             .get_info<cl::sycl::info::device::usm_device_allocations>()) {
      std::cout << "ERROR :: device \"" << dev_name
                << "\" does not support usm_device_allocations!" << std::endl;
      return false;
    }

    // Instantiate a MyStructA
    sA_ = {0, nullptr};
    int* a_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
    float* a_floats =
        (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
    // Host-side float data to copy
    int* host_a = (int*)malloc(sizeof(int));
    float* host_array = new float[MAX_FLOATS];
    for (unsigned int i = 0; i < MAX_FLOATS; i++) {
      host_array[i] = i * 1.0;
    }

    queue
        .submit([&](cl::sycl::handler& h) {
          // Copy host data to device
          h.memcpy(a_floats, &host_array[0], MAX_FLOATS * sizeof(float));
        })
        .wait_and_throw();
    queue
        .submit([&](cl::sycl::handler& h) {
          // Copy host data to device
          h.memcpy(a_int, &host_a[0], sizeof(int));
        })
        .wait_and_throw();
    // Assign MyStructA pointers
    sA_.a = a_int;
    sA_.a_floats = a_floats;
    MyStructA* sA_dev = (MyStructA*)malloc_device(sizeof(sA_), dev, *ctx_);
    queue
        .submit(
            [&](cl::sycl::handler& h) { h.memcpy(sA_dev, &sA_, sizeof(sA_)); })
        .wait_and_throw();

    // Instantiate a MyStructB
    MyStructB sB_{0, nullptr, nullptr};
    int* b_host = (int*)malloc(sizeof(int));
    int* b_int = (int*)malloc_device(sizeof(int), dev, *ctx_);
    float* b_floats =
        (float*)malloc_device(MAX_FLOATS * sizeof(float), dev, *ctx_);
    queue
        .submit([&](cl::sycl::handler& h) {
          h.memcpy(b_floats, &host_array[0], MAX_FLOATS * sizeof(float));
        })
        .wait_and_throw();
    // Assign MyStructB pointers
    sB_.b = b_int;
    sB_.b_floats = b_floats;
    sB_.ms_A = sA_dev;

    return true;
  }

  cl::sycl::context* GetContext() { return ctx_; }

  MyStructB* sB() { return &sB_; }

 private:
  MyStructA sA_;
  MyStructB sB_;
  cl::sycl::context* ctx_;
};

class TestKernel {
 public:
  TestKernel() = delete;
  TestKernel(MyStructB* sB) {
    b_floats_ = sB->b_floats;
    sA_ = sB->ms_A;
  }

  void operator()(cl::sycl::id<1> idx) {
    unsigned int id = (int)idx[0];
    // b_floats_[id] += 0.5;
    // *sA_->a = 100;
  }

 private:
  float* b_floats_;
  MyStructA* sA_;
};

int main() {
  // Catch asynchronous exceptions
  auto exception_handler = [](cl::sycl::exception_list exceptions) {
    for (std::exception_ptr const& e : exceptions) {
      try {
        std::rethrow_exception(e);
      } catch (cl::sycl::exception const& e) {
        std::cout << "Caught asynchronous SYCL exception during generation:\n"
                  << e.what() << std::endl;
      }
    }
  };
  WorkerClass* wc = new WorkerClass();
  wc->Init();

  // Get context from WorkerClass
  cl::sycl::context* ctx = wc->GetContext();
  cl::sycl::device dev = ctx->get_devices()[0];
  cl::sycl::queue queue = cl::sycl::queue(dev);

  MyStructB* sB = wc->sB();

  queue.submit([&](cl::sycl::handler& h) {
    TestKernel kernel(sB);
    h.parallel_for<class foo>(cl::sycl::range<1>{MAX_FLOATS}, kernel);
  });
  queue.wait();

  return 0;
}

I get,

terminate called after throwing an instance of 'cl::sycl::runtime_error'
  what():  OpenCL API failed. OpenCL API returns: -50 (CL_INVALID_ARG_VALUE) -50 (CL_INVALID_ARG_VALUE)
Aborted

I've seen a related post to access device-side memory [1] but does not use kernel function objects as I would like to. I'd like to note that I can access device memory with local variables but not member variables.

Thanks in advance.

[1] https://community.intel.com/t5/Intel-oneAPI-Data-Parallel-C/Error-50-CL-INVALID-ARG-VALUE-for-Intel-iGPU/td-p/1158406