#include #include #ifdef __SYCL_DEVICE_ONLY__ #define GRID_SIMT #endif cl::sycl::queue *theGridAccelerator; uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ unsigned long nt=acceleratorThreads(); \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ cl::sycl::range<3> local {nt,1,nsimd}; \ cl::sycl::range<3> global{unum1,unum2,nsimd}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ [=] (cl::sycl::nd_item<3> item) /*mutable*/ { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ auto lane = item.get_global_id(2); \ { __VA_ARGS__ }; \ }); \ }); inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; #define accelerator_barrier(dummy) theGridAccelerator->wait(); #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} ); #define accelerator_for( iter, num, nsimd, ... ) \ accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \ accelerator_barrier(dummy); int main(void){ cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (selectedDevice); double *data = (double*)acceleratorAllocShared(100*sizeof(double)); std::pair > the_pair; the_pair.first = 3.14; the_pair.second.resize(10, 99.0); double &first_elem_ref = the_pair.first; accelerator_for(i, 100, 1, { #ifdef USE_BROKEN data[i] = the_pair.first; #else data[i] = first_elem_ref; #endif }); acceleratorFreeShared(data); return 0; }