<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Unable to Simultaneously Execute Multiple Kernels on Device Using Out-of-Order Queue in Intel® oneAPI DPC++/C++ Compiler</title>
    <link>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Unable-to-Simultaneously-Execute-Multiple-Kernels-on-Device/m-p/1607229#M3869</link>
    <description>&lt;P&gt;&lt;STRONG&gt;The test code is as follows:&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;#include &amp;lt;CL/sycl.hpp&amp;gt;&lt;BR /&gt;#include &amp;lt;chrono&amp;gt;&lt;BR /&gt;#include &amp;lt;iostream&amp;gt;&lt;/P&gt;&lt;P&gt;using namespace sycl;&lt;/P&gt;&lt;P&gt;// Assuming IntArray is a custom type defined elsewhere&lt;BR /&gt;using IntArray = std::vector&amp;lt;int&amp;gt;;&lt;/P&gt;&lt;P&gt;const size_t array_size = 1024; // Define your array size&lt;BR /&gt;const int iter = 10; // Define number of iterations&lt;/P&gt;&lt;P&gt;int multi_queue(sycl::queue&amp;amp; q, const IntArray&amp;amp; a, const IntArray&amp;amp; b) {&lt;BR /&gt;IntArray s1(array_size), s2(array_size), s3(array_size);&lt;/P&gt;&lt;P&gt;//buffer&amp;lt;int, 1&amp;gt; a_buf(a.data(), range&amp;lt;1&amp;gt;(array_size));&lt;BR /&gt;//buffer&amp;lt;int, 1&amp;gt; b_buf(b.data(), range&amp;lt;1&amp;gt;(array_size));&lt;BR /&gt;buffer&amp;lt;int, 1&amp;gt; sum_buf1(s1.data(), range&amp;lt;1&amp;gt;(array_size));&lt;BR /&gt;buffer&amp;lt;int, 1&amp;gt; sum_buf2(s2.data(), range&amp;lt;1&amp;gt;(array_size));&lt;BR /&gt;buffer&amp;lt;int, 1&amp;gt; sum_buf3(s3.data(), range&amp;lt;1&amp;gt;(array_size));&lt;/P&gt;&lt;P&gt;size_t num_groups = 1;&lt;BR /&gt;size_t wg_size = 256;&lt;BR /&gt;auto start = std::chrono::steady_clock::now();&lt;BR /&gt;for (int i = 0; i &amp;lt; iter; i++) {&lt;BR /&gt;q.submit([&amp;amp;](sycl::handler&amp;amp; h) {&lt;BR /&gt;//sycl::accessor a_acc(a_buf, h, sycl::read_only);&lt;BR /&gt;//sycl::accessor b_acc(b_buf, h, sycl::read_only);&lt;BR /&gt;sycl::accessor sum_acc(sum_buf1, h, sycl::write_only, sycl::no_init);&lt;/P&gt;&lt;P&gt;h.parallel_for(sycl::nd_range&amp;lt;1&amp;gt;(num_groups * wg_size, wg_size),&lt;BR /&gt;[=](sycl::nd_item&amp;lt;1&amp;gt; index) {&lt;BR /&gt;size_t loc_id = index.get_local_id();&lt;BR /&gt;sum_acc[loc_id] = 0;&lt;BR /&gt;for (int j = 0; j &amp;lt; 1000; j++)&lt;BR /&gt;for (size_t i = loc_id; i &amp;lt; array_size; i += wg_size) {&lt;BR /&gt;sum_acc[loc_id] += /*a_acc[i] + b_acc[i]*/1;&lt;BR /&gt;}&lt;BR /&gt;});&lt;BR /&gt;});&lt;BR /&gt;q.submit([&amp;amp;](sycl::handler&amp;amp; h) {&lt;BR /&gt;//sycl::accessor a_acc(a_buf, h, sycl::read_only);&lt;BR /&gt;//sycl::accessor b_acc(b_buf, h, sycl::read_only);&lt;BR /&gt;sycl::accessor sum_acc(sum_buf2, h, sycl::write_only, sycl::no_init);&lt;/P&gt;&lt;P&gt;h.parallel_for(sycl::nd_range&amp;lt;1&amp;gt;(num_groups * wg_size, wg_size),&lt;BR /&gt;[=](sycl::nd_item&amp;lt;1&amp;gt; index) {&lt;BR /&gt;size_t loc_id = index.get_local_id();&lt;BR /&gt;sum_acc[loc_id] = 0;&lt;BR /&gt;for (int j = 0; j &amp;lt; 1000; j++)&lt;BR /&gt;for (size_t i = loc_id; i &amp;lt; array_size; i += wg_size) {&lt;BR /&gt;sum_acc[loc_id] += /*a_acc[i] + b_acc[i]*/2;&lt;BR /&gt;}&lt;BR /&gt;});&lt;BR /&gt;});&lt;BR /&gt;q.submit([&amp;amp;](sycl::handler&amp;amp; h) {&lt;BR /&gt;//sycl::accessor a_acc(a_buf, h, sycl::read_only);&lt;BR /&gt;//sycl::accessor b_acc(b_buf, h, sycl::read_only);&lt;BR /&gt;sycl::accessor sum_acc(sum_buf3, h, sycl::write_only, sycl::no_init);&lt;/P&gt;&lt;P&gt;h.parallel_for(sycl::nd_range&amp;lt;1&amp;gt;(num_groups * wg_size, wg_size),&lt;BR /&gt;[=](sycl::nd_item&amp;lt;1&amp;gt; index) {&lt;BR /&gt;size_t loc_id = index.get_local_id();&lt;BR /&gt;sum_acc[loc_id] = 0;&lt;BR /&gt;for (int j = 0; j &amp;lt; 1000; j++)&lt;BR /&gt;for (size_t i = loc_id; i &amp;lt; array_size; i += wg_size) {&lt;BR /&gt;sum_acc[loc_id] += /*a_acc[i] + b_acc[i]*/3;&lt;BR /&gt;}&lt;BR /&gt;});&lt;BR /&gt;});&lt;BR /&gt;}&lt;BR /&gt;q.wait();&lt;BR /&gt;auto end = std::chrono::steady_clock::now();&lt;BR /&gt;auto duration = std::chrono::duration_cast&amp;lt;std::chrono::microseconds&amp;gt;(end - start);&lt;BR /&gt;std::cout &amp;lt;&amp;lt; "multi_queue completed on device - took "&lt;BR /&gt;&amp;lt;&amp;lt; duration.count() &amp;lt;&amp;lt; " u-secs\n";&lt;/P&gt;&lt;P&gt;host_accessor result1(sum_buf1, read_only);&lt;BR /&gt;host_accessor result2(sum_buf2, read_only);&lt;BR /&gt;host_accessor result3(sum_buf3, read_only);&lt;/P&gt;&lt;P&gt;//for (size_t i = 0; i &amp;lt; 2; ++i) {&lt;BR /&gt;// std::cout &amp;lt;&amp;lt; "s1[" &amp;lt;&amp;lt; i &amp;lt;&amp;lt; "] = " &amp;lt;&amp;lt; result1[i] &amp;lt;&amp;lt; ",";&lt;BR /&gt;// std::cout &amp;lt;&amp;lt; "s2[" &amp;lt;&amp;lt; i &amp;lt;&amp;lt; "] = " &amp;lt;&amp;lt; result2[i] &amp;lt;&amp;lt; ",";&lt;BR /&gt;// std::cout &amp;lt;&amp;lt; "s3[" &amp;lt;&amp;lt; i &amp;lt;&amp;lt; "] = " &amp;lt;&amp;lt; result3[i] &amp;lt;&amp;lt; "\n";&lt;BR /&gt;//}&lt;/P&gt;&lt;P&gt;// check results&lt;BR /&gt;return ((end - start).count());&lt;BR /&gt;} // end multi_queue&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;int main() {&lt;BR /&gt;// Define your device selector&lt;BR /&gt;default_selector d_selector;&lt;/P&gt;&lt;P&gt;// Define arrays a and b of type IntArray and populate them&lt;BR /&gt;IntArray a(array_size, 1), b(array_size, 1);&lt;/P&gt;&lt;P&gt;// Create in-order queue with queue properties&lt;BR /&gt;property_list q_prop{ property::queue::in_order() };&lt;/P&gt;&lt;P&gt;std::cout &amp;lt;&amp;lt; "In order queue: Jitting+Execution time\n";&lt;BR /&gt;queue q1(d_selector, q_prop);&lt;BR /&gt;multi_queue(q1, a, b);&lt;BR /&gt;std::this_thread::sleep_for(std::chrono::milliseconds(500));&lt;BR /&gt;//usleep(500 * 1000);&lt;BR /&gt;std::cout &amp;lt;&amp;lt; "In order queue: Execution time\n";&lt;BR /&gt;multi_queue(q1, a, b);&lt;/P&gt;&lt;P&gt;// Create out-of-order queue without queue properties&lt;BR /&gt;queue q2(d_selector);&lt;BR /&gt;std::cout &amp;lt;&amp;lt; "Out of order queue: Jitting+Execution time\n";&lt;BR /&gt;multi_queue(q2, a, b);&lt;BR /&gt;std::this_thread::sleep_for(std::chrono::milliseconds(500));&lt;BR /&gt;std::cout &amp;lt;&amp;lt; "Out of order queue: Execution time\n";&lt;BR /&gt;multi_queue(q2, a, b);&lt;/P&gt;&lt;P&gt;return 0;&lt;BR /&gt;}&lt;/P&gt;</description>
    <pubDate>Mon, 17 Jun 2024 05:01:03 GMT</pubDate>
    <dc:creator>-Light-</dc:creator>
    <dc:date>2024-06-17T05:01:03Z</dc:date>
    <item>
      <title>Unable to Simultaneously Execute Multiple Kernels on Device Using Out-of-Order Queue</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Unable-to-Simultaneously-Execute-Multiple-Kernels-on-Device/m-p/1607228#M3868</link>
      <description>&lt;P&gt;Hello,&lt;BR /&gt;I have been using SYCL programming to perform parallel computations, but I have noticed that kernel functions are not executing in parallel. I found a relevant document (Intel Optimization Guide for GPU) and conducted tests based on the example program provided there.I don't know why the code can't be attached, please see the comments.&lt;BR /&gt;&lt;BR /&gt;I performed tests on two different system environments and found that the execution time using both in-order and out-of-order queues was nearly identical. Upon inspecting the timeline view in Vtune Profiler, I noticed that none of the GPU Computing Tasks executed in parallel, regardless of whether in-order or out-of-order queues were used.&lt;BR /&gt;&lt;BR /&gt;System Environment 1:&lt;BR /&gt;12th Gen Intel(R) Core(TM) i7-12700H @ 2.30 GHz&lt;BR /&gt;Intel(R) Arc(TM) A370M Graphics&lt;BR /&gt;Windows 10 IoT Enterprise 22H2&lt;BR /&gt;System Environment 2:&lt;BR /&gt;13th Gen Intel(R) Core(TM) i7-13700 @ 2.10 GHz&lt;BR /&gt;Intel(R) UHD Graphics 770&lt;BR /&gt;Windows 11 Home Chinese Edition&lt;BR /&gt;&lt;BR /&gt;I would appreciate any insights into the reasons behind this issue and possible solutions. Thank you.&lt;/P&gt;</description>
      <pubDate>Mon, 17 Jun 2024 05:00:11 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Unable-to-Simultaneously-Execute-Multiple-Kernels-on-Device/m-p/1607228#M3868</guid>
      <dc:creator>-Light-</dc:creator>
      <dc:date>2024-06-17T05:00:11Z</dc:date>
    </item>
    <item>
      <title>Re: Unable to Simultaneously Execute Multiple Kernels on Device Using Out-of-Order Queue</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Unable-to-Simultaneously-Execute-Multiple-Kernels-on-Device/m-p/1607229#M3869</link>
      <description>&lt;P&gt;&lt;STRONG&gt;The test code is as follows:&lt;/STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;#include &amp;lt;CL/sycl.hpp&amp;gt;&lt;BR /&gt;#include &amp;lt;chrono&amp;gt;&lt;BR /&gt;#include &amp;lt;iostream&amp;gt;&lt;/P&gt;&lt;P&gt;using namespace sycl;&lt;/P&gt;&lt;P&gt;// Assuming IntArray is a custom type defined elsewhere&lt;BR /&gt;using IntArray = std::vector&amp;lt;int&amp;gt;;&lt;/P&gt;&lt;P&gt;const size_t array_size = 1024; // Define your array size&lt;BR /&gt;const int iter = 10; // Define number of iterations&lt;/P&gt;&lt;P&gt;int multi_queue(sycl::queue&amp;amp; q, const IntArray&amp;amp; a, const IntArray&amp;amp; b) {&lt;BR /&gt;IntArray s1(array_size), s2(array_size), s3(array_size);&lt;/P&gt;&lt;P&gt;//buffer&amp;lt;int, 1&amp;gt; a_buf(a.data(), range&amp;lt;1&amp;gt;(array_size));&lt;BR /&gt;//buffer&amp;lt;int, 1&amp;gt; b_buf(b.data(), range&amp;lt;1&amp;gt;(array_size));&lt;BR /&gt;buffer&amp;lt;int, 1&amp;gt; sum_buf1(s1.data(), range&amp;lt;1&amp;gt;(array_size));&lt;BR /&gt;buffer&amp;lt;int, 1&amp;gt; sum_buf2(s2.data(), range&amp;lt;1&amp;gt;(array_size));&lt;BR /&gt;buffer&amp;lt;int, 1&amp;gt; sum_buf3(s3.data(), range&amp;lt;1&amp;gt;(array_size));&lt;/P&gt;&lt;P&gt;size_t num_groups = 1;&lt;BR /&gt;size_t wg_size = 256;&lt;BR /&gt;auto start = std::chrono::steady_clock::now();&lt;BR /&gt;for (int i = 0; i &amp;lt; iter; i++) {&lt;BR /&gt;q.submit([&amp;amp;](sycl::handler&amp;amp; h) {&lt;BR /&gt;//sycl::accessor a_acc(a_buf, h, sycl::read_only);&lt;BR /&gt;//sycl::accessor b_acc(b_buf, h, sycl::read_only);&lt;BR /&gt;sycl::accessor sum_acc(sum_buf1, h, sycl::write_only, sycl::no_init);&lt;/P&gt;&lt;P&gt;h.parallel_for(sycl::nd_range&amp;lt;1&amp;gt;(num_groups * wg_size, wg_size),&lt;BR /&gt;[=](sycl::nd_item&amp;lt;1&amp;gt; index) {&lt;BR /&gt;size_t loc_id = index.get_local_id();&lt;BR /&gt;sum_acc[loc_id] = 0;&lt;BR /&gt;for (int j = 0; j &amp;lt; 1000; j++)&lt;BR /&gt;for (size_t i = loc_id; i &amp;lt; array_size; i += wg_size) {&lt;BR /&gt;sum_acc[loc_id] += /*a_acc[i] + b_acc[i]*/1;&lt;BR /&gt;}&lt;BR /&gt;});&lt;BR /&gt;});&lt;BR /&gt;q.submit([&amp;amp;](sycl::handler&amp;amp; h) {&lt;BR /&gt;//sycl::accessor a_acc(a_buf, h, sycl::read_only);&lt;BR /&gt;//sycl::accessor b_acc(b_buf, h, sycl::read_only);&lt;BR /&gt;sycl::accessor sum_acc(sum_buf2, h, sycl::write_only, sycl::no_init);&lt;/P&gt;&lt;P&gt;h.parallel_for(sycl::nd_range&amp;lt;1&amp;gt;(num_groups * wg_size, wg_size),&lt;BR /&gt;[=](sycl::nd_item&amp;lt;1&amp;gt; index) {&lt;BR /&gt;size_t loc_id = index.get_local_id();&lt;BR /&gt;sum_acc[loc_id] = 0;&lt;BR /&gt;for (int j = 0; j &amp;lt; 1000; j++)&lt;BR /&gt;for (size_t i = loc_id; i &amp;lt; array_size; i += wg_size) {&lt;BR /&gt;sum_acc[loc_id] += /*a_acc[i] + b_acc[i]*/2;&lt;BR /&gt;}&lt;BR /&gt;});&lt;BR /&gt;});&lt;BR /&gt;q.submit([&amp;amp;](sycl::handler&amp;amp; h) {&lt;BR /&gt;//sycl::accessor a_acc(a_buf, h, sycl::read_only);&lt;BR /&gt;//sycl::accessor b_acc(b_buf, h, sycl::read_only);&lt;BR /&gt;sycl::accessor sum_acc(sum_buf3, h, sycl::write_only, sycl::no_init);&lt;/P&gt;&lt;P&gt;h.parallel_for(sycl::nd_range&amp;lt;1&amp;gt;(num_groups * wg_size, wg_size),&lt;BR /&gt;[=](sycl::nd_item&amp;lt;1&amp;gt; index) {&lt;BR /&gt;size_t loc_id = index.get_local_id();&lt;BR /&gt;sum_acc[loc_id] = 0;&lt;BR /&gt;for (int j = 0; j &amp;lt; 1000; j++)&lt;BR /&gt;for (size_t i = loc_id; i &amp;lt; array_size; i += wg_size) {&lt;BR /&gt;sum_acc[loc_id] += /*a_acc[i] + b_acc[i]*/3;&lt;BR /&gt;}&lt;BR /&gt;});&lt;BR /&gt;});&lt;BR /&gt;}&lt;BR /&gt;q.wait();&lt;BR /&gt;auto end = std::chrono::steady_clock::now();&lt;BR /&gt;auto duration = std::chrono::duration_cast&amp;lt;std::chrono::microseconds&amp;gt;(end - start);&lt;BR /&gt;std::cout &amp;lt;&amp;lt; "multi_queue completed on device - took "&lt;BR /&gt;&amp;lt;&amp;lt; duration.count() &amp;lt;&amp;lt; " u-secs\n";&lt;/P&gt;&lt;P&gt;host_accessor result1(sum_buf1, read_only);&lt;BR /&gt;host_accessor result2(sum_buf2, read_only);&lt;BR /&gt;host_accessor result3(sum_buf3, read_only);&lt;/P&gt;&lt;P&gt;//for (size_t i = 0; i &amp;lt; 2; ++i) {&lt;BR /&gt;// std::cout &amp;lt;&amp;lt; "s1[" &amp;lt;&amp;lt; i &amp;lt;&amp;lt; "] = " &amp;lt;&amp;lt; result1[i] &amp;lt;&amp;lt; ",";&lt;BR /&gt;// std::cout &amp;lt;&amp;lt; "s2[" &amp;lt;&amp;lt; i &amp;lt;&amp;lt; "] = " &amp;lt;&amp;lt; result2[i] &amp;lt;&amp;lt; ",";&lt;BR /&gt;// std::cout &amp;lt;&amp;lt; "s3[" &amp;lt;&amp;lt; i &amp;lt;&amp;lt; "] = " &amp;lt;&amp;lt; result3[i] &amp;lt;&amp;lt; "\n";&lt;BR /&gt;//}&lt;/P&gt;&lt;P&gt;// check results&lt;BR /&gt;return ((end - start).count());&lt;BR /&gt;} // end multi_queue&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;int main() {&lt;BR /&gt;// Define your device selector&lt;BR /&gt;default_selector d_selector;&lt;/P&gt;&lt;P&gt;// Define arrays a and b of type IntArray and populate them&lt;BR /&gt;IntArray a(array_size, 1), b(array_size, 1);&lt;/P&gt;&lt;P&gt;// Create in-order queue with queue properties&lt;BR /&gt;property_list q_prop{ property::queue::in_order() };&lt;/P&gt;&lt;P&gt;std::cout &amp;lt;&amp;lt; "In order queue: Jitting+Execution time\n";&lt;BR /&gt;queue q1(d_selector, q_prop);&lt;BR /&gt;multi_queue(q1, a, b);&lt;BR /&gt;std::this_thread::sleep_for(std::chrono::milliseconds(500));&lt;BR /&gt;//usleep(500 * 1000);&lt;BR /&gt;std::cout &amp;lt;&amp;lt; "In order queue: Execution time\n";&lt;BR /&gt;multi_queue(q1, a, b);&lt;/P&gt;&lt;P&gt;// Create out-of-order queue without queue properties&lt;BR /&gt;queue q2(d_selector);&lt;BR /&gt;std::cout &amp;lt;&amp;lt; "Out of order queue: Jitting+Execution time\n";&lt;BR /&gt;multi_queue(q2, a, b);&lt;BR /&gt;std::this_thread::sleep_for(std::chrono::milliseconds(500));&lt;BR /&gt;std::cout &amp;lt;&amp;lt; "Out of order queue: Execution time\n";&lt;BR /&gt;multi_queue(q2, a, b);&lt;/P&gt;&lt;P&gt;return 0;&lt;BR /&gt;}&lt;/P&gt;</description>
      <pubDate>Mon, 17 Jun 2024 05:01:03 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Unable-to-Simultaneously-Execute-Multiple-Kernels-on-Device/m-p/1607229#M3869</guid>
      <dc:creator>-Light-</dc:creator>
      <dc:date>2024-06-17T05:01:03Z</dc:date>
    </item>
    <item>
      <title>Re: Unable to Simultaneously Execute Multiple Kernels on Device Using Out-of-Order Queue</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Unable-to-Simultaneously-Execute-Multiple-Kernels-on-Device/m-p/1607491#M3871</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Refer to the relevant documentation&lt;SPAN&gt;t (Intel Optimization Guide for GPU)：&lt;/SPAN&gt;&amp;nbsp;&lt;A href="https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2024-1/multiple-kernel-execution.html" target="_blank"&gt;Executing Multiple Kernels on the Device at the Same Time (intel.com)&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jun 2024 01:06:42 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Unable-to-Simultaneously-Execute-Multiple-Kernels-on-Device/m-p/1607491#M3871</guid>
      <dc:creator>-Light-</dc:creator>
      <dc:date>2024-06-18T01:06:42Z</dc:date>
    </item>
  </channel>
</rss>

