<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC in OpenCL* for CPU</title>
    <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541189#M7232</link>
    <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;Can you provide us the complete code? Including the main function and how to measure the performance.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;BR /&gt;</description>
    <pubDate>Tue, 07 Nov 2023 01:53:26 GMT</pubDate>
    <dc:creator>cw_intel</dc:creator>
    <dc:date>2023-11-07T01:53:26Z</dc:date>
    <item>
      <title>Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1540697#M7231</link>
      <description>&lt;P&gt;Hi. In my experiments, oneAPI SYCL kernels always work slower at cpu device if comparing to ISPC.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;For example, this code:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="cpp"&gt;const size_t resolution = 2048;
size_t vector_size = resolution * resolution;

void oneapi_slopemap(queue&amp;amp; q, buffer&amp;lt;float, 1&amp;gt;&amp;amp; b_data, buffer&amp;lt;float, 1&amp;gt;&amp;amp; b_result)
{
	float slope_coeff = 0.1f * 2800.0f / (5000.0f / resolution);
	range&amp;lt;2&amp;gt; num_items2D { resolution, resolution };
	q.submit([&amp;amp;](handler&amp;amp; h) {
		auto data = b_data.get_access&amp;lt;access::mode::read&amp;gt;(h);
		auto result = b_result.get_access&amp;lt;access::mode::write&amp;gt;(h);
		h.parallel_for(num_items2D, [=](auto i)
		{
			int x = i[1];
			int y = i[0];
			int x0 = sycl::max&amp;lt;int&amp;gt;(x - 1, 0);
			int x1 = sycl::min&amp;lt;int&amp;gt;(x + 1, resolution - 1);
			int y0 = sycl::max&amp;lt;int&amp;gt;(y - 1, 0);
			int y1 = sycl::min&amp;lt;int&amp;gt;(y + 1, resolution - 1);
			float h7 = data[y0 * resolution + x0];
			float h8 = data[y0 * resolution + x];
			float h9 = data[y0 * resolution + x1];
			float h4 = data[y * resolution + x0];
			float h6 = data[y * resolution + x1];
			float h1 = data[y1 * resolution + x0];
			float h2 = data[y1 * resolution + x];
			float h3 = data[y1 * resolution + x1];
			float gradient_x = h9 + 3.0f * (h6 - h4) + h3 - h7 - h1;
			float gradient_y = h7 + 3.0f * (h8 - h2) + h9 - h1 - h3;
			float slope = slope_coeff * sycl::sqrt(gradient_x * gradient_x + gradient_y * gradient_y);
			slope = sycl::atan(slope) * 57.2957795130823f;
			result[y * resolution + x] = slope;
		});
	});
	q.wait();
}&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;works 1.9 times slower than this ispc code:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="cpp"&gt;export void slopemap(uniform float data[], uniform float result[], uniform int resolution, uniform int y)
{
    uniform float slope_coeff = 0.1f * 2800.0f / (5000.0f / resolution);
    uniform int y0 = max(y - 1, 0);
    uniform int y1 = min(y + 1, resolution - 1);
    foreach (x = 0 ... resolution)
    {
        int x0 = max(x - 1, 0);
        int x1 = min(x + 1, resolution - 1);
        float h7 = data[y0 * resolution + x0];
        float h8 = data[y0 * resolution + x];
        float h9 = data[y0 * resolution + x1];
        float h4 = data[y * resolution + x0];
        float h6 = data[y * resolution + x1];
        float h1 = data[y1 * resolution + x0];
        float h2 = data[y1 * resolution + x];
        float h3 = data[y1 * resolution + x1];
        float gradient_x = h9 + 3.0f * (h6 - h4) + h3 - h7 - h1;
        float gradient_y = h7 + 3.0f * (h8 - h2) + h9 - h1 - h3;
        float slope = slope_coeff * sqrt(gradient_x * gradient_x + gradient_y * gradient_y);
        slope = atan(slope) * 57.2957795130823f;
        result[y * resolution + x] = slope;
    }
}​&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;which is called this way:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="cpp"&gt;float slope_coeff = 0.1f * 2800.0f / (5000.0f / resolution);
concurrency::parallel_for(0, resolution, [&amp;amp;](auto y)
{
    ispc::slopemap(data.data(), result.data(), resolution, y);
});&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I use Windows version of oneAPI, integrated with Visual Studio 2022. This is the compilation command used:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;/O2 /fsycl-targets=spir64_x86_64 /fsycl-early-optimizations /Zi /D "NDEBUG" /D "_WINDLL" /D "_UNICODE" /D "UNICODE" /WX- /MD /std:c++20 /EHsc /W3 /nologo /Fo"x64\Release\" &lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;And this is the ISPC compilation command:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;ispc -O2 "%(Filename).ispc" -o "$(IntDir)%(Filename).obj" -h "$(ProjectDir)%(Filename).h" --target=sse4,avx2,avx512skx-x8 --opt=fast-math&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Can the performance of CPU backend be improved and be closer to ISPC-performance? Thank you.&lt;/P&gt;</description>
      <pubDate>Sun, 05 Nov 2023 13:50:14 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1540697#M7231</guid>
      <dc:creator>blinkfrog</dc:creator>
      <dc:date>2023-11-05T13:50:14Z</dc:date>
    </item>
    <item>
      <title>Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541189#M7232</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;Can you provide us the complete code? Including the main function and how to measure the performance.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Tue, 07 Nov 2023 01:53:26 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541189#M7232</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2023-11-07T01:53:26Z</dc:date>
    </item>
    <item>
      <title>Re: Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541448#M7235</link>
      <description>&lt;P&gt;Sure!&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I've put all the SYCL code and benchmark code to a single file.&amp;nbsp; Also I've included VS project and solution in case if this helps.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;ISPC compile command:&lt;/P&gt;&lt;P&gt;ispc -O2 ispc_slopemap.ispc" -o ispc_slopemap.obj -h ispc_slopemap.h" --target=avx2 --opt=fast-math&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;(Usually I compile ISPC using Custom Build Tool in Visual studio for multiple targets such as sse42, avx2 and avx512, but, when I switched to DPC++ compiler, I got link errors for some reason (there is no such errors with Intel C++ compiler) until I left only single target and added .obj file manually to the project. If your CPU supports avx512 instruction set, you can change the compile command accordingly)&lt;/P&gt;</description>
      <pubDate>Tue, 07 Nov 2023 16:14:44 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541448#M7235</guid>
      <dc:creator>blinkfrog</dc:creator>
      <dc:date>2023-11-07T16:14:44Z</dc:date>
    </item>
    <item>
      <title>Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541677#M7238</link>
      <description>&lt;P&gt;Thanks for providing the details. We will test it on our side. &lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks.&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Wed, 08 Nov 2023 01:47:30 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541677#M7238</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2023-11-08T01:47:30Z</dc:date>
    </item>
    <item>
      <title>Re: Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541896#M7239</link>
      <description>Hey,&lt;BR /&gt;&lt;BR /&gt;Just a thought on this topic. You compile your Sycl code for 64bit. I assume the kernel will use 64bit arithmetic for pointers in this case. ISPC uses by default 32bit pointer arithmetic. As you have a lot of reads this might explain the difference in performance.</description>
      <pubDate>Wed, 08 Nov 2023 14:38:52 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1541896#M7239</guid>
      <dc:creator>Spooner</dc:creator>
      <dc:date>2023-11-08T14:38:52Z</dc:date>
    </item>
    <item>
      <title>Re: Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1542000#M7240</link>
      <description>&lt;P&gt;Thank you very much Spooner,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;To check your assumption I've compiled ispc-file with `--addressing=64` parameter. And the numbers changed indeed.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;So, these are benchmark results:&lt;/P&gt;&lt;LI-CODE lang="none"&gt;    Method        Min     Med     Avg
ISPC 32-bit addr 1.109   1.282   1.297
ISPC 64-bit addr 1.622   1.826   1.840
oneAPI@OpenCL    2.334   2.678   2.690&lt;/LI-CODE&gt;&lt;P&gt;Still, even in this case OpenCL still is noticeably slower than ISPC.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;However, is there a way to make OpenCL to use 32-bit addressing?&lt;/P&gt;</description>
      <pubDate>Wed, 08 Nov 2023 20:05:26 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1542000#M7240</guid>
      <dc:creator>blinkfrog</dc:creator>
      <dc:date>2023-11-08T20:05:26Z</dc:date>
    </item>
    <item>
      <title>Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1543172#M7241</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;Can you provide a separate ISPC code and compilation command? Based on the code and compilation command you provided, I can't compile it correctly.&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;​Thanks.&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Mon, 13 Nov 2023 03:50:47 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1543172#M7241</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2023-11-13T03:50:47Z</dc:date>
    </item>
    <item>
      <title>Re: Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1543240#M7242</link>
      <description>&lt;P&gt;Sure!&lt;/P&gt;&lt;P&gt;&amp;nbsp;This is the command line to compile&amp;nbsp;ispc_normalmap.ispc:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;ispc ispc_normalmap.ispc -o ispc_normalmap.obj -h ispc_normalmap.h -O2 --opt=fast-math&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;(it is assumed that the directory with ispc.exe is in your PATH variable, if you use Windows)&lt;/P&gt;&lt;P&gt;it generates header file and .obj file, which you need to add to your project in Visual Studio manually (since DPC++ doesn't support Custom Build Tool for some reason, unlike Intel C++ Compiler), for example, by drag-n-drop this file to your project name in Solution Explorer.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;(There is also error in original cpp file that I provided, line #102 is this:&lt;/DIV&gt;&lt;/DIV&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="cpp"&gt;ispc_slopemap(ispc_data.data(), sycl_result.data());&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;but should be this:&lt;/DIV&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="cpp"&gt;ispc_slopemap(ispc_data.data(), ispc_result.data());&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;Same for line #94.&lt;/DIV&gt;&lt;DIV&gt;But this error doesn't affect the performance, only affects availability of results in case if you may want to compare them during debugging.&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;And also some error in the algorithm itself which is irrelevant to performance and test itself. Also, the names of files and functions can be confusing (files are `normalmap`, while functions are `slopemap`), this is because in the very test I measured performance of calculating normal maps but then decided to switch to slope maps)&lt;/DIV&gt;</description>
      <pubDate>Mon, 13 Nov 2023 11:44:47 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1543240#M7242</guid>
      <dc:creator>blinkfrog</dc:creator>
      <dc:date>2023-11-13T11:44:47Z</dc:date>
    </item>
    <item>
      <title>Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1544006#M7246</link>
      <description>&lt;P&gt;Thank you for the details. And we can build the ISPC code now. We are investigating the issue. Will let you know if there is any update. &lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Thanks. &lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Wed, 15 Nov 2023 02:32:46 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1544006#M7246</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2023-11-15T02:32:46Z</dc:date>
    </item>
    <item>
      <title>Re: Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601808#M7335</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;I'd like to let you know that we will improve the performance of sycl kernel in future compilers.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;And now we have an workaround for you. The workaround &lt;SPAN&gt;is to modify the test code to avoid using "max/min" on x indices, just by padding "data" buffer with 2 extra elements in dim x.&amp;nbsp; Attach the code sycl_performance_padding.cpp.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;I tested the code on my side, the performance is improved, and it's better than ISPC. &lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Method &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; Min &amp;nbsp; &amp;nbsp; Med &amp;nbsp; &amp;nbsp; Avg&lt;/P&gt;
&lt;P&gt;ISPC &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; 1.539 &amp;nbsp; 1.728 &amp;nbsp; 2.560&lt;/P&gt;
&lt;P&gt;oneAPI@OpenCL &amp;nbsp; &amp;nbsp; 2.409 &amp;nbsp; 8.910 &amp;nbsp; 8.501&lt;/P&gt;
&lt;P&gt;syclwithpadding &amp;nbsp; &amp;nbsp;1.180 &amp;nbsp; 1.531 &amp;nbsp; 1.797&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Can you test the code on your side?&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Thanks,&lt;/P&gt;
&lt;P&gt;Chen&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 29 May 2024 04:02:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601808#M7335</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2024-05-29T04:02:00Z</dc:date>
    </item>
    <item>
      <title>Re: Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601918#M7336</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;thank you for the update and for all your efforts. This is exciting news that performance of SYCL kernels will be improved!&lt;BR /&gt;&lt;BR /&gt;Yes, padding helps, of course. It makes vectorization of code much easier, resulting in a plain stencil pattern which can be vectorized without using of gather instructions.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;However, padding of data isn't always possible. Also, there is a lot of other algorithms, where gather and scatter can't be eliminated (and where ISPC still is ~70% faster than OpenCL CPU backend of oneAPI).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Also, wanted to ask, is your code correct? Shouldn't `resolution` be `resolution + 2` in the next lines within the kernel? Our data rows now are 2 elements wider than `resolution`.&lt;BR /&gt;```&lt;BR /&gt;float h7 = data[y0 * resolution + x0];&lt;BR /&gt;float h8 = data[y0 * resolution + x];&lt;BR /&gt;float h9 = data[y0 * resolution + x1];&lt;BR /&gt;float h4 = data[y * resolution + x0];&lt;BR /&gt;float h6 = data[y * resolution + x1];&lt;BR /&gt;float h1 = data[y1 * resolution + x0];&lt;BR /&gt;float h2 = data[y1 * resolution + x];&lt;BR /&gt;float h3 = data[y1 * resolution + x1];&lt;BR /&gt;```&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Nonetheless, here are results from my computer:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;```&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;&amp;nbsp; &amp;nbsp; Method&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; Min&amp;nbsp; &amp;nbsp;Med&amp;nbsp; &amp;nbsp;Avg&lt;/FONT&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;ISPC&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;1.273 1.528 1.567&lt;/FONT&gt;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;&lt;A href="mailto:oneAPI@OpenCL" target="_blank"&gt;oneAPI@OpenCL&lt;/A&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; 2.331 2.707 2.737&lt;/FONT&gt;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;ISPC [with padding]&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; 0.911 1.167 1.232&lt;/FONT&gt;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;&lt;A href="mailto:oneAPI@OpenCL[padded]" target="_blank"&gt;oneAPI@OpenCL [with padding]&lt;/A&gt;&amp;nbsp; &amp;nbsp;2.072 2.518 2.586&lt;/FONT&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;```&lt;/P&gt;</description>
      <pubDate>Wed, 29 May 2024 09:51:37 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601918#M7336</guid>
      <dc:creator>blinkfrog</dc:creator>
      <dc:date>2024-05-29T09:51:37Z</dc:date>
    </item>
    <item>
      <title>Re: Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601947#M7337</link>
      <description>&lt;P&gt;Yes, you are right,&amp;nbsp;&lt;SPAN&gt;`resolution` be `resolution + 2` in the kernel.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;But from your results, there is no&amp;nbsp;significant performance improvement. The sycl performance with padding is still worse than the perfromance ISPC, right?&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Can you&amp;nbsp;please tell us the HW details, the version of compiler and the compilation command?&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 29 May 2024 11:54:59 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601947#M7337</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2024-05-29T11:54:59Z</dc:date>
    </item>
    <item>
      <title>Re: Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601980#M7338</link>
      <description>&lt;P&gt;Yes, ISPC still is noticeably faster.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;After reboot, execution times of both ISPC code and oneAPI kernel code have improved somewhat and are `0.785 ms` and `1.764 ms ` (minimum time) respectively (versions with padding).&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I use Windows 11, Visual Studio&amp;nbsp;17.8.3, oneAPI 2024.1 and Intel(R) oneAPI DPC++ Compiler 2024 shipped with it.&lt;/P&gt;&lt;P&gt;I compile SYCL code using VS and VC project, compile options are `/O2 /Zi /D "NDEBUG" /D "_UNICODE" /D "UNICODE" /WX- /MD /std:c++20 /EHsc /W3 /nologo /Fo"x64\Release\"`&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Hardware: Intel Core i7-11800H&amp;nbsp;@ 2.30GHz, 32 GB RAM.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thank you.&lt;/P&gt;</description>
      <pubDate>Wed, 29 May 2024 13:33:39 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601980#M7338</guid>
      <dc:creator>blinkfrog</dc:creator>
      <dc:date>2024-05-29T13:33:39Z</dc:date>
    </item>
    <item>
      <title>Re: Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601987#M7339</link>
      <description>&lt;P&gt;Can you provide us with the ISPC code with padding?&lt;/P&gt;</description>
      <pubDate>Wed, 29 May 2024 14:02:16 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1601987#M7339</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2024-05-29T14:02:16Z</dc:date>
    </item>
    <item>
      <title>Re: Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1602006#M7340</link>
      <description>&lt;P&gt;Sure, here it is:&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;export void normalmap(uniform float data[], uniform float result[], uniform int resolution, uniform size_t y)
{
    uniform float slope_coeff = 0.1f * 2800.0f / (5000.0f / resolution);
    uniform size_t y0 = y &amp;gt; 0 ? y - 1 : 0;
    uniform size_t y1 = y &amp;lt; resolution - 1 ? y + 1 : resolution - 1;	
    foreach (x = 1 ... resolution - 1)
    {
        size_t x0 = x - 1;
        size_t x1 = x + 1;
        float h7 = data[y0 * resolution + x0];
        float h8 = data[y0 * resolution + x];
        float h9 = data[y0 * resolution + x1];
        float h4 = data[y * resolution + x0];
        float h6 = data[y * resolution + x1];
        float h1 = data[y1 * resolution + x0];
        float h2 = data[y1 * resolution + x];
        float h3 = data[y1 * resolution + x1];
        float gradient_x = h9 + 3.0f * (h6 - h4) + h3 - h7 - h1;
        float gradient_y = h7 + 3.0f * (h8 - h2) + h9 - h1 - h3;
        float slope = slope_coeff * sqrt(gradient_x * gradient_x + gradient_y * gradient_y);
        slope = atan(slope) * 57.2957795130823f;
        result[y * resolution + x - 1] = slope;
    }
}&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;Calling code is the same as before, but `resolution` parameter should be new, padded width, so, `resolution + 2` :&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;void ispc_slopemap(float* data, float* result)
{
	concurrency::parallel_for(0, static_cast&amp;lt;int&amp;gt;(resolution), [&amp;amp;](int y)
	{
		ispc::normalmap(data, result, resolution + 2, y);
	});
}&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 29 May 2024 14:46:19 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1602006#M7340</guid>
      <dc:creator>blinkfrog</dc:creator>
      <dc:date>2024-05-29T14:46:19Z</dc:date>
    </item>
    <item>
      <title>Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1606045#M7341</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Could you test the sycl code without /Zi flag? And then tell us the performance of the sycl code with padding without this flag?&lt;SPAN style="font-size: 14px; font-family: -apple-system, BlinkMacSystemFont, &amp;quot;Segoe UI&amp;quot;, Roboto, Oxygen, Ubuntu, &amp;quot;Fira Sans&amp;quot;, &amp;quot;Droid Sans&amp;quot;, &amp;quot;Helvetica Neue&amp;quot;, sans-serif;"&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Wed, 12 Jun 2024 08:49:17 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1606045#M7341</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2024-06-12T08:49:17Z</dc:date>
    </item>
    <item>
      <title>Re: Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1607357#M7343</link>
      <description>&lt;P&gt;Sorry for a delay in my reply.&lt;/P&gt;&lt;P&gt;I've tried to compile it without /Zi flag. Wow! Execution became MUCH faster! Didn't expect such effect.&lt;/P&gt;&lt;P&gt;With padding it is now even faster than ISPC:&lt;/P&gt;&lt;P&gt;```&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;&lt;FONT&gt;ISPC [with padding]&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;&amp;nbsp;0.911 1.167 1.232&lt;/FONT&gt;&lt;/FONT&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;&lt;FONT&gt;&lt;A class="" href="mailto:oneAPI@OpenCL[padded]" target="_blank" rel="nofollow noopener noreferrer"&gt;oneAPI@OpenCL [with padding]&lt;/A&gt;&amp;nbsp; &amp;nbsp;&lt;/FONT&gt;0.909 1.034 1.068&lt;/FONT&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;```&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;(If I recompile ISPC with using AVX512, which is supported by my CPU too, it still is a bit faster than OpenCL:&amp;nbsp;&amp;nbsp;0.891 0.996 1.059, but this isn't a significant difference)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;In example without padding, it still is a bit slower than ISPC, but this is fine too:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;```&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;ISPC&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp;1.305 1.669 1.731&lt;/FONT&gt;&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;&lt;FONT face="courier new,courier"&gt;&lt;A href="mailto:oneAPI@OpenCL" target="_blank" rel="noopener"&gt;oneAPI@OpenCL&amp;nbsp;&lt;/A&gt; 1.495 1.799 1.863&lt;/FONT&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;```&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thank you very much for your assistance!&lt;/P&gt;</description>
      <pubDate>Mon, 17 Jun 2024 13:53:22 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1607357#M7343</guid>
      <dc:creator>blinkfrog</dc:creator>
      <dc:date>2024-06-17T13:53:22Z</dc:date>
    </item>
    <item>
      <title>Re:Poor CPU OpenCL backend performance of SYCL kernels comparing to ISPC</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1608302#M7344</link>
      <description>&lt;P&gt;Thanks for accepting our solution. If you need any additional information, please post a new question as this thread will no longer be monitored by Intel.&lt;/P&gt;&lt;BR /&gt;</description>
      <pubDate>Thu, 20 Jun 2024 02:09:18 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/Poor-CPU-OpenCL-backend-performance-of-SYCL-kernels-comparing-to/m-p/1608302#M7344</guid>
      <dc:creator>cw_intel</dc:creator>
      <dc:date>2024-06-20T02:09:18Z</dc:date>
    </item>
  </channel>
</rss>

