<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Parallel for is very slow compared to iterative solution in Intel® oneAPI DPC++/C++ Compiler</title>
    <link>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Parallel-for-is-very-slow-compared-to-iterative-solution/m-p/1387066#M2202</link>
    <description>&lt;P&gt;This should be moved to &lt;A href="https://community.intel.com/t5/Intel-oneAPI-Data-Parallel-C/bd-p/oneapi-data-parallel-c" target="_blank"&gt;Intel® oneAPI Data Parallel C++ - Intel Communities&lt;/A&gt;&lt;/P&gt;</description>
    <pubDate>Tue, 24 May 2022 17:55:25 GMT</pubDate>
    <dc:creator>Steve_Lionel</dc:creator>
    <dc:date>2022-05-24T17:55:25Z</dc:date>
    <item>
      <title>Parallel for is very slow compared to iterative solution</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Parallel-for-is-very-slow-compared-to-iterative-solution/m-p/1387064#M2201</link>
      <description>&lt;P&gt;I am trying to accelerate an algorithm using DPC++. What happens is that the normal calculation takes 1.5 times faster than kernel parallel execution. The following code is for both calculations.&lt;/P&gt;
&lt;P&gt;the&amp;nbsp;num_items&amp;nbsp;currently equals 16,000. I tried small values like 500 but the same thing, the CPU is way faster the kernel.&lt;/P&gt;
&lt;P&gt;I am using visual studio 2022 that runs oneAPI dpc++ compiler, and trying to make emulation on an FPGA, but I don't know how to find the details of the FPGA emulator like what frequency it is running on. The full code is:&amp;nbsp;&lt;A href="https://ideone.com/iEHQHa" rel="nofollow noreferrer" target="_blank"&gt;https://ideone.com/iEHQHa&lt;/A&gt;&lt;A href="https://www.tommypetspain.com/" target="_self"&gt;b&lt;/A&gt;&lt;/P&gt;
&lt;PRE&gt;    // This is the normal iterative code.
    std::vector&amp;lt;double&amp;gt; distance_calculation(std::vector&amp;lt;std::vector&amp;lt;double&amp;gt;&amp;gt;&amp;amp; dataset, 
    std::vector&amp;lt;double&amp;gt;&amp;amp; curr_test) {
    auto start = std::chrono::high_resolution_clock::now();
    std::vector&amp;lt;double&amp;gt;res;
    for (int i = 0; i &amp;lt; dataset.size(); ++i) {
        double dis = 0;
        for (int j = 0; j &amp;lt; dataset[i].size(); ++j) {
            dis += (curr_test[j] - dataset[i][j]) * (curr_test[j] - dataset[i][j]);
        }
        res.push_back(dis);
    }
    auto finish = std::chrono::high_resolution_clock::now();
    std::chrono::duration&amp;lt;double&amp;gt; elapsed = finish - start;
    std::cout &amp;lt;&amp;lt; "Elapsed time: " &amp;lt;&amp;lt; elapsed.count() &amp;lt;&amp;lt; " s\n";
    return res;
}
&lt;/PRE&gt;
&lt;PRE&gt;    // This is FPGA emulation code
    std::vector&amp;lt;double&amp;gt; distance_calculation_FPGA(queue&amp;amp; q, const  
    std::vector&amp;lt;std::vector&amp;lt;double&amp;gt;&amp;gt;&amp;amp; dataset, const std::vector&amp;lt;double&amp;gt;&amp;amp; curr_test) {
    std::vector&amp;lt;double&amp;gt;linear_dataset;
    for (int i = 0; i &amp;lt; dataset.size(); ++i) {
        for (int j = 0; j &amp;lt; dataset[i].size(); ++j) {
            linear_dataset.push_back(dataset[i][j]);
        }
    }
    range&amp;lt;1&amp;gt; num_items{dataset.size()};
    std::vector&amp;lt;double&amp;gt;res;
    //std::cout &amp;lt;&amp;lt; "im in" &amp;lt;&amp;lt; std::endl;

    res.resize(dataset.size());
    buffer dataset_buf(linear_dataset);
    buffer curr_test_buf(curr_test);
    buffer res_buf(res.data(), num_items);
    {
        auto start = std::chrono::high_resolution_clock::now();
        q.submit([&amp;amp;](handler&amp;amp; h) {
            accessor a(dataset_buf, h, read_only);
            accessor b(curr_test_buf, h, read_only);

            accessor dif(res_buf, h, read_write, no_init);
            h.parallel_for(range&amp;lt;1&amp;gt;(num_items), [=](id&amp;lt;1&amp;gt; i) {
                //  dif[i] = a[i].size() * 1.0;// a[i];
                for (int j = 0; j &amp;lt; 5; ++j) {
                    dif[i] += (b[j] - a[i * 5 + j]) * (b[j] - a[i * 5 + j]);

                }
                });
            });
            q.wait();
            auto finish = std::chrono::high_resolution_clock::now();
            std::chrono::duration&amp;lt;double&amp;gt; elapsed = finish - start;
            std::cout &amp;lt;&amp;lt; "Elapsed time: " &amp;lt;&amp;lt; elapsed.count() &amp;lt;&amp;lt; " s\n";

    }
    /*
        for (int i = 0; i &amp;lt; dataset.size(); ++i) {
            double dis = 0;
            for (int j = 0; j &amp;lt; dataset[i].size(); ++j) {
                dis += (curr_test[j] - dataset[i][j]) * (curr_test[j] - dataset[i][j]);
            }
            res.push_back(dis);
        }
        */
    return res;
}&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 24 May 2022 17:42:06 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Parallel-for-is-very-slow-compared-to-iterative-solution/m-p/1387064#M2201</guid>
      <dc:creator>sidrakiyani</dc:creator>
      <dc:date>2022-05-24T17:42:06Z</dc:date>
    </item>
    <item>
      <title>Re: Parallel for is very slow compared to iterative solution</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Parallel-for-is-very-slow-compared-to-iterative-solution/m-p/1387066#M2202</link>
      <description>&lt;P&gt;This should be moved to &lt;A href="https://community.intel.com/t5/Intel-oneAPI-Data-Parallel-C/bd-p/oneapi-data-parallel-c" target="_blank"&gt;Intel® oneAPI Data Parallel C++ - Intel Communities&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 24 May 2022 17:55:25 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Parallel-for-is-very-slow-compared-to-iterative-solution/m-p/1387066#M2202</guid>
      <dc:creator>Steve_Lionel</dc:creator>
      <dc:date>2022-05-24T17:55:25Z</dc:date>
    </item>
    <item>
      <title>Re: Parallel for is very slow compared to iterative solution</title>
      <link>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Parallel-for-is-very-slow-compared-to-iterative-solution/m-p/1387074#M2203</link>
      <description>&lt;P&gt;Definitely on the wrong forum. Moving to DPC++ Forum.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 24 May 2022 18:08:02 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-oneAPI-DPC-C-Compiler/Parallel-for-is-very-slow-compared-to-iterative-solution/m-p/1387074#M2203</guid>
      <dc:creator>Barbara_P_Intel</dc:creator>
      <dc:date>2022-05-24T18:08:02Z</dc:date>
    </item>
  </channel>
</rss>

