<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: With OMP is slower than without OMP, why? in Intel® Moderncode for Parallel Architectures</title>
    <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888589#M3663</link>
    <description>&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="width: 100%; margin-top: 5px;"&gt;
&lt;DIV style="margin-left:2px;margin-right:2px;"&gt;Quoting - &lt;A href="https://community.intel.com/en-us/profile/409218"&gt;bigknife&lt;/A&gt;&lt;/DIV&gt;
&lt;DIV style="background-color:#E5E5E5; padding:5px;border: 1px; border-style: inset;margin-left:2px;margin-right:2px;"&gt;&lt;EM&gt;
&lt;DIV style="margin:0px;"&gt;&lt;/DIV&gt;
&lt;P&gt;&lt;/P&gt;
&lt;/EM&gt;&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;Your post is unreadable, please repost it.&lt;/P&gt;
&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Sat, 06 Dec 2008 10:52:20 GMT</pubDate>
    <dc:creator>Dmitry_Vyukov</dc:creator>
    <dc:date>2008-12-06T10:52:20Z</dc:date>
    <item>
      <title>With OMP is slower than without OMP, why?</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888584#M3658</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;
&lt;P&gt;In my computer, Dell Power Edge 2900 (dual Xeon E5430 CPUs),&lt;BR /&gt;Code2 without OMP is as 3 time fast as code1 with OMP. Why?&lt;/P&gt;
&lt;P&gt;Thanks.&lt;/P&gt;
&lt;P&gt;Peter&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;code1 (with OMP) &lt;BR /&gt;static vector &lt;DOUBLE&gt; ompDblValues(8); &lt;BR /&gt;&lt;BR /&gt;double MinOutputOMP(const vector &lt;DOUBLE&gt; &amp;amp;outputs) &lt;BR /&gt;{ &lt;BR /&gt;    ompDblValues.assign(8, FLT_MAX); &lt;BR /&gt;    const long n = (long)outputs.size(); //n=220000 &lt;BR /&gt;#pragma omp parallel for &lt;BR /&gt;    for (long i=0; i &lt;N&gt;&lt;/N&gt;    { &lt;BR /&gt;        long nThread = omp_get_thread_num(); &lt;BR /&gt;        ompDblValues[nThread] = (ompDblValues[nThread] &amp;lt; outputs&lt;I&gt;) ? ompDblValues[nThread] : outputs&lt;I&gt;; &lt;BR /&gt;    } &lt;BR /&gt;    double minPositive= FLT_MAX; &lt;BR /&gt;    for (long i=0; i &lt;OMP_GET_NUM_PROCS&gt;&lt;/OMP_GET_NUM_PROCS&gt;    { &lt;BR /&gt;        minPositive = (minPositive &amp;lt; ompDblValues&lt;I&gt;) ? minPositive : ompDblValues&lt;I&gt;; &lt;BR /&gt;    } &lt;BR /&gt;    return minPositive; &lt;BR /&gt;} &lt;BR /&gt;&lt;BR /&gt;code2 (without OMP) &lt;BR /&gt;double MinOutput(const vector &lt;DOUBLE&gt; &amp;amp;outputs) &lt;BR /&gt;{ &lt;BR /&gt;double minPositive= FLT_MAX; &lt;BR /&gt;const long n = (long)outputs.size(); //n=220000 &lt;BR /&gt;for (int i=0; i &lt;N&gt;&lt;/N&gt;{ &lt;BR /&gt;minPositive = (minPositive &amp;lt; outputs&lt;I&gt;) ? minPositive : outputs&lt;I&gt;; &lt;BR /&gt;} &lt;BR /&gt;return minPositive; &lt;BR /&gt;}&lt;/I&gt;&lt;/I&gt;&lt;/DOUBLE&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/DOUBLE&gt;&lt;/DOUBLE&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 05 Dec 2008 18:13:46 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888584#M3658</guid>
      <dc:creator>bigknife</dc:creator>
      <dc:date>2008-12-05T18:13:46Z</dc:date>
    </item>
    <item>
      <title>Re: With OMP is slower than without OMP, why?</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888585#M3659</link>
      <description>&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="width: 100%; margin-top: 5px;"&gt;
&lt;DIV style="margin-left:2px;margin-right:2px;"&gt;Quoting - &lt;A href="https://community.intel.com/en-us/profile/409218"&gt;bigknife&lt;/A&gt;&lt;/DIV&gt;
&lt;DIV style="background-color:#E5E5E5; padding:5px;border: 1px; border-style: inset;margin-left:2px;margin-right:2px;"&gt;&lt;EM&gt;
&lt;P&gt;Hi,&lt;/P&gt;
&lt;P&gt;In my computer, Dell Power Edge 2900 (dual Xeon E5430 CPUs),&lt;BR /&gt;Code2 without OMP is as 3 time fast as code1 with OMP. Why?&lt;/P&gt;
&lt;P&gt;Thanks.&lt;/P&gt;
&lt;P&gt;Peter&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;code1 (with OMP) &lt;BR /&gt;static vector &lt;DOUBLE&gt; ompDblValues(8); &lt;BR /&gt;&lt;BR /&gt;double MinOutputOMP(const vector &lt;DOUBLE&gt; &amp;amp;outputs) &lt;BR /&gt;{ &lt;BR /&gt; ompDblValues.assign(8, FLT_MAX); &lt;BR /&gt; const long n = (long)outputs.size(); //n=220000 &lt;BR /&gt;#pragma omp parallel for &lt;BR /&gt; for (long i=0; i &lt;N&gt;&lt;/N&gt; { &lt;BR /&gt; long nThread = omp_get_thread_num(); &lt;BR /&gt; ompDblValues[nThread] = (ompDblValues[nThread] &amp;lt; outputs&lt;I&gt;) ? ompDblValues[nThread] : outputs&lt;I&gt;; &lt;BR /&gt; } &lt;BR /&gt; double minPositive= FLT_MAX; &lt;BR /&gt; for (long i=0; i &lt;OMP_GET_NUM_PROCS&gt;&lt;/OMP_GET_NUM_PROCS&gt; { &lt;BR /&gt; minPositive = (minPositive &amp;lt; ompDblValues&lt;I&gt;) ? minPositive : ompDblValues&lt;I&gt;; &lt;BR /&gt; } &lt;BR /&gt; return minPositive; &lt;BR /&gt;} &lt;BR /&gt;&lt;BR /&gt;code2 (without OMP) &lt;BR /&gt;double MinOutput(const vector &lt;DOUBLE&gt; &amp;amp;outputs) &lt;BR /&gt;{ &lt;BR /&gt;double minPositive= FLT_MAX; &lt;BR /&gt;const long n = (long)outputs.size(); //n=220000 &lt;BR /&gt;for (int i=0; i &lt;N&gt;&lt;/N&gt;{ &lt;BR /&gt;minPositive = (minPositive &amp;lt; outputs&lt;I&gt;) ? minPositive : outputs&lt;I&gt;; &lt;BR /&gt;} &lt;BR /&gt;return minPositive; &lt;BR /&gt;}&lt;/I&gt;&lt;/I&gt;&lt;/DOUBLE&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/I&gt;&lt;/DOUBLE&gt;&lt;/DOUBLE&gt;&lt;/P&gt;
&lt;/EM&gt;&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;Possible reason is that compiler has chosen schedule with granularity of single for-loop iteration. To fix this you must add schedule directive with specified granularity:&lt;/P&gt;
&lt;P&gt;&lt;EM&gt;&lt;EM&gt;#pragma omp parallel for &lt;STRONG&gt;schedule(dymanic, 10000)&lt;/STRONG&gt;&lt;BR /&gt;&lt;/EM&gt;&lt;/EM&gt;&lt;/P&gt;
&lt;P&gt;Second reason is false-sharing in &lt;EM&gt;&lt;EM&gt;ompDblValues &lt;/EM&gt;&lt;/EM&gt;array. If want to do reduction manually then you must use something like this:&lt;/P&gt;
&lt;P&gt;size_t const cache_line_size = 128;&lt;/P&gt;
&lt;P&gt;struct X&lt;/P&gt;
&lt;P&gt;{&lt;/P&gt;
&lt;P&gt;double value;&lt;/P&gt;
&lt;P&gt;char pad [cache_line_size];&lt;BR /&gt;};&lt;/P&gt;
&lt;P&gt;static vector &lt;X&gt; ompDblValues;&lt;/X&gt;&lt;/P&gt;
&lt;P&gt;&lt;EM&gt;&lt;EM&gt;&lt;BR /&gt;&lt;/EM&gt;&lt;/EM&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 05 Dec 2008 18:40:43 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888585#M3659</guid>
      <dc:creator>Dmitry_Vyukov</dc:creator>
      <dc:date>2008-12-05T18:40:43Z</dc:date>
    </item>
    <item>
      <title>Re: With OMP is slower than without OMP, why?</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888586#M3660</link>
      <description>&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="width: 100%; margin-top: 5px;"&gt;
&lt;DIV style="margin-left:2px;margin-right:2px;"&gt;Quoting - &lt;A href="https://community.intel.com/en-us/profile/409218"&gt;bigknife&lt;/A&gt;&lt;/DIV&gt;
&lt;DIV style="background-color:#E5E5E5; padding:5px;border: 1px; border-style: inset;margin-left:2px;margin-right:2px;"&gt;&lt;EM&gt;
&lt;P&gt;Code2 without OMP is as 3 time fast as code1 with OMP. Why?&lt;/P&gt;
&lt;/EM&gt;&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;This is Ok ;)&lt;/P&gt;
&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 05 Dec 2008 18:45:23 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888586#M3660</guid>
      <dc:creator>Dmitry_Vyukov</dc:creator>
      <dc:date>2008-12-05T18:45:23Z</dc:date>
    </item>
    <item>
      <title>Re: With OMP is slower than without OMP, why?</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888587#M3661</link>
      <description>&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="margin-top: 5px; width: 100%;"&gt;
&lt;DIV style="margin-left:2px;margin-right:2px;"&gt;Quoting - &lt;A href="https://community.intel.com/en-us/profile/409218"&gt;bigknife&lt;/A&gt;&lt;/DIV&gt;
&lt;DIV style="background-color:#E5E5E5; padding:5px;border: 1px; border-style: inset;margin-left:2px;margin-right:2px;"&gt;&lt;EM&gt;
&lt;P&gt;code2 (without OMP) &lt;BR /&gt;double MinOutput(const vector &lt;DOUBLE&gt; &amp;amp;outputs) &lt;BR /&gt;{ &lt;BR /&gt;double minPositive= FLT_MAX; &lt;BR /&gt;const long n = (long)outputs.size(); //n=220000 &lt;BR /&gt;for (int i=0; i &lt;N&gt;&lt;/N&gt;{ &lt;BR /&gt;minPositive = (minPositive &amp;lt; outputs&lt;I&gt;) ? minPositive : outputs&lt;I&gt;; &lt;BR /&gt;} &lt;BR /&gt;return minPositive; &lt;BR /&gt;}&lt;/I&gt;&lt;/I&gt;&lt;/DOUBLE&gt;&lt;/P&gt;
&lt;/EM&gt;&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;Published code which is successful at parallelizing such an operation gives each thread multiple batches of sufficient length, with the &lt;STRONG&gt;private&lt;/STRONG&gt; results from individual batches combined in a &lt;STRONG&gt;critical&lt;/STRONG&gt; region. This may not be the only way, but I suspect you will need to considerthe OpenMP syntax I mentioned.&lt;/P&gt;
&lt;P&gt;OpenMP Fortran includes a somewhat suitable reduction operator, but you shouldn'tlet the choice of C handicap you.&lt;/P&gt;
&lt;P&gt;The following code finds the position of a maximum element in a float array, which is batched into groups of size aa_dim1, in a direct translation of a Fortran double subscripted array. Since C is in use, &lt;STRONG&gt;private&lt;/STRONG&gt; is implicit in the definition of variables inside the parallel region. In your case, not saving the position should allow the inner loop to vectorize, and atomic may work in place of critical.&lt;/P&gt;
&lt;P&gt;
&lt;PRE&gt;[cpp]      max__ = aa[aa_dim1 + 1];
      xindex = 1;
      yindex = 1;
      i__2 = *n;
      i__3 = *n;
#pragma omp parallel for if(i__2 &amp;gt; 103)
      for (j = 1; j &amp;lt;= i__2; ++j) {
          int indxj=0;
          float maxj=max__;
          for (int i__ = 1; i__ &amp;lt;= i__3; ++i__)
              if (aa[i__ + j * aa_dim1] &amp;gt; maxj){
                  maxj = aa[i__ + j * aa_dim1];
                  indxj = i__;
                  }
#pragma omp critical
            if(maxj &amp;gt; max__) {
                max__= maxj;
                xindex=indxj;
                yindex=j;
                }
        }
[/cpp]&lt;/PRE&gt;
&lt;/P&gt;</description>
      <pubDate>Fri, 05 Dec 2008 22:00:54 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888587#M3661</guid>
      <dc:creator>TimP</dc:creator>
      <dc:date>2008-12-05T22:00:54Z</dc:date>
    </item>
    <item>
      <title>Re: With OMP is slower than without OMP, why?</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888588#M3662</link>
      <description>&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="margin-top: 5px; width: 100%;"&gt;
&lt;DIV style="margin-left:2px;margin-right:2px;"&gt;Quoting - &lt;A href="https://community.intel.com/en-us/profile/347331"&gt;Dmitriy Vyukov&lt;/A&gt;&lt;/DIV&gt;
&lt;DIV style="background-color:#E5E5E5; padding:5px;border: 1px; border-style: inset;margin-left:2px;margin-right:2px;"&gt;&lt;EM&gt;
&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="margin-top: 5px; width: 100%; height: 0px;"&gt;
&lt;P&gt;I did add schedule directive like "schedule(dymanic)" or "schedule(guide)", itdidnot work. But I have not tried "schedule(dymatic, 10000). I'll try it.&lt;/P&gt;
&lt;P&gt;What's the meaning of "false-sharing in &lt;EM&gt;&lt;EM&gt;ompDblValues &lt;/EM&gt;&lt;/EM&gt;array"? I cannot figure it out.&lt;BR /&gt;Why should I try something like that:&lt;/P&gt;
&lt;P&gt;size_t const cache_line_size = 128;&lt;/P&gt;
&lt;P&gt;struct X&lt;/P&gt;
&lt;P&gt;{&lt;/P&gt;
&lt;P&gt;double value;&lt;/P&gt;
&lt;P&gt;char pad [cache_line_size];&lt;BR /&gt;};&lt;/P&gt;
&lt;P&gt;static vector &lt;X&gt; ompDblValues;&lt;/X&gt;&lt;/P&gt;
&lt;P&gt;Thanks!&lt;BR /&gt;&lt;BR /&gt;Peter&lt;/P&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;Possible reason is that compiler has chosen schedule with granularity of single for-loop iteration. To fix this you must add schedule directive with specified granularity:&lt;/P&gt;
&lt;P&gt;&lt;EM&gt;&lt;EM&gt;#pragma omp parallel for &lt;STRONG&gt;schedule(dymanic, 10000)&lt;/STRONG&gt;&lt;BR /&gt;&lt;/EM&gt;&lt;/EM&gt;&lt;/P&gt;
&lt;P&gt;Second reason is false-sharing in &lt;EM&gt;&lt;EM&gt;ompDblValues &lt;/EM&gt;&lt;/EM&gt;array. If want to do reduction manually then you must use something like this:&lt;/P&gt;
&lt;P&gt;size_t const cache_line_size = 128;&lt;/P&gt;
&lt;P&gt;struct X&lt;/P&gt;
&lt;P&gt;{&lt;/P&gt;
&lt;P&gt;double value;&lt;/P&gt;
&lt;P&gt;char pad [cache_line_size];&lt;BR /&gt;};&lt;/P&gt;
&lt;P&gt;static vector &lt;X&gt; ompDblValues;&lt;/X&gt;&lt;/P&gt;
&lt;P&gt;&lt;EM&gt;&lt;EM&gt;&lt;BR /&gt;&lt;/EM&gt;&lt;/EM&gt;&lt;/P&gt;
&lt;/EM&gt;&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 06 Dec 2008 09:54:23 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888588#M3662</guid>
      <dc:creator>bigknife</dc:creator>
      <dc:date>2008-12-06T09:54:23Z</dc:date>
    </item>
    <item>
      <title>Re: With OMP is slower than without OMP, why?</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888589#M3663</link>
      <description>&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="width: 100%; margin-top: 5px;"&gt;
&lt;DIV style="margin-left:2px;margin-right:2px;"&gt;Quoting - &lt;A href="https://community.intel.com/en-us/profile/409218"&gt;bigknife&lt;/A&gt;&lt;/DIV&gt;
&lt;DIV style="background-color:#E5E5E5; padding:5px;border: 1px; border-style: inset;margin-left:2px;margin-right:2px;"&gt;&lt;EM&gt;
&lt;DIV style="margin:0px;"&gt;&lt;/DIV&gt;
&lt;P&gt;&lt;/P&gt;
&lt;/EM&gt;&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;Your post is unreadable, please repost it.&lt;/P&gt;
&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 06 Dec 2008 10:52:20 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888589#M3663</guid>
      <dc:creator>Dmitry_Vyukov</dc:creator>
      <dc:date>2008-12-06T10:52:20Z</dc:date>
    </item>
    <item>
      <title>Re: With OMP is slower than without OMP, why?</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888590#M3664</link>
      <description>&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="margin-top: 5px; width: 100%;"&gt;
&lt;DIV style="margin-left:2px;margin-right:2px;"&gt;Quoting - &lt;A href="https://community.intel.com/en-us/profile/347331"&gt;Dmitriy Vyukov&lt;/A&gt;&lt;/DIV&gt;
&lt;DIV style="background-color:#E5E5E5; padding:5px;border: 1px; border-style: inset;margin-left:2px;margin-right:2px;"&gt;&lt;EM&gt;
&lt;DIV style="margin:0px;"&gt;
&lt;DIV id="quote_reply" style="margin-top: 5px; width: 100%; height: 0px;"&gt;
&lt;P&gt;Really? But I can read it very well.&lt;/P&gt;
&lt;P&gt;The repost is like that:&lt;/P&gt;
&lt;P&gt;I did add schedule directive like "schedule(dymanic)"&lt;/P&gt;
&lt;P&gt;or "schedule(guide)", it didnot work. But I have not&lt;/P&gt;
&lt;P&gt;tried "schedule(dymatic, 10000). I'll try it.&lt;/P&gt;
&lt;P&gt;What's the meaning of "false-sharing in ompDblValues&lt;/P&gt;
&lt;P&gt;array"? I cannot figure it out.&lt;BR /&gt;Why should I try something like that:&lt;/P&gt;
&lt;P&gt;size_t const cache_line_size = 128;&lt;/P&gt;
&lt;P&gt;struct X&lt;/P&gt;
&lt;P&gt;{&lt;/P&gt;
&lt;P&gt;double value;&lt;/P&gt;
&lt;P&gt;char pad [cache_line_size];&lt;BR /&gt;};&lt;/P&gt;
&lt;P&gt;static vector &lt;X&gt; ompDblValues;&lt;/X&gt;&lt;/P&gt;
&lt;P&gt;Thanks!&lt;/P&gt;
&lt;P&gt;Peter&lt;/P&gt;
&lt;P&gt;Possible reason is that compiler has chosen schedule&lt;/P&gt;
&lt;P&gt;with granularity of single for-loop iteration. To fix&lt;/P&gt;
&lt;P&gt;this you must add schedule directive with specified&lt;/P&gt;
&lt;P&gt;granularity:&lt;/P&gt;
&lt;P&gt;#pragma omp parallel for schedule(dymanic, 10000)&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;Second reason is false-sharing in ompDblValues array.&lt;/P&gt;
&lt;P&gt;If want to do reduction manually then you must use&lt;/P&gt;
&lt;P&gt;something like this:&lt;/P&gt;
&lt;P&gt;size_t const cache_line_size = 128;&lt;/P&gt;
&lt;P&gt;struct X&lt;/P&gt;
&lt;P&gt;{&lt;/P&gt;
&lt;P&gt;double value;&lt;/P&gt;
&lt;P&gt;char pad [cache_line_size];&lt;BR /&gt;};&lt;/P&gt;
&lt;P&gt;static vector &lt;X&gt; ompDblValues;&lt;/X&gt;&lt;/P&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;Your post is unreadable, please repost it.&lt;/P&gt;
&lt;P&gt;&lt;/P&gt;
&lt;/EM&gt;&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 08 Dec 2008 07:42:22 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888590#M3664</guid>
      <dc:creator>bigknife</dc:creator>
      <dc:date>2008-12-08T07:42:22Z</dc:date>
    </item>
    <item>
      <title>Re: With OMP is slower than without OMP, why?</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888591#M3665</link>
      <description>&lt;P&gt;When different cores/processors write data to memory locations which are situated in one cache line, this emposes huge performance overheads (hundreds of cycles).&lt;/P&gt;
&lt;P&gt;When different cores/processors write to single memory location, it is called [just] sharing.&lt;/P&gt;
&lt;P&gt;When different cores/processors write to different memory locations (but still situated in one cache line), it is called false-sharing.&lt;/P&gt;
&lt;P&gt;Both things totally destroy performance and scalability on multi-core/multi-processor systems.&lt;/P&gt;</description>
      <pubDate>Wed, 24 Dec 2008 16:12:22 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/With-OMP-is-slower-than-without-OMP-why/m-p/888591#M3665</guid>
      <dc:creator>Dmitry_Vyukov</dc:creator>
      <dc:date>2008-12-24T16:12:22Z</dc:date>
    </item>
  </channel>
</rss>

