<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic here are the results on in Intel® Moderncode for Parallel Architectures</title>
    <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006573#M6433</link>
    <description>&lt;P&gt;here are the results on threads&lt;/P&gt;

&lt;P&gt;
	&lt;STYLE type="text/css"&gt;&amp;lt;!--td {border: 1px solid #ccc;}br {mso-data-placement:same-cell;}--&amp;gt;
	&lt;/STYLE&gt;
&lt;/P&gt;

&lt;TABLE dir="ltr" style="table-layout:fixed;font-size:13px;font-family:Calibri;border-collapse:collapse;border:1px solid #ccc" border="1" cellpadding="0" cellspacing="0"&gt;
	&lt;COLGROUP&gt;
		&lt;COL width="149" /&gt;
		&lt;COL width="132" /&gt;&lt;/COLGROUP&gt;
	&lt;TBODY&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,2,&amp;quot;Xeon&amp;quot;]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;"&gt;Xeon&lt;/TD&gt;
			&lt;TD data-sheets-value="[null,2,&amp;quot;timings&amp;quot;]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;"&gt;timings&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,1]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;1&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.0109815597534179]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;1.10E-02&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,2]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;2&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.00826859474182128]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;8.27E-03&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,4]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;4&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.00637030601501464]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;6.37E-03&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,8]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;8&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.00366783142089843]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;3.67E-03&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,16]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;16&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.00264596939086914]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;2.65E-03&lt;/TD&gt;
		&lt;/TR&gt;
	&lt;/TBODY&gt;
&lt;/TABLE&gt;</description>
    <pubDate>Mon, 31 Aug 2015 09:37:45 GMT</pubDate>
    <dc:creator>aketh_t_</dc:creator>
    <dc:date>2015-08-31T09:37:45Z</dc:date>
    <item>
      <title>problem in scaling the code</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006572#M6432</link>
      <description>&lt;P&gt;Hi all,&lt;/P&gt;

&lt;P&gt;Last time I posted a query and the problem was solved by fusing all the omp regions.&lt;/P&gt;

&lt;P&gt;However this time the problem seems to be with scalability on openmp.&lt;/P&gt;

&lt;P&gt;The code doesn't scale well.&lt;/P&gt;

&lt;P&gt;Its about 3X on 16 Xeon cores(Intel(R) Xeon(R)&amp;nbsp; E5-2650 v2)&lt;/P&gt;

&lt;P&gt;How to improve scalability??&lt;/P&gt;

&lt;P&gt;here is the code&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;do k=1,km-1

        do kk=1,2

          starttime = omp_get_wtime()

          !$OMP PARALLEL PRIVATE(I)DEFAULT(SHARED)
 
          !$omp do  
          do j=1,ny_block
           do i=1,nx_block
 
          LMASK(i,j) = TLT%K_LEVEL(i,j,bid) == k  .and.            &amp;amp;
                       TLT%K_LEVEL(i,j,bid) &amp;lt; KMT(i,j,bid)  .and.  &amp;amp;
                       TLT%ZTW(i,j,bid) == 1


            if ( LMASK(i,j) ) then 

             WORK1(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &amp;amp;
                           * SLX(i,j,kk,kbt,k,bid) * dz(k)

             WORK2(i,j,kk) = c2 * dzwr(k) * ( WORK1(i,j,kk)            &amp;amp;
              - KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) &amp;amp;
                                            * dz(k+1) )

             WORK2_NEXT(i,j) = c2 * ( &amp;amp;
              KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) - &amp;amp;
              KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) )

             WORK3(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &amp;amp;
                           * SLY(i,j,kk,kbt,k,bid) * dz(k)

             WORK4(i,j,kk) = c2 * dzwr(k) * ( WORK3(i,j,kk)            &amp;amp;
              - KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) &amp;amp;
                                            * dz(k+1) )

             WORK4_NEXT(i,j) = c2 * ( &amp;amp;
              KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) - &amp;amp;
              KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) )

            endif

            if( LMASK(i,j) .and. abs( WORK2_NEXT(i,j) ) &amp;lt; abs( WORK2(i,j,kk) ) ) then 

             WORK2(i,j,kk) = WORK2_NEXT(i,j)

            endif

           if ( LMASK(i,j) .and. abs( WORK4_NEXT(i,j) ) &amp;lt; abs( WORK4(i,j,kk ) ) ) then 
             WORK4(i,j,kk) = WORK4_NEXT(i,j)
           endif

          LMASK(i,j) = TLT%K_LEVEL(i,j,bid) == k  .and.           &amp;amp;
                       TLT%K_LEVEL(i,j,bid) &amp;lt; KMT(i,j,bid)  .and. &amp;amp;
                       TLT%ZTW(i,j,bid) == 2

          if ( LMASK(i,j) ) then

            WORK1(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     &amp;amp; 
                           * SLX(i,j,kk,ktp,k+1,bid)

            WORK2(i,j,kk) =  c2 * ( WORK1(i,j,kk)                 &amp;amp;
                           - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &amp;amp;
                              * SLX(i,j,kk,kbt,k+1,bid) ) )

            WORK1(i,j,kk) = WORK1(i,j,kk) * dz(k+1)

            WORK3(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     &amp;amp;
                           * SLY(i,j,kk,ktp,k+1,bid)

            WORK4(i,j,kk) =  c2 * ( WORK3(i,j,kk)                 &amp;amp;
                           - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &amp;amp;
                              * SLY(i,j,kk,kbt,k+1,bid) ) )

            WORK3(i,j,kk) = WORK3(i,j,kk) * dz(k+1)

            endif
 
          LMASK(i,j) = LMASK(i,j) .and. TLT%K_LEVEL(i,j,bid) + 1 &amp;lt; KMT(i,j,bid)

          if (k.lt.km-1) then ! added to avoid out of bounds access

            if( LMASK(i,j) ) then

              WORK2_NEXT(i,j) = c2 * dzwr(k+1) * ( &amp;amp;
                KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) * dz(k+1) - &amp;amp;
                KAPPA_THIC(i,j,ktp,k+2,bid) * SLX(i,j,kk,ktp,k+2,bid) * dz(k+2))

              WORK4_NEXT(i,j) = c2 * dzwr(k+1) * ( &amp;amp;
                KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) * dz(k+1) - &amp;amp;
                KAPPA_THIC(i,j,ktp,k+2,bid) * SLY(i,j,kk,ktp,k+2,bid) * dz(k+2))

              endif 

          end if
             
          if( LMASK(i,j) .and. abs( WORK2_NEXT(i,j) ) &amp;lt; abs( WORK2(i,j,kk) ) ) &amp;amp;
            WORK2(i,j,kk) = WORK2_NEXT(i,j)

          if( LMASK(i,j) .and. abs(WORK4_NEXT(i,j)) &amp;lt; abs(WORK4(i,j,kk)) ) &amp;amp;
            WORK4(i,j,kk) = WORK4_NEXT(i,j)

             enddo
          enddo
          !$omp end do

          !$OMP END PARALLEL

          endtime = omp_get_wtime()

          total = total + (endtime - starttime)

        enddo
      enddo&lt;/PRE&gt;

&lt;P&gt;Also attached is the standalone working code I have created, so that you guys could run it on your machines if you wish.&lt;/P&gt;

&lt;P&gt;The attached code is a standalone version created from a larger code piece.&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 09:36:06 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006572#M6432</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-08-31T09:36:06Z</dc:date>
    </item>
    <item>
      <title>here are the results on</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006573#M6433</link>
      <description>&lt;P&gt;here are the results on threads&lt;/P&gt;

&lt;P&gt;
	&lt;STYLE type="text/css"&gt;&amp;lt;!--td {border: 1px solid #ccc;}br {mso-data-placement:same-cell;}--&amp;gt;
	&lt;/STYLE&gt;
&lt;/P&gt;

&lt;TABLE dir="ltr" style="table-layout:fixed;font-size:13px;font-family:Calibri;border-collapse:collapse;border:1px solid #ccc" border="1" cellpadding="0" cellspacing="0"&gt;
	&lt;COLGROUP&gt;
		&lt;COL width="149" /&gt;
		&lt;COL width="132" /&gt;&lt;/COLGROUP&gt;
	&lt;TBODY&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,2,&amp;quot;Xeon&amp;quot;]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;"&gt;Xeon&lt;/TD&gt;
			&lt;TD data-sheets-value="[null,2,&amp;quot;timings&amp;quot;]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;"&gt;timings&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,1]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;1&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.0109815597534179]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;1.10E-02&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,2]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;2&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.00826859474182128]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;8.27E-03&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,4]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;4&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.00637030601501464]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;6.37E-03&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,8]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;8&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.00366783142089843]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;3.67E-03&lt;/TD&gt;
		&lt;/TR&gt;
		&lt;TR style="height:20px;"&gt;
			&lt;TD data-sheets-value="[null,3,null,16]" style="padding:0px 3px 0px 3px;vertical-align:bottom;padding-top:0px;padding-left:3px;padding-bottom:0px;padding-right:3px;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;16&lt;/TD&gt;
			&lt;TD data-sheets-numberformat="[null,2,&amp;quot;0.00E+00&amp;quot;,1]" data-sheets-value="[null,3,null,0.00264596939086914]" style="padding:0px 3px 0px 3px;vertical-align:bottom;font-family:Calibri;font-size:110%;color:#000000;wrap-strategy:0;white-space:nowrap;text-align:right;"&gt;2.65E-03&lt;/TD&gt;
		&lt;/TR&gt;
	&lt;/TBODY&gt;
&lt;/TABLE&gt;</description>
      <pubDate>Mon, 31 Aug 2015 09:37:45 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006573#M6433</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-08-31T09:37:45Z</dc:date>
    </item>
    <item>
      <title>This looks like code posted</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006574#M6434</link>
      <description>&lt;P&gt;This looks like code posted elsewhere.&lt;/P&gt;

&lt;P&gt;The runtime of the of the parallel region is too short to make effective use of parallelization. Your parallel region is inside your do k=1,km-1 and do kk=1,2 loops&amp;nbsp;or (60-1)*2 = 118 iterations. The time you display is for the sum of the runtimes of the enclosed parallel region. Yielding parallel region run times of 1/118'th that listed:&lt;/P&gt;

&lt;P&gt;1&amp;nbsp; 9.32203E-05&lt;BR /&gt;
	2&amp;nbsp; 7.00847E-05&lt;BR /&gt;
	4&amp;nbsp; 5.39831E-05&lt;BR /&gt;
	8&amp;nbsp; 3.11017E-05&lt;BR /&gt;
	16 2.16949E-05&lt;/P&gt;

&lt;P&gt;Starting and ending a parallel region has some expense that has to be amortized. Restructure your code to locate the parallel region further out (if possible at the do k= level).&lt;/P&gt;

&lt;P&gt;Your code is not incrementing the variable bid.&lt;BR /&gt;
	Your LMASK array can be replaced with a scalar.&lt;BR /&gt;
	Your outputs are being overwritten (missing code or bug).&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 12:45:30 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006574#M6434</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-08-31T12:45:30Z</dc:date>
    </item>
    <item>
      <title>As I told you the changed.F90</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006575#M6435</link>
      <description>&lt;P&gt;As I told you the changed.F90 is part of a huge file which you will not be able to run.&lt;/P&gt;

&lt;P&gt;Its just a subroutine of CESM.&lt;/P&gt;

&lt;P&gt;at K level it isnt parallel.&lt;/P&gt;

&lt;P&gt;So at kk as well.&lt;/P&gt;

&lt;P&gt;Look at the updates at WORK1 you will notice why.&lt;/P&gt;

&lt;P&gt;variables are overwritten??&lt;/P&gt;

&lt;P&gt;if you mean I am changing work1(1,1,1) from current iteration to next iteration that is true. it works like that itself.&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 12:51:37 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006575#M6435</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-08-31T12:51:37Z</dc:date>
    </item>
    <item>
      <title>Lasttime the issue was with</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006576#M6436</link>
      <description>&lt;P&gt;Lasttime the issue was with speedup.&lt;/P&gt;

&lt;P&gt;Now the issue is with scaling.&lt;/P&gt;

&lt;P&gt;you can increase the nx_block and ny_block to a huge value and increase the code runtime and demonstrate speedup as well.&lt;/P&gt;

&lt;P&gt;I was more concerned with false-sharing. Is it not a huge overhead in my code considering all the shared variables?&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 12:54:00 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006576#M6436</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-08-31T12:54:00Z</dc:date>
    </item>
    <item>
      <title>bid is a constant per entry</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006577#M6437</link>
      <description>&lt;P&gt;bid is a constant per entry to the subroutine of CESM. so it wont change.&lt;/P&gt;

&lt;P&gt;My code looks strange, but I cant explain why. that is how it is.&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 12:55:54 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006577#M6437</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-08-31T12:55:54Z</dc:date>
    </item>
    <item>
      <title>If you replace the LMASK</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006578#M6438</link>
      <description>&lt;P&gt;If you replace the LMASK array with LMASK scalar (because each thread references only one cell per iteration)&amp;nbsp;then the LMASK setting won't write to memory and won't false share on the ~ first 8 and last 8 cells of the thread's zone of LMASK as array. Even without the false sharing of LMASK as an array, this will save the writing to RAM and to some lesser extent the Read Modify Write of the cache line containing the cell for LMASK(i,j)&lt;/P&gt;

&lt;P&gt;To reduce the number of entry/exit of parallel regions (assuming missing code permits this) consider:&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;starttime = omp_get_wtime()
!$OMP PARALLEL PRIVATE(I, k, kk) DEFAULT(SHARED)
! note, all threads redundantly executing these two loops
do k=1,km-1
&amp;nbsp; do kk=1,2
&amp;nbsp;&amp;nbsp;&amp;nbsp; ! slice the inner range&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp do&amp;nbsp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp; do j=1,ny_block
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; do i=1,nx_block
... 
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; enddo
&amp;nbsp;&amp;nbsp;&amp;nbsp; enddo
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp end do
&amp;nbsp;&amp;nbsp;&amp;nbsp; ! The above !$OMP has an implied barrier
&amp;nbsp;&amp;nbsp;&amp;nbsp; ! this will keep the k and kk loops in sync
&amp;nbsp; enddo
enddo
!$OMP END PARALLEL
endtime = omp_get_wtime()
total = total + (endtime - starttime)
&lt;/PRE&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 14:28:52 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006578#M6438</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-08-31T14:28:52Z</dc:date>
    </item>
    <item>
      <title>i did try the LMASK. I have</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006579#M6439</link>
      <description>&lt;P&gt;i did try the LMASK. I have forgotten to add to make it part of the standalone code I have provided.&lt;/P&gt;

&lt;P&gt;It offered a small benefit.&lt;/P&gt;

&lt;P&gt;code was about 1.2-1.5X over the existing speedup we got.&lt;/P&gt;

&lt;P&gt;I will try this new method mentioned above. Seems good in theory. must check if it works well.&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 15:49:16 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006579#M6439</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-08-31T15:49:16Z</dc:date>
    </item>
    <item>
      <title>Although the compiler should</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006580#M6440</link>
      <description>&lt;P&gt;Although the compiler should optimize this:&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK1(i,j,kk) =&amp;nbsp; KAPPA_THIC(i,j,ktp,k+1,bid)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;amp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; * SLX(i,j,kk,ktp,k+1,bid)

&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK2(i,j,kk) =&amp;nbsp; c2 * ( WORK1(i,j,kk)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; - ( KAPPA_THIC(i,j,kbt,k+1,bid)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; * SLX(i,j,kk,kbt,k+1,bid) ) )

&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK1(i,j,kk) = WORK1(i,j,kk) * dz(k+1)
&lt;/PRE&gt;

&lt;P&gt;consider using this instead:&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK1temp =&amp;nbsp; KAPPA_THIC(i,j,ktp,k+1,bid)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;amp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; * SLX(i,j,kk,ktp,k+1,bid)

&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK2(i,j,kk) =&amp;nbsp; c2 * ( WORK1temp&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; - ( KAPPA_THIC(i,j,kbt,k+1,bid)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; * SLX(i,j,kk,kbt,k+1,bid) ) )

&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK1(i,j,kk) = WORK1temp * dz(k+1)
&lt;/PRE&gt;

&lt;P&gt;And make similar use of temporaries where applicable.&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 16:37:41 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006580#M6440</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-08-31T16:37:41Z</dc:date>
    </item>
    <item>
      <title>You might want to experiment</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006581#M6441</link>
      <description>&lt;P&gt;You might want to experiment with changing:&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; if (k.lt.km-1) then ! added to avoid out of bounds access&lt;BR /&gt;
	&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; if( LMASK(i,j) ) then&lt;BR /&gt;
	&lt;BR /&gt;
	to&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; if (k.eq.km-1)&amp;nbsp;LMASK(i,j) = .false.&amp;nbsp;! added to avoid out of bounds access (remove endif)&lt;BR /&gt;
	&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;if( LMASK(i,j) ) then&lt;BR /&gt;
	&lt;BR /&gt;
	The reasoning for this is whereas the former code might not vectorize, the latter may have a better chance at vectorization.&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Mon, 31 Aug 2015 16:51:16 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006581#M6441</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-08-31T16:51:16Z</dc:date>
    </item>
    <item>
      <title>i tried combining the</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006582#M6442</link>
      <description>&lt;P&gt;i tried combining the parallel region of K level as suggested.&lt;/P&gt;

&lt;P&gt;interestingly there isnt great improvement as such.&lt;/P&gt;

&lt;P&gt;its still approx 2.7E-03.&lt;/P&gt;

&lt;P&gt;Checking if using local variables help.&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 10:18:01 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006582#M6442</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-09-01T10:18:01Z</dc:date>
    </item>
    <item>
      <title>using local variable like</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006583#M6443</link>
      <description>&lt;P&gt;using local variable like worktemp seems to be a bad idea.&lt;/P&gt;

&lt;P&gt;must be because we are not vectorizing then?&lt;/P&gt;

&lt;P&gt;Performance has degraded.&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 10:47:40 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006583#M6443</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-09-01T10:47:40Z</dc:date>
    </item>
    <item>
      <title>If WORKtemp knocked out</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006584#M6444</link>
      <description>&lt;P&gt;If WORKtemp knocked out vectorization (over use of array) then this is a lost optimization opportunity by the compiler. It may be a case that the major loop unde optimization has too many things to keep track of.&lt;/P&gt;

&lt;P&gt;Try the following (although I think it is redundant to do this:&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; ! slice the inner range&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp do&amp;nbsp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp; do j=1,ny_block
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp simd
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; do i=1,nx_block
&lt;/PRE&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 12:38:58 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006584#M6444</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-09-01T12:38:58Z</dc:date>
    </item>
    <item>
      <title>Or</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006585#M6445</link>
      <description>&lt;P&gt;Or&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp; ! slice the inner range&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp do simd collapse(2)
&amp;nbsp;&amp;nbsp;&amp;nbsp; do j=1,ny_block
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; do i=1,nx_block
&lt;/PRE&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 12:42:08 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006585#M6445</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-09-01T12:42:08Z</dc:date>
    </item>
    <item>
      <title>I meant we must replace</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006586#M6446</link>
      <description>&lt;P&gt;I meant we must replace worktemp by worktemp(1:10) work on vector lengths per iteration.&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 12:42:52 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006586#M6446</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-09-01T12:42:52Z</dc:date>
    </item>
    <item>
      <title>Try the simd clause with the</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006587#M6447</link>
      <description>&lt;P&gt;Try the simd clause with the scalar temp, then if necessary go back to the original array format using simd clause (both ways)&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 12:50:32 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006587#M6447</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-09-01T12:50:32Z</dc:date>
    </item>
    <item>
      <title>not helping either.</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006588#M6448</link>
      <description>&lt;P&gt;not helping either.&lt;/P&gt;

&lt;P&gt;seems like KAPPA and SLX have unaligned access. we may have to copy them to temporaries and work. that must help.&lt;/P&gt;

&lt;P&gt;also if you notice work1,work2 work2_next go hand in hand.&lt;/P&gt;

&lt;P&gt;I think breaking the loop to accomodate these dependencies will help.&lt;/P&gt;

&lt;P&gt;Any way I was following your quickthread programming site.&lt;/P&gt;

&lt;P&gt;Do you publish papers or have you published any papers with these models?&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 13:02:22 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006588#M6448</guid>
      <dc:creator>aketh_t_</dc:creator>
      <dc:date>2015-09-01T13:02:22Z</dc:date>
    </item>
    <item>
      <title>I haven't done much with the</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006589#M6449</link>
      <description>&lt;P&gt;I haven't done much with the QuickThread threading toolkit since about 2009. I couldn't get enough followers to support continued development. Which is surprising seeing that it resolves(ed) virtually all of the threading issues from small single CPU multi-core&amp;nbsp;systems to large multi CPU many core (and NUMA) systems. I've even suggested to Intel&amp;nbsp;of having me of&amp;nbsp;incorporating the threading pool design concept into OpenMP (both C++ and Fortran) as an experimental extension. This then could be used as a test bed to either prove or disprove the merits of the extension. I am a strong proponent of making the pudding (proof of the pudding) and then letting the reviewers have a taste. Intel has experimented with non-standard extensions to OpenMP (e.g. CREW). And I am well familiar with the OpenMP limitations with regard to thread teaming.&lt;/P&gt;

&lt;P&gt;Consider a problem similar to your problem, only much larger run on a multi-node NUMA system. And where you know you could gain (significant) performance provided you could control the cache locality. This is extremely hard to do using current OpenMP features. The is especially true when one portion of your code favors one organization of thread teams and a different portion of your code favors different thread teaming arrangement and considering that you do not want to oversubscribe threads or under subscribe threads. There is no way to do this using current OpenMP features.&lt;/P&gt;

&lt;P&gt;**** HYPOTHETICAL OpenMP extension ****&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;! Outer loop distributed by NUMA node
!$OMP PARALLEL DO TEAM(OneEachM0$) collapse(2) PRIVATE(I, k, kk) DEFAULT(SHARED)
do k=1,km-1
&amp;nbsp; do kk=1,2
&amp;nbsp;&amp;nbsp;&amp;nbsp; ! slice the inner range&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp parallel do TEAM(OneEachL2WithinM0$) ! slice by core within NUMA node
&amp;nbsp;&amp;nbsp;&amp;nbsp; do j=1,ny_block
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp parallel do TEAM(L2$) ! slice by threads within core
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; do i=1,nx_block
... 
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; enddo
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp end parallel do
&amp;nbsp;&amp;nbsp;&amp;nbsp; enddo
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp end parallel do
&amp;nbsp; enddo
enddo
!$OMP END PARALLEL do

&lt;/PRE&gt;

&lt;P&gt;**** end&amp;nbsp;HYPOTHETICAL OpenMP extension ****&lt;/P&gt;

&lt;P&gt;Notes:&lt;/P&gt;

&lt;P&gt;There is no requirement for any convoluted KMP_AFFINITY or OMP_PLACE_THREADS (which may not be ideal for all portions of the application).&lt;/P&gt;

&lt;P&gt;There is no requirement for the application to know anything about the topology. On a small 1 CPU system the outer loop has a team of 1.&lt;/P&gt;

&lt;P&gt;Unlike standard OpenMP nested parallelism, the proposed system constitutes a single thread pool, affinity pinned for compute threads, and non-pinned higher priority threads for I/O. The single pool is organizable in any manner you choose at point of call (!$OMP...TEAM(...)) and it does so without creating a new pool (thus avoids oversubscription). This organization is not constricted by an arrangement setup by way of environment variables (other than those used to constrict the application to a subset of the available logical processors).&lt;/P&gt;

&lt;P&gt;As a further extension, assume that at some level of the above&amp;nbsp;nested loop structure you need to output results data. With standard OpenMP how would you do this without mucking up the running of the nested loop? About the only avenues you have are a) create an independent thread using pthreads (or other such thread), b) Use the OpenMP TASK but your nested loop if fully subscribed so this will either oversubscribe or most likely perform the task immediately thus effectively blocking the current thread through the duration of any I/O wait, or c) perform the I/O directly in the thread, possibly within a critical section, thus blocking not only all compute threads&amp;nbsp;wishing I/O while other thread is&amp;nbsp;pending I/O. None of those three "solutions" are particularly appealing.&lt;/P&gt;

&lt;P&gt;A better way would be to borrow the QuickThread feature of having an I/O thread or pool of threads.&lt;/P&gt;

&lt;P&gt;**** HYPOTHETICAL OpenMP extension ****&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;! Outer loop distributed by NUMA node
!$OMP PARALLEL DO TEAM(OneEachM0$) PRIVATE(I, k, kk) DEFAULT(SHARED)
do k=1,km-1
&amp;nbsp; do kk=1,2
&amp;nbsp;&amp;nbsp;&amp;nbsp; ! slice the inner range&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp parallel do TEAM(OneEachL2$) ! slice by core 
&amp;nbsp;&amp;nbsp;&amp;nbsp; do j=1,ny_block
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp parallel do TEAM(L2$) ! slice by threads within core
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; do i=1,nx_block
... 
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; enddo
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp end parallel do
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp task team(IO$) private(args) ! enqueue to I/O thread
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; call yourIOroutine(args)
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp end task
&amp;nbsp;&amp;nbsp;&amp;nbsp; enddo
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp end parallel do
&amp;nbsp; enddo
enddo
!$OMP END PARALLEL do

&lt;/PRE&gt;

&lt;P&gt;**** end&amp;nbsp;HYPOTHETICAL OpenMP extension ****&lt;/P&gt;

&lt;P&gt;The above new statement would enqueue the task into a higher priority thread queue, who's threads are typically not affinity pinned (i.e. will run on any logical processor, preferably a waiting logical processor), and most importantly, while the higher priority thread is blocked performing I/O, the lesser priority compute thread that was preempted is permitted the resume. You can also optionally constrict the I/O threads to one or more logical processors (e.g. those of the last core on a Xeon Phi which are otherwise not used, or on the socket that is directly connected to an I/O device, or a thread that "owns" the GPU).&lt;/P&gt;

&lt;P&gt;Any comments by the community as to the desirability of features like this would be welcome.&lt;/P&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 16:54:22 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006589#M6449</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-09-01T16:54:22Z</dc:date>
    </item>
    <item>
      <title>I think we've said it before:</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006590#M6450</link>
      <description>&lt;P&gt;I think we've said it before: the alignments which are important are of the arrays which are stored. If all the work arrays are set up under -align array32byte and the leading dimension is a multiple of 32 bytes, you can assert those arrays aligned . &amp;nbsp;It does look as if your scaling would be limited by memory bandwidth.&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 18:28:04 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006590#M6450</guid>
      <dc:creator>TimP</dc:creator>
      <dc:date>2015-09-01T18:28:04Z</dc:date>
    </item>
    <item>
      <title>Experiment wit associate.</title>
      <link>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006591#M6451</link>
      <description>&lt;P&gt;Experiment wit associate. Something like this (be careful):&lt;/P&gt;

&lt;PRE class="brush:fortran;"&gt;
associate (TLT_K_LEVELa =&amp;gt; TLT%K_LEVEL(:,:,bid), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; KMTa =&amp;gt; KMT(:,:,bid), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; TLT_ZTWa =&amp;gt; TLT%ZTW(:,:,bid), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ...) ! end of outer level associate
do k=1,km-1
&amp;nbsp; associate (dza =&amp;gt; dz(k), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; dza1 =&amp;gt; dz(k+1), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ...) ! end of k level associate
&amp;nbsp; do kk=1,2
&amp;nbsp;&amp;nbsp;&amp;nbsp; starttime = omp_get_wtime()
&amp;nbsp;&amp;nbsp;&amp;nbsp; ! be carefule of the kbt, ktp, k+1, ... associates
&amp;nbsp;&amp;nbsp;&amp;nbsp; associate (WORK1a =&amp;gt; WORK1(:,:,kk), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK2a =&amp;gt; WORK2(:,:,kk), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK3a =&amp;gt; WORK3(:,:,kk), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK4a =&amp;gt; WORK4(:,:,kk), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; KAPPA_THICa =&amp;gt; KAPPA_THIC(:,:,kbt,k,bid), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; SLXa =&amp;gt; SLX(:,:,kk,kbt,k,bid), &amp;amp;
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; SLXa1 =&amp;gt; SLX(i,j,kk,ktp,k+1,bid) &amp;amp; 
&amp;amp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ...) ! end of kk level associate
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$OMP PARALLEL
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp do&amp;nbsp; 
&amp;nbsp;&amp;nbsp;&amp;nbsp; do j=1,ny_block
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; do i=1,nx_block
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; LMASK(i,j) = TLT_K_LEVELa(i,j) == k&amp;nbsp; .and.&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; TLT_K_LEVELa(i,j) &amp;lt; KMTa(i,j)&amp;nbsp; .and.&amp;nbsp; &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; TLT_ZTWa(i,j) == 1

&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; if ( LMASK(i,j) ) then 
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK1a(i,j) =&amp;nbsp; KAPPA_THICa(i,j)&amp;nbsp; &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; * SLXa(i,j) * dza
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; WORK2a(i,j) = c2 * dzwra * ( WORK1a(i,j)&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; - KAPPA_THICa1(i,j) * SLXa1(i,j) &amp;amp;
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; * dza1 )
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; ...
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; endif
&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp; enddo ! i
&amp;nbsp;&amp;nbsp;&amp;nbsp; enddo ! j
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$omp end do
&amp;nbsp;&amp;nbsp;&amp;nbsp; !$OMP END PARALLEL
&amp;nbsp;&amp;nbsp;&amp;nbsp; end associate ! kk level
&amp;nbsp;&amp;nbsp;&amp;nbsp; endtime = omp_get_wtime()
&amp;nbsp;&amp;nbsp;&amp;nbsp; total = total + (endtime - starttime)
&amp;nbsp; enddo
&amp;nbsp; end associate ! k level
enddo
end associate ! outer level

&lt;/PRE&gt;

&lt;P&gt;Jim Dempsey&lt;/P&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 01 Sep 2015 21:58:34 GMT</pubDate>
      <guid>https://community.intel.com/t5/Intel-Moderncode-for-Parallel/problem-in-scaling-the-code/m-p/1006591#M6451</guid>
      <dc:creator>jimdempseyatthecove</dc:creator>
      <dc:date>2015-09-01T21:58:34Z</dc:date>
    </item>
  </channel>
</rss>

