topic >> Without using LMASK?? in Intel® Moderncode for Parallel Architectures

OpenMP performance lower than expected

aketh_t_ — Wed, 22 Jul 2015 09:56:51 GMT

Hi all,

below is the code I have been trying to parallelize with OpenMP

do k=1,km-1

        do kk=1,2

          starttime = omp_get_wtime()

          !$OMP PARALLEL PRIVATE(I)DEFAULT(SHARED)
 
          !$omp do  
          do j=1,ny_block
           do i=1,nx_block
 
          LMASK(i,j) = TLT%K_LEVEL(i,j,bid) == k  .and.            &
                       TLT%K_LEVEL(i,j,bid) < KMT(i,j,bid)  .and.  &
                       TLT%ZTW(i,j,bid) == 1

           enddo
          enddo
          !$omp end do

          !$omp do 
          do j=1,ny_block
           do i=1,nx_block

            if ( LMASK(i,j) ) then 

            WORK1(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &
                           * SLX(i,j,kk,kbt,k,bid) * dz(k)

            WORK2(i,j,kk) = c2 * dzwr(k) * ( WORK1(i,j,kk)            &
              - KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) &
                                            * dz(k+1) )

            WORK2_NEXT(i,j) = c2 * ( &
              KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) - &
              KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) )

            WORK3(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &
                           * SLY(i,j,kk,kbt,k,bid) * dz(k)

            WORK4(i,j,kk) = c2 * dzwr(k) * ( WORK3(i,j,kk)            &
              - KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) &
                                            * dz(k+1) )

            WORK4_NEXT(i,j) = c2 * ( &
              KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) - &
              KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) )

            endif

            enddo
          enddo 
          !$omp end do 

           !$omp do 
           do j=1,ny_block
           do i=1,nx_block

              if( LMASK(i,j) .and. abs( WORK2_NEXT(i,j) ) < abs( WORK2(i,j,kk) ) ) then 

              WORK2(i,j,kk) = WORK2_NEXT(i,j)

              endif

           enddo
          enddo
          !$omp end do

          !$omp do 
          do j=1,ny_block
           do i=1,nx_block

           if ( LMASK(i,j) .and. abs( WORK4_NEXT(i,j) ) < abs( WORK4(i,j,kk ) ) ) then 
              WORK4(i,j,kk) = WORK4_NEXT(i,j)
           endif

           enddo
          enddo 
          !$omp end do 

          !$omp do 
          do j=1,ny_block
           do i=1,nx_block  

          LMASK(i,j) = TLT%K_LEVEL(i,j,bid) == k  .and.           &
                       TLT%K_LEVEL(i,j,bid) < KMT(i,j,bid)  .and. &
                       TLT%ZTW(i,j,bid) == 2

            enddo
          enddo
          !$omp end do

         !$omp do 
         do j=1,ny_block
           do i=1,nx_block


            if ( LMASK(i,j) ) then

            WORK1(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     & 
                           * SLX(i,j,kk,ktp,k+1,bid)

            WORK2(i,j,kk) =  c2 * ( WORK1(i,j,kk)                 &
                           - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &
                              * SLX(i,j,kk,kbt,k+1,bid) ) )

            WORK1(i,j,kk) = WORK1(i,j,kk) * dz(k+1)

            WORK3(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     &
                           * SLY(i,j,kk,ktp,k+1,bid)

            WORK4(i,j,kk) =  c2 * ( WORK3(i,j,kk)                 &
                           - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &
                              * SLY(i,j,kk,kbt,k+1,bid) ) )

            WORK3(i,j,kk) = WORK3(i,j,kk) * dz(k+1)

            endif
 
            enddo
          enddo
          !$omp end do   

          !$omp do 
          do j=1,ny_block
           do i=1,nx_block


          LMASK(i,j) = LMASK(i,j) .and. TLT%K_LEVEL(i,j,bid) + 1 < KMT(i,j,bid)

           enddo
          enddo
          !$omp end do

          if (k.lt.km-1) then ! added to avoid out of bounds access

           !$omp do 
           do j=1,ny_block
            do i=1,nx_block
 
            if( LMASK(i,j) ) then

              WORK2_NEXT(i,j) = c2 * dzwr(k+1) * ( &
                KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) * dz(k+1) - &
                KAPPA_THIC(i,j,ktp,k+2,bid) * SLX(i,j,kk,ktp,k+2,bid) * dz(k+2))

              WORK4_NEXT(i,j) = c2 * dzwr(k+1) * ( &
                KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) * dz(k+1) - &
                KAPPA_THIC(i,j,ktp,k+2,bid) * SLY(i,j,kk,ktp,k+2,bid) * dz(k+2))

              endif 

              enddo
            enddo
            !$omp end do
          end if
             
          !$omp do 
          do j=1,ny_block
            do i=1,nx_block


            if( LMASK(i,j) .and. abs( WORK2_NEXT(i,j) ) < abs( WORK2(i,j,kk) ) ) &
            WORK2(i,j,kk) = WORK2_NEXT(i,j)

            enddo
          enddo
          !$omp end do     

          !$omp do 
         do j=1,ny_block
            do i=1,nx_block


            if( LMASK(i,j) .and. abs(WORK4_NEXT(i,j)) < abs(WORK4(i,j,kk)) ) &
            WORK4(i,j,kk) = WORK4_NEXT(i,j)

             enddo
          enddo
          !$omp end do 

          !$OMP END PARALLEL

          endtime = omp_get_wtime()

          total = total + (endtime - starttime)

        enddo
      enddo

The performance however is not as expected.

With 4 threads there are only marginal improvements from 1.3 * 10-2 to 1 * 10-2

I tried for more threads(16) and performance was only 6.06E-03 i.e 4X only(not scaling well either)

Each of those i,j loops are 196*84.

Also schedule Dynamic was not of much help either. and performance was same with chuncksize of 20.

Collapse with dynamic and chunk size 20 made code slow with performance of 8*1-2

Any help for Improvements??

This has been posted and

TimP — Wed, 22 Jul 2015 10:37:19 GMT

This has been posted and commented upon elsewhere.

Yes I agree.

aketh_t_ — Wed, 22 Jul 2015 10:41:44 GMT

Yes I agree.

1)I was told it was not the right place to post. So I reposted here.

The problem you have is you

jimdempseyatthecove — Wed, 22 Jul 2015 16:00:27 GMT

The problem you have is you have 10 !$omp do loops, together with the accompanying implicit barrier at end of each loop.

Rewrite your code to use 1 !$omp do loop without use of LMASK.

Jim Dempsey

Without using LMASK???

aketh_t_ — Thu, 23 Jul 2015 02:16:00 GMT

Without using LMASK???

If you like, start by fusing

TimP — Thu, 23 Jul 2015 04:58:36 GMT

If you like, start by fusing those parallel regions so that at least your openmp version is consistent with the automatic fusion implemented by ifort in your single thread baseline. I've forgotten how long ago this was suggested to you. I suggested this be included in the parallelism training but I guess others are correct in advising that no one will be influenced by such suggestions.

Could you guys let me know

aketh_t_ — Thu, 23 Jul 2015 05:33:38 GMT

Could you guys let me know the rationale for fusing loops in general make for better performance(even when we do not use openmp).

Coz with OpenMP the issue is straight forward that threads do have to overcome a barrier. But even when there is no OpenMP I have found Loop fusuin helps.

Any material/explanation why that work's??

Has it got anything to do with cache??

thank you all.

aketh_t_ — Thu, 23 Jul 2015 08:29:00 GMT

thank you all.

My code seems to be doing well now with fusing of loops. the time taken is approx 2.63E-03.

An improvement over original of 1.34E-02.

The speedup is approximately 5x for 16 threads.

But it still isnt able to scale as well as expected.

I tried omp collapse and schedule dynamic with chunk size 294. but performance has only been lower compared to the code without any schedule clause. approx 4.3E-03.

Any suggestions??

With fusion, the repeated

TimP — Thu, 23 Jul 2015 12:07:23 GMT

With fusion, the repeated references to lmask should be satisfied in register without even depending on cache.

In the simpler cases, the non-fused loops are likely to need unroll by 4 to optimize performance, but this will lead to excessive code expansion, so the compiler will not do it unless requested. Either way, fusion is a better solution.

For several years, Intel CPUs have tended to prefer to have 6 to 8 array sections stored per loop, in part so as to optimize use of fill buffers. HyperThreading of course will impact this, as fill buffers are shared among logical siblings.

The explanation about additional overhead for starting new openmp regions is not simple, as the thread pool will remain active if you don't zero out KMP_BLOCKTIME. Still, the overhead for starting a new parallel region is clearly larger than for starting a new single thread nested loop, and it becomes at least proportionally larger with increased numbers of threads, as more than Amdahl's law is at play.

At -O3, ifort will perform some automatic fusion or distribution so as to optimize the loops. but this optimization is not possible across OpenMP barriers. So you were penalizing OpenMP by preventing optimizations for that case.

You could analyze your code by profiling with VTune if you wished to verify to what extent memory bandwidth might account for non-linear scaling. If you are trying to analyze penalties for not setting optimum affinity, I expect that to be difficult, in part due to non-repeatability.

>> Without using LMASK??

jimdempseyatthecove — Thu, 23 Jul 2015 15:25:00 GMT

>> Without using LMASK??

do k=1,km-1
  do kk=1,2

starttime = omp_get_wtime()

!$OMP PARALLEL PRIVATE(I,LdoWork)DEFAULT(SHARED)
 
!$omp do  
do j=1,ny_block
  do i=1,nx_block
    LdoWork = TLT%K_LEVEL(i,j,bid) == k  .and.            &
              TLT%K_LEVEL(i,j,bid) < KMT(i,j,bid)  .and.  &
              TLT%ZTW(i,j,bid) == 1
    if(LdoWork) then
      WORK1(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &
                     * SLX(i,j,kk,kbt,k,bid) * dz(k)

      WORK2(i,j,kk) = c2 * dzwr(k) * ( WORK1(i,j,kk)            &
                    - KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) &
                    * dz(k+1) )

      WORK2_NEXT(i,j) = c2 * ( &
                        KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) - &
                        KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) )

      WORK3(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &
                     * SLY(i,j,kk,kbt,k,bid) * dz(k)

      WORK4(i,j,kk) = c2 * dzwr(k) * ( WORK3(i,j,kk)            &
                    - KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) &
                    * dz(k+1) )

      WORK4_NEXT(i,j) = c2 * ( &
                        KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) - &
                        KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) )
      if( abs( WORK2_NEXT(i,j) ) < abs( WORK2(i,j,kk) ) ) then 
        WORK2(i,j,kk) = WORK2_NEXT(i,j)
      endif
      if ( abs( WORK4_NEXT(i,j) ) < abs( WORK4(i,j,kk ) ) ) then 
        WORK4(i,j,kk) = WORK4_NEXT(i,j)
      endif
    endif
    
    LdoWork = TLT%K_LEVEL(i,j,bid) == k  .and.           &
              TLT%K_LEVEL(i,j,bid) < KMT(i,j,bid)  .and. &
              TLT%ZTW(i,j,bid) == 2
    if(LdoWork) then
      WORK1(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     & 
                     * SLX(i,j,kk,ktp,k+1,bid)

      WORK2(i,j,kk) =  c2 * ( WORK1(i,j,kk)                 &
                     - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &
                     * SLX(i,j,kk,kbt,k+1,bid) ) )

      WORK1(i,j,kk) = WORK1(i,j,kk) * dz(k+1)

      WORK3(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     &
                     * SLY(i,j,kk,ktp,k+1,bid)

      WORK4(i,j,kk) =  c2 * ( WORK3(i,j,kk)                 &
                     - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &
                     * SLY(i,j,kk,kbt,k+1,bid) ) )

      WORK3(i,j,kk) = WORK3(i,j,kk) * dz(k+1)
      LdoWork = TLT%K_LEVEL(i,j,bid) + 1 < KMT(i,j,bid)
      if (k.lt.km-1) then ! added to avoid out of bounds access
        if(LdoWOrk) then
          WORK2_NEXT(i,j) = c2 * dzwr(k+1) * ( &
                            KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) * dz(k+1) - &
                            KAPPA_THIC(i,j,ktp,k+2,bid) * SLX(i,j,kk,ktp,k+2,bid) * dz(k+2))

          WORK4_NEXT(i,j) = c2 * dzwr(k+1) * ( &
                            KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) * dz(k+1) - &
                            KAPPA_THIC(i,j,ktp,k+2,bid) * SLY(i,j,kk,ktp,k+2,bid) * dz(k+2))
        endif
      endif
      if( abs( WORK2_NEXT(i,j) ) < abs( WORK2(i,j,kk) ) ) &
        WORK2(i,j,kk) = WORK2_NEXT(i,j)

      if( abs(WORK4_NEXT(i,j)) < abs(WORK4(i,j,kk)) ) &
        WORK4(i,j,kk) = WORK4_NEXT(i,j)
    endif
  enddo
enddo
!$omp end do

!$OMP END PARALLEL

endtime = omp_get_wtime()

total = total + (endtime - starttime)

enddo
enddo

Jim Dempsey