OpenMP performance lower than expected

aketh_t_ · ‎07-22-2015

Hi all,

below is the code I have been trying to parallelize with OpenMP

do k=1,km-1

        do kk=1,2

          starttime = omp_get_wtime()

          !$OMP PARALLEL PRIVATE(I)DEFAULT(SHARED)
 
          !$omp do  
          do j=1,ny_block
           do i=1,nx_block
 
          LMASK(i,j) = TLT%K_LEVEL(i,j,bid) == k  .and.            &
                       TLT%K_LEVEL(i,j,bid) < KMT(i,j,bid)  .and.  &
                       TLT%ZTW(i,j,bid) == 1

           enddo
          enddo
          !$omp end do

          !$omp do 
          do j=1,ny_block
           do i=1,nx_block

            if ( LMASK(i,j) ) then 

            WORK1(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &
                           * SLX(i,j,kk,kbt,k,bid) * dz(k)

            WORK2(i,j,kk) = c2 * dzwr(k) * ( WORK1(i,j,kk)            &
              - KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) &
                                            * dz(k+1) )

            WORK2_NEXT(i,j) = c2 * ( &
              KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) - &
              KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) )

            WORK3(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &
                           * SLY(i,j,kk,kbt,k,bid) * dz(k)

            WORK4(i,j,kk) = c2 * dzwr(k) * ( WORK3(i,j,kk)            &
              - KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) &
                                            * dz(k+1) )

            WORK4_NEXT(i,j) = c2 * ( &
              KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) - &
              KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) )

            endif

            enddo
          enddo 
          !$omp end do 

           !$omp do 
           do j=1,ny_block
           do i=1,nx_block

              if( LMASK(i,j) .and. abs( WORK2_NEXT(i,j) ) < abs( WORK2(i,j,kk) ) ) then 

              WORK2(i,j,kk) = WORK2_NEXT(i,j)

              endif

           enddo
          enddo
          !$omp end do

          !$omp do 
          do j=1,ny_block
           do i=1,nx_block

           if ( LMASK(i,j) .and. abs( WORK4_NEXT(i,j) ) < abs( WORK4(i,j,kk ) ) ) then 
              WORK4(i,j,kk) = WORK4_NEXT(i,j)
           endif

           enddo
          enddo 
          !$omp end do 

          !$omp do 
          do j=1,ny_block
           do i=1,nx_block  

          LMASK(i,j) = TLT%K_LEVEL(i,j,bid) == k  .and.           &
                       TLT%K_LEVEL(i,j,bid) < KMT(i,j,bid)  .and. &
                       TLT%ZTW(i,j,bid) == 2

            enddo
          enddo
          !$omp end do

         !$omp do 
         do j=1,ny_block
           do i=1,nx_block


            if ( LMASK(i,j) ) then

            WORK1(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     & 
                           * SLX(i,j,kk,ktp,k+1,bid)

            WORK2(i,j,kk) =  c2 * ( WORK1(i,j,kk)                 &
                           - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &
                              * SLX(i,j,kk,kbt,k+1,bid) ) )

            WORK1(i,j,kk) = WORK1(i,j,kk) * dz(k+1)

            WORK3(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     &
                           * SLY(i,j,kk,ktp,k+1,bid)

            WORK4(i,j,kk) =  c2 * ( WORK3(i,j,kk)                 &
                           - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &
                              * SLY(i,j,kk,kbt,k+1,bid) ) )

            WORK3(i,j,kk) = WORK3(i,j,kk) * dz(k+1)

            endif
 
            enddo
          enddo
          !$omp end do   

          !$omp do 
          do j=1,ny_block
           do i=1,nx_block


          LMASK(i,j) = LMASK(i,j) .and. TLT%K_LEVEL(i,j,bid) + 1 < KMT(i,j,bid)

           enddo
          enddo
          !$omp end do

          if (k.lt.km-1) then ! added to avoid out of bounds access

           !$omp do 
           do j=1,ny_block
            do i=1,nx_block
 
            if( LMASK(i,j) ) then

              WORK2_NEXT(i,j) = c2 * dzwr(k+1) * ( &
                KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) * dz(k+1) - &
                KAPPA_THIC(i,j,ktp,k+2,bid) * SLX(i,j,kk,ktp,k+2,bid) * dz(k+2))

              WORK4_NEXT(i,j) = c2 * dzwr(k+1) * ( &
                KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) * dz(k+1) - &
                KAPPA_THIC(i,j,ktp,k+2,bid) * SLY(i,j,kk,ktp,k+2,bid) * dz(k+2))

              endif 

              enddo
            enddo
            !$omp end do
          end if
             
          !$omp do 
          do j=1,ny_block
            do i=1,nx_block


            if( LMASK(i,j) .and. abs( WORK2_NEXT(i,j) ) < abs( WORK2(i,j,kk) ) ) &
            WORK2(i,j,kk) = WORK2_NEXT(i,j)

            enddo
          enddo
          !$omp end do     

          !$omp do 
         do j=1,ny_block
            do i=1,nx_block


            if( LMASK(i,j) .and. abs(WORK4_NEXT(i,j)) < abs(WORK4(i,j,kk)) ) &
            WORK4(i,j,kk) = WORK4_NEXT(i,j)

             enddo
          enddo
          !$omp end do 

          !$OMP END PARALLEL

          endtime = omp_get_wtime()

          total = total + (endtime - starttime)

        enddo
      enddo

The performance however is not as expected.

With 4 threads there are only marginal improvements from 1.3 * 10-2 to 1 * 10-2

I tried for more threads(16) and performance was only 6.06E-03 i.e 4X only(not scaling well either)

Each of those i,j loops are 196*84.

Also schedule Dynamic was not of much help either. and performance was same with chuncksize of 20.

Collapse with dynamic and chunk size 20 made code slow with performance of 8*1-2

Any help for Improvements??

TimP · ‎07-22-2015

This has been posted and commented upon elsewhere.

aketh_t_ · ‎07-22-2015

Yes I agree.

1)I was told it was not the right place to post. So I reposted here.

jimdempseyatthecove · ‎07-22-2015

The problem you have is you have 10 !$omp do loops, together with the accompanying implicit barrier at end of each loop.

Rewrite your code to use 1 !$omp do loop without use of LMASK.

Jim Dempsey

aketh_t_ · ‎07-22-2015

Without using LMASK???

TimP · ‎07-22-2015

If you like, start by fusing those parallel regions so that at least your openmp version is consistent with the automatic fusion implemented by ifort in your single thread baseline. I've forgotten how long ago this was suggested to you. I suggested this be included in the parallelism training but I guess others are correct in advising that no one will be influenced by such suggestions.

aketh_t_ · ‎07-22-2015

Could you guys let me know the rationale for fusing loops in general make for better performance(even when we do not use openmp).

Coz with OpenMP the issue is straight forward that threads do have to overcome a barrier. But even when there is no OpenMP I have found Loop fusuin helps.

Any material/explanation why that work's??

Has it got anything to do with cache??

aketh_t_ · ‎07-23-2015

thank you all.

My code seems to be doing well now with fusing of loops. the time taken is approx 2.63E-03.

An improvement over original of 1.34E-02.

The speedup is approximately 5x for 16 threads.

But it still isnt able to scale as well as expected.

I tried omp collapse and schedule dynamic with chunk size 294. but performance has only been lower compared to the code without any schedule clause. approx 4.3E-03.

Any suggestions??

TimP · ‎07-23-2015

With fusion, the repeated references to lmask should be satisfied in register without even depending on cache.

In the simpler cases, the non-fused loops are likely to need unroll by 4 to optimize performance, but this will lead to excessive code expansion, so the compiler will not do it unless requested. Either way, fusion is a better solution.

For several years, Intel CPUs have tended to prefer to have 6 to 8 array sections stored per loop, in part so as to optimize use of fill buffers. HyperThreading of course will impact this, as fill buffers are shared among logical siblings.

The explanation about additional overhead for starting new openmp regions is not simple, as the thread pool will remain active if you don't zero out KMP_BLOCKTIME. Still, the overhead for starting a new parallel region is clearly larger than for starting a new single thread nested loop, and it becomes at least proportionally larger with increased numbers of threads, as more than Amdahl's law is at play.

At -O3, ifort will perform some automatic fusion or distribution so as to optimize the loops. but this optimization is not possible across OpenMP barriers. So you were penalizing OpenMP by preventing optimizations for that case.

You could analyze your code by profiling with VTune if you wished to verify to what extent memory bandwidth might account for non-linear scaling. If you are trying to analyze penalties for not setting optimum affinity, I expect that to be difficult, in part due to non-repeatability.

jimdempseyatthecove · ‎07-23-2015

>> Without using LMASK??

do k=1,km-1
  do kk=1,2

starttime = omp_get_wtime()

!$OMP PARALLEL PRIVATE(I,LdoWork)DEFAULT(SHARED)
 
!$omp do  
do j=1,ny_block
  do i=1,nx_block
    LdoWork = TLT%K_LEVEL(i,j,bid) == k  .and.            &
              TLT%K_LEVEL(i,j,bid) < KMT(i,j,bid)  .and.  &
              TLT%ZTW(i,j,bid) == 1
    if(LdoWork) then
      WORK1(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &
                     * SLX(i,j,kk,kbt,k,bid) * dz(k)

      WORK2(i,j,kk) = c2 * dzwr(k) * ( WORK1(i,j,kk)            &
                    - KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) &
                    * dz(k+1) )

      WORK2_NEXT(i,j) = c2 * ( &
                        KAPPA_THIC(i,j,ktp,k+1,bid) * SLX(i,j,kk,ktp,k+1,bid) - &
                        KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) )

      WORK3(i,j,kk) =  KAPPA_THIC(i,j,kbt,k,bid)  &
                     * SLY(i,j,kk,kbt,k,bid) * dz(k)

      WORK4(i,j,kk) = c2 * dzwr(k) * ( WORK3(i,j,kk)            &
                    - KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) &
                    * dz(k+1) )

      WORK4_NEXT(i,j) = c2 * ( &
                        KAPPA_THIC(i,j,ktp,k+1,bid) * SLY(i,j,kk,ktp,k+1,bid) - &
                        KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) )
      if( abs( WORK2_NEXT(i,j) ) < abs( WORK2(i,j,kk) ) ) then 
        WORK2(i,j,kk) = WORK2_NEXT(i,j)
      endif
      if ( abs( WORK4_NEXT(i,j) ) < abs( WORK4(i,j,kk ) ) ) then 
        WORK4(i,j,kk) = WORK4_NEXT(i,j)
      endif
    endif
    
    LdoWork = TLT%K_LEVEL(i,j,bid) == k  .and.           &
              TLT%K_LEVEL(i,j,bid) < KMT(i,j,bid)  .and. &
              TLT%ZTW(i,j,bid) == 2
    if(LdoWork) then
      WORK1(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     & 
                     * SLX(i,j,kk,ktp,k+1,bid)

      WORK2(i,j,kk) =  c2 * ( WORK1(i,j,kk)                 &
                     - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &
                     * SLX(i,j,kk,kbt,k+1,bid) ) )

      WORK1(i,j,kk) = WORK1(i,j,kk) * dz(k+1)

      WORK3(i,j,kk) =  KAPPA_THIC(i,j,ktp,k+1,bid)     &
                     * SLY(i,j,kk,ktp,k+1,bid)

      WORK4(i,j,kk) =  c2 * ( WORK3(i,j,kk)                 &
                     - ( KAPPA_THIC(i,j,kbt,k+1,bid)        &
                     * SLY(i,j,kk,kbt,k+1,bid) ) )

      WORK3(i,j,kk) = WORK3(i,j,kk) * dz(k+1)
      LdoWork = TLT%K_LEVEL(i,j,bid) + 1 < KMT(i,j,bid)
      if (k.lt.km-1) then ! added to avoid out of bounds access
        if(LdoWOrk) then
          WORK2_NEXT(i,j) = c2 * dzwr(k+1) * ( &
                            KAPPA_THIC(i,j,kbt,k+1,bid) * SLX(i,j,kk,kbt,k+1,bid) * dz(k+1) - &
                            KAPPA_THIC(i,j,ktp,k+2,bid) * SLX(i,j,kk,ktp,k+2,bid) * dz(k+2))

          WORK4_NEXT(i,j) = c2 * dzwr(k+1) * ( &
                            KAPPA_THIC(i,j,kbt,k+1,bid) * SLY(i,j,kk,kbt,k+1,bid) * dz(k+1) - &
                            KAPPA_THIC(i,j,ktp,k+2,bid) * SLY(i,j,kk,ktp,k+2,bid) * dz(k+2))
        endif
      endif
      if( abs( WORK2_NEXT(i,j) ) < abs( WORK2(i,j,kk) ) ) &
        WORK2(i,j,kk) = WORK2_NEXT(i,j)

      if( abs(WORK4_NEXT(i,j)) < abs(WORK4(i,j,kk)) ) &
        WORK4(i,j,kk) = WORK4_NEXT(i,j)
    endif
  enddo
enddo
!$omp end do

!$OMP END PARALLEL

endtime = omp_get_wtime()

total = total + (endtime - starttime)

enddo
enddo

Jim Dempsey