Intel® Fortran Compiler
Build applications that can scale for the future with optimized code designed for Intel® Xeon® and compatible processors.
28914 Discussions

OpenMP with DGEMM, how to program properly?

AlphaF20
New Contributor I
4,406 Views

I have the following Fortran 90 code modified from https://community.intel.com/t5/Intel-Fortran-Compiler/OpenMP-do-loop-efficiency/m-p/1258532

 

Program reshape_for_blas

  Use, Intrinsic :: iso_fortran_env, Only :  wp => real64, li => int64
  Use :: omp_lib
  
  Implicit None

  Real( wp ), Dimension( :, :    ), Allocatable :: a
  Real( wp ), Dimension( :, :, : ), Allocatable :: b
  Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
  Real( wp ), Dimension( :, :    ), Allocatable :: d
  Real( wp ), Dimension( :, :    ), Allocatable :: e

  Integer :: na, nb, nc, nd, ne
  Integer :: la, lb, lc, ld  
  Integer( li ) :: start, finish, rate
  Integer :: numthreads
  
  
  numthreads = 6

  call omp_set_num_threads(numthreads)  
  
  
  Write( *, * ) 'na, nb, nc, nd ?'
  Read( *, * ) na, nb, nc, nd
  ne = nc * nd
  Allocate( a ( 1:na, 1:nb ) ) 
  Allocate( b ( 1:nb, 1:nc, 1:nd ) ) 
  Allocate( c2( 1:na, 1:nc, 1:nd ) )  
  Allocate( c3( 1:na, 1:nc, 1:nd ) )
  Allocate( c4( 1:na, 1:nc, 1:nd ) )  
  Allocate( c5( 1:na, 1:nc, 1:nd ) )  
  Allocate( d ( 1:nb, 1:ne ) ) 
  Allocate( e ( 1:na, 1:ne ) ) 

  ! Set up some data
  Call Random_number( a )
  Call Random_number( b )
  c2(:,:,:) = 0.0_wp
  c3(:,:,:) = 0.0_wp  
  c4(:,:,:) = 0.0_wp
  c5(:,:,:) = 0.0_wp

  !naive
 ! Call System_clock( start, rate )

  
 ! do la = 1, na 
 !   do lb = 1, nb
 !     do lc = 1, nc
 !       do ld = 1, nd
 !         c3(la,lc,ld) = c3(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
 !       enddo  
 !     enddo
 !   enddo
 ! enddo  
 ! Call System_clock( finish, rate )
 ! Write( *, * ) 'Time for naive loop abcd ', Real( finish - start, wp ) / rate  
 
  c3(:,:,:) = 0.0_wp
  !naive
  Call System_clock( start, rate )
  do ld = 1, nd 
    do lc = 1, nc
      do lb = 1, nb
        do la = 1, na
          c3(la,lc,ld) = c3(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
        enddo  
      enddo
    enddo
  enddo  
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for naive loop dcba', Real( finish - start, wp ) / rate  
    





  c3(:,:,:) = 0.0_wp
  !naive
  Call System_clock( start, rate )
  do ld = 1, nd
    do lc = 1, nc
      do la = 1, na
        do lb = 1, nb
          c3(la,lc,ld) = c3(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
        enddo
      enddo
    enddo
  enddo
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for naive loop dcab', Real( finish - start, wp ) / rate

   
   
   
   
   
   
  ! Direct
  Call System_clock( start, rate )
  Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
                                            b , Size( b , Dim = 1 ), &
                                            0.0_wp, c2, Size( c2, Dim = 1 ) )
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for straight dgemm ', Real( finish - start, wp ) / rate
    
          
   
   
   
   
   
   
   
   
   
   
  c5(:,:,:) = 0.0_wp
  Call System_clock( start, rate )  
  !$omp parallel
  !$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)    
  do la = 1, na 
      do lc = 1, nc
        do ld = 1, nd
            do lb = 1, nb
          c5(la,lc,ld) = c5(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
        enddo  
      enddo
    enddo
  enddo  
  !$omp end do  
  !$omp end parallel
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for naive loop omp dcba', Real( finish - start, wp ) / rate  
 

  
   
   
  c5(:,:,:) = 0.0_wp   
  Call System_clock( start, rate )  
  !$omp parallel
  !$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)    
  do ld = 1, nd 
    do lc = 1, nc
      do lb = 1, nb
        do la = 1, na
          c5(la,lc,ld) = c5(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
        enddo  
      enddo
    enddo
  enddo  
  !$omp end do  
  !$omp end parallel
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for naive loop omp dcba', Real( finish - start, wp ) / rate    
  
  
  c5(:,:,:) = 0.0_wp
  Call System_clock( start, rate )
  !$omp parallel
  !$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
  do ld = 1, nd
    do lc = 1, nc
      do la = 1, na
        do lb = 1, nb
          c5(la,lc,ld) = c5(la,lc,ld)  + a(la,lb) * b(lb, lc, ld)
        enddo
      enddo
    enddo
  enddo
  !$omp end do
  !$omp end parallel
  Call System_clock( finish, rate )
  Write( *, * ) 'Time for naive loop omp dcab', Real( finish - start, wp ) / rate

  
  !dgemm omp 
  Call System_clock( start, rate )
  
  !$omp parallel
  ! Direct
  Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
                                            b , Size( b , Dim = 1 ), &
                                            0.0_wp, c4, Size( c4, Dim = 1 ) )
  !$omp end parallel
  Call System_clock( finish, rate )  
  Write( *, * ) 'Time for straight dgemm method omp', Real( finish - start, wp ) / rate
    
     
     






  
  
  do la = 1, na 
    do lc = 1, nc
      do ld = 1, nd
         if ( dabs(c3(la,lc,ld) - c5(la,lc,ld))  > 1.e-6 ) then 
           write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
         endif 
         
         if ( dabs(c4(la,lc,ld) - c3(la,lc,ld))  > 1.e-6 ) then 
           write (*,*) '!!! c4', la,lc,ld, c4(la,lc,ld) - c5(la,lc,ld)
         endif      
         
         
         if ( dabs(c2(la,lc,ld) - c3(la,lc,ld))  > 1.e-6 ) then 
           write (*,*) '!!! c2', la,lc,ld, c2(la,lc,ld) - c5(la,lc,ld)
         endif               
         
         
      enddo
    enddo
  enddo  
  
End Program reshape_for_blas



 

 

I use ifort loop-v2.f90 -mkl=parallel -qopenmp to compile it. The input of na-nd

300 300 300 300 lead to

 

Time for naive loop dcba   2.18543500000000
 Time for naive loop dcab   2.25414400000000
 Time for straight dgemm   7.108299999999999E-002
 Time for naive loop omp dcba   4.25462000000000
 Time for naive loop omp dcba  0.548002000000000
 Time for naive loop omp dcab  0.546911000000000
 Time for straight dgemm method omp  0.234727000000000

 

 not much speed up for OpenMP DGEMM. If I use 400 400 400 400 as input dimension, it leads to

e.g., !!! c4 1 364 262 261.365626269193,

inconsistent results.

May I know how to use OpenMP with DGEMM properly?

ifort version ifort (IFORT) 19.0.3.199 20190206

OS CentOS Linux release 7.8.2003 (Core)
Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz

Labels (2)
0 Kudos
1 Solution
jimdempseyatthecove
Honored Contributor III
4,383 Views
  !dgemm omp 
  Call System_clock( start, rate )
  
  !$omp parallel
  ! Direct
  Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
                                            b , Size( b , Dim = 1 ), &
                                            0.0_wp, c4, Size( c4, Dim = 1 ) )
  !$omp end parallel
  Call System_clock( finish, rate )

Assuming you are linking with the MKL parallel library (as opposed to the sequential library).
Also assume your system has 8 hardware threads...

The !$omp parallel creates a thread team of 8 threads
Each thread calling dgemm (with the parallel MKL library) enters MKL and each instance of MKL creates an OpenMP thread team of 8 threads. Now you have 64 threads running *** all 8 teams writing to same output array.

.OR. assume MKL sequential library loaded...
The !$omp parallel creates a thread team of 8 threads
Each thread calling dgemm (with the parallel MKL library) enters MKL and each instance of MKL uses the calling thread. Now you have the original 8 threads running *** all 8 threads writing to same output array.

The solution is to have the serial thread call the MKL threaded library dgemm
.OR.
Have each OpenMP parallel thread call the MKL sequential library using different output arrays (or different slices of the same output array).

(there are more advanced scenarios that you can use after you master this)

Jim Dempsey

 

View solution in original post

6 Replies
jimdempseyatthecove
Honored Contributor III
4,384 Views
  !dgemm omp 
  Call System_clock( start, rate )
  
  !$omp parallel
  ! Direct
  Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
                                            b , Size( b , Dim = 1 ), &
                                            0.0_wp, c4, Size( c4, Dim = 1 ) )
  !$omp end parallel
  Call System_clock( finish, rate )

Assuming you are linking with the MKL parallel library (as opposed to the sequential library).
Also assume your system has 8 hardware threads...

The !$omp parallel creates a thread team of 8 threads
Each thread calling dgemm (with the parallel MKL library) enters MKL and each instance of MKL creates an OpenMP thread team of 8 threads. Now you have 64 threads running *** all 8 teams writing to same output array.

.OR. assume MKL sequential library loaded...
The !$omp parallel creates a thread team of 8 threads
Each thread calling dgemm (with the parallel MKL library) enters MKL and each instance of MKL uses the calling thread. Now you have the original 8 threads running *** all 8 threads writing to same output array.

The solution is to have the serial thread call the MKL threaded library dgemm
.OR.
Have each OpenMP parallel thread call the MKL sequential library using different output arrays (or different slices of the same output array).

(there are more advanced scenarios that you can use after you master this)

Jim Dempsey

 

AlphaF20
New Contributor I
4,356 Views


Thank you so much for your answer! I am sorry for my late reply.

Looks like, there is an option in mkl,
(a)

 

 

 

 ifort mkl=parallel -qopenmp 

 

 

 


(b)

 

 

 

ifort mkl -qopenmp

 

 

 

is =parallel "Have each OpenMP parallel thread call the MKL sequential library..."?


In the code, there seems to be three approaches:
1. In the question post

 

 

 

!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel

 

 

 

2. Remove the $omp

 

 

 

Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )

 

 

 

3. I got a reply from stackoverflow (I tried with lopenblas with gfortran)
https://stackoverflow.com/questions/66296334/openmp-with-blas/66312637

 

 

 

Integer :: ithr, istart, iend


!$omp parallel private(ithr, istart, iend)
ithr = omp_get_thread_num()

! First index (plane) in B for the current thread
istart = ithr * nd / omp_get_num_threads()
! First index (plane) in B for the next thread
iend = (ithr + 1) * nd / omp_get_num_threads()

Call dgemm('N', 'N', na, nc * (iend - istart), nb, 1.0_wp, a, nd, &
b(1, 1, 1 + istart), Size(b, Dim = 1), &
0.0_wp, c6(1, 1, 1 + istart), Size(c6, Dim = 1))
!$omp end parallel

 

 

 

Together with

(a)

 

 ifort mkl=parallel -qopenmp 

 

or 

(b)

 

ifort mkl -qopenmp

 

 

, there are 6 combinations. My experience is
(a)-1 and (b)-1 often got different results than nested loops. For other combinations, I don't find much difference in terms of timing and numthreads I put.

 

So far, it seems that, in ifort, I can use dgemm in mkl to speed up the matrix multiplication, but have not benefit from openmp.

 

 

0 Kudos
jimdempseyatthecove
Honored Contributor III
4,318 Views

b) should be

ifort mkl=sequential -qopenmp

or

ifort mkl=parallel -qopenmp
...
! in Fortran code
! divvy up the threads amungst omp and mkl such that
! (omp #threads) * (mkl * threads) <= omp_max_threads()
! the reduced value for OMP threads is to be determined through experimentation
call omp_set_num_threads(i_your_choice_for_number_of_threads_for_omp)

call mkl_set_num_threads(max(omp_get_max_threads()/i_your_choice_for_number_of_threads_for_omp,1))
!$omp parallel

Jim Dempsey

jimdempseyatthecove
Honored Contributor III
4,313 Views

Also

!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel

The above will have each OpenMP (assuming mkl=parallel) having MKL compute the same result into c4.

What you should be doing is to partition the sources and destinations such to produce unique subsets of outputs. This is called tiling (or division of work).

As to how you proportion the work between OMP and MKL, that is subject to experimentation.

Jim Dempsey

AlphaF20
New Contributor I
4,167 Views

By comparing the time with

  numthreads = 1

and

  numthreads = 6

I saw speedup of dgemm, in the input as 400 400 400 400. Thank you so much!

0 Kudos
John_Campbell
New Contributor II
3,333 Views

I thought it might be helpful to identify some changes to the code to improve OpenMP performance for the computation identified involving DO loops.

The main points are:

DO loop order: it is best to order the DO loops:

* so that memory is used sequentially ( for the inner DO )

* that different threads address different parts of memory to limit cache coherence problems, as well as “race conditions” ( for the outer OMP DO loop)
* SCHEDULE can also effect memory use order and SCHEDULE(STATIC) is good for memory control, while SCHEDULE(DYNAMIC) can be preferred when load sharing between threads is an important issue.

I have used a timer function, DELTA_SECOND, which is not thread safe, but as it is only used outside the !$OMP region this can be ok, while any functions used inside the !$OMP region must be thread safe.

The basic calculation is "c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)"

As "ld" is the outer (Fortran) dimension, this is the best for !$OMP DO. Also "nd" is much larger than "num_threads" so there is no need for the COLLAPSE clause to be used, while if nd & nc are about num_threads or less, COLLAPSE(2) could be useful.

In this case, la is the best choice for the inner loop, or even use array syntax, resulting in "c5(:,lc,ld) = c5(:,lc,ld) + a(:,lb) * b(lb, lc, ld)", which is basically daxpy syntax, with "b(lb, lc, ld)" being a constant. This is much preferred to lb as the inner loop, while "lc" as the inner loop would generate excessive memory access, unless "num_threads" times size "c5(la,:,ld)" and "b(lb, :, ld)" could fit in L3 cache.


The other point I would make is it is best to explicitly define all variables and arrays as either SHARED, PRIVATE or REDUCTION and it is best that private arrays are initially zero. This can eliminate a lot of unexpected results.

!$omp parallel do default(none) private(lb, lc, ld) shared(a,b,c6, nb,nc,nd) schedule(static)

I have modified the original code to demonstrate more do loop order alternatives that were in the original post.

Reply