MKL does not respect affinity on master thread

Tue_B_ · ‎08-05-2015

This is follow-up/semi-related post to this rather long post:

https://software.intel.com/en-us/comment/1825717#comment-1825717

So like the title says:

MKL does not respect affinity on the master thread. Even though I enforce MKL to spawn threads on the NUMA node the master thread is running on, only one thread runs on this NUMA node and the rest gets pushed to other NUMA nodes. This is only a problem for the masterthread, since all the other threads actually respecet the affinity they have been given.

The code below is made to run on systems with a minimum of 6 NUMA nodes with at least 6 cores on each thread. (In my case I run it on a system with 8 NUMA nodes with 6 cores on each.)

It is very clear to see what happens if you comment/uncommet case (1) in the first nested region aka this line:

       CASE(1)
          !call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)<--- comment out this one

This bug or whatever it is, is extremely problematic for us, since it in practice makes it impossible to use nested MKL on NUMA systems. Does anyone have a solution to it? Or does Intel have an idea about when this could be resolved?

program NumaAwareDGEMM

 use IFPORT
 use omp_lib
 use mkl_service
 implicit none

 logical(4) :: Success
 integer :: NoNUMANodes, blocksize,dim
 integer :: N,I,J
 integer :: ID
 real*8,allocatable,dimension(:,:) :: A, B,C1,c2,c3,c4,c5,c6,c7,c8
    

NoNUMANodes=6                     !How many NUMA nodes to distribute calculations over
success = SETENVQQ("OMP_DISPLAY_ENV=TRUE")
success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6}")
 

 blocksize=1000
 dim=blocksize*NoNUMANodes
 allocate(A(dim,dim))
 allocate(B(dim,dim))
 allocate(C1(dim,dim))
 allocate(C2(dim,dim))
 allocate(C3(dim,dim))
 allocate(C4(dim,dim))
 allocate(C5(dim,dim))
 allocate(C6(dim,dim))
 allocate(C7(dim,dim))
 allocate(C8(dim,dim))
 call KMP_SET_STACKSIZE_S(990000000)
 call omp_set_dynamic(0)
 call mkl_set_dynamic(0)
 call omp_set_nested(1)
 call MKL_SET_NUM_THREADS(5)

   !intialization region
   call omp_set_num_threads(NoNumaNodes) !Outer parallelization across all numanodes
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID)
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      ID=omp_get_thread_num()
      print *,'Thread binding for socket=',ID
      SELECT CASE (ID)
        CASE(1)
          success=SETENVQQ("OMP_PLACES={0:6}")
        CASE(2)
          success=SETENVQQ("OMP_PLACES={6:6}")
        CASE(3)
          success=SETENVQQ("OMP_PLACES={12:6}")
        CASE(4)
          success=SETENVQQ("OMP_PLACES={18:6}")
        CASE(5)
          success=SETENVQQ("OMP_PLACES={24:6}")
        CASE(6)
          success=SETENVQQ("OMP_PLACES={30:6}")
        CASE(7)
          success=SETENVQQ("OMP_PLACES={36:6}")
        CASE(8)
          success=SETENVQQ("OMP_PLACES={42:6}")
      END SELECT 
   end do
   !$OMP END DO
   !$OMP END PARALLEL  
    print*,'Initialization over'   
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i)  
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      SELECT CASE (i)
        CASE(1)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)
        CASE(2)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim)
        CASE(3)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim)
        CASE(4)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim)
        CASE(5)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim)
        CASE(6)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim)
        CASE(7)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim)
        CASE(8)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim)
      END SELECT 
   end do
   !$OMP END DO
   !$OMP END PARALLEL  
   print*,'First MKL call done'
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i)  
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      SELECT CASE (i)
        CASE(1)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)
        CASE(2)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim)
        CASE(3)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim)
        CASE(4)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim)
        CASE(5)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim)
        CASE(6)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim)
        CASE(7)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim)
        CASE(8)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim)
      END SELECT 
   end do
   !$OMP END DO
   !$OMP END PARALLEL  
  end program NumaAwareDGEMM

dasfasd

TimP · ‎08-05-2015

Assuming master thread sets id to 0, it looks at first glance you're not attempting to set its affinity.

Tue_B_ · ‎08-05-2015

Hey Tim

You are of course right!

I posted the wrong code (I tried to change that in an attempt to fix it), anyway here is the code, now with affinity settings for the master thread.

Now the problem has changed to the one I also observed in the previous post, namely that the master thread no longer spawns any additional threads.

Does anyone have a solution to this bug/problem?

 use IFPORT
 use omp_lib
 use mkl_service
 implicit none

 logical(4) :: Success
 integer :: NoNUMANodes, blocksize,dim
 integer :: N,I,J
 integer :: ID
 real*8,allocatable,dimension(:,:) :: A, B,C1,c2,c3,c4,c5,c6,c7,c8
    

NoNUMANodes=6                     !How many NUMA nodes to distribute calculations over
success = SETENVQQ("OMP_DISPLAY_ENV=TRUE")
success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6}")
 

 blocksize=1000
 dim=blocksize*NoNUMANodes
 allocate(A(dim,dim))
 allocate(B(dim,dim))
 allocate(C1(dim,dim))
 allocate(C2(dim,dim))
 allocate(C3(dim,dim))
 allocate(C4(dim,dim))
 allocate(C5(dim,dim))
 allocate(C6(dim,dim))
 allocate(C7(dim,dim))
 allocate(C8(dim,dim))
 call KMP_SET_STACKSIZE_S(990000000)
 call omp_set_dynamic(0)
 call mkl_set_dynamic(0)
 call omp_set_nested(1)
 call MKL_SET_NUM_THREADS(5)

   !intialization region
   call omp_set_num_threads(NoNumaNodes) !Outer parallelization across all numanodes
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID)
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      ID=omp_get_thread_num()
      print *,'Thread binding for socket=',ID
      SELECT CASE (ID)
        CASE(0)
          success=SETENVQQ("OMP_PLACES={0:6}")
        CASE(1)
          success=SETENVQQ("OMP_PLACES={6:6}")
        CASE(2)
          success=SETENVQQ("OMP_PLACES={12:6}")
        CASE(3)
          success=SETENVQQ("OMP_PLACES={18:6}")
        CASE(4)
          success=SETENVQQ("OMP_PLACES={24:6}")
        CASE(5)
          success=SETENVQQ("OMP_PLACES={30:6}")
        CASE(6)
          success=SETENVQQ("OMP_PLACES={36:6}")
        CASE(7)
          success=SETENVQQ("OMP_PLACES={42:6}")
      END SELECT 
   end do
   !$OMP END DO
   !$OMP END PARALLEL  
    print*,'Initialization over'   
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i)  
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      SELECT CASE (i)
        CASE(1)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)
        CASE(2)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim)
        CASE(3)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim)
        CASE(4)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim)
        CASE(5)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim)
        CASE(6)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim)
        CASE(7)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim)
        CASE(8)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim)
      END SELECT 
   end do
   !$OMP END DO
   !$OMP END PARALLEL  
   print*,'First MKL call done'
   !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i)  
   !$OMP DO SCHEDULE(STATIC)
   do i = 1,NoNumanodes
      SELECT CASE (i)
        CASE(1)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)
        CASE(2)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim)
        CASE(3)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim)
        CASE(4)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim)
        CASE(5)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim)
        CASE(6)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim)
        CASE(7)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim)
        CASE(8)
          call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim)
      END SELECT 
   end do
   !$OMP END DO
   !$OMP END PARALLEL  
  end program NumaAwareDGEMM