- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
This is follow-up/semi-related post to this rather long post:
https://software.intel.com/en-us/comment/1825717#comment-1825717
So like the title says:
MKL does not respect affinity on the master thread. Even though I enforce MKL to spawn threads on the NUMA node the master thread is running on, only one thread runs on this NUMA node and the rest gets pushed to other NUMA nodes. This is only a problem for the masterthread, since all the other threads actually respecet the affinity they have been given.
The code below is made to run on systems with a minimum of 6 NUMA nodes with at least 6 cores on each thread. (In my case I run it on a system with 8 NUMA nodes with 6 cores on each.)
It is very clear to see what happens if you comment/uncommet case (1) in the first nested region aka this line:
CASE(1) !call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim)<--- comment out this one
This bug or whatever it is, is extremely problematic for us, since it in practice makes it impossible to use nested MKL on NUMA systems. Does anyone have a solution to it? Or does Intel have an idea about when this could be resolved?
program NumaAwareDGEMM
use IFPORT use omp_lib use mkl_service implicit none logical(4) :: Success integer :: NoNUMANodes, blocksize,dim integer :: N,I,J integer :: ID real*8,allocatable,dimension(:,:) :: A, B,C1,c2,c3,c4,c5,c6,c7,c8 NoNUMANodes=6 !How many NUMA nodes to distribute calculations over success = SETENVQQ("OMP_DISPLAY_ENV=TRUE") success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6}") blocksize=1000 dim=blocksize*NoNUMANodes allocate(A(dim,dim)) allocate(B(dim,dim)) allocate(C1(dim,dim)) allocate(C2(dim,dim)) allocate(C3(dim,dim)) allocate(C4(dim,dim)) allocate(C5(dim,dim)) allocate(C6(dim,dim)) allocate(C7(dim,dim)) allocate(C8(dim,dim)) call KMP_SET_STACKSIZE_S(990000000) call omp_set_dynamic(0) call mkl_set_dynamic(0) call omp_set_nested(1) call MKL_SET_NUM_THREADS(5) !intialization region call omp_set_num_threads(NoNumaNodes) !Outer parallelization across all numanodes !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNumanodes ID=omp_get_thread_num() print *,'Thread binding for socket=',ID SELECT CASE (ID) CASE(1) success=SETENVQQ("OMP_PLACES={0:6}") CASE(2) success=SETENVQQ("OMP_PLACES={6:6}") CASE(3) success=SETENVQQ("OMP_PLACES={12:6}") CASE(4) success=SETENVQQ("OMP_PLACES={18:6}") CASE(5) success=SETENVQQ("OMP_PLACES={24:6}") CASE(6) success=SETENVQQ("OMP_PLACES={30:6}") CASE(7) success=SETENVQQ("OMP_PLACES={36:6}") CASE(8) success=SETENVQQ("OMP_PLACES={42:6}") END SELECT end do !$OMP END DO !$OMP END PARALLEL print*,'Initialization over' !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNumanodes SELECT CASE (i) CASE(1) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim) CASE(2) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim) CASE(3) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim) CASE(4) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim) CASE(5) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim) CASE(6) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim) CASE(7) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim) CASE(8) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim) END SELECT end do !$OMP END DO !$OMP END PARALLEL print*,'First MKL call done' !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNumanodes SELECT CASE (i) CASE(1) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim) CASE(2) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim) CASE(3) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim) CASE(4) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim) CASE(5) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim) CASE(6) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim) CASE(7) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim) CASE(8) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim) END SELECT end do !$OMP END DO !$OMP END PARALLEL end program NumaAwareDGEMM
dasfasd
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hey Tim
You are of course right!
I posted the wrong code (I tried to change that in an attempt to fix it), anyway here is the code, now with affinity settings for the master thread.
Now the problem has changed to the one I also observed in the previous post, namely that the master thread no longer spawns any additional threads.
Does anyone have a solution to this bug/problem?
use IFPORT use omp_lib use mkl_service implicit none logical(4) :: Success integer :: NoNUMANodes, blocksize,dim integer :: N,I,J integer :: ID real*8,allocatable,dimension(:,:) :: A, B,C1,c2,c3,c4,c5,c6,c7,c8 NoNUMANodes=6 !How many NUMA nodes to distribute calculations over success = SETENVQQ("OMP_DISPLAY_ENV=TRUE") success=SETENVQQ("OMP_PLACES={0:6},{6:6},{12:6},{18:6},{24:6},{30:6}") blocksize=1000 dim=blocksize*NoNUMANodes allocate(A(dim,dim)) allocate(B(dim,dim)) allocate(C1(dim,dim)) allocate(C2(dim,dim)) allocate(C3(dim,dim)) allocate(C4(dim,dim)) allocate(C5(dim,dim)) allocate(C6(dim,dim)) allocate(C7(dim,dim)) allocate(C8(dim,dim)) call KMP_SET_STACKSIZE_S(990000000) call omp_set_dynamic(0) call mkl_set_dynamic(0) call omp_set_nested(1) call MKL_SET_NUM_THREADS(5) !intialization region call omp_set_num_threads(NoNumaNodes) !Outer parallelization across all numanodes !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,ID) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNumanodes ID=omp_get_thread_num() print *,'Thread binding for socket=',ID SELECT CASE (ID) CASE(0) success=SETENVQQ("OMP_PLACES={0:6}") CASE(1) success=SETENVQQ("OMP_PLACES={6:6}") CASE(2) success=SETENVQQ("OMP_PLACES={12:6}") CASE(3) success=SETENVQQ("OMP_PLACES={18:6}") CASE(4) success=SETENVQQ("OMP_PLACES={24:6}") CASE(5) success=SETENVQQ("OMP_PLACES={30:6}") CASE(6) success=SETENVQQ("OMP_PLACES={36:6}") CASE(7) success=SETENVQQ("OMP_PLACES={42:6}") END SELECT end do !$OMP END DO !$OMP END PARALLEL print*,'Initialization over' !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNumanodes SELECT CASE (i) CASE(1) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim) CASE(2) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim) CASE(3) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim) CASE(4) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim) CASE(5) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim) CASE(6) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim) CASE(7) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim) CASE(8) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim) END SELECT end do !$OMP END DO !$OMP END PARALLEL print*,'First MKL call done' !$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i) !$OMP DO SCHEDULE(STATIC) do i = 1,NoNumanodes SELECT CASE (i) CASE(1) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C1,dim) CASE(2) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C2,dim) CASE(3) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C3,dim) CASE(4) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C4,dim) CASE(5) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C5,dim) CASE(6) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C6,dim) CASE(7) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C7,dim) CASE(8) call dgemm('N','N',dim,dim,dim,1.d0,A,dim,B,dim,0.d0,C8,dim) END SELECT end do !$OMP END DO !$OMP END PARALLEL end program NumaAwareDGEMM

- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page