Intel® Fortran Compiler
Build applications that can scale for the future with optimized code designed for Intel® Xeon® and compatible processors.

do-concurrent is slower than ordinary-do and forall.

mehdi__chinoune
Beginner
413 Views

The do-concurrent construct is supposed to replace forall, but in this test it takes double the time to complete the same job if the indices is well ordered and gives a very slow time if not:

PROGRAM TEST_DO_SPEED
  IMPLICIT NONE

  REAL, ALLOCATABLE :: A(:,:,:), B(:,:,:), C(:,:,:)
  REAL :: TIC
  INTEGER :: T0, T1, T2, T3, T4, T5, T6
  INTEGER :: I, J, K
  INTEGER, PARAMETER :: L = 1024, M = 512, N = 512

  ALLOCATE( A(L,M,N), B(L,M,N), C(L,M,N) )
  CALL RANDOM_NUMBER(A)
  CALL RANDOM_NUMBER(B)
  CALL RANDOM_NUMBER(C)

  CALL SYSTEM_CLOCK( T0, TIC)

  DO CONCURRENT( K=1:N, J=1:M, I=1:L)
    C(I,J,K) = A(I,J,K) +B(I,J,K)
  END DO

  CALL SYSTEM_CLOCK(T1)

  DO CONCURRENT( I=1:L, J=1:M, K=1:N)
    C(I,J,K) = A(I,J,K) +B(I,J,K)
  END DO

  CALL SYSTEM_CLOCK(T2)

  DO K=1,N
    DO J=1,M
      DO I=1,L
        C(I,J,K) = A(I,J,K) +B(I,J,K)
      END DO
    END DO
  END DO

  CALL SYSTEM_CLOCK(T3)

  C = A + B

  CALL SYSTEM_CLOCK(T4)

  FORALL(K=1:N, J=1:M, I=1:L)
    C(I,J,K) = A(I,J,K) + B(I,J,K)
  END FORALL

  CALL SYSTEM_CLOCK(T5)

  FORALL(I=1:L, J=1:M, K=1:N)
    C(I,J,K) = A(I,J,K) + B(I,J,K)
  END FORALL

  CALL SYSTEM_CLOCK(T6)

  PRINT*,"DO CONCURRENT 1 : ",(T1-T0)/TIC
  PRINT*,"DO CONCURRENT 2 : ",(T2-T1)/TIC
  PRINT*,"  ORDINARY DO   : ",(T3-T2)/TIC
  PRINT*,"   ARRAY DO     : ",(T4-T3)/TIC
  PRINT*,"   FORALL 1     : ",(T5-T4)/TIC
  PRINT*,"   FORALL 2     : ",(T6-T5)/TIC

END PROGRAM

It gives:

 DO CONCURRENT 1 :   0.3280000
 DO CONCURRENT 2 :    2.250000
   ORDINARY DO   :    2.891000
    ARRAY DO     :   0.2810000
    FORALL 1     :   0.2810000
    FORALL 2     :   0.2820000

Edit : Sorry,I was wrong. after adding the line 'CALL RANDOM_NUMBER(C)' the time of has changed.

0 Kudos
2 Replies
andrew_4619
Honored Contributor II
413 Views

Read https://software.intel.com/en-us/forums/intel-visual-fortran-compiler-for-windows/topic/499840

 

0 Kudos
andrew_4619
Honored Contributor II
413 Views

For fun a did a variation of your code with 6 'runs' and took the average time. They area ll the same except "Do concurrent 2" where the optimiser clearly must make some poor choices. Having said that with such a simple test the optimiser might make some decisions to "simplify" the code in some of the cases anyway. 

DO CONCURRENT 1 : .08117
DO CONCURRENT 2 : 1.87650
  ORDINARY DO   : .07917
   ARRAY DO     : .08033
   FORALL 1     : .08033
   FORALL 2     : .08067

!  do_conc_test.f90 
!
PROGRAM TEST_DO_SPEED
    IMPLICIT NONE

    REAL, ALLOCATABLE :: A(:,:,:), B(:,:,:), C(:,:,:)
    REAL :: TIC, t_all(6)
    INTEGER :: t(0:6)
    INTEGER :: I, J, K, NT
    !INTEGER, PARAMETER :: L = 1024, M = 512, N = 512
    INTEGER, PARAMETER :: L = 1024, M = 256, N = 256
    integer, parameter :: N_tries = 6

    ALLOCATE( A(L,M,N), B(L,M,N), C(L,M,N) )
    CALL RANDOM_NUMBER(A)
    CALL RANDOM_NUMBER(B)
    CALL RANDOM_NUMBER(C)
  
    t_all = 0.0
    do NT = 1 , N_tries
        CALL SYSTEM_CLOCK( T(0), TIC)

        DO CONCURRENT( K=1:N, J=1:M, I=1:L)
            C(I,J,K) = A(I,J,K) +B(I,J,K)
        END DO

        CALL SYSTEM_CLOCK(T(1))

        DO CONCURRENT( I=1:L, J=1:M, K=1:N)
            C(I,J,K) = A(I,J,K) +B(I,J,K)
        END DO

        CALL SYSTEM_CLOCK(T(2))

        DO K=1,N
            DO J=1,M
                DO I=1,L
                    C(I,J,K) = A(I,J,K) +B(I,J,K)
                END DO
            END DO
        END DO

        CALL SYSTEM_CLOCK(T(3))

        C = A + B

        CALL SYSTEM_CLOCK(T(4))

        FORALL(K=1:N, J=1:M, I=1:L)
            C(I,J,K) = A(I,J,K) + B(I,J,K)
        END FORALL

        CALL SYSTEM_CLOCK(T(5))

        FORALL(I=1:L, J=1:M, K=1:N)
            C(I,J,K) = A(I,J,K) + B(I,J,K)
        END FORALL

        CALL SYSTEM_CLOCK(T(6))
  
  
        do i = 1 , size(t_all)
            t_all(i) = t_all(i) + (T(i)-T(i-1))/TIC
        enddo
    enddo
    t_all = t_all / N_tries
  

  write(*,'(A,F0.5)') "DO CONCURRENT 1 : ", t_all(1) 
  write(*,'(A,F0.5)') "DO CONCURRENT 2 : ", t_all(2) 
  write(*,'(A,F0.5)') "  ORDINARY DO   : ", t_all(3) 
  write(*,'(A,F0.5)') "   ARRAY DO     : ", t_all(4) 
  write(*,'(A,F0.5)') "   FORALL 1     : ", t_all(5) 
  write(*,'(A,F0.5)') "   FORALL 2     : ", t_all(6) 
  read(*,*) i
END PROGRAM

 

0 Kudos
Reply