Intel® Fortran Compiler
Build applications that can scale for the future with optimized code designed for Intel® Xeon® and compatible processors.
Announcements
Welcome to the Intel Community. If you get an answer you like, please mark it as an Accepted Solution to help others. Thank you!
26755 Discussions

do-concurrent is slower than ordinary-do and forall.

mehdi__chinoune
Beginner
129 Views

The do-concurrent construct is supposed to replace forall, but in this test it takes double the time to complete the same job if the indices is well ordered and gives a very slow time if not:

PROGRAM TEST_DO_SPEED
  IMPLICIT NONE

  REAL, ALLOCATABLE :: A(:,:,:), B(:,:,:), C(:,:,:)
  REAL :: TIC
  INTEGER :: T0, T1, T2, T3, T4, T5, T6
  INTEGER :: I, J, K
  INTEGER, PARAMETER :: L = 1024, M = 512, N = 512

  ALLOCATE( A(L,M,N), B(L,M,N), C(L,M,N) )
  CALL RANDOM_NUMBER(A)
  CALL RANDOM_NUMBER(B)
  CALL RANDOM_NUMBER(C)

  CALL SYSTEM_CLOCK( T0, TIC)

  DO CONCURRENT( K=1:N, J=1:M, I=1:L)
    C(I,J,K) = A(I,J,K) +B(I,J,K)
  END DO

  CALL SYSTEM_CLOCK(T1)

  DO CONCURRENT( I=1:L, J=1:M, K=1:N)
    C(I,J,K) = A(I,J,K) +B(I,J,K)
  END DO

  CALL SYSTEM_CLOCK(T2)

  DO K=1,N
    DO J=1,M
      DO I=1,L
        C(I,J,K) = A(I,J,K) +B(I,J,K)
      END DO
    END DO
  END DO

  CALL SYSTEM_CLOCK(T3)

  C = A + B

  CALL SYSTEM_CLOCK(T4)

  FORALL(K=1:N, J=1:M, I=1:L)
    C(I,J,K) = A(I,J,K) + B(I,J,K)
  END FORALL

  CALL SYSTEM_CLOCK(T5)

  FORALL(I=1:L, J=1:M, K=1:N)
    C(I,J,K) = A(I,J,K) + B(I,J,K)
  END FORALL

  CALL SYSTEM_CLOCK(T6)

  PRINT*,"DO CONCURRENT 1 : ",(T1-T0)/TIC
  PRINT*,"DO CONCURRENT 2 : ",(T2-T1)/TIC
  PRINT*,"  ORDINARY DO   : ",(T3-T2)/TIC
  PRINT*,"   ARRAY DO     : ",(T4-T3)/TIC
  PRINT*,"   FORALL 1     : ",(T5-T4)/TIC
  PRINT*,"   FORALL 2     : ",(T6-T5)/TIC

END PROGRAM

It gives:

 DO CONCURRENT 1 :   0.3280000
 DO CONCURRENT 2 :    2.250000
   ORDINARY DO   :    2.891000
    ARRAY DO     :   0.2810000
    FORALL 1     :   0.2810000
    FORALL 2     :   0.2820000

Edit : Sorry,I was wrong. after adding the line 'CALL RANDOM_NUMBER(C)' the time of has changed.

0 Kudos
2 Replies
andrew_4619
Honored Contributor I
129 Views

Read https://software.intel.com/en-us/forums/intel-visual-fortran-compiler-for-windows/topic/499840

 

andrew_4619
Honored Contributor I
129 Views

For fun a did a variation of your code with 6 'runs' and took the average time. They area ll the same except "Do concurrent 2" where the optimiser clearly must make some poor choices. Having said that with such a simple test the optimiser might make some decisions to "simplify" the code in some of the cases anyway. 

DO CONCURRENT 1 : .08117
DO CONCURRENT 2 : 1.87650
  ORDINARY DO   : .07917
   ARRAY DO     : .08033
   FORALL 1     : .08033
   FORALL 2     : .08067

!  do_conc_test.f90 
!
PROGRAM TEST_DO_SPEED
    IMPLICIT NONE

    REAL, ALLOCATABLE :: A(:,:,:), B(:,:,:), C(:,:,:)
    REAL :: TIC, t_all(6)
    INTEGER :: t(0:6)
    INTEGER :: I, J, K, NT
    !INTEGER, PARAMETER :: L = 1024, M = 512, N = 512
    INTEGER, PARAMETER :: L = 1024, M = 256, N = 256
    integer, parameter :: N_tries = 6

    ALLOCATE( A(L,M,N), B(L,M,N), C(L,M,N) )
    CALL RANDOM_NUMBER(A)
    CALL RANDOM_NUMBER(B)
    CALL RANDOM_NUMBER(C)
  
    t_all = 0.0
    do NT = 1 , N_tries
        CALL SYSTEM_CLOCK( T(0), TIC)

        DO CONCURRENT( K=1:N, J=1:M, I=1:L)
            C(I,J,K) = A(I,J,K) +B(I,J,K)
        END DO

        CALL SYSTEM_CLOCK(T(1))

        DO CONCURRENT( I=1:L, J=1:M, K=1:N)
            C(I,J,K) = A(I,J,K) +B(I,J,K)
        END DO

        CALL SYSTEM_CLOCK(T(2))

        DO K=1,N
            DO J=1,M
                DO I=1,L
                    C(I,J,K) = A(I,J,K) +B(I,J,K)
                END DO
            END DO
        END DO

        CALL SYSTEM_CLOCK(T(3))

        C = A + B

        CALL SYSTEM_CLOCK(T(4))

        FORALL(K=1:N, J=1:M, I=1:L)
            C(I,J,K) = A(I,J,K) + B(I,J,K)
        END FORALL

        CALL SYSTEM_CLOCK(T(5))

        FORALL(I=1:L, J=1:M, K=1:N)
            C(I,J,K) = A(I,J,K) + B(I,J,K)
        END FORALL

        CALL SYSTEM_CLOCK(T(6))
  
  
        do i = 1 , size(t_all)
            t_all(i) = t_all(i) + (T(i)-T(i-1))/TIC
        enddo
    enddo
    t_all = t_all / N_tries
  

  write(*,'(A,F0.5)') "DO CONCURRENT 1 : ", t_all(1) 
  write(*,'(A,F0.5)') "DO CONCURRENT 2 : ", t_all(2) 
  write(*,'(A,F0.5)') "  ORDINARY DO   : ", t_all(3) 
  write(*,'(A,F0.5)') "   ARRAY DO     : ", t_all(4) 
  write(*,'(A,F0.5)') "   FORALL 1     : ", t_all(5) 
  write(*,'(A,F0.5)') "   FORALL 2     : ", t_all(6) 
  read(*,*) i
END PROGRAM

 

Reply