- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
The do-concurrent construct is supposed to replace forall, but in this test it takes double the time to complete the same job if the indices is well ordered and gives a very slow time if not:
PROGRAM TEST_DO_SPEED
IMPLICIT NONE
REAL, ALLOCATABLE :: A(:,:,:), B(:,:,:), C(:,:,:)
REAL :: TIC
INTEGER :: T0, T1, T2, T3, T4, T5, T6
INTEGER :: I, J, K
INTEGER, PARAMETER :: L = 1024, M = 512, N = 512
ALLOCATE( A(L,M,N), B(L,M,N), C(L,M,N) )
CALL RANDOM_NUMBER(A)
CALL RANDOM_NUMBER(B)
CALL RANDOM_NUMBER(C)
CALL SYSTEM_CLOCK( T0, TIC)
DO CONCURRENT( K=1:N, J=1:M, I=1:L)
C(I,J,K) = A(I,J,K) +B(I,J,K)
END DO
CALL SYSTEM_CLOCK(T1)
DO CONCURRENT( I=1:L, J=1:M, K=1:N)
C(I,J,K) = A(I,J,K) +B(I,J,K)
END DO
CALL SYSTEM_CLOCK(T2)
DO K=1,N
DO J=1,M
DO I=1,L
C(I,J,K) = A(I,J,K) +B(I,J,K)
END DO
END DO
END DO
CALL SYSTEM_CLOCK(T3)
C = A + B
CALL SYSTEM_CLOCK(T4)
FORALL(K=1:N, J=1:M, I=1:L)
C(I,J,K) = A(I,J,K) + B(I,J,K)
END FORALL
CALL SYSTEM_CLOCK(T5)
FORALL(I=1:L, J=1:M, K=1:N)
C(I,J,K) = A(I,J,K) + B(I,J,K)
END FORALL
CALL SYSTEM_CLOCK(T6)
PRINT*,"DO CONCURRENT 1 : ",(T1-T0)/TIC
PRINT*,"DO CONCURRENT 2 : ",(T2-T1)/TIC
PRINT*," ORDINARY DO : ",(T3-T2)/TIC
PRINT*," ARRAY DO : ",(T4-T3)/TIC
PRINT*," FORALL 1 : ",(T5-T4)/TIC
PRINT*," FORALL 2 : ",(T6-T5)/TIC
END PROGRAM
It gives:
DO CONCURRENT 1 : 0.3280000
DO CONCURRENT 2 : 2.250000
ORDINARY DO : 2.891000
ARRAY DO : 0.2810000
FORALL 1 : 0.2810000
FORALL 2 : 0.2820000
Edit : Sorry,I was wrong. after adding the line 'CALL RANDOM_NUMBER(C)' the time of has changed.
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Read https://software.intel.com/en-us/forums/intel-visual-fortran-compiler-for-windows/topic/499840
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
For fun a did a variation of your code with 6 'runs' and took the average time. They area ll the same except "Do concurrent 2" where the optimiser clearly must make some poor choices. Having said that with such a simple test the optimiser might make some decisions to "simplify" the code in some of the cases anyway.
DO CONCURRENT 1 : .08117
DO CONCURRENT 2 : 1.87650
ORDINARY DO : .07917
ARRAY DO : .08033
FORALL 1 : .08033
FORALL 2 : .08067
! do_conc_test.f90
!
PROGRAM TEST_DO_SPEED
IMPLICIT NONE
REAL, ALLOCATABLE :: A(:,:,:), B(:,:,:), C(:,:,:)
REAL :: TIC, t_all(6)
INTEGER :: t(0:6)
INTEGER :: I, J, K, NT
!INTEGER, PARAMETER :: L = 1024, M = 512, N = 512
INTEGER, PARAMETER :: L = 1024, M = 256, N = 256
integer, parameter :: N_tries = 6
ALLOCATE( A(L,M,N), B(L,M,N), C(L,M,N) )
CALL RANDOM_NUMBER(A)
CALL RANDOM_NUMBER(B)
CALL RANDOM_NUMBER(C)
t_all = 0.0
do NT = 1 , N_tries
CALL SYSTEM_CLOCK( T(0), TIC)
DO CONCURRENT( K=1:N, J=1:M, I=1:L)
C(I,J,K) = A(I,J,K) +B(I,J,K)
END DO
CALL SYSTEM_CLOCK(T(1))
DO CONCURRENT( I=1:L, J=1:M, K=1:N)
C(I,J,K) = A(I,J,K) +B(I,J,K)
END DO
CALL SYSTEM_CLOCK(T(2))
DO K=1,N
DO J=1,M
DO I=1,L
C(I,J,K) = A(I,J,K) +B(I,J,K)
END DO
END DO
END DO
CALL SYSTEM_CLOCK(T(3))
C = A + B
CALL SYSTEM_CLOCK(T(4))
FORALL(K=1:N, J=1:M, I=1:L)
C(I,J,K) = A(I,J,K) + B(I,J,K)
END FORALL
CALL SYSTEM_CLOCK(T(5))
FORALL(I=1:L, J=1:M, K=1:N)
C(I,J,K) = A(I,J,K) + B(I,J,K)
END FORALL
CALL SYSTEM_CLOCK(T(6))
do i = 1 , size(t_all)
t_all(i) = t_all(i) + (T(i)-T(i-1))/TIC
enddo
enddo
t_all = t_all / N_tries
write(*,'(A,F0.5)') "DO CONCURRENT 1 : ", t_all(1)
write(*,'(A,F0.5)') "DO CONCURRENT 2 : ", t_all(2)
write(*,'(A,F0.5)') " ORDINARY DO : ", t_all(3)
write(*,'(A,F0.5)') " ARRAY DO : ", t_all(4)
write(*,'(A,F0.5)') " FORALL 1 : ", t_all(5)
write(*,'(A,F0.5)') " FORALL 2 : ", t_all(6)
read(*,*) i
END PROGRAM
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page