integer*4, parameter :: l = 1000000 integer*4, parameter :: m = 100 integer*4, parameter :: n = 10 ! real*8 a(l,m), at(m,l), b(m,n), c(l,n) real*8 te(0:5), t common /aaaa/ a, b, c equivalence (a,at) integer*4 i ! call cpu_time (te(0)) a = 1 b = 1 call cpu_time (t) te(0) = t-te(0) write (*,*) te(0),' initialise variables' call matmul_test (a,b,c, l,m,n, te(1:4)) call matmul_tran (at,b,c, l,m,n, te(5:5)) ! do i = 0,5 write (*,*) i,te(i) ! /cpu end do ! end subroutine matmul_test (A,B,C, l,m,n, te) ! integer*4 l,m,n, i,j,k real*8 A(l,m), B(m,n), C(l,n), row(m), s real*8 te(4), t ! A(1000000,100), 800 mb ! B(100,10), 8 kb ! C(1000000,10) 80 mb ! ! Conventional Matrix Multiplication, using accumulator "s" call cpu_time (te(1)) do i = 1,l do j = 1,n s = 0 do k = 1,m s = s + A(i,k) * B(k,j) end do C(i,j) = s end do end do call cpu_time (t) te(1) = t - te(1) write (*,*) te(1),' conventional loops' ! ! If s is not used - more general ( loop order swap for addressing A sequentially ) ! ( no need to transpose B ) call cpu_time (te(2)) C = 0 do k = 1,m do j = 1,n ! do i = 1,l ! C(i,j) = C(i,j) + A(i,k) * B(k,j) ! end do C(:,j) = C(:,j) + A(:,k) * B(k,j) end do end do call cpu_time (t) te(2) = t - te(2) write (*,*) te(2), ' vector addition' ! ! If row of A is used - sequential dot product, suits vector instruction ! call cpu_time (te(3)) do i = 1,l row(1:m) = A(i,1:m) do j = 1,n ! do k = 1,m ! s = s + A(i,k) * B(k,j) ! end do C(i,j) = Dot_Product (row, B(:,j)) end do end do call cpu_time (t) te(3) = t - te(3) write (*,*) te(3), ' row dot_product' ! ! If B is transposed - vector addition ! call cpu_time (te(4)) C = 0 do k = 1,m do j = 1,n ! do i = 1,l C(:,j) = C(:,j) + A(:,k) * B(k,j) ! * B(j,k) ! end do end do end do call cpu_time (t) te(4) = t - te(4) write (*,*) te(4), ' vector addition' ! end subroutine matmul_tran (A,B,C, l,m,n, te) ! integer*4 l,m,n, i,j real*8 A(m,l), B(m,n), C(l,n) real*8 te(1), t ! A(1000000,100), 800 mb ! B(100,10), 8 kb ! C(1000000,10) 80 mb ! ! If A is transposed - sequential dot product, suits vector instruction ! call cpu_time (te(1)) C = 0 do i = 1,l do j = 1,n ! do k = 1,m ! C(i,j) = C(i,j) + A(k,i) * B(k,j) ! end do C(i,j) = Dot_Product (A(:,i), B(:,j)) end do end do call cpu_time (t) te(1) = t - te(1) write (*,*) te(1), ' transpose dot_product' ! end