module transpose_mod implicit none private integer(4), parameter :: INT_SIZE = 8, LOG_SIZE = 4 ! consts integer(INT_SIZE), parameter :: ZERO = 0_INT_SIZE integer(INT_SIZE), parameter :: ONE = 1_INT_SIZE ! block indexes integer(INT_SIZE), parameter :: iStart = 1 integer(INT_SIZE), parameter :: iEnd = 2 integer(INT_SIZE), parameter :: jStart = 3 integer(INT_SIZE), parameter :: jEnd = 4 integer(INT_SIZE), parameter :: iind = 1 integer(INT_SIZE), parameter :: jind = 2 ! types of non-overlapped sub-blocks integer(INT_SIZE), parameter :: top = 1 integer(INT_SIZE), parameter :: left = 2 integer(INT_SIZE), parameter :: right = 3 integer(INT_SIZE), parameter :: bottom = 4 ! block type, bind(C) :: block_t integer(INT_SIZE) :: pos(4) integer(INT_SIZE) :: size(2) end type block_t ! non-overlapped sub-blocks type, bind(C) :: blocks_t type(block_t) :: blocks(4) end type blocks_t interface new_block module procedure new_block_pos, new_block_pos2_size end interface new_block private :: new_block_pos, new_block_pos2_size public :: block_t, new_block, inmemory_transpose contains type(block_t) function new_block_pos(pos) result(block_n) integer(INT_SIZE), intent(in) :: pos(4) block_n%pos = pos block_n%size(iind) = block_n%pos(iEnd) - block_n%pos(iStart) + ONE block_n%size(jind) = block_n%pos(jEnd) - block_n%pos(jStart) + ONE end function new_block_pos type(block_t) function new_block_pos2_size(pos2, size) result(block_n) integer(INT_SIZE), intent(in) :: pos2(2), size block_n%pos(iStart) = pos2(1) block_n%pos(jStart) = pos2(2) block_n%size = size block_n%pos(iEnd) = block_n%pos(iStart) + block_n%size(iind) - ONE block_n%pos(jEnd) = block_n%pos(jStart) + block_n%size(jind) - ONE end function new_block_pos2_size type(block_t) function overlap(block_to, block_from) type(block_t), intent(in) :: block_to, block_from overlap%pos(iStart) = max(block_to%pos(iStart), block_from%pos(iStart)) overlap%pos(iEnd) = min(block_to%pos(iEnd), block_from%pos(iEnd)) overlap%pos(jStart) = max(block_to%pos(jStart), block_from%pos(jStart)) overlap%pos(jEnd) = min(block_to%pos(jEnd), block_from%pos(jEnd)) overlap%size(iind) = overlap%pos(iEnd) - overlap%pos(iStart) + ONE overlap%size(jind) = overlap%pos(jEnd) - overlap%pos(jStart) + ONE end function overlap logical(LOG_SIZE) function is_transposable(block_to, block_from) type(block_t), intent(in) :: block_to, block_from is_transposable = .true. if (block_to%size(iind) /= block_from%size(jind) .or. block_to%size(jind) /= block_from%size(iind)) & is_transposable = .false. end function is_transposable logical(LOG_SIZE) function is_same(block_to, block_from) type(block_t), intent(in) :: block_to, block_from is_same = .false. if (all(block_to%pos == block_from%pos)) is_same = .true. end function is_same type(blocks_t) function generate_blocks(block_to, block_overlap) result(blocks) type(block_t), intent(in) :: block_to, block_overlap blocks%blocks = block_to if (all(block_overlap%size /= ZERO)) then blocks%blocks(top)%pos(jEnd) = min(block_to%pos(jEnd), block_overlap%pos(jStart) - ONE) blocks%blocks(left)%pos(iEnd) = min(block_to%pos(iEnd), block_overlap%pos(iStart) - ONE) blocks%blocks(right)%pos(iStart) = max(block_to%pos(iStart), block_overlap%pos(iEnd) + ONE) blocks%blocks(bottom)%pos(jStart) = max(block_to%pos(jStart), block_overlap%pos(jEnd) + ONE) blocks%blocks(top)%size(jind) = max(blocks%blocks(top)%pos(jEnd) - blocks%blocks(top)%pos(jStart) + ONE, ZERO) blocks%blocks(left)%size(iind) = max(blocks%blocks(left)%pos(iEnd) - blocks%blocks(left)%pos(iStart) + ONE, ZERO) blocks%blocks(right)%size(iind) = max(blocks%blocks(right)%pos(iEnd) - blocks%blocks(right)%pos(iStart) + ONE, ZERO) blocks%blocks(bottom)%size(jind) = max(blocks%blocks(bottom)%pos(jEnd) - blocks%blocks(bottom)%pos(jStart) + ONE, ZERO) end if end function generate_blocks type(block_t) function generate_subblock(block_to, block_from, typeid) result(subblock) type(block_t), intent(in) :: block_to, block_from integer(INT_SIZE), intent(in) :: typeid subblock = block_from subblock%size(1:2) = block_to%size(2:1:-1) select case (typeid) case (top,left) subblock%pos(iEnd) = subblock%pos(iStart) + subblock%size(iind) - ONE subblock%pos(jEnd) = subblock%pos(jStart) + subblock%size(jind) - ONE case (right,bottom) subblock%pos(iStart) = subblock%pos(iEnd) - subblock%size(iind) + ONE subblock%pos(jStart) = subblock%pos(jEnd) - subblock%size(jind) + ONE end select end function generate_subblock subroutine exclude_block(block_io, subblock) type(block_t), intent(inout) :: block_io type(block_t), intent(in) :: subblock if (block_io%pos(iStart) == subblock%pos(iStart) .and. block_io%pos(jStart) == subblock%pos(jStart)) then if (block_io%size(iind) == subblock%size(iind)) then block_io%pos(jStart) = block_io%pos(jStart) + subblock%size(jind) block_io%size(jind) = block_io%size(jind) - subblock%size(jind) else block_io%pos(iStart) = block_io%pos(iStart) + subblock%size(iind) block_io%size(iind) = block_io%size(iind) - subblock%size(iind) end if else if (block_io%size(iind) == subblock%size(iind)) then block_io%size(jind) = block_io%size(jind) - subblock%size(jind) block_io%pos(jEnd) = block_io%pos(jStart) + block_io%size(jind) - ONE else block_io%size(iind) = block_io%size(iind) - subblock%size(iind) block_io%pos(iEnd) = block_io%pos(iStart) + block_io%size(iind) - ONE end if end if end subroutine exclude_block subroutine transpose_block(S, block_fromto) real(8), intent(inout) :: S(:,:) type(block_t), intent(in) :: block_fromto integer(INT_SIZE) :: i, j, ishift, jshift real(8) :: tmp ishift = block_fromto%pos(iStart) jshift = block_fromto%pos(jStart) do concurrent (i = 0:block_fromto%size(iind)-ONE, & j = 0:block_fromto%size(iind)-ONE, & j < i) tmp = S(i+ishift,j+jshift) S(i+ishift,j+jshift) = S(j+ishift,i+jshift) S(j+ishift,i+jshift) = tmp end do end subroutine transpose_block subroutine copy_block(S, block_to, block_from) real(8), intent(inout) :: S(:,:) type(block_t), intent(in) :: block_to, block_from integer(INT_SIZE) :: i, j do concurrent (i = 0:block_to%size(iind) - ONE, & j = 0:block_to%size(jind) - ONE) S(i+block_to%pos(iStart),j+block_to%pos(jStart)) = S(j+block_from%pos(iStart),i+block_from%pos(jStart)) end do end subroutine copy_block subroutine inmemory_transpose(S, block_to, block_from) real(8), intent(inout) :: S(:,:) type(block_t), intent(in) :: block_to, block_from type(block_t) :: block_left_to, block_left_from, subblock, block_overlap type(blocks_t) :: blocks integer(INT_SIZE) :: i if (.not.is_transposable(block_to, block_from)) & error stop "Cannot be transposed: wrong shape!" block_left_to = block_to block_left_from = block_from do while(.true.) if (is_same(block_left_to, block_left_from)) then call transpose_block(S, block_left_to) return end if block_overlap = overlap(block_left_to, block_left_from) if (any(block_overlap%size == ZERO)) then call copy_block(S, block_left_to, block_left_from) return end if blocks = generate_blocks(block_left_to, block_left_from) do i = 1, size(blocks%blocks) associate(block_work => blocks%blocks(i)) if (all(block_work%size /= 0)) then subblock = generate_subblock(block_work, block_left_from, i) call copy_block(S, block_work, subblock) call exclude_block(block_left_to, block_work) call exclude_block(block_left_from, subblock) exit end if end associate end do end do end subroutine inmemory_transpose end module transpose_mod