The following code fails in division, works fine if vectorization is disabled.
program div_test
integer, parameter :: n = 180
character*16 :: out
real upp(4,n)
integer i
integer :: ieee_flags
upp = 0
upp(1,:) = 1
i = ieee_flags('set', 'exception', 'all', out)
!!DIR$ NOVECTOR
do i = 1 ,n
upp(1,i) = 1. / upp(1,i)
enddo
print *, maxval(upp)
end
Looking at the assembly it is obvious why:
movss 16+div_test_$UPP.0.1(%rax), %xmm1 #20.15
movaps %xmm0, %xmm3 #20.15
movss div_test_$UPP.0.1(%rax), %xmm2 #20.15
movaps %xmm0, %xmm6 #20.15
movss 48+div_test_$UPP.0.1(%rax), %xmm4 #20.15
movaps %xmm0, %xmm9 #20.15
movss 32+div_test_$UPP.0.1(%rax), %xmm5 #20.15
movaps %xmm0, %xmm12 #20.15
movss 80+div_test_$UPP.0.1(%rax), %xmm7 #20.15
addl $8, %edx #19.12
movss 64+div_test_$UPP.0.1(%rax), %xmm8 #20.15
movss 112+div_test_$UPP.0.1(%rax), %xmm10 #20.15
movss 96+div_test_$UPP.0.1(%rax), %xmm11 #20.15
unpcklps %xmm1, %xmm2 #20.15
unpcklps %xmm4, %xmm5 #20.15
unpcklps %xmm7, %xmm8 #20.15
unpcklps %xmm10, %xmm11 #20.15
divps %xmm2, %xmm3 #20.15
divps %xmm5, %xmm6 #20.15
divps %xmm8, %xmm9 #20.15
divps %xmm11, %xmm12 #20.15
the xmm* registers will carry some zeroes into the division. It is worth noting that previous compiler version (14.0.1) seems to be have in a more sensible way, adding two more unpcklps that will move the zeroes out of the registers.
movss 48+div_test_$UPP.0.1(%rax), %xmm0 #20.31
addl $8, %edx #19.12
movss 16+div_test_$UPP.0.1(%rax), %xmm2 #20.31
movss 32+div_test_$UPP.0.1(%rax), %xmm1 #20.31
movss div_test_$UPP.0.1(%rax), %xmm3 #20.31
movss 64+div_test_$UPP.0.1(%rax), %xmm8 #20.31
movss 112+div_test_$UPP.0.1(%rax), %xmm5 #20.31
movss 80+div_test_$UPP.0.1(%rax), %xmm7 #20.31
movss 96+div_test_$UPP.0.1(%rax), %xmm6 #20.31
unpcklps %xmm0, %xmm2 #20.31
unpcklps %xmm1, %xmm3 #20.31
unpcklps %xmm5, %xmm7 #20.31
unpcklps %xmm6, %xmm8 #20.31
unpcklps %xmm2, %xmm3 #20.31
movaps .L_2il0floatpacket.0(%rip), %xmm4 #20.15
movaps .L_2il0floatpacket.0(%rip), %xmm9 #20.15
unpcklps %xmm7, %xmm8 #20.31
divps %xmm3, %xmm4 #20.15
divps %xmm8, %xmm9 #20.15
連結已複製
0 回應