FPE in vectorized division using ifort 19.0.3

gn164 · ‎05-30-2019

The following code fails in division, works fine if vectorization is disabled.

        program div_test


           integer, parameter :: n = 180
           character*16 :: out
           real upp(4,n)
           integer i
           integer :: ieee_flags

           upp = 0
           upp(1,:) = 1

           i = ieee_flags('set', 'exception', 'all', out)

           !!DIR$ NOVECTOR
           do i = 1 ,n
              upp(1,i) = 1. / upp(1,i)
           enddo

           print *, maxval(upp)

        end

Looking at the assembly it is obvious why:

movss     16+div_test_$UPP.0.1(%rax), %xmm1             #20.15
        movaps    %xmm0, %xmm3                                  #20.15
        movss     div_test_$UPP.0.1(%rax), %xmm2                #20.15
        movaps    %xmm0, %xmm6                                  #20.15
        movss     48+div_test_$UPP.0.1(%rax), %xmm4             #20.15
        movaps    %xmm0, %xmm9                                  #20.15
        movss     32+div_test_$UPP.0.1(%rax), %xmm5             #20.15
        movaps    %xmm0, %xmm12                                 #20.15
        movss     80+div_test_$UPP.0.1(%rax), %xmm7             #20.15
        addl      $8, %edx                                      #19.12
        movss     64+div_test_$UPP.0.1(%rax), %xmm8             #20.15
        movss     112+div_test_$UPP.0.1(%rax), %xmm10           #20.15
        movss     96+div_test_$UPP.0.1(%rax), %xmm11            #20.15
        unpcklps  %xmm1, %xmm2                                  #20.15
        unpcklps  %xmm4, %xmm5                                  #20.15
        unpcklps  %xmm7, %xmm8                                  #20.15
        unpcklps  %xmm10, %xmm11                                #20.15
        divps     %xmm2, %xmm3                                  #20.15
        divps     %xmm5, %xmm6                                  #20.15
        divps     %xmm8, %xmm9                                  #20.15
        divps     %xmm11, %xmm12                                #20.15

the xmm* registers will carry some zeroes into the division. It is worth noting that previous compiler version (14.0.1) seems to be have in a more sensible way, adding two more unpcklps that will move the zeroes out of the registers.

movss     48+div_test_$UPP.0.1(%rax), %xmm0             #20.31
        addl      $8, %edx                                      #19.12
        movss     16+div_test_$UPP.0.1(%rax), %xmm2             #20.31
        movss     32+div_test_$UPP.0.1(%rax), %xmm1             #20.31
        movss     div_test_$UPP.0.1(%rax), %xmm3                #20.31
        movss     64+div_test_$UPP.0.1(%rax), %xmm8             #20.31
        movss     112+div_test_$UPP.0.1(%rax), %xmm5            #20.31
        movss     80+div_test_$UPP.0.1(%rax), %xmm7             #20.31
        movss     96+div_test_$UPP.0.1(%rax), %xmm6             #20.31
        unpcklps  %xmm0, %xmm2                                  #20.31
        unpcklps  %xmm1, %xmm3                                  #20.31
        unpcklps  %xmm5, %xmm7                                  #20.31
        unpcklps  %xmm6, %xmm8                                  #20.31
        unpcklps  %xmm2, %xmm3                                  #20.31
        movaps    .L_2il0floatpacket.0(%rip), %xmm4             #20.15
        movaps    .L_2il0floatpacket.0(%rip), %xmm9             #20.15
        unpcklps  %xmm7, %xmm8                                  #20.31
        divps     %xmm3, %xmm4                                  #20.15
        divps     %xmm8, %xmm9                                  #20.15