Optimization bug using parameter in function

jimdempseyatthecove · ‎07-24-2020

The following code works in debug mode but not in release mode.

The purpose of the code is to replace the IEEE_IS_NAN function, which is abysmally slow and is not vectorizable, with a function that is very lightweight and vectorizable.

!  TestNaN.f90 
module MOD_FAST_IS_NAN
    integer, parameter :: IEEE754_SP_SIGN = Z'80000000'
    integer, parameter :: IEEE754_SP_EXPONENT = Z'7F800000'
   integer, parameter :: IEEE754_SP_MANTISSA = Z'007FFFFF'
   
    integer(8), parameter :: IEEE754_DP_SIGN = Z'8000000000000000'
    integer(8), parameter :: IEEE754_DP_EXPONENT = Z'7FFFFF0000000000'
   integer(8), parameter :: IEEE754_DP_MANTISSA =  Z'000000FFFFFFFFFF'
   
    interface FAST_IS_NAN
        LOGICAL FUNCTION FAST_IS_NAN_SP(X)
            REAL :: X
        END FUNCTION FAST_IS_NAN_SP
        LOGICAL FUNCTION FAST_IS_NAN_DP(X)
            DOUBLE PRECISION :: X
        END FUNCTION FAST_IS_NAN_DP
    end interface FAST_IS_NAN
    CONTAINS
end module MOD_FAST_IS_NAN
    LOGICAL FUNCTION FAST_IS_NAN_SP(X)
        REAL :: X
        ! Bug in 2020.0.166 (either way)
!       FAST_IS_NAN_SP = ((TRANSFER(ABS(X),0)+IEEE754_SP_MANTISSA) .LT. 0)
        FAST_IS_NAN_SP = ((TRANSFER(ABS(X),0)+Z'007FFFFF') .LT. 0)
    END FUNCTION FAST_IS_NAN_SP
    
    LOGICAL FUNCTION FAST_IS_NAN_DP(X)
        DOUBLE PRECISION :: X
        ! Bug in 2020.0.166 (either way)
!       FAST_IS_NAN_DP = ((TRANSFER(ABS(X),0_8)+IEEE754_SP_MANTISSA) .LT. 0_8)
        FAST_IS_NAN_DP = ((TRANSFER(ABS(X),0_8)+Z'000000FFFFFFFFFF') .LT. 0_8)
    END FUNCTION FAST_IS_NAN_DP
    
program TestNaN
    USE, INTRINSIC :: IEEE_ARITHMETIC
    use MOD_FAST_IS_NAN
    use omp_lib
    implicit none
    
    integer, parameter :: N = 100000000
    real, allocatable :: SP(:)
    real :: SP_NAN
    double precision, allocatable :: DP(:)
    real :: DP_NAN
   double precision :: T0, T1
    integer :: I, J
    LOGICAL :: FOUND_NAN
    
    ALLOCATE(SP(N), DP(N))
    CALL RANDOM_NUMBER(SP)
    CALL RANDOM_NUMBER(DP)
    
    ! ASSURE CONSTANTS ARE CORRECT
    SP_NAN = TRANSFER(IEEE754_SP_EXPONENT+1, 0.0)
    FOUND_NAN = IEEE_IS_NAN(SP_NAN)
    PRINT *,'IEEE_IS_NAN(SP_NAN)', FOUND_NAN
    SP_NAN = TRANSFER(IEEE754_SP_EXPONENT+1, 0.0)
    FOUND_NAN = FAST_IS_NAN(SP_NAN)
    PRINT *,'FAST_IS_NAN(SP_NAN)', FOUND_NAN
    
    ! ASSURE CONSTANTS ARE CORRECT
    DP_NAN = TRANSFER(IEEE754_DP_EXPONENT+1, 0.0_8)
    FOUND_NAN = IEEE_IS_NAN(DP_NAN)
    PRINT *,'IEEE_IS_NAN(DP_NAN)', FOUND_NAN
    DP_NAN = TRANSFER(IEEE754_DP_EXPONENT+1, 0.0_8)
    FOUND_NAN = FAST_IS_NAN(DP_NAN)
    PRINT *,'FAST_IS_NAN(DP_NAN)', FOUND_NAN
    
  ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        SP(I) = SP(I) * 1.00001
        IF(IEEE_IS_NAN(SP(I))) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, 100
        DO I=1,N
            SP(I) = SP(I) * 1.00001
            IF(IEEE_IS_NAN(SP(I))) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "IEEE_IS_NAN(SP(I))", T1-T0, FOUND_NAN

    ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        SP(I) = SP(I) * 1.00001
        IF(FAST_IS_NAN(SP(I))) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, 100
        DO I=1,N
            SP(I) = SP(I) * 1.00001
            IF(FAST_IS_NAN(SP(I))) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "FAST_IS_NAN(SP(I))", T1-T0, FOUND_NAN
    
    ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        DP(I) = DP(I) * 1.00001_8
        IF(IEEE_IS_NAN(DP(I))) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, 100
        DO I=1,N
            DP(I) = DP(I) * 1.00001_8
            IF(IEEE_IS_NAN(DP(I))) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "IEEE_IS_NAN(DP(I))", T1-T0, FOUND_NAN
    
    ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        DP(I) = DP(I) * 1.00001_8
        IF(FAST_IS_NAN(DP(I))) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, 100
        DO I=1,N
            DP(I) = DP(I) * 1.00001_8
            IF(FAST_IS_NAN(DP(I))) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "FAST_IS_NAN(DP(I))", T1-T0, FOUND_NAN
    
end program TestNaN

Note, I just downloaded 2020 update 2, will give that a try.

Jim Dempsey

jimdempseyatthecove · ‎07-24-2020

This was with /QxHost on Core i7 2600K

jimdempseyatthecove · ‎07-24-2020

>>Note, I just downloaded 2020 update 2, will give that a try

Oops, that blew out MS VS 2019, ... reinstalling MS VS 2019 to try again.

Jim Dempsey

jimdempseyatthecove · ‎07-25-2020

Intel PS 2020u2 did not correct the problem.

Jim Dempsey

Steve_Lionel · ‎07-25-2020

I've spent some time looking at the assembly for this, and I think I understand what is happening. Here is the /Od (no optimization) code for your single precision routine (the part that does the test):

        movss     xmm0, DWORD PTR [eax]                         ;24.28
        andps     xmm0, XMMWORD PTR [_2il0floatpacket.0]        ;24.28
        movss     DWORD PTR [-16+ebp], xmm0                     ;24.28
        mov       eax, DWORD PTR [-16+ebp]                      ;24.28
        mov       DWORD PTR [-12+ebp], eax                      ;24.28
        mov       eax, 8388607                                  ;24.9
        add       eax, DWORD PTR [-12+ebp]                      ;24.46
        js        .B2.3         ; Prob 50%                      ;24.60

This is very straightforward, as you'd expect. The movss and andps are doing the ABS(X). It then adds Z'007FFFFF' (8388607) and jumps if the result is negative.

Here's the optimized version:

        movss     xmm0, DWORD PTR [eax]                         ;24.28
        xor       eax, eax                                      ;25.5
        andps     xmm0, XMMWORD PTR [_2il0floatpacket.0]        ;24.28
        movd      edx, xmm0                                     ;24.28
        cmp       edx, -8388607                                 ;25.5
        cmovl     eax, ecx                                      ;25.5

It starts out largely the same, doing the ABS the same way. But instead of adding 7FFFFF it compares the value to 0-7FFFFF. This is fine, arithmetically, but you were relying on an undetected integer overflow changing the sign bit, and you can't depend on that with Fortran signed integers.

This is not a compiler bug.

Steve_Lionel · ‎07-25-2020

Incidentally, the reason IEEE_IS_NAN is slower is that it relies on the FP_CLASS intrinsic (extension) and then tests the result to see if it is a quiet or signaling NaN. FP_CLASS isn't particularly complicated, but the overall sequence is longer than your "fast" version. I'd argue that for most purposes, it's fine.

FortranFan · ‎07-25-2020

Jim,

Will it be possible for you to explain why would need anything more complicated than `x /= x` check for your "fast" version?

   generic :: FAST_IS_NAN => FAST_IS_NAN_SP, FAST_IS_NAN_DP

contains

   elemental function FAST_IS_NAN_SP(X) result(is_nan)
      real(SP), intent(in) :: X
      logical :: is_nan
      is_nan = ( X /= X )
      return
   end function FAST_IS_NAN_SP

   elemental function FAST_IS_NAN_DP(X) result(is_nan)
      real(DP), intent(in) :: X
      logical :: is_nan
      is_nan = ( X /= X )
      return
   end function FAST_IS_NAN_DP

FortranFan · ‎07-25-2020

Jim,

Here's a modified version of your code from the original post. It'll be useful if you can try it as-is and report here the output you see on your workstation i.e., whether you can reproduce the trend in the results I show below with this modified code as in:

this "simple" version identifies the same number of NAN values as the IEEE intrinsic in Fortran with Intel Fortran compiler optimization, and
there is about 10x improvement in CPU timing response with this "fast version.

module kinds_m
   integer, parameter :: SP = kind(1.0)
   integer, parameter :: DP = kind(1D0)
end module

module fpe_support_m

   use, intrinsic :: iso_fortran_env, only : I4 => int32, I8 => int64
   use kinds_m, only : SP, DP

   integer(I4), parameter :: IEEE754_SP_EXPONENT = int( Z'7F800000', kind=I4 )
   integer(I8), parameter :: IEEE754_DP_EXPONENT = int( Z'7FFFFF0000000000', kind=I8 )

   generic :: FAST_IS_NAN => FAST_IS_NAN_SP, FAST_IS_NAN_DP

contains

   elemental function FAST_IS_NAN_SP(X) result(is_nan)
      real(SP), intent(in) :: X
      logical :: is_nan
      is_nan = ( X /= X )
      return
   end function FAST_IS_NAN_SP

   elemental function FAST_IS_NAN_DP(X) result(is_nan)
      real(DP), intent(in) :: X
      logical :: is_nan
      is_nan = ( X /= X )
      return
   end function FAST_IS_NAN_DP

end module fpe_support_m

module data_m

   use, intrinsic :: ieee_arithmetic, only : ieee_value, ieee_quiet_nan
   use kinds_m, only : SP, DP

   logical, save :: init_rng_sp = .false.
   logical, save :: init_rng_dp = .false.

   generic :: calc_data => calc_data_sp, calc_data_dp

contains

   subroutine calc_data_sp ( vals )
      real(SP), intent(inout) :: vals(:)
      real(SP) :: r, dr
      if ( .not. init_rng_sp ) then
         call random_seed()
         call random_number( vals )
         call random_number( r )
         dr = r / 10.0_sp
         where ( (vals > (r-dr)) .and. (vals < (r+dr)) )
            vals = ieee_value( r, ieee_quiet_nan )
         end where
         init_rng_sp = .true.
      end if
      rvals = rvals * 1.00001_sp
      return
   end subroutine

   subroutine calc_data_dp ( vals )
      real(DP), intent(inout) :: vals(:)
      real(DP) :: r, dr
      if ( .not. init_rng_dp ) then
         call random_seed()
         call random_number( vals )
         call random_number( r )
         dr = r / 10.0_dp
         where ( (vals > (r-dr)) .and. (vals < (r+dr)) )
            vals = ieee_value( r, ieee_quiet_nan )
         end where
         init_rng_dp = .true.
      end if
      rvals = rvals * 1.00001_dp
      return
   end subroutine

end module

program TestNaN

   use, intrinsic :: IEEE_ARITHMETIC
   use kinds_m, only : SP, DP
   use data_m, only : calc_data
   use fpe_support_m, only : IEEE754_SP_EXPONENT, IEEE754_DP_EXPONENT, FAST_IS_NAN
   use omp_lib

   implicit none

   integer, parameter :: N = 100000000
   integer, parameter :: NPASS = 100
   real(SP), allocatable :: rvals(:)
   real(SP) :: SP_NAN
   real(DP), allocatable :: dvals(:)
   real(DP) :: DP_NAN
   real(DP) :: T0, T1, NN
   integer :: J
   integer :: num_nan
   logical :: FOUND_NAN
   character(len=*), parameter :: fmth = "(g0,t25,g0,t50,g0)"
   character(len=*), parameter :: fmtd = "(g0,t25,g10.2,t50,g0)"

   allocate(rvals(N), dvals(N))
   call calc_data( rvals )
   call calc_data( dvals )

   ! ASSURE CONSTANTS ARE CORRECT
   SP_NAN = transfer(IEEE754_SP_EXPONENT+1, 0.0)
   FOUND_NAN = IEEE_IS_NAN(SP_NAN)
   print *,'IEEE_IS_NAN(SP_NAN)', FOUND_NAN
   SP_NAN = transfer(IEEE754_SP_EXPONENT+1, 0.0)
   FOUND_NAN = FAST_IS_NAN(SP_NAN)
   print *,'FAST_IS_NAN(SP_NAN)', FOUND_NAN

   ! ASSURE CONSTANTS ARE CORRECT
   DP_NAN = transfer(IEEE754_DP_EXPONENT+1, 0.0_8)
   FOUND_NAN = IEEE_IS_NAN(DP_NAN)
   print *,'IEEE_IS_NAN(DP_NAN)', FOUND_NAN
   DP_NAN = transfer(IEEE754_DP_EXPONENT+1, 0.0_8)
   FOUND_NAN = FAST_IS_NAN(DP_NAN)
   print *,'FAST_IS_NAN(DP_NAN)', FOUND_NAN

   num_nan = count( ieee_is_nan(rvals) )
   print *, "Expected number of SP NAN values: ", num_nan
   num_nan = count( ieee_is_nan(dvals) )
   print *, "Expected number of DP NAN values: ", num_nan
   print *
   print fmth, "IS_NAN Method", "OMP time", "Number of NAN values found"
   print fmth, "", "(secs)", ""

   ! Make pass without report to precondition cache to some stable state
   call calc_data( rvals )
   num_nan = count( ieee_is_nan(rvals) )

   ! Now timed passes
   T0 = OMP_GET_WTIME()
   NN = 0.0_dp
   do J = 1, NPASS
      call calc_data( rvals )
      NN = NN + count( ieee_is_nan(rvals) )
   end do
   T1 = OMP_GET_WTIME()
   num_nan = int( NN/real(NPASS,kind=kind(NN)), kind=kind(num_nan))
   print fmtd, "IEEE_IS_NAN(rvals(I))", T1-T0, num_nan

   ! Make pass without report to precondition cache to some stable state
   call calc_data( rvals )
   num_nan = count( fast_is_nan(rvals) )

   ! Now timed passes
   T0 = OMP_GET_WTIME()
   NN = 0.0_dp
   do J = 1, NPASS
      call calc_data( rvals )
      NN = NN + count( fast_is_nan(rvals) )
   end do
   T1 = OMP_GET_WTIME()
   num_nan = int( NN/real(NPASS,kind=kind(NN)), kind=kind(num_nan))
   print fmtd, "FAST_IS_NAN(rvals(I))", T1-T0, num_nan

   ! Make pass without report to precondition cache to some stable state
   call calc_data( dvals )
   num_nan = count( ieee_is_nan(dvals) )

   ! Now timed passes
   T0 = OMP_GET_WTIME()
   NN = 0.0_dp
   do J = 1, NPASS
      call calc_data( dvals )
      NN = NN + count( ieee_is_nan(dvals) )
   end do
   T1 = OMP_GET_WTIME()
   num_nan = int( NN/real(NPASS,kind=kind(NN)), kind=kind(num_nan))
   print fmtd, "IEEE_IS_NAN(dvals(I))", T1-T0, num_nan

   ! Make pass without report to precondition cache to some stable state
   call calc_data( dvals )
   num_nan = count( fast_is_nan(rvals) )

   ! Now timed passes
   T0 = OMP_GET_WTIME()
   NN = 0.0_dp
   do J = 1, NPASS
      call calc_data( dvals )
      NN = NN + count( fast_is_nan(dvals) )
   end do
   T1 = OMP_GET_WTIME()
   num_nan = int( NN/real(NPASS,kind=kind(NN)), kind=kind(num_nan))
   print fmtd, "FAST_IS_NAN(dvals(I))", T1-T0, num_nan

end program TestNaN

Intel Fortran console output:

C:\temp>ifort /O2 /QxHost /Qopenmp ff-nan.f90 /exe:ff-nan.exe
Intel(R) Visual Fortran Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.1.216 Build 20200306
Copyright (C) 1985-2020 Intel Corporation.  All rights reserved.

Microsoft (R) Incremental Linker Version 14.25.28612.0
Copyright (C) Microsoft Corporation.  All rights reserved.

-out:ff-nan.exe
-subsystem:console
-defaultlib:libiomp5md.lib
-nodefaultlib:vcomp.lib
-nodefaultlib:vcompd.lib
ff-nan.obj

C:\temp>ff-nan.exe
 IEEE_IS_NAN(SP_NAN) T
 FAST_IS_NAN(SP_NAN) T
 IEEE_IS_NAN(DP_NAN) T
 FAST_IS_NAN(DP_NAN) T
 Expected number of SP NAN values:      8429746
 Expected number of DP NAN values:      5060414

IS_NAN Method           OMP time                 Number of NAN values found
                        (secs)
IEEE_IS_NAN(rvals(I))      37.                   8429746
FAST_IS_NAN(rvals(I))      2.0                   8429746
IEEE_IS_NAN(dvals(I))      32.                   5060414
FAST_IS_NAN(dvals(I))      3.9                   5060414

C:\temp>

Steve_Lionel · ‎07-25-2020

@FortranFan , x/=x may trigger an exception if x is a sNaN, and this also requires compiling with /fp:strict that will slow down overall performance, otherwise the optimizer may remove the test.

FortranFan · ‎07-25-2020

I did think about signaling NAN and that's why I wrote the code as I did. And with signaling nan, I hadn't noticed a difference. Here's with /fp:strict:

   subroutine calc_data_dp ( vals )
      real(DP), intent(inout) :: vals(:)
      real(DP) :: r, dr
      if ( .not. init_rng_dp ) then
         call random_seed()
         call random_number( vals )
         call random_number( r )
         dr = r / 10.0_dp
         where ( (vals > (r-dr)) .and. (vals < (r+dr)) )
            vals = ieee_value( r, ieee_signaling_nan ) !<-- ***
         end where
         init_rng_dp = .true.
      end if
      rvals = rvals * 1.00001_dp
      return
   end subroutine

C:\temp>ifort /O2 /QxHost /Qopenmp ff-nan.f90 /fp:strict /exe:ff-nan.exe
Intel(R) Visual Fortran Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.1.216 Build 20200306
Copyright (C) 1985-2020 Intel Corporation.  All rights reserved.

Microsoft (R) Incremental Linker Version 14.25.28612.0
Copyright (C) Microsoft Corporation.  All rights reserved.

-out:ff-nan.exe
-subsystem:console
-defaultlib:libiomp5md.lib
-nodefaultlib:vcomp.lib
-nodefaultlib:vcompd.lib
ff-nan.obj

C:\temp>ff-nan.exe
 IEEE_IS_NAN(SP_NAN) T
 FAST_IS_NAN(SP_NAN) T
 IEEE_IS_NAN(DP_NAN) T
 FAST_IS_NAN(DP_NAN) T
 Expected number of SP NAN values:      4766943
 Expected number of DP NAN values:       758712

IS_NAN Method           OMP time                 Number of NAN values found
                        (secs)
IEEE_IS_NAN(rvals(I))      35.                   4766943
FAST_IS_NAN(rvals(I))      2.0                   4766943
IEEE_IS_NAN(dvals(I))      27.                   758712
FAST_IS_NAN(dvals(I))      3.8                   758712

C:\temp>

jimdempseyatthecove · ‎07-25-2020

FortranFan, thanks for the suggestion to use (X/=X), not sure I will use that method. For three reasons:

1) Compiler optimization may elide the test and return .false.
2) Steve's concern about an assert
3) I have an alternate requirement for vectorization to be optimal.

I will post my updated code after my 3rd attempt at reinstall of PS 2020u2 (may revert back to 2020u1)

I found 2 issues with my earlier attempt (in the test). One was a programming error (assumption) on my part, and the second was (I assume) a compiler bug. The compiler bug occurs when one of my constants is defined as a parameter. When defined as a variable, the new code works (though a tad slower than it could).

RE: vectorization & optimization

Returning returns an unspecified, but generally and INTEGER(4) value with an indicator of .true. or .false.. For Fortran, the specification uses the lsb as a flag. In order to place the flag there, (depending on compiler optimization) an additional conditional move may be required.

The problem with the default LOGICAL byte-size, itit gzzzzzz

INTEL PLEASE FIX YOUR INSTALLER it just not popped-up a dialog box on top of this forum edit box, with a Cancel button located where I was typing.... thus inadvertently having my typing unwittingly incorectlly answer the pop-up.

back to "The problem with..." when processing an array of variables, it is preferable to use (compiler generated) vectors, having different width vectors complicates this as it may require a convert from one format to the other. While a LOGICAL(8) can be returned, I think it may be more optimal to return an integer of size of REAL or DOUBLE with:

>=0 as .NOT. NaN
<0 as NaN

Then the potential NaN's can be accumulated with:

FOUND_NAN = IOR(FOUND_NAN, INTEGER_IS_NAN(X))

that should vectorize and registerize within a tight loop.

Now I can get back to figure out the install problem with PS 2020u2

Jim Dempsey

jimdempseyatthecove · ‎07-25-2020

Results:

 IEEE_IS_NAN(SP_NAN) T
 NEGATIVE_IS_NAN(SP_NAN) T
 FAST_IS_NAN(SP_NAN) T
 IEEE_IS_NAN(DP_NAN) T
 NEGATIVE_IS_NAN(DP_NAN) T
 FAST_IS_NAN(DP_NAN) T
 IEEE_IS_NAN(SP(I))             28.1037067166981      F
 NEGATIVE_IS_NAN(SP(I)) < 0    0.766837673545979      F
 FAST_IS_NAN(SP(I))             1.12401320825484      F
 IEEE_IS_NAN(DP(I))             26.7647927954972      F
 NEGATIVE_IS_NAN(DP(I)) < 0_8   2.06496960600361      F
 NEGATIVE_IS_NAN(DP(I))   2.63807039399626      F

Revised code:

!  TestNaN.f90 
module MOD_NEGATIVE_IS_NAN
    
    integer, parameter :: IEEE754_SP_SIGN = Z'80000000'
    integer, parameter :: IEEE754_SP_EXPONENT = Z'7F800000'
    ! *** Due to compiler (Intel 2020u1/u2) optimization bug we cannot use a parameter
!    integer, parameter :: IEEE754_SP_MANTISSA = Z'007FFFFF'
    ! instead using memory stored variable works
    ! at some point this can be restored to parameter
    integer :: IEEE754_SP_MANTISSA = Z'007FFFFF'
   
    integer(8), parameter :: IEEE754_DP_SIGN = Z'8000000000000000'
    integer(8), parameter :: IEEE754_DP_EXPONENT = Z'7FFFFF0000000000'
    ! *** Due to compiler (Intel 2020u1/u2) optimization bug we cannot use a parameter
!    integer(8), parameter :: IEEE754_DP_MANTISSA =  Z'000000FFFFFFFFFF'
    ! instead using memory stored variable works
    ! at some point this can be restored to parameter
    integer(8) :: IEEE754_DP_MANTISSA =  Z'000000FFFFFFFFFF'
   
    ! Function that returns a negative integer (of size of REAL/DOUBLE PRECISION)
    ! when input argument is a NaN  (QNaN as SNaN would have caused an abort)
    ! these functions are elemental and capable of operating on vectors
    interface NEGATIVE_IS_NAN
        MODULE PROCEDURE NEGATIVE_IS_NAN_SP
        MODULE PROCEDURE NEGATIVE_IS_NAN_DP
    end interface NEGATIVE_IS_NAN
    
    interface FAST_IS_NAN
        MODULE PROCEDURE FAST_IS_NAN_SP
        MODULE PROCEDURE FAST_IS_NAN_DP
    end interface FAST_IS_NAN
    CONTAINS
    
    INTEGER(4) ELEMENTAL FUNCTION NEGATIVE_IS_NAN_SP(X)
        REAL, INTENT(IN) :: X
        NEGATIVE_IS_NAN_SP = TRANSFER(ABS(X),0)+IEEE754_SP_MANTISSA
    END FUNCTION NEGATIVE_IS_NAN_SP
    
    INTEGER(8) ELEMENTAL FUNCTION NEGATIVE_IS_NAN_DP(X)
        DOUBLE PRECISION, INTENT(IN) :: X
        NEGATIVE_IS_NAN_DP = TRANSFER(ABS(X),0_8)+IEEE754_DP_MANTISSA
    END FUNCTION NEGATIVE_IS_NAN_DP
    
    LOGICAL FUNCTION FAST_IS_NAN_SP(X)
        REAL, INTENT(IN) :: X
        FAST_IS_NAN_SP = (NEGATIVE_IS_NAN_SP(X) < 0)
    END FUNCTION FAST_IS_NAN_SP
    
    LOGICAL FUNCTION FAST_IS_NAN_DP(X)
        DOUBLE PRECISION, INTENT(IN) :: X
        FAST_IS_NAN_DP = (NEGATIVE_IS_NAN_DP(X) < 0_8)
    END FUNCTION FAST_IS_NAN_DP
end module MOD_NEGATIVE_IS_NAN
    
program TestNaN
    USE, INTRINSIC :: IEEE_ARITHMETIC
    use MOD_NEGATIVE_IS_NAN
    use omp_lib
    implicit none
    
    integer, parameter :: N = 100000000     ! some number larger than Last Level Cache
    integer, parameter :: reps = 50         ! number of times to scann for NaN
    real, allocatable :: SP(:)
    real :: SP_NAN
    double precision, allocatable :: DP(:)
    real(8) :: DP_NAN
    double precision :: T0, T1
    integer :: I, J
    LOGICAL :: FOUND_NAN
    
    ALLOCATE(SP(N), DP(N))
    CALL RANDOM_NUMBER(SP)
    CALL RANDOM_NUMBER(DP)
    
    ! ASSURE CONSTANTS ARE CORRECT
    SP_NAN = TRANSFER(IEEE754_SP_EXPONENT+1234, 0.0)    ! Arbitrary QNaN
    FOUND_NAN = IEEE_IS_NAN(SP_NAN)
    PRINT *,'IEEE_IS_NAN(SP_NAN)', FOUND_NAN
    FOUND_NAN = (NEGATIVE_IS_NAN(SP_NAN) < 0)
    PRINT *,'NEGATIVE_IS_NAN(SP_NAN)', FOUND_NAN
    FOUND_NAN = FAST_IS_NAN(SP_NAN)
    PRINT *,'FAST_IS_NAN(SP_NAN)', FOUND_NAN
    
    ! ASSURE CONSTANTS ARE CORRECT
    DP_NAN = TRANSFER(IEEE754_DP_EXPONENT+1234_8, 0.0_8)    ! Arbitrary QNaN
    FOUND_NAN = IEEE_IS_NAN(DP_NAN)
    PRINT *,'IEEE_IS_NAN(DP_NAN)', FOUND_NAN
    FOUND_NAN = (NEGATIVE_IS_NAN(DP_NAN) < 0_8)
    PRINT *,'NEGATIVE_IS_NAN(DP_NAN)', FOUND_NAN
    FOUND_NAN = FAST_IS_NAN(DP_NAN)
    PRINT *,'FAST_IS_NAN(DP_NAN)', FOUND_NAN
    
  ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        SP(I) = SP(I) * 1.00001
        IF(IEEE_IS_NAN(SP(I))) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, reps
        DO I=1,N
            SP(I) = SP(I) * 1.00001
            IF(IEEE_IS_NAN(SP(I))) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "IEEE_IS_NAN(SP(I))          ", T1-T0, FOUND_NAN

    ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        SP(I) = SP(I) * 1.00001
        IF(NEGATIVE_IS_NAN(SP(I)) < 0) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, reps
        DO I=1,N
            SP(I) = SP(I) * 1.00001
            IF(NEGATIVE_IS_NAN(SP(I)) < 0) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "NEGATIVE_IS_NAN(SP(I)) < 0  ", T1-T0, FOUND_NAN
    

    ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        SP(I) = SP(I) * 1.00001
        IF(FAST_IS_NAN(SP(I))) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, reps
        DO I=1,N
            SP(I) = SP(I) * 1.00001
            IF(FAST_IS_NAN(SP(I))) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "FAST_IS_NAN(SP(I))          ", T1-T0, FOUND_NAN
    
    ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        DP(I) = DP(I) * 1.00001_8
        IF(IEEE_IS_NAN(DP(I))) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, reps
        DO I=1,N
            DP(I) = DP(I) * 1.00001_8
            IF(IEEE_IS_NAN(DP(I))) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "IEEE_IS_NAN(DP(I))          ", T1-T0, FOUND_NAN
    
    ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        DP(I) = DP(I) * 1.00001_8
        IF(NEGATIVE_IS_NAN(DP(I)) < 0_8) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, reps
        DO I=1,N
            DP(I) = DP(I) * 1.00001_8
            IF(NEGATIVE_IS_NAN(DP(I)) < 0_8) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "NEGATIVE_IS_NAN(DP(I)) < 0_8", T1-T0, FOUND_NAN
    
    ! Make pass without report to precondition cache to some stable state
    FOUND_NAN = .FALSE.
    DO I=1,N
        DP(I) = DP(I) * 1.00001_8
        IF(FAST_IS_NAN(DP(I))) FOUND_NAN = .TRUE.
    END DO
    
    ! Now timed passes
    T0 = OMP_GET_WTIME()
    FOUND_NAN = .FALSE.
    DO J=1, reps
        DO I=1,N
            DP(I) = DP(I) * 1.00001_8
            IF(FAST_IS_NAN(DP(I))) FOUND_NAN = .TRUE.
        END DO
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "NEGATIVE_IS_NAN(DP(I))", T1-T0, FOUND_NAN
end program TestNaN

Jim Dempsey

P.S. Re-installation of PS 2020u2 "succeeded", but did not integrate VTune etc... into MS VS 2019 grrrr...

Steve_Lionel · ‎07-25-2020

Jim, please file a support ticket on the install issues. That's the only way any attention will be paid to them.

FortranFan · ‎07-25-2020

Jim,

Your latest "fast" version code doesn't work for "DOUBLE PRECISION".

!  TestNaN.f90
module MOD_NEGATIVE_IS_NAN

    integer, parameter :: IEEE754_SP_SIGN = Z'80000000'
    integer, parameter :: IEEE754_SP_EXPONENT = Z'7F800000'
    ! *** Due to compiler (Intel 2020u1/u2) optimization bug we cannot use a parameter
!    integer, parameter :: IEEE754_SP_MANTISSA = Z'007FFFFF'
    ! instead using memory stored variable works
    ! at some point this can be restored to parameter
    integer :: IEEE754_SP_MANTISSA = Z'007FFFFF'

    integer(8), parameter :: IEEE754_DP_SIGN = Z'8000000000000000', kind=8
    integer(8), parameter :: IEEE754_DP_EXPONENT = Z'7FFFFF0000000000'
    ! *** Due to compiler (Intel 2020u1/u2) optimization bug we cannot use a parameter
!    integer(8), parameter :: IEEE754_DP_MANTISSA =  Z'000000FFFFFFFFFF'
    ! instead using memory stored variable works
    ! at some point this can be restored to parameter
    integer(8) :: IEEE754_DP_MANTISSA =  Z'000000FFFFFFFFFF'

    ! Function that returns a negative integer (of size of REAL/DOUBLE PRECISION)
    ! when input argument is a NaN  (QNaN as SNaN would have caused an abort)
    ! these functions are elemental and capable of operating on vectors
    interface NEGATIVE_IS_NAN
        MODULE PROCEDURE NEGATIVE_IS_NAN_SP
        MODULE PROCEDURE NEGATIVE_IS_NAN_DP
    end interface NEGATIVE_IS_NAN

    interface FAST_IS_NAN
        MODULE PROCEDURE FAST_IS_NAN_SP
        MODULE PROCEDURE FAST_IS_NAN_DP
    end interface FAST_IS_NAN
    CONTAINS

    INTEGER(4) ELEMENTAL FUNCTION NEGATIVE_IS_NAN_SP(X)
        REAL, INTENT(IN) :: X
        NEGATIVE_IS_NAN_SP = TRANSFER(ABS(X),0)+IEEE754_SP_MANTISSA
    END FUNCTION NEGATIVE_IS_NAN_SP

    INTEGER(8) ELEMENTAL FUNCTION NEGATIVE_IS_NAN_DP(X)
        DOUBLE PRECISION, INTENT(IN) :: X
        NEGATIVE_IS_NAN_DP = TRANSFER(ABS(X),0_8)+IEEE754_DP_MANTISSA
    END FUNCTION NEGATIVE_IS_NAN_DP

    elemental LOGICAL FUNCTION FAST_IS_NAN_SP(X)
        REAL, INTENT(IN) :: X
        FAST_IS_NAN_SP = (NEGATIVE_IS_NAN_SP(X) < 0)
    END FUNCTION FAST_IS_NAN_SP

    elemental LOGICAL FUNCTION FAST_IS_NAN_DP(X)
        DOUBLE PRECISION, INTENT(IN) :: X
        FAST_IS_NAN_DP = (NEGATIVE_IS_NAN_DP(X) < 0_8)
    END FUNCTION FAST_IS_NAN_DP
end module MOD_NEGATIVE_IS_NAN

program TestNan

   use, intrinsic :: ieee_arithmetic, only : ieee_value, ieee_quiet_nan, ieee_is_nan
   use, intrinsic :: ieee_exceptions, only : ieee_set_halting_mode, ieee_divide_by_zero
   use MOD_NEGATIVE_IS_NAN

   double precision :: a(1000), r, dr

   call ieee_set_halting_mode( flag=ieee_divide_by_zero, halting=.false. )
   call random_number( a )
   call random_number( r )
   dr = r / 10.0D0
   where ( (a > (r-dr)).and.(a < (r+dr)) )
      a = ieee_value( r, ieee_quiet_nan )
   end where

   print *, "IEEE_IS_NAN: count of NAN values = ", count( IEEE_IS_NAN(a) )
   print *, "FAST_IS_NAN: count of NAN values = ", count( FAST_IS_NAN(a) )

end program

C:\Temp>ifort /O2 /QxHost /Qopenmp TestNan.f90
Intel(R) Visual Fortran Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.2.254 Build 20200623
Copyright (C) 1985-2020 Intel Corporation.  All rights reserved.

Microsoft (R) Incremental Linker Version 14.26.28806.0
Copyright (C) Microsoft Corporation.  All rights reserved.

-out:TestNan.exe
-subsystem:console
-defaultlib:libiomp5md.lib
-nodefaultlib:vcomp.lib
-nodefaultlib:vcompd.lib
TestNan.obj

C:\Temp>TestNan
 IEEE_IS_NAN: count of NAN values =          135
 FAST_IS_NAN: count of NAN values =            0

C:\Temp>

FortranFan · ‎07-25-2020

@jimdempseyatthecove wrote:

 .. (X/=X), not sure I will use that method. For three reasons:

1) Compiler optimization may elide the test and return .false.
..
3) I have an alternate  requirement for vectorization to be optimal.

Jim,

Re: point 1, have you seen a recent compiler that elides that test? It used to be that a while ago compilers had "a little knowledge" which often proved "dangerous". But it's unclear if modern compilers fall under the same trap.

Re: point 3 though, "fast" doesn't appear to go with the requirement for optimal.

See my previous post where your latest version with "negative_is_nan" doesn't seem to work for "double precision".

You may want to take note there has got to be something which can be gained by simplicity, a trivial check which makes use of the simple IEEE definition which is only that the exponent bits are all one for "infinity" (and NAN) may get you more than the complex transformations you've shown here:

   use, intrinsic :: iso_fortran_env, only : I1 => int8, I2 => int16, I4 => int32, I8 => int64
   ..
   generic :: FAST_IS_NAN => FAST_IS_NAN_SP, FAST_IS_NAN_DP

contains

   elemental function FAST_IS_NAN_SP( x ) result(is_nan)

      ! Argument list
      real(SP), intent(in) :: x
      ! Function result
      logical :: is_nan

      ! Local variables
      integer(I1), parameter :: INFINITY = int( B'11111111', kind=I1 )
      integer, parameter :: NUM_EXP_BITS = 8
      integer, parameter :: POS_BEGIN_EXPONENT = 23
      integer(I4) :: n
      integer(I1) :: expnt

      n = transfer( source=x, mold=n )
      expnt = ibits( n, pos=POS_BEGIN_EXPONENT, len=NUM_EXP_BITS )
      is_nan = ( expnt == INFINITY )

      return

   end function

   elemental function FAST_IS_NAN_DP( x ) result(is_nan)

      ! Argument list
      real(DP), intent(in) :: x
      ! Function result
      logical :: is_nan

      ! Local variables
      integer(I2), parameter :: INFINITY = int( B'11111111111', kind=I2 )
      integer, parameter :: NUM_EXP_BITS = 11
      integer, parameter :: POS_BEGIN_EXPONENT = 52
      integer(I8) :: n
      integer(I2) :: expnt

      n = transfer( source=x, mold=n )
      expnt = ibits( n, pos=POS_BEGIN_EXPONENT, len=NUM_EXP_BITS )
      is_nan = ( expnt == INFINITY )

      return

   end function

Using this simple code, the test in the previous post yields:

C:\Temp>ifort /O2 /QxHost /Qopenmp TestNan.f90
Intel(R) Visual Fortran Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.1.2.254 Build 20200623
Copyright (C) 1985-2020 Intel Corporation.  All rights reserved.

Microsoft (R) Incremental Linker Version 14.26.28806.0
Copyright (C) Microsoft Corporation.  All rights reserved.

-out:TestNan.exe
-subsystem:console
-defaultlib:libiomp5md.lib
-nodefaultlib:vcomp.lib
-nodefaultlib:vcompd.lib
TestNan.obj

C:\Temp>TestNan
 IEEE_IS_NAN: count of NAN values =          135
 FAST_IS_NAN: count of NAN values =          135

C:\Temp>

FortranFan · ‎07-25-2020

And compared to IEEE_IS_NAN function, the simple code I show above does give 5x to 10x improvement in performance:

C:\Temp>ff-nan.exe
 IEEE_IS_NAN(SP_NAN) T
 FAST_IS_NAN(SP_NAN) T
 IEEE_IS_NAN(DP_NAN) T
 FAST_IS_NAN(DP_NAN) T
 Expected number of SP NAN values:     14834547
 Expected number of DP NAN values:     17695573

IS_NAN Method           OMP time                 Number of NAN values found
                        (secs)
IEEE_IS_NAN(rvals(I))      62.                   14834547
FAST_IS_NAN(rvals(I))      6.4                   14834547
IEEE_IS_NAN(dvals(I))      67.                   17695573
FAST_IS_NAN(dvals(I))      12.                   17695573

C:\Temp>

jimdempseyatthecove · ‎07-26-2020

>>Your latest "fast" version code doesn't work for "DOUBLE PRECISION".

Will look into this using your Test program (mine only did one test).

>>You may want to take note there has got to be something which can be gained by simplicity, a trivial check which makes use of the simple IEEE definition which is only that the exponent bits are all one for "infinity" (and NAN)

Infinity and NaN both use exponent of all 1's
Infinity has mantissa of 0
sNaN has msb of mantissa 1 and the remaining bits undefined
qNaN has msb of mantissa 0 and the remaining bits undefined but not all 0's

For all three, the sign bit is not material.

The test I devised is:

ABS to remove the sign bit. Assuming the number is already in a simd register this is one instruction, 1/2 clock
Integer add with all mantissa bits set, both NaN formats will carry into sign bit (infinity will not, regular and sub-normal numbers will not), 1/2 clock

When inlined, test is reduced to 1 clock cycle (excluding how the data got in a simd register and how it is disposed of). Accumulation of condition can be made with IOR (another 1/2 clock cycle). Counting is a different story. Haven't decided yet, it should be doable in two 1/2 clock cycles.

I cannot experiment right now as installation of PS2020u2 has caused a cascade of problems, that I am in the process of unsnarling.

Jim Dempsey

jimdempseyatthecove · ‎07-26-2020

re: DP incorrect

My bad, when I typed in the parameters, I entered 3 nibbles as 3 bytes.

The correct values are:

    integer(8), parameter :: IEEE754_DP_SIGN = Z'8000000000000000'
    integer(8), parameter :: IEEE754_DP_EXPONENT = Z'7FF0000000000000'
    ! *** Due to compiler (Intel 2020u1/u2) optimization bug we cannot use a parameter
!    integer(8), parameter :: IEEE754_DP_MANTISSA =  Z'000000FFFFFFFFFF'
    ! instead using memory stored variable works
    ! at some point this can be restored to parameter
    integer(8) :: IEEE754_DP_MANTISSA =  Z'000FFFFFFFFFFFFF'

using your array prep (salting arrays with qNaNs) and performing count of NaNs in place of any NaNs (and adding one extra count loop where counter is INTEGER(8)):

IEEE_IS_NAN(SP_NAN) T
 NEGATIVE_IS_NAN(SP_NAN) T
 FAST_IS_NAN(SP_NAN) T
 IEEE_IS_NAN(DP_NAN) T
 NEGATIVE_IS_NAN(DP_NAN) T
 FAST_IS_NAN(DP_NAN) T
 IEEE_IS_NAN(SP(I))                   27.2217424000264         838154450
 NEGATIVE_IS_NAN(SP(I)) < 0            5.68242450000253        838154450
 FAST_IS_NAN(SP(I))                    6.12305860000197        838154450
 IEEE_IS_NAN(DP(I))                   27.1758321999805         831400050
 NEGATIVE_IS_NAN(DP(I)) < 0_8          5.70193209999707        831400050
 NEGATIVE_IS_NAN(DP(I)) COUNT_NANS_8   5.68927289999556        831400050
 FAST_IS_NAN(DP(I))                    6.13013249996584        831400050

(I hand edited the output to align the columns)

The above results were with using arrays of dimension 100,000,000 (much larger than LLC)

Reducing to your array sizes of 1000 (and increasing reps)

IEEE_IS_NAN(SP_NAN) T
 NEGATIVE_IS_NAN(SP_NAN) T
 FAST_IS_NAN(SP_NAN) T
 IEEE_IS_NAN(DP_NAN) T
 NEGATIVE_IS_NAN(DP_NAN) T
 FAST_IS_NAN(DP_NAN) T
 IEEE_IS_NAN(SP(I))                   22.7590829000110         675000000
 NEGATIVE_IS_NAN(SP(I)) < 0            6.31552950001787        675000000
 FAST_IS_NAN(SP(I))                    6.46725129999686        675000000
 IEEE_IS_NAN(DP(I))                   18.7217154999962          65000000
 NEGATIVE_IS_NAN(DP(I)) < 0_8          6.22576640004991         65000000
 NEGATIVE_IS_NAN(DP(I)) COUNT_NANS_8   6.30040659999941         65000000
 FAST_IS_NAN(DP(I))                    6.29617970000254         65000000

The specific loop structure with counts is not as vector friendly as it could be, this may require a little reconfiguration of the loop. I will get back on this. The above run was on a system with AVX512, the earlier run on AVX (not AVX2).

Thank you for testing the code and finding the error..

Jim Dempsey

jimdempseyatthecove · ‎07-26-2020

Now I modified the code to count NaNs to vectorize by way of local array.

    ! Now timed passes
    T0 = OMP_GET_WTIME()
    DO J=1, reps
        COUNT_NANS = 0
        BLOCK
            INTEGER :: COUNT_NANS_4(0:15)
            INTEGER :: J
            COUNT_NANS_4 = 0
            DO I=1,N - MOD(N,16), 16
                DO J=0,15
                    SP(I+J) = SP(I+J) * 1.00001
                    IF(NEGATIVE_IS_NAN(SP(I+J)) < 0) COUNT_NANS_4(J) = COUNT_NANS_4(J) + 1
                END DO
            END DO
            DO J=0,15
                COUNT_NANS = COUNT_NANS + COUNT_NANS_4(J)
            END DO
            DO I=N-MOD(N,16)+1,N
                SP(I) = SP(I) * 1.00001
                IF(NEGATIVE_IS_NAN(SP(I)) < 0) COUNT_NANS = COUNT_NANS + 1
            END DO
        END BLOCK
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "NEGATIVE_IS_NAN(SP(I)) COUNT_NANS_4", T1-T0, COUNT_NANS

That is the first double nested loops (vector sum of NaN's of each vector of array as it is being modified)

DP loop:

    ! Now timed passes
    T0 = OMP_GET_WTIME()
    DO J=1, reps
        COUNT_NANS = 0
        BLOCK
            INTEGER(8) :: COUNT_NANS_8(0:7)
            INTEGER :: J
            COUNT_NANS_8 = 0_8
            DO I=1,N - MOD(N,8), 8
                DO J=0,7
                    DP(I+J) = DP(I+J) * 1.00001_8
                    IF(NEGATIVE_IS_NAN(DP(I+J)) < 0_8) COUNT_NANS_8(J) = COUNT_NANS_8(J) + 1_8
                END DO
            END DO
            DO J=0,7
                COUNT_NANS = COUNT_NANS + COUNT_NANS_8(J)
            END DO
            DO I=N-MOD(N,8)+1,N
                DP(I) = DP(I) * 1.00001_8
                IF(NEGATIVE_IS_NAN(DP(I)) < 0_8) COUNT_NANS = COUNT_NANS + 1
            END DO
        END BLOCK
    END DO
    T1 = OMP_GET_WTIME()
    PRINT *, "NEGATIVE_IS_NAN(DP(I)) COUNT_NANS_8", T1-T0, COUNT_NANS

If testing shows I can use the parameter (as opposed to variable) for the NaN to carry I can eliminate the memory reference for the first vpaddq and vpandq. A little more experimentation to get the parameters into registers.

Jim Dempsey

jimdempseyatthecove · ‎07-26-2020

I see the issue now, the ABS is using a memory value to strip off the sign bit.

This can be done a little better. Will experiment by adding specialize vector variants to functions, if that be an issue.

Jim Dempsey