Solved: Thanks for all the comments.

curro_p__b_ · ‎01-15-2016

I have some experience with gfortran compiler but no experience whatsoever with ifort, so I should be doing something terribly wrong in what follows. I have recently installed parallel_studio_xe_2016_update1 in my laptop running Debian (Jessie release). I have prepared a very simple program with a very naive approach to computing Pi using Leibniz series. Probably is one of the worst ways of computing Pi, but I just wanted to check vectorization and the stability of the program. The program has a version for single precision and another version for double precision.

The single precision is the following:

PROGRAM VECT_TEST_SINGLE
  IMPLICIT NONE
  INTEGER, PARAMETER :: SP = KIND(1.0)
  INTEGER, PARAMETER :: DP = KIND(1.0D0)
  !
  REAL(KIND=SP) :: Pi_app
  REAL(KIND=DP) :: time_start, time_end, time_tot
  INTEGER :: I,J
  !
  time_tot = 0.0_dp
  Pi_app = 0.0_SP
  !
  DO J = 0, 10
     Pi_app = 0.0_SP
     !
     CALL CPU_TIME(time_start)
     !
     DO I = 0, (2**J)*100000
        Pi_app = Pi_app + (-1.0_SP)**I/REAL(2*I+1,SP)
     END DO
     !
     CALL CPU_TIME(time_end)
     !
     time_tot = time_tot + time_end-time_start
     !
     PRINT*, (2**J)*100000, time_tot, 4.0_SP*Pi_app, 4.0_SP*Pi_app-ACOS(-1.0_SP)
     !
  ENDDO
  !
END PROGRAM VECT_TEST_SINGLE

While for double precision the program has minor variations.

PROGRAM VECT_TEST_DOUBLE
  IMPLICIT NONE
  INTEGER, PARAMETER :: DP = KIND(1.0D0)
  !
  REAL(KIND=DP) :: Pi_app
  REAL(KIND=DP) :: time_start, time_end, time_tot
  INTEGER :: I,J
  !
  time_tot = 0.0_dp
  Pi_app = 0.0_DP
  !
  DO J = 0, 10
     Pi_app = 0.0_DP
     CALL CPU_TIME(time_start)
     !
     DO I = 0, (2**J)*100000
        Pi_app = Pi_app + (-1.0_DP)**I/REAL(2*I+1,DP)
     END DO
     !
     CALL CPU_TIME(time_end)
     !
     time_tot = time_tot + time_end-time_start
     !
     PRINT*, (2**J)*100000, time_tot, 4.0_DP*Pi_app, 4.0_DP*Pi_app-ACOS(-1.0_DP)
     !
  ENDDO
  !     
END PROGRAM VECT_TEST_DOUBLE

In both cases the output lines is the numbers of terms summed of the series, the time taken to do the sum and the approx. value of Pi and the difference with ACOS(-1). I compiled the programs with the following Makefile for gfortran:

#
OPT=-O3
FC = gfortran
FLAGS = -march=native -fopt-info-vec-missed -fopt-info-vec-optimized


all: test_vectorize_Single test_vectorize_Double

.FORCE:	

test_vectorize_Single:	test_vectorize_Single.f90 Makefile .FORCE
	$(FC) $(OPT) $(FLAGS) -o $@_$(FC) $<

test_vectorize_Double:	test_vectorize_Double.f90 Makefile .FORCE
	$(FC) $(OPT) $(FLAGS)  -o $@_$(FC) $<

clean:	
	rm -f *.o *.s *.exe *.lst *.ppm  test_vectorize_Single_$(FC) test_vectorize_Double_$(FC)

and a different Makefile for ifort:

# 
OPT=-fast
INFO=-qopt-report=3
FLAGS = -L${MKLROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -lmkl_sequential -lpthread -lm
FC = ifort

all: test_vectorize_Single test_vectorize_Double

.FORCE:	

test_vectorize_Single:	test_vectorize_Single.f90 Makefile_ifort .FORCE
	$(FC) $(OPT)  $(INFO) -o $@_$(FC) $< $(FLAGS)

test_vectorize_Double:	test_vectorize_Double.f90 Makefile_ifort .FORCE
	$(FC) $(OPT) $(INFO) -o $@_$(FC) $< $(FLAGS)


clean:	
	rm -f *.o *.s *.exe *.lst *.ppm   test_vectorize_Single_$(FC) test_vectorize_Double_$(FC)

I find the results surprising, specially for single precision:

$ time ./test_vectorize_Single_gfortran 
      100000   0.0000000000000000        3.14160585       1.31130219E-05
      200000   0.0000000000000000        3.14160132       8.58306885E-06
      400000   4.0000000000000001E-003   3.14159846       5.72204590E-06
      800000   8.0000000000000002E-003   3.14159727       4.52995300E-06
     1600000   2.0000000000000000E-002   3.14159703       4.29153442E-06
     3200000   3.2000000000000001E-002   3.14159703       4.29153442E-06
     6400000   5.1999999999999991E-002   3.14159703       4.29153442E-06
    12800000   8.3999999999999991E-002   3.14159703       4.29153442E-06
    25600000  0.14799999999999999        3.14159679       4.05311584E-06
    51200000  0.26799999999999990        3.14159679       4.05311584E-06
   102400000  0.50400000000000000        3.14159679       4.05311584E-06

real	0m0.506s
user	0m0.504s
sys	0m0.000s

$ time ./test_vectorize_Single_ifort 
      100000  8.000000000000000E-003   3.141608      1.5497208E-05
      200000  2.800000000000000E-002   3.141596      3.5762787E-06
      400000  5.200000000000000E-002   3.141694      1.0156631E-04
      800000  8.799999999999999E-002   3.141549     -4.4107437E-05
     1600000  0.160000000000000        3.142432      8.3971024E-04
     3200000  0.304000000000000        3.140750     -8.4304810E-04
     6400000  0.604000000000000        3.147671      6.0782433E-03
    12800000   1.28000000000000        3.121287     -2.0306110E-02
    25600000   2.56000000000000        3.064596     -7.6996565E-02
    51200000   5.20000000000000        3.031954     -0.1096389    
   102400000   10.5840000000000        3.031954     -0.1096389    

real	0m10.582s
user	0m10.580s
sys	0m0.004s

The ifort program took way much longer to execute and the results were completely off the mark compared to the previous case. In the double precision case results are quite similar though still ifort compiled executable took much longer than the alternative:

$ time ./test_vectorize_Double_gfortran 
      100000   0.0000000000000000        3.1416026534897203        9.9998999272266076E-006
      200000   4.0000000000000001E-003   3.1415976535647618        4.9999749687223982E-006
      400000   8.0000000000000002E-003   3.1415951535834941        2.4999937009440032E-006
      800000   1.6000000000000000E-002   3.1415939035881548        1.2499983617075827E-006
     1600000   2.7999999999999997E-002   3.1415932785894536        6.2499966047013800E-007
     3200000   5.1999999999999991E-002   3.1415929660895618        3.1249976872871343E-007
     6400000   8.0000000000000016E-002   3.1415928098397976        1.5625000449048798E-007
    12800000  0.14000000000000001        3.1415927317148120        7.8125018898589360E-008
    25600000  0.25200000000000000        3.1415926926522508        3.9062457712901733E-008
    51200000  0.47999999999999998        3.1415926731216950        1.9531901873648394E-008
   102400000  0.93599999999999994        3.1415926633549081        9.7651149388866543E-009

real	0m0.938s
user	0m0.936s
sys	0m0.000s
[pts/0][curro.kimoshi: src]$ time ./test_vectorize_Double_ifort 
      100000  1.200000000000000E-002   3.14160265348968       9.999899887258579E-006
      200000  2.800000000000000E-002   3.14159765356467       4.999974872355040E-006
      400000  5.599999999999999E-002   3.14159515358340       2.499993601912109E-006
      800000  0.108000000000000        3.14159390358802       1.249998222263571E-006
     1600000  0.192000000000000        3.14159327858927       6.249994810580972E-007
     3200000  0.364000000000000        3.14159296608931       3.124995213710235E-007
     6400000  0.720000000000000        3.14159280983955       1.562497522478168E-007
    12800000   1.44800000000000        3.14159273171429       7.812449842603542E-008
    25600000   2.93600000000000        3.14159269265213       3.906233603245823E-008
    51200000   5.98000000000000        3.14159267312262       1.953282380284804E-008
   102400000   12.2040000000000        3.14159266335631       9.766513375808472E-009

real	0m12.203s
user	0m12.208s
sys	0m0.000s

I am probably doing something wrong or using wrong options, any explanation of what's going on, suggestion, or remark will be very welcomed.

JVanB · ‎01-15-2016

The precision results aren't that surprising. Pretend that the sum is partitioned into a sum of even terms and of odd terms. For the even terms, consider that sum([(1.0/(4*i+1),i=0,N)]) = 0.710289793064215+log(4.0*N+3), approximately. Then when i in your sum hit 2**21, the next even term will be 1.0/(2*2**21+1) = 1.999995*2.0**(-23), while the sum of even terms will have reached about 1.131*2**2. Since the binary exponents now differ by 25, no terms will contribute to the sum of even terms beyond this point.

For the odd terms, sum([(1.0/(4*i+3),i=0,N)]) = -7.510837033330731E-002+log(4.0*N+5), so that when again i = 2**21, the next odd term will be 1.0/(2*2**21+3) = 1.9999986*2.0**(-23), but the sum will only be 1.869*2**1. Now the odd terms will be just off the least significant bit of the running sum, but with rounding they should still contribute until about i = 2**22, when the next term will be about 1.9999993*2.0**(-24) and the sum will be 1.955*2**1. So the odd sum gets roughly an extra 2**21 terms each contributing 2.0**(-22), thus about 0.5 extra before it stops accumulating. This would make your value of pi low by about 2.0.

Now it's not so simple because 2 accumulators was the worst case scenario but ifort probably uses something like 4 or 8 accumulators depending on your processor. I haven't repeated my analysis for different numbers of accumulators but you could probably figure that number out from your final value of pi.

gfortran doesn't seem to be vectorizing with multiple accumulators. I don't know why ifort was so slow. Maybe it did something dumb with the (-1.0_SP)**i term there. What happens if you change that to (-1)**i? To really tell what's happening, write out a subroutine containing only that inner loop and post the assembly code you get with -S for gfortran, probably the same for ifort.

View solution in original post

TimP · ‎01-15-2016

I wouldn't have expected gfortran to vectorize with the options quoted, nor to take any measures to preserve accuracy. I may not have time the next few days to attempt conversion of your post to compilable form but may get to it eventually.

JVanB · ‎01-15-2016

The precision results aren't that surprising. Pretend that the sum is partitioned into a sum of even terms and of odd terms. For the even terms, consider that sum([(1.0/(4*i+1),i=0,N)]) = 0.710289793064215+log(4.0*N+3), approximately. Then when i in your sum hit 2**21, the next even term will be 1.0/(2*2**21+1) = 1.999995*2.0**(-23), while the sum of even terms will have reached about 1.131*2**2. Since the binary exponents now differ by 25, no terms will contribute to the sum of even terms beyond this point.

For the odd terms, sum([(1.0/(4*i+3),i=0,N)]) = -7.510837033330731E-002+log(4.0*N+5), so that when again i = 2**21, the next odd term will be 1.0/(2*2**21+3) = 1.9999986*2.0**(-23), but the sum will only be 1.869*2**1. Now the odd terms will be just off the least significant bit of the running sum, but with rounding they should still contribute until about i = 2**22, when the next term will be about 1.9999993*2.0**(-24) and the sum will be 1.955*2**1. So the odd sum gets roughly an extra 2**21 terms each contributing 2.0**(-22), thus about 0.5 extra before it stops accumulating. This would make your value of pi low by about 2.0.

Now it's not so simple because 2 accumulators was the worst case scenario but ifort probably uses something like 4 or 8 accumulators depending on your processor. I haven't repeated my analysis for different numbers of accumulators but you could probably figure that number out from your final value of pi.

gfortran doesn't seem to be vectorizing with multiple accumulators. I don't know why ifort was so slow. Maybe it did something dumb with the (-1.0_SP)**i term there. What happens if you change that to (-1)**i? To really tell what's happening, write out a subroutine containing only that inner loop and post the assembly code you get with -S for gfortran, probably the same for ifort.

TimP · ‎01-15-2016

fp-model source would make the ifort setting more consistent with the gfortran choice. As RO pointed out, batching sums to enable vectorization can go bad. Ifort will use up to 64 sums depending on target architecture, under the default -fp-model fast. Gfortran uses only enough batched sums to enable parallel reduction, but only if you permit it e.g. by setting ffast-math.

gfortran has a -fast option; if you adhere to the frequent recommendation not to use it, you should likewise not use that ifort option. gfortran -ffast-math is not as risky as the same option in gcc/g++. gfortran -fast-math -fno-cx-limited-range resembles ifort -fp-model fast -assume protect_parens. I don't think anything you showed in your code would require protect_parens, but I would recommend setting that or -standard-semantics.

curro_p__b_ · ‎01-16-2016

Thank you for your comments and thanks to RO for his answer that clarifies quite well what's going on. In fact once that I make the replacement suggested and substitute

        Pi_app = Pi_app + (-1.0_DP)**i/REAL(2*I+1,DP)

with

        Pi_app = Pi_app +  REAL((-1)**I,DP)/REAL(2*I+1,DP)

in the double precision case and I made a similar replacement in the single precision case. I also added the -ffast-math option to gfortran and now the results are much similar.

In the single precision case:

$ time ./test_vectorize_Single_gfortran 
      100000   0.0000000000000000        3.14155102      -4.17232513E-05
      200000   0.0000000000000000        3.14197350       3.80754471E-04
      400000   0.0000000000000000        3.14095759      -6.35147095E-04
      800000   4.0000000000000001E-003   3.14433718       2.74443626E-03
     1600000   4.0000000000000001E-003   3.13760018      -3.99255753E-03
     3200000   1.2000000000000000E-002   3.12833524      -1.32575035E-02
     6400000   2.0000000000000000E-002   3.12833476      -1.32579803E-02
    12800000   3.2000000000000001E-002   2.98875833     -0.152834415    
    25600000   5.1999999999999998E-002   2.98875809     -0.152834654    
    51200000   8.7999999999999995E-002   2.98875809     -0.152834654    
   102400000  0.16000000000000000        2.98875809     -0.152834654    

real	0m0.162s
user	0m0.160s
sys	0m0.000s

$ time ./test_vectorize_Single_ifort 
      100000  0.000000000000000E+000   3.141590     -2.6226044E-06
      200000  4.000000000000000E-003   3.141572     -2.1219254E-05
      400000  4.000000000000000E-003   3.141636      4.3392181E-05
      800000  1.600000000000000E-002   3.141636      4.3630600E-05
     1600000  3.200000000000000E-002   3.141635      4.2438507E-05
     3200000  5.200000000000000E-002   3.139176     -2.4166107E-03
     6400000  8.000000000000000E-002   3.158610      1.7016888E-02
    12800000  0.132000000000000        3.174740      3.3147335E-02
    25600000  0.224000000000000        3.076813     -6.4779997E-02
    51200000  0.412000000000000        3.076813     -6.4779997E-02
   102400000  0.792000000000000        3.076813     -6.4779997E-02

real	0m0.791s
user	0m0.788s
sys	0m0.008s

Now both results are off and probably due to the same reason, loss of precision in batched sums.

In the double precision case still gfortran is faster but not anymore by a factor of more than 10:

$ time ./test_vectorize_Double_gfortran 
      100000   0.0000000000000000        3.1416026534898136        9.9999000204853417E-006
      200000   0.0000000000000000        3.1415976535645713        4.9999747782081272E-006
      400000   4.0000000000000001E-003   3.1415951535836721        2.4999938790237763E-006
      800000   8.0000000000000002E-003   3.1415939035886247        1.2499988315539667E-006
     1600000   1.6000000000000000E-002   3.1415932785909066        6.2500111353003263E-007
     3200000   3.2000000000000001E-002   3.1415929660924249        3.1250263177184934E-007
     6400000   4.3999999999999997E-002   3.1415928098437020        1.5625390892282098E-007
    12800000   7.1999999999999995E-002   3.1415927317166967        7.8126903613195964E-008
    25600000  0.13200000000000001        3.1415926926538842        3.9064091073015561E-008
    51200000  0.24800000000000000        3.1415926731198458        1.9530052686178578E-008
   102400000  0.47599999999999998        3.1415926633545195        9.7647263608280355E-009

real	0m0.477s
user	0m0.476s
sys	0m0.000s


$ time ./test_vectorize_Double_ifort 
      100000  0.000000000000000E+000   3.14160265348968       9.999899887258579E-006
      200000  0.000000000000000E+000   3.14159765356467       4.999974872355040E-006
      400000  4.000000000000000E-003   3.14159515358340       2.499993601912109E-006
      800000  1.200000000000000E-002   3.14159390358802       1.249998222263571E-006
     1600000  2.800000000000000E-002   3.14159327858927       6.249994810580972E-007
     3200000  4.800000000000000E-002   3.14159296608931       3.124995213710235E-007
     6400000  8.400000000000001E-002   3.14159280983955       1.562497522478168E-007
    12800000  0.136000000000000        3.14159273171429       7.812449842603542E-008
    25600000  0.232000000000000        3.14159269265213       3.906233603245823E-008
    51200000  0.416000000000000        3.14159267312262       1.953282380284804E-008
   102400000  0.788000000000000        3.14159266335631       9.766513375808472E-009

real	0m0.789s
user	0m0.788s
sys	0m0.000s

I am not used to read assembly but I tried to get the assembly code In the single precision case the loop. In the gfortran case is:

  13:test_vectorize_Single.f90 ****   DO J = 0, 10
  14:test_vectorize_Single.f90 ****      Pi_app = 0.0_SP
  15:test_vectorize_Single.f90 ****      !
  16:test_vectorize_Single.f90 ****      CALL CPU_TIME(time_start)
  17:test_vectorize_Single.f90 ****      !
  18:test_vectorize_Single.f90 ****      DO I = 0, (2**J)*100000
  31                            .loc 1 18 0
  32 0008 41BCA086              movl    $100000, %r12d
  32      0100
  33                    .LBE3:
  34                    .LBE2:
   1:test_vectorize_Single.f90 **** PROGRAM VECT_TEST_SINGLE
^LGAS LISTING /tmp/ccumMjdP.s                   page 2


  35                            .loc 1 1 0
  36 000e 55                    pushq   %rbp
  37                            .cfi_def_cfa_offset 32
  38                            .cfi_offset 6, -32
  39                    .LBB21:
  40                    .LBB12:
  41                    .LBB4:
  42                    .LBB5:
  19:test_vectorize_Single.f90 ****         Pi_app = Pi_app + REAL((-1)**I,SP)/REAL(2*I+1,SP)
  43                            .loc 1 19 0
  44 000f BD010000              movl    $1, %ebp
  44      00
  45                    .LBE5:
  46                    .LBE4:
  47                    .LBE12:
  48                    .LBE21:
   1:test_vectorize_Single.f90 **** PROGRAM VECT_TEST_SINGLE
  49                            .loc 1 1 0
  50 0014 53                    pushq   %rbx
  51                            .cfi_def_cfa_offset 40
  52                            .cfi_offset 3, -40
  13:test_vectorize_Single.f90 ****      Pi_app = 0.0_SP
  53                            .loc 1 13 0
  54 0015 31DB                  xorl    %ebx, %ebx
   1:test_vectorize_Single.f90 ****   INTEGER, PARAMETER :: DP = KIND(1.0D0)
  55                            .loc 1 1 0
  56 0017 4881EC18              subq    $536, %rsp
  56      020000
  57                            .cfi_def_cfa_offset 576
  58 001e C5FB1134              vmovsd  %xmm6, (%rsp)
  58      24
  59                    .LVL1:
  60                            .p2align 4,,10
  61 0023 0F1F4400              .p2align 3
  61      00
  62                    .L7:
  63                    .LBB22:
  16:test_vectorize_Single.f90 ****      !
  64                            .loc 1 16 0
  65 0028 488D7C24              leaq    32(%rsp), %rdi
  65      20
  66 002d 31C0                  xorl    %eax, %eax
  67                    .LBB13:
  18:test_vectorize_Single.f90 ****         Pi_app = Pi_app + REAL((-1)**I,SP)/REAL(2*I+1,SP)
  68                            .loc 1 18 0
  69 002f 4589E5                movl    %r12d, %r13d
  70                    .LBE13:
  71                            .loc 1 16 0
  72 0032 E8000000              call    _gfortran_cpu_time_8
  72      00
  73                    .LVL2:
  74                    .LBB14:
  18:test_vectorize_Single.f90 ****         Pi_app = Pi_app + REAL((-1)**I,SP)/REAL(2*I+1,SP)
  75                            .loc 1 18 0
  76 0037 89D9                  movl    %ebx, %ecx
  77 0039 C5F857C0              vxorps  %xmm0, %xmm0, %xmm0
^LGAS LISTING /tmp/ccumMjdP.s                   page 3


  78 003d C5F96F25              vmovdqa .LC1(%rip), %xmm4
  78      00000000 
  79 0045 C5F96F3D              vmovdqa .LC3(%rip), %xmm7
  79      00000000 
  80 004d 41D3E5                sall    %cl, %r13d
  81                    .LVL3:
  82 0050 31C9                  xorl    %ecx, %ecx
  83 0052 418D45FD              leal    -3(%r13), %eax
  84 0056 418D7501              leal    1(%r13), %esi
  85 005a C1E802                shrl    $2, %eax
  86 005d 83C001                addl    $1, %eax
  87 0060 8D148500              leal    0(,%rax,4), %edx
  87      000000
  88                    .LVL4:
  89                    .L3:
  90                    .LBB11:
  91                    .LBB6:
  92                            .loc 1 19 0
  93 0067 C5D9DB0D              vpand   .LC3(%rip), %xmm4, %xmm1
  93      00000000 
  94 006f C5F172F1              vpslld  $1, %xmm1, %xmm1
  94      01
  95 0074 83C101                addl    $1, %ecx
  96 0077 C5C1FAC9              vpsubd  %xmm1, %xmm7, %xmm1
  97 007b C5F85BD1              vcvtdq2ps       %xmm1, %xmm2
  98 007f C5F172F4              vpslld  $1, %xmm4, %xmm1
  98      01
  99 0084 C5D9FE25              vpaddd  .LC2(%rip), %xmm4, %xmm4
  99      00000000 
 100 008c C5F1FECF              vpaddd  %xmm7, %xmm1, %xmm1
 101 0090 C5F85BC9              vcvtdq2ps       %xmm1, %xmm1
 102 0094 C5F853D9              vrcpps  %xmm1, %xmm3
 103 0098 C5E059C9              vmulps  %xmm1, %xmm3, %xmm1
 104 009c C5E059C9              vmulps  %xmm1, %xmm3, %xmm1
 105 00a0 C5E058DB              vaddps  %xmm3, %xmm3, %xmm3
 106 00a4 C5E05CC9              vsubps  %xmm1, %xmm3, %xmm1
 107 00a8 C5E859C9              vmulps  %xmm1, %xmm2, %xmm1
 108 00ac C5F858C1              vaddps  %xmm1, %xmm0, %xmm0
 109 00b0 39C1                  cmpl    %eax, %ecx
 110 00b2 72B3                  jb      .L3
 111 00b4 C5FB7CC0              vhaddps %xmm0, %xmm0, %xmm0
 112 00b8 C5FB7CC0              vhaddps %xmm0, %xmm0, %xmm0
 113 00bc 39F2                  cmpl    %esi, %edx
 114 00be 746F                  je      .L5
 115                    .LVL5:
 116 00c0 C5FA1035              vmovss  .LC7(%rip), %xmm6
 116      00000000 
 117 00c8 8D441201              leal    1(%rdx,%rdx), %eax
 118 00cc C5F057C9              vxorps  %xmm1, %xmm1, %xmm1
 119 00d0 C5F22AC8              vcvtsi2ss       %eax, %xmm1, %xmm1
 120                    .LBE6:
  18:test_vectorize_Single.f90 ****         Pi_app = Pi_app + REAL((-1)**I,SP)/REAL(2*I+1,SP)
 121                            .loc 1 18 0
 122 00d4 8D4A01                leal    1(%rdx), %ecx
 123                    .LBB7:
 124                            .loc 1 19 0
 125 00d7 C5CA5EC9              vdivss  %xmm1, %xmm6, %xmm1
GAS LISTING /tmp/ccumMjdP.s                   page 4


 126 00db C5FA58C1              vaddss  %xmm1, %xmm0, %xmm0
 127                    .LVL6:
 128                    .LBE7:
  18:test_vectorize_Single.f90 ****         Pi_app = Pi_app + REAL((-1)**I,SP)/REAL(2*I+1,SP)
 129                            .loc 1 18 0
 130 00df 4139D5                cmpl    %edx, %r13d
 131 00e2 744B                  je      .L5
 132                    .LBB8:
 133                            .loc 1 19 0
 134 00e4 89C8                  movl    %ecx, %eax
 135 00e6 89EE                  movl    %ebp, %esi
 136 00e8 C5F057C9              vxorps  %xmm1, %xmm1, %xmm1
 137 00ec C5E857D2              vxorps  %xmm2, %xmm2, %xmm2
 138 00f0 83E001                andl    $1, %eax
 139                    .LBE8:
  18:test_vectorize_Single.f90 ****         Pi_app = Pi_app + REAL((-1)**I,SP)/REAL(2*I+1,SP)
 140                            .loc 1 18 0
 141 00f3 83C202                addl    $2, %edx
 142                    .LBB9:
 143                            .loc 1 19 0
 144 00f6 01C0                  addl    %eax, %eax
 145 00f8 29C6                  subl    %eax, %esi
 146 00fa 8D440901              leal    1(%rcx,%rcx), %eax
 147 00fe C5F22ACE              vcvtsi2ss       %esi, %xmm1, %xmm1
 148 0102 C5EA2AD0              vcvtsi2ss       %eax, %xmm2, %xmm2
 149 0106 C5F25ECA              vdivss  %xmm2, %xmm1, %xmm1
 150 010a C5FA58C1              vaddss  %xmm1, %xmm0, %xmm0
 151                    .LVL7:
 152                    .LBE9:
  18:test_vectorize_Single.f90 ****         Pi_app = Pi_app + REAL((-1)**I,SP)/REAL(2*I+1,SP)
 153                            .loc 1 18 0
 154 010e 4139CD                cmpl    %ecx, %r13d
 155 0111 741C                  je      .L5
 156                    .LBB10:
 157                            .loc 1 19 0
 158 0113 8D441201              leal    1(%rdx,%rdx), %eax
 159 0117 C5F057C9              vxorps  %xmm1, %xmm1, %xmm1
 160 011b C5E857D2              vxorps  %xmm2, %xmm2, %xmm2
 161 011f C5F22ACD              vcvtsi2ss       %ebp, %xmm1, %xmm1
 162 0123 C5EA2AD0              vcvtsi2ss       %eax, %xmm2, %xmm2
 163 0127 C5F25ECA              vdivss  %xmm2, %xmm1, %xmm1
 164 012b C5FA58C1              vaddss  %xmm1, %xmm0, %xmm0
 165                    .LVL8:
 166                    .L5:
 167                    .LBE10:
 168                    .LBE11:
 169                    .LBE14:
  20:test_vectorize_Single.f90 ****      END DO

Same for ifort is

###   DO J = 0, 10
###      Pi_app = 0.0_SP
###      !
###      CALL CPU_TIME(time_start)
###      !
###      DO I = 0, (2**J)*100000
###         Pi_app = Pi_app + REAL((-1)**I,SP)/REAL(2*I+1,SP)

        movl      $8, %eax                                      #19.36
..LN20:
        movl      $16, %edx                                     #19.44
..LN21:
        .loc    1  13  is_stmt 1
        xorl      %ecx, %ecx                                    #13.3
..LN22:
        .loc    1  10  is_stmt 1
        vxorpd    %xmm9, %xmm9, %xmm9                           #10.3
..LN23:
        .loc    1  19  is_stmt 1
        movl      %ecx, %r12d                                   #19.44
..LN24:
        .loc    1  26  is_stmt 1

###      END DO
###      !
###      CALL CPU_TIME(time_end)
###      !
###      time_tot = time_tot + time_end-time_start
###      !
###      PRINT*, (2**J)*100000, time_tot, 4.0_SP*Pi_app, 4.0_SP*Pi_app-ACOS(-1.0_SP)

Apart from beeing much shorter, probably due to the options I used in each case, it seems that gfortran break the sum in more batch lots, but I am not sure at all of this. Anyways, thanks again for the clarifications.

JVanB · ‎01-16-2016

For gfortran the inner loop is lines 89:110 and is computing sums 4 abreast. I can't believe the gfortran is winning the race with this code, one would think that one could sum 8 terms simultaneously in 8 instructions. What kind of processor are you using that gfortran isn't using the full width ymm register nor FMA instructions?

The code shown for ifort doesn't seem to have anything to do with the loop. What I had hoped you might do is to split out the inner loop as a subroutine and compile it as a separate file, like this:

! inner.f90
subroutine inner(Pi_app,J)
   implicit none
   integer, parameter :: SP = kind(1.0)
   real(SP) Pi_app
   integer I, J
   Pi_app = 0
     DO I = 0, (2**J)*100000
        Pi_app = Pi_app + (-1)**I/REAL(2*I+1,SP)
     END DO
 end subroutine inner

And then just CALL inner(Pi_app,J) in your main program. That would take out a lot of the noise so you could post the whole *.s file without it being too wordy. Also, don't tell the compiler to include a Fortran source listing in the assembly file; that's just noise, too. The modularization might make it possible for someone to write some assembly code for comparison.

TimP · ‎01-17-2016

More suggestions

1. Reverse the loop so as to add in increasing magnitude order.

2. (-1)**I replaced by merge(1,-1,Iand(I,1)/=1)

jimdempseyatthecove · ‎01-17-2016

This is a good example of:

When using calculations with limited precision (either single, double, or quad for that matter) in convergence...
Sometimes more calculations (iterations) == less accuracy.

Meaning computational machines differ from abstract mathematics.

Too many times I have seen code using convergence routines that iterate a specific number of times as opposed to self-determining when to exit.

Jim Dempsey

curro_p__b_ · ‎01-17-2016

Thanks for all the comments. It is always surprising, and rewarding, how much can one learn from a little piece of code.

I carried out the two changes in the code that TP suggested in #8 and these are the new results:

## Single precision gfort

$ time ./test_vectorize_Single_gfortran 
      100000   0.0000000000000000        3.14160252       9.77516174E-06
      200000   0.0000000000000000        3.14159870       5.96046448E-06
      400000   0.0000000000000000        3.14159775       5.00679016E-06
      800000   4.0000000000000001E-003   3.14159679       4.05311584E-06
     1600000   4.0000000000000001E-003   3.14159107      -1.66893005E-06
     3200000   1.2000000000000000E-002   3.14159393       1.19209290E-06
     6400000   2.0000000000000000E-002   3.14159393       1.19209290E-06
    12800000   3.5999999999999997E-002   3.14157295      -1.97887421E-05
    25600000   5.9999999999999998E-002   3.14157200      -2.07424164E-05
    51200000  0.10400000000000000        3.14156818      -2.45571136E-05
   102400000  0.17199999999999999        3.14157200      -2.07424164E-05

real	0m0.176s
user	0m0.172s
sys	0m0.000s

# Single Precision ifort
$ time ./test_vectorize_Single_ifort 
      100000  0.000000000000000E+000   3.141605      1.2636185E-05
      200000  0.000000000000000E+000   3.141601      7.8678131E-06
      400000  0.000000000000000E+000   3.141597      4.0531158E-06
      800000  0.000000000000000E+000   3.141594      1.1920929E-06
     1600000  4.000000000000000E-003   3.141592     -7.1525574E-07
     3200000  8.000000000000000E-003   3.141591     -1.6689301E-06
     6400000  1.600000000000000E-002   3.141586     -6.4373016E-06
    12800000  2.800000000000000E-002   3.141595      2.1457672E-06
    25600000  5.200000000000000E-002   3.141596      3.0994415E-06
    51200000  8.799999999999999E-002   3.141596      3.0994415E-06
   102400000  0.144000000000000        3.141596      3.0994415E-06

real	0m0.146s
user	0m0.144s
sys	0m0.000s

Now times are similar though ifort beats gfortran though it is not much. Result for double precision are the following:

## Double precision gfortran 
$ time ./test_vectorize_Double_gfortran 
      100000   0.0000000000000000        3.1416026534897941        9.9999000009454164E-006
      200000   4.0000000000000001E-003   3.1415976535647934        4.9999750002527321E-006
      400000   1.2000000000000000E-002   3.1415951535835434        2.4999937502379055E-006
      800000   2.0000000000000000E-002   3.1415939035882308        1.2499984376468376E-006
     1600000   3.5999999999999997E-002   3.1415932785894025        6.2499960939987886E-007
     3200000   6.4000000000000001E-002   3.1415929660896955        3.1249990239956560E-007
     6400000  0.11199999999999999        3.1415928098397687        1.5624997562468934E-007
    12800000  0.18800000000000000        3.1415927317147876        7.8124994473682818E-008
    25600000  0.30800000000000000        3.1415926926522921        3.9062499013198249E-008
    51200000  0.53599999999999992        3.1415926731210431        1.9531249950688334E-008
   102400000  0.99600000000000000        3.1415926633554183        9.7656251973887720E-009

real	0m0.996s
user	0m0.996s
sys	0m0.004s

## Double precision ifort
$ time ./test_vectorize_Double_ifort 
      100000  0.000000000000000E+000   3.14160265348980       9.999900008494933E-006
      200000  0.000000000000000E+000   3.14159765356482       4.999975027786263E-006
      400000  4.000000000000000E-003   3.14159515358354       2.499993748017459E-006
      800000  8.000000000000000E-003   3.14159390358823       1.249998440755462E-006
     1600000  1.600000000000000E-002   3.14159327858937       6.249995774254558E-007
     3200000  3.200000000000000E-002   3.14159296608969       3.124998997350303E-007
     6400000  5.600000000000000E-002   3.14159280983973       1.562499356566605E-007
    12800000  9.200000000000000E-002   3.14159273171482       7.812503000081961E-008
    25600000  0.156000000000000        3.14159269265219       3.906239243178788E-008
    51200000  0.276000000000000        3.14159267312113       1.953133832444109E-008
   102400000  0.504000000000000        3.14159266335480       9.765006581119451E-009

real	0m0.508s
user	0m0.504s
sys	0m0.000s

Uauh.... Now it is what (naively) I expected from the beginning. Results are numerically sound and It takes in a consistent way more time in double precision (factor of 5 in ifort, of 10 in gfortran) than in single precision. Now also ifort is faster in double precision than gfortran.

Changing the summation order is a known and very nice trick, thanks to TP for reminding me of it in #8. As mentioned by JD in #9, performing floating point calculations is a field full of traps where your expectations are sometimes grossly unfulfilled. I guess this is the reason why numerical analysis is by its own right a branch of Mathematics. And an important one. One should be aware of this. My fault to forget that in this simple program.

I have to admit that the MERGE(1,-1,IAND(I,1)/=1) part was new to me, and until I checked what is going on in this snippet of code I could only guess what is the purpose of it. It is beautiful. I had seen (not often) merge before, but not Iand, and never this (cute) combination. Now that I understand it, it is clear why it speeds up so much the code. As RO noted in his quotes the (-1)**I term was a bottleneck, and of course the conditional in the new version takes much less time than the "raised to" ** operator.

With regard to RO #7 I thank you your help from the assembly point of view. Sadly, in my case I find very hard to distinguish noise from info in assembly and I sent to you the wrong assembly output because I thought that just adding the -c -S -fsource-asm flags to ifort was the way of getting this output. In gfortran I used -c -g -Wa,-a,-ad flags. I can only start to glance how powerful could be to have some fluency in assembly.

I am very thankful for all the suggestions. Of course I didn't expect to learn so much with this message to the forum and it has been very rewarding.

Ps. RO, I am not sure that this detail is needed anymore, but currently I am trying all this at home with my laptop, whose processor is an i5:

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 58
model name	: Intel(R) Core(TM) i5-3230M CPU @ 2.60GHz
stepping	: 9
microcode	: 0x12
cpu MHz		: 1206.460
cache size	: 3072 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips	: 5188.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 58
model name	: Intel(R) Core(TM) i5-3230M CPU @ 2.60GHz
stepping	: 9
microcode	: 0x12
cpu MHz		: 1200.062
cache size	: 3072 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 2
apicid		: 1
initial apicid	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips	: 5188.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 2
vendor_id	: GenuineIntel
cpu family	: 6
model		: 58
model name	: Intel(R) Core(TM) i5-3230M CPU @ 2.60GHz
stepping	: 9
microcode	: 0x12
cpu MHz		: 1204.937
cache size	: 3072 KB
physical id	: 0
siblings	: 4
core id		: 1
cpu cores	: 2
apicid		: 2
initial apicid	: 2
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips	: 5188.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 3
vendor_id	: GenuineIntel
cpu family	: 6
model		: 58
model name	: Intel(R) Core(TM) i5-3230M CPU @ 2.60GHz
stepping	: 9
microcode	: 0x12
cpu MHz		: 1199.960
cache size	: 3072 KB
physical id	: 0
siblings	: 4
core id		: 1
cpu cores	: 2
apicid		: 3
initial apicid	: 3
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips	: 5188.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

Thanks again.

jimdempseyatthecove · ‎01-17-2016

Tim P. wrote:

More suggestions

1. Reverse the loop so as to add in increasing magnitude order.

2. (-1)**I replaced by merge(1,-1,Iand(I,1)/=1)

or -(iand(I,1)*2 - 1)

The merge will (may) be faster on CPUs that support the conditional move. This will permit a non-branching evaluation.

The "obscure" expression I present will have no branches as well. Note, the compiler may optimize *2 with add to self. (and, add, dec, neg)This would be available on all CPU's and may potentially be faster than conditional move (though the compiler optimization might be smart enough to omit the conditional move).

Jim Dempsey

TimP · ‎01-17-2016

For an old CPU without an efficient merge, 1-2*Iand(I,1) should work. I felt this might be the least self-explanatory. You could cause this to use fma on the current CPU in case you want more fp and less integer ops.

JVanB · ‎01-18-2016

The ifort code turned out to be ugly, calling a library routine to compute (-1.0_SP)**I for each numerator, every time through the loop. With (-1.0_SP)**I and /QxCORE-AVX2, my results were:

      100000  0.000000000000000E+000   3.141608      1.5497208E-05
      200000  0.000000000000000E+000   3.141596      3.5762787E-06
      400000  1.562500000000000E-002   3.141694      1.0156631E-04
      800000  4.687500000000000E-002   3.141549     -4.4107437E-05
     1600000  9.375000000000000E-002   3.142432      8.3971024E-04
     3200000  0.218750000000000        3.140750     -8.4304810E-04
     6400000  0.453125000000000        3.147671      6.0782433E-03
    12800000  0.937500000000000        3.121287     -2.0306110E-02
    25600000   1.93750000000000        3.064596     -7.6996565E-02
    51200000   3.96875000000000        3.031954     -0.1096389
   102400000   8.15625000000000        3.031954     -0.1096389

Changing to (-1)**I as in Quote #6 helped as noted in Quote #5:

      100000  0.000000000000000E+000   3.141608      1.5497208E-05
      200000  0.000000000000000E+000   3.141596      3.5762787E-06
      400000  0.000000000000000E+000   3.141694      1.0156631E-04
      800000  0.000000000000000E+000   3.141549     -4.4107437E-05
     1600000  0.000000000000000E+000   3.142432      8.3971024E-04
     3200000  1.562500000000000E-002   3.140750     -8.4304810E-04
     6400000  3.125000000000000E-002   3.147671      6.0782433E-03
    12800000  6.250000000000000E-002   3.121287     -2.0306110E-02
    25600000  0.125000000000000        3.064596     -7.6996565E-02
    51200000  0.250000000000000        3.031954     -0.1096389
   102400000  0.500000000000000        3.031954     -0.1096389

To see what a little assembly language could do I tried:

format MS64 coff

public INNER
section '.text' code readable executable
INNER:
   xchg rax, rcx
   mov ecx, [rdx]
   mov edx, 100000
   shl edx, cl
   vmovdqu ymm0, [offsets]
   vpbroadcastq ymm1, [deltas]
   vbroadcastss ymm3, [two]
   vxorps ymm2, ymm2, ymm2
   sub edx, 8
   jl .skip
   .loop:
      vcvtdq2ps ymm4, ymm0
      vrcpps ymm5, ymm4
      vfnmadd132ps ymm4,ymm3,ymm5
      vfmadd231ps ymm2,ymm4,ymm5
      vpaddd ymm0, ymm0, ymm1
      sub edx, 8
      jge .loop
   .skip:
   vhaddps ymm2, ymm2, ymm2
   vhaddps ymm2, ymm2, ymm2
   vperm2f128 ymm5, ymm2, ymm2, 81h
   vaddss xmm5, xmm5, xmm2
   vmovss [rax], xmm5
   ret

section '.data' readable writeable align 16
align 16
   label offsets qqword
      dd 1,-3,5,-7,9,-11,13,-15
   label deltas qword
      dd 16,-16
   two dd 40000000h

And got:

      100000  0.000000000000000E+000   3.141568     -2.4318695E-05
      200000  0.000000000000000E+000   3.141561     -3.1948090E-05
      400000  0.000000000000000E+000   3.141631      3.7908554E-05
      800000  0.000000000000000E+000   3.141632      3.9100647E-05
     1600000  0.000000000000000E+000   3.141633      3.9815903E-05
     3200000  0.000000000000000E+000   3.139172     -2.4206638E-03
     6400000  0.000000000000000E+000   3.158608      1.7015219E-02
    12800000  0.000000000000000E+000   3.174737      3.3144474E-02
    25600000  0.000000000000000E+000   3.076810     -6.4783096E-02
    51200000  1.562500000000000E-002   3.076810     -6.4783096E-02
   102400000  3.125000000000000E-002   3.076810     -6.4783096E-02

Shows how wrong I was: it only took 7 instructions in the inner loop. Of course I am taking some shortcuts in my code for brevity in that there is no epilog to handle sums which aren't a multiple of 8 terms.

I tried flopping the accumulators to improve accuracy:

format MS64 coff

public INNER
section '.text' code readable executable
INNER:
   xchg rax, rcx
   mov ecx, [rdx]
   mov edx, 100000
   shl edx, cl
   vmovdqu [rsp+8], ymm6
   vmovdqu ymm6, [perms]
   vmovdqu ymm0, [offsets]
   vpbroadcastq ymm1, [deltas]
   vbroadcastss ymm3, [two]
   vxorps ymm2, ymm2, ymm2
   sub edx, 8
   jl .skip
   .loop:
      vcvtdq2ps ymm4, ymm0
      vpermps ymm0, ymm6, ymm0
      vpermps ymm1, ymm6, ymm1
      vrcpps ymm5, ymm4
      vfnmadd132ps ymm4,ymm3,ymm5
      vfmadd231ps ymm2,ymm4,ymm5
      vpaddd ymm0, ymm0, ymm1
      sub edx, 8
      jge .loop
   .skip:
   vhaddps ymm2, ymm2, ymm2
   vhaddps ymm2, ymm2, ymm2
   vperm2f128 ymm5, ymm2, ymm2, 81h
   vaddss xmm5, xmm5, xmm2
   vmovss [rax], xmm5
   vmovdqu ymm6, [rsp+8]
   ret

section '.data' readable writeable align 16
align 16
   label offsets qqword
      dd 1,-3,5,-7,9,-11,13,-15
   label perms qqword
      dd 1,0,3,2,5,4,7,6
   label deltas qword
      dd 16,-16
   two dd 40000000h

With result:

      100000  0.000000000000000E+000   3.141576     -1.6450882E-05
      200000  0.000000000000000E+000   3.141580     -1.3113022E-05
      400000  0.000000000000000E+000   3.141583     -1.0013580E-05
      800000  0.000000000000000E+000   3.141584     -8.5830688E-06
     1600000  0.000000000000000E+000   3.141585     -7.8678131E-06
     3200000  0.000000000000000E+000   3.141585     -7.8678131E-06
     6400000  0.000000000000000E+000   3.141585     -7.3909760E-06
    12800000  0.000000000000000E+000   3.141585     -7.3909760E-06
    25600000  0.000000000000000E+000   3.141585     -7.3909760E-06
    51200000  1.562500000000000E-002   3.141585     -7.3909760E-06
   102400000  3.125000000000000E-002   3.141585     -7.3909760E-06

But this last result is really cheating: I don't think one could reasonably expect the compiler to perform accumulator flopping for alternating series like an assembly language programmer would!

Edit: removed extraneous ABI-violating instruction that invalidated timing results left over from old version.

JVanB · ‎01-18-2016

A few things. First, I had some old code left over in the first assembly version in Quote #7 and because the save and restore of ymm6 wasn't there but the write to ymm6 still was, it violated the Windows x64 ABI with the result that the timing results were invalidated. Fixing this issue got the time to 2 ticks rather than 1 of 1/64 of a second.

Second, why should (-1)**I be replaced by some less obvious expression? Shouldn't the compiler be able to eliminate (-1)**i or (-1.0_SP)**I when it does strength reduction? I would have to say that failure to do so is a bug.

OK, then what if we write out the first assembly example in good Fortran, thus telling the compiler how do vectorize the inner loop and apply strength reduction?

recursive subroutine inner(Pi_app,J)
   implicit none
   integer, parameter :: SP = kind(1.0)
   integer, parameter :: offsets(8) = [1,-3,5,-7,9,-11,13,-15]
   real(SP) Pi_app
   integer I, J
   integer dens(8), deltas(8)
   real sums(8)
   deltas = [16,-16,16,-16,16,-16,16,-16]
   sums = 0
   dens = offsets
     DO I = 0, (2**J)*100000-7, 8
        sums = sums+1/real(dens,SP)
        dens = dens+deltas
     END DO
   pi_app = 0
     DO I = I, (2**J)*100000
        Pi_app = Pi_app + (-1)**I/REAL(2*I+1,SP)
     END DO
   Pi_app = Pi_app+sum(sums)
end subroutine inner

      100000  0.000000000000000E+000   3.141590     -2.6226044E-06
      200000  0.000000000000000E+000   3.141573     -1.9311905E-05
      400000  0.000000000000000E+000   3.141638      4.5299530E-05
      800000  1.562500000000000E-002   3.141636      4.3630600E-05
     1600000  1.562500000000000E-002   3.141637      4.4345856E-05
     3200000  1.562500000000000E-002   3.139176     -2.4166107E-03
     6400000  3.125000000000000E-002   3.158610      1.7016888E-02
    12800000  6.250000000000000E-002   3.174740      3.3147335E-02
    25600000  0.109375000000000        3.076813     -6.4779997E-02
    51200000  0.203125000000000        3.076813     -6.4779997E-02
   102400000  0.390625000000000        3.076813     -6.4779997E-02

Still horrible code, even though I made it easy for the compiler by leaving out my accumulator flopping stuff. In this case, the inner loop was performed exclusively by scalar operations by the compiler and rather poor ones at that, using a division rather than reciprocal followed by a Newton-Raphson iteration. Failure to recognize the idiom of performing 8 REAL32 or INT32 operations when written in vector form can only be described as another bug.

Finally, reordering the sums is not really necessary or even all that helpful because vectorization changes one conditionally convergent series to several divergent series. Even if you take partial sums of these divergent series in reverse order, there will be stretches where the next term to be added is smaller than half a unit in the last place of the current sum, so chunks of the series won't contribute to the sum. Also the partial sums, even if they are accurate are each significantly bigger than the final result, losing significant figures at the end. Accumulator flopping fixes that by summing several conditionally convergent series.

Have you seen what can be done with just the first 13 terms of this series in https://en.wikipedia.org/wiki/Van_Wijngaarden_transformation ?

I tried summing 16 abreast rather than 8 because I thought that latency on the accumulators due to the 5-cycle vfmadd231ps instruction might be slowing things down, so that with twice the accumulators, latency should be half as problematic. Also I separated out the first 16 terms and only added them in at the end so that roundoff errors in the convergent series (because I applied accumulator flopping) would be lessened. Thus I got a solution that sometimes got done in one tick and sometimes was good to 1 unit in the last place.

format MS64 coff

public INNER
section '.text' code readable executable align 16
align 16
INNER:
   vmovdqu [rsp+8], ymm6
   xchg rax, rcx
   mov ecx, [rdx]
   mov edx, 100000
   shl edx, cl
   vmovdqu ymm0, [offsets]
   vpbroadcastq ymm1, [deltas]
   vbroadcastss ymm3, [two]
   vxorps ymm2, ymm2, ymm2
   vxorps ymm6, ymm6, ymm6
   vpaddd ymm0, ymm0, ymm1
   vpaddd ymm0, ymm0, ymm1
   sub edx, 32
   jl .skip
   .loop:
      vcvtdq2ps ymm4, ymm0
      vpshufd ymm0, ymm0, 0b1h
      vpshufd ymm1, ymm1, 0b1h
      vrcpps ymm5, ymm4
      vfnmadd132ps ymm4,ymm3,ymm5
      vfmadd231ps ymm2,ymm4,ymm5
      vpaddd ymm0, ymm0, ymm1
      vcvtdq2ps ymm4, ymm0
      vrcpps ymm5, ymm4
      vfnmadd132ps ymm4,ymm3,ymm5
      vfmadd231ps ymm6,ymm4,ymm5
      vpaddd ymm0, ymm0, ymm1
      sub edx, 16
      jge .loop
   vmovdqu ymm0, [offsets]
   vpbroadcastq ymm1, [deltas]
   vaddps ymm6, ymm6, ymm2
   vcvtdq2ps ymm4, ymm0
   vrcpps ymm5, ymm4
   vfnmadd132ps ymm4,ymm3,ymm5
   vmulps ymm2,ymm4,ymm5
   vpaddd ymm0, ymm0, ymm1
   vcvtdq2ps ymm4, ymm0
   vrcpps ymm5, ymm4
   vfnmadd132ps ymm4,ymm3,ymm5
   vfmadd231ps ymm6,ymm4,ymm5
   .skip:
   vaddps ymm2, ymm2, ymm6
   vhaddps ymm2, ymm2, ymm2
   vhaddps ymm2, ymm2, ymm2
   vextractf128 xmm5, ymm2, 01h
   vaddss xmm5, xmm5, xmm2
   vmovss [rax], xmm5
   vmovdqu ymm6, [rsp+8]
   ret

section '.data' readable writeable align 16
align 16
   label offsets qqword
      dd 1,-3,5,-7,9,-11,13,-15
   label deltas qword
      dd 16,-16
   two dd 40000000h

      100000  0.000000000000000E+000   3.141582     -1.0967255E-05
      200000  0.000000000000000E+000   3.141587     -5.7220459E-06
      400000  0.000000000000000E+000   3.141589     -3.3378601E-06
      800000  0.000000000000000E+000   3.141591     -1.9073486E-06
     1600000  0.000000000000000E+000   3.141591     -1.4305115E-06
     3200000  0.000000000000000E+000   3.141592     -1.1920929E-06
     6400000  0.000000000000000E+000   3.141592     -7.1525574E-07
    12800000  0.000000000000000E+000   3.141592     -7.1525574E-07
    25600000  0.000000000000000E+000   3.141592     -7.1525574E-07
    51200000  0.000000000000000E+000   3.141592     -7.1525574E-07
   102400000  1.562500000000000E-002   3.141593     -2.3841858E-07

Bizarre results obtained with a simple program