Intel® Fortran Compiler
Build applications that can scale for the future with optimized code designed for Intel® Xeon® and compatible processors.
28436 Discussions

## SIGSEGV in an argument on a subroutine

Beginner
689 Views

Hi, I was trying to vectorize one loop that has a lot of ifs with the next directive

```  m=0.0
! form banded matrix of Puasson equastion
pok=3.
call annotate_site_begin( "pressure" )
do 99 k=2,kbm1
do 99 i=2,imm1
!DIR\$ SIMD LASTPRIVATE(bb2) REDUCTION(+:m, gc2, gc1, gen)
do 99 j=2,jmm1

if (k+1<=kb.and.i+1<=im)then
aa1(i+1,j,k+1)=.25e0*aaf(i+1,j,k+1)
1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
1 +.25e0*aaf(i,j,k+1)*dq(i+1,j)/dq(i,j)
1 *dy(i,j)/ddx(i,j)/art(i,j)
endif

if (k+1<=kb.and.i-1>=1)then
aa2(i-1,j,k+1)=-.25e0*aaf(i-1,j,k+1)
1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
1 -.25e0*aaf(i,j,k+1)*dq(i-1,j)/dq(i,j)
1 *dy(i,j)/ddx(i,j)/art(i,j)
end if
if (k-1>=1.and.i+1<=im)then
aa3(i+1,j,k-1)=-.25e0*aaf(i+1,j,k-1)
1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
1 -.25e0*aaf(i,j,k-1)*dq(i+1,j)/dq(i,j)
1 *dy(i,j)/ddx(i,j)/art(i,j)
end if
if (k-1>=1.and.i-1>=1)then
aa4(i-1,j,k-1)=.25e0*aaf(i-1,j,k-1)
1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
1 +.25e0*aaf(i,j,k-1)*dq(i-1,j)/dq(i,j)
1 *dy(i,j)/ddx(i,j)/art(i,j)
end if
if (k+1<=kb.and.j+1<=jm)then
bb1(i,j+1,k+1)=.25e0*bbf(i,j+1,k+1)
1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
1 +.25e0*bbf(i,j,k+1)*dq(i,j+1)/dq(i,j)
1 *dx(i,j)/ddy(i,j)/art(i,j)
end if
if (k+1<=kb.and.j-1>=1)then
bb2(i,j-1,k+1)=-.25e0*bbf(i,j-1,k+1)
1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j)
1 -.25e0*bbf(i,j,k+1)*dq(i,j-1)/dq(i,j)
1 *dx(i,j)/ddy(i,j)/art(i,j)
end if
if (k-1>=1.and.j+1<=jm)then
bb3(i,j+1,k-1)=-.25e0*bbf(i,j+1,k-1)
1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
1 -.25e0*bbf(i,j,k-1)*dq(i,j+1)/dq(i,j)
1 *dx(i,j)/ddy(i,j)/art(i,j)
end if
if (k-1>=1.and.j-1>=1) then
bb4(i,j-1,k-1)=.25e0*bbf(i,j-1,k-1)
1 *.5*(dx(i,j)+dx(i,j-1))/ddx(i,j-1)/art(i,j)
1 +.25e0*bbf(i,j,k-1)*dq(i,j-1)/dq(i,j)
1 *dx(i,j)/ddy(i,j)/art(i,j)
end if
if (i+1<=im) then
ga1(i+1,j,k)=dz(k)*dq(i+1,j)
1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
end if
if (i-1>=1) then
ga2(i-1,j,k)=dz(k)*dq(i-1,j)
1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
end if
if (j+1<=jm) then
gb1(i,j+1,k)=dz(k)*dq(i,j+1)
1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
end if
if (j-1>=1) then
gb2(i,j-1,k)=dz(k)*dq(i,j-1)
1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j)
end if
if (k+1<=kb) then
gc1(i,j,k+1)=1.e0/(dzz(k)*dq(i,j))*
1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k+1)*dy(i,j)/
1 dx(i,j)+.5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k+1)*dx(i,j)/
1 dy(i,j))/art(i,j)
end if
if (k-1>=1) then
gc2(i,j,k-1)=1.e0/(dzz(k-1)*dq(i,j))*
1 (art(i,j)+.5*(aaf(i,j,k-1)+aaf(i,j,k))*aaf(i,j,k-1)*dy(i,j)/
1 dx(i,j)+.5*(bbf(i,j,k-1)+bbf(i,j,k))*bbf(i,j,k-1)*dx(i,j)/
1 dy(i,j))/art(i,j)
end if
!   if(iint==5)stop
if (i-1>=1.and.j-1>=1.and.k-1>=1.) then
gen(i,j,k)=(-dq(i,j)*dz(k)*(.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)+
1 .5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)+
1 .5*(dx(i,j)+dx(i,j+1))/ddy(i,j)+
1 .5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)))/art(i,j)-
2 (1.e0/dzz(k-1)+1.e0/dzz(k))/dq(i,j)*(art(i,j)
1 +.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+
1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j)
1 )/art(i,j)
else
gen(i,j,k)=(-dq(i,j)*dz(k)*(dy(i,j)/ddx(i,j)+
1 dy(i,j)/ddx(i-1,j)+
1 dx(i,j)/ddy(i,j)+
1 dx(i,j)/ddy(i,j-1)))/art(i,j)-
2 (2.e0/dzz(k))/dq(i,j)*
1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+
1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j)
1 )/art(i,j)
end if

if (((k-1)*(k-kb)*(i-1)*(i-im)*(j-1)*(j-jm)).ne.0) then
m=m+1

if (k==kbm1) then
ga1(i+1,j,k)=ga1(i+1,j,k)+aa1(i+1,j,k+1)
ga2(i-1,j,k)=ga2(i-1,j,k)+aa2(i-1,j,k+1)
gb1(i,j+1,k)=gb1(i,j+1,k)+bb1(i,j+1,k+1)
gb2(i,j-1,k)=gb2(i,j-1,k)+bb2(i,j-1,k+1)
gen(i,j,k)=gen(i,j,k)+gc1(i,j,k+1)
endif

if (k==2) then !¸òþñþôýð  ÿþòõ¨¿ýþ¸ª¹
aa2(i+1,j,k-1)=0.
bb2(i,j+1,k-1)=0.
bb4(i,j-1,k-1)=0.
gc2(i,j,k-1)=0.
endif

if (i==2) then
gc2(i,j,k-1)=gc2(i,j,k-1)+aa4(i-1,j,k-1)
gc1(i,j,k+1)=gc1(i,j,k+1)+aa2(i-1,j,k+1)
gen(i,j,k)=gen(i,j,k)+ga2(i-1,j,k)
endif

if (i==imm1) then
gc2(i,j,k-1)=gc2(i,j,k-1)+aa3(i+1,j,k-1)
gc1(i,j,k+1)=gc1(i,j,k+1)+aa1(i+1,j,k+1)
gen(i,j,k)=gen(i,j,k)+ga1(i+1,j,k)
endif

if (j==2) then
gc2(i,j,k-1)=gc2(i,j,k-1)+bb4(i,j-1,k-1)
gc1(i,j,k+1)=gc1(i,j,k+1)+bb2(i,j-1,k+1)
gen(i,j,k)=gen(i,j,k)+gb2(i,j-1,k)
endif

if (j==jmm1) then
gc2(i,j,k-1)=gc2(i,j,k-1)+bb3(i,j+1,k-1)
gc1(i,j,k+1)=gc1(i,j,k+1)+bb1(i,j+1,k+1)
gen(i,j,k)=gen(i,j,k)+gb1(i,j+1,k)
endif

endif

if (maa1+m<=lm) then
if (k+1>kbm1.or.i+1>imm1) then
apr(m)=0.0
else
apr(m)=aa1(i+1,j,k+1)
end if
ja(m)=ind(m+maa1)
ia(m)=ind(m)
end if

lapr=ma1
if (mbb1+m<=lm) then
if (k+1>kbm1.or.j+1>jmm1) then
apr(lapr+m)=0.0
else
apr(lapr+m)=bb1(i,j+1,k+1)
end if
ja(m+lapr)=ind(m+mbb1)
ia(m+lapr)=ind(m)
end if

lapr=ma1+mb1
if (mgc+m<=lm) then
if (k+1>kbm1) then
apr(lapr+m)=0.0
else
apr(lapr+m)=gc1(i,j,k+1)
end if
ja(m+lapr)=ind(m+mgc)
ia(m+lapr)=ind(m)
end if

lapr=ma1+mb1+mc
if (mbb2+m<=lm) then
if (k+1>kbm1.or.j-1<2) then
apr(lapr+m)=0.0
else
apr(lapr+m)=bb2(i,j-1,k+1)
end if

ja(m+lapr)=ind(m+mbb2)
ia(m+lapr)=ind(m)
end if

lapr=ma1+mb1+mc+mb2
if (maa2+m<=lm) then
if (k+1>kbm1.or.i-1<2) then
apr(lapr+m)=0.0
else
apr(lapr+m)=aa2(i-1,j,k+1)
endif
ja(m+lapr)=ind(m+maa2)
ia(m+lapr)=ind(m)
end if

lapr=ma1+mb1+mc+mb2+ma2
if (mga+m<=lm) then
if (i+1>imm1) then
apr(lapr+m)=0.0
else
apr(lapr+m)=ga1(i+1,j,k)
end if
ja(m+lapr)=ind(m+mga)
ia(m+lapr)=ind(m)
end if

lapr=ma1+mb1+mc+mb2+ma2+ma
if (1+m<=lm) then
if (j+1>jmm1) then
apr(lapr+m)=0.0
else
apr(lapr+m)=gb1(i,j+1,k)
end if
ja(m+lapr)=ind(m+mgb)
ia(m+lapr)=ind(m)
end if

lapr=ma1+mb1+mc+mb2+ma2+ma+mb
apr(lapr+m)=gen(i,j,k)
ja(m+lapr)=ind(m)
ia(m+lapr)=ind(m)
lapr=ma1+mb1+mc+mb2+ma2+ma+mb+lm
if (m-mgb>=1) then
if (j-1<2) then
apr(m-mgb+lapr)=0.0
else
apr(m-mgb+lapr)=gb2(i,j-1,k)
end if
ja(m-mgb+lapr)=ind(m-mgb)
ia(m-mgb+lapr)=ind(m)
end if

lapr=ma1+mb1+mc+mb2+ma2+ma+2*mb+lm
if (m-mga>=1) then
if (i-1<2) then
apr(m-mga+lapr)=0.0
else
apr(m-mga+lapr)=ga2(i-1,j,k)
end if

ia(m-mga+lapr)=ind(m)
ja(m-mga+lapr)=ind(-mga+m)
end if

lapr=ma1+mb1+mc+mb2+ma2+2*ma+2*mb+lm
if (m-maa2>=1) then
if (k-1<2.or.i+1>imm1) then
apr(m-maa2+lapr)=0.0
else
apr(m-maa2+lapr)=aa3(i+1,j,k-1)
endif

ia(m-maa2+lapr)=ind(m)
ja(m-maa2+lapr)=ind(-maa2+m)
end if

lapr=ma1+mb1+mc+mb2+2*ma2+2*ma+2*mb+lm
if (m-mbb2>=1) then
if (j+1>jmm1.or.k-1<2) then
apr(m-mbb2+lapr)=0.0
else
apr(m-mbb2+lapr)=bb3(i,j+1,k-1)
endif

ia(m-mbb2+lapr)=ind(m)
ja(m-mbb2+lapr)=ind(-mbb2+m)
end if

lapr=ma1+mb1+mc+2*mb2+2*ma2+2*ma+2*mb+lm

if(m-mgc>=1)then
if (k-1>kbm1) then
apr(m-mgc+lapr)=0.0
else
apr(m-mgc+lapr)=gc2(i,j,k-1)
end if
ja(m-mgc+lapr)=ind(m-mgc)
ia(m-mgc+lapr)=ind(m)
end if

lapr=ma1+mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm
if (m-mbb1>=1) then
if (j-1<2.or.k-1<2)then
apr(m-mbb1+lapr)=0.0
else
apr(m-mbb1+lapr)=bb4(i,j-1,k-1)
endif

ia(m-mbb1+lapr)=ind(m)
ja(m-mbb1+lapr)=ind(m-mbb1)
end if

lapr=ma1+2*mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm
if (m-maa1>=1) then
if (i-1<2.or.k-1<2)then
apr(m-maa1+lapr)=0.0
else
apr(m-maa1+lapr)=aa4(i-1,j,k-1)
endif

ia(m-maa1+lapr)=ind(m)
ja(m-maa1+lapr)=ind(m-maa1)
end if

99  continue

call annotate_site_end```

However, when I run it, I got this SIGSEGV

forrtl: severe (174): SIGSEGV, segmentation fault occurred
Image                       PC                             Routine                     Line        Source
nohydropom_intel    000000000044B7F3  Unknown               Unknown  Unknown
nohydropom_intel    000000000042BEBD  pressure1_                       92  pressure1.for
nohydropom_intel    000000000040C755  MAIN__.R                        614  Main.for
nohydropom_intel    0000000000403D32  Unknown                 Unknown  Unknown
libc-2.23.so              00007F1CF1F92830  __libc_start_main    Unknown  Unknown
nohydropom_intel    0000000000403C29  Unknown                  Unknown  Unknown

I debugged my code. I inserted a breakpoint in line 614 at Main.for. Then I found that the SIGSEGV happened at one argument that it is an array.

Breakpoint 1, main () at ../Main.for:614
614        call pressure1(dti,q)
No symbol "dti" in current context.
Symbol "q" is static storage at address 0x1088280.
(gdb) print dti
No symbol "dti" in current context.
(gdb) print q
\$1 = (( ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...) ...) ...)
(gdb) info line 614
Line 614 of "../Main.for" starts at address 0x40c743 <main+35251> and ends at 0x40c755 <main+35269>.
(gdb) disas 0x40c743, 0x40c755
Dump of assembler code from 0x40c743 to 0x40c755:
=> 0x000000000040c743 <main+35251>:    mov    \$0x6458900,%edi
0x000000000040c748 <main+35256>:    mov    \$0x1088280,%esi
0x000000000040c74d <main+35261>:    vzeroupper
0x000000000040c750 <main+35264>:    callq  0x42bb30 <pressure1>
End of assembler dump.
(gdb) continue
Continuing.

Program received signal SIGSEGV, Segmentation fault.
0x000000000042bebd in pressure1 (dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>)
at ../pressure1.for:92
92        allocate(apr(n_apr))
(gdb) info stack
#0  0x000000000042bebd in pressure1 (dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>)
at ../pressure1.for:92
#1  0x000000000040c755 in main () at ../Main.for:614
#2  0x0000000000403d32 in main ()
(gdb) info frame
Stack level 0, frame at 0x7fffffffbf80:
rip = 0x42bebd in pressure1 (../pressure1.for:92); saved rip = 0x40c755
called by frame at 0x7fffffffc400
source language fortran.
Arglist at 0x7fffffffbf70, args: dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>
Locals at 0x7fffffffbf70, Previous frame's sp is 0x7fffffffbf80
Saved registers:
rbx at 0x7fffffffbf38, rbp at 0x7fffffffbf70, r12 at 0x7fffffffbf58, r13 at 0x7fffffffbf50, r14 at 0x7fffffffbf48,
r15 at 0x7fffffffbf40, rip at 0x7fffffffbf78
Symbol "q1" is a complex DWARF expression:
0: DW_OP_breg4 0 [\$rsi]
.
(gdb) whatis q1
type = REAL(8) (400,6,80)
(gdb) up
#1  0x000000000040c755 in main () at ../Main.for:614
614        call pressure1(dti,q)
(gdb) whatis q
type = REAL(8) (400,6,80)

When I delete the DIR SIMD directive, and recompile my code then my code runs. This is the content of my makefile

EXE= nohydropom_intel
FC= ifort
FFLAGS+= -O2 -m64 -mavx -mtune=core-avx-i -axAVX -real-size 64 -fp-model precise -fp-model source \
-fast-transcendentals -fimf-use-svml=true -fma -g -ipo -qopt-report=5 \
-traceback
#FFLAGS+= -O2 -xHost -real-size 64 -parallel -ipo -fstack-protector-all
#LDFLAGS = -lslatec -llapack
#LIBDIR = -L/usr/local/lib -L/usr/lib/lapack

OBJS = \
Advsm.o    Subr.o    Bcond1.o    Vertstruct.o    S_t_subr.o \
Coef.o    Main.o    ztosig.o    pprint.o    pressure1.o    Wveloc.o \

\${EXE}:  \${OBJS}
\$(FC) \$(FFLAGS) -o \$(EXE) \$^ \$(INCDIR) \$(LIBDIR) \$(LDFLAGS)

\${OBJS}: %.o: ../%.for
\${FC} \${FFLAGS} -c -o \$@ \$< \$(INCDIR)

clean:
rm -f *.o \$(EXE)

I have many questions about this issue. The variables in the DIR SIMD are not related with q1. I checked the declarations of q in Main.for, and q1 in pressure.for and they have the same declarations

Main.for => DIMENSION  q(im,jm,kb)

pressure1.for  =>   dimension q1(im,jm,kb)

im, jm, kb are defined as   PARAMETER (IM=400,JM=6,KB=80,ks=80)   in a file called comblk98.h.

So, I don't think that is an issue with my code. Somehow, the compiler is messing the options with the DIR SIMD directive. I wonder if anyone of you know if the DIR SIMD affects the way an array is passed to a subroutine. If you don't then I will open a ticket to Intel.

By the way, this is my environment

Iepardo@epardohome:~/nohydro/intel\$ ifort -v
ifort version 19.0.3.199
epardo@epardohome:~/nohydro/intel\$ uname -a
Linux epardohome 4.4.0-128-generic #154-Ubuntu SMP Fri May 25 14:15:18 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
epardo@epardohome:~/nohydro/intel\$ cat /etc/os-release
NAME="Ubuntu"
VERSION="16.04.6 LTS (Xenial Xerus)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 16.04.6 LTS"
VERSION_ID="16.04"
HOME_URL="http://www.ubuntu.com/"
SUPPORT_URL="http://help.ubuntu.com/"
VERSION_CODENAME=xenial
UBUNTU_CODENAME=xenial

9 Replies
Beginner
689 Views

I have opened ticket 04186524 to Intel.

Honored Contributor III
689 Views

The traceback that you provided in #1 clearly shows that the last source line executed was in pressure1.for, namely, line 92. The hundreds of lines of source code that you provided do not tell us what was on that line, and there is insufficient information to guess whether the problem is with the code in the subroutine, the arguments passed to the subroutine, or some combination of both.

The reported "Cannot access memory at address 0x2" is also an important clue that should be followed up. That sort of address should not be referenced in a user mode program.

I suggest that you run a serial version of your program with the same input data and with similar optimization levels as with the parallel version. Until such a run works and yields reasonable results, the complications associated with parallel code are tough to understand and investigate.

Beginner
689 Views

Hi, I have uploaded the Main.for, pressure1.for, and makefile files. I didn't include any parallel or multithread options in the makefile, so I assume that my code was compiled as serial. Please let me know if you need more files for your analysis.

Honored Contributor III
689 Views

The include file 'comblk98.h' probably contains some type declarations that are pertinent.  Do you use implicit typing in your sources?

How complicated is the source for advisor_annotate.mod? In other words, would it be asking too much to request that you provide all the sources needed for an independent compilation of the source file pressure1.for?

There are some instances of nonstandard Fortran expressions, such as '+' followed by '-' in lines 421, 422 of main.for: ...ADVUA(I,J) +- ARU(I,J)... I hope that you are aware of the implications of such usage.

Beginner
689 Views

I have uploaded all my code in a zip. For your analysis, you must use the makefile in the release directory to compile the code (that is the one that makes the exe that is crashing). My code is too old (fortran 77) , so it's using implicit types. I don't know how complicated is the source of advisor_annotate since that it is part of Intel Advisor 19 Update 3 (a tool that was using to guide me to vectorize and parallelize my code). However, I will attach the source files for advisor-annotate too. I will check the lines that you mentioned later.

Thanks.

Honored Contributor III
689 Views

FWIW

SIMD generated code performs the calculations on all lanes of the SIMD (small vector) with applicable mask (depending on instruction set) and then performs a masked store.

In the code presented in post #1 you have exception sections for the perimeter (surface boundary) of the volume being computed. It would be more (most) efficient to compute (SIMD) the interior volume separated from the (scalar) perimeter (surface boundary) and thus eliminate complications of having the boundary tests (and unnecessary code that is masked out).

Jim Dempsey

Honored Contributor III
689 Views

Ernesto, the code that you provided in #6 has bugs, I think. For instance, at the point where Seamount() is called from main.f, the array variables X and Y are undefined, but their values are used in the double DO loops after the call to DEPTH() in Seamount.f. Do you agree?

Beginner
688 Views

Hi mecej4. You are right about the undefined variables in seamount, that should be marked as a bug. I will try to fix it. Also, you were right regarding the signs in the expression that you mentioned in post #5. Regarding post #7 from Jim,  although I'm not very familiar with SIMD programming, his comments have showed me that there could be a better way to change my code to improve optimization. Thanks to both of you guys.

Honored Contributor III
688 Views

>> although I'm not very familiar with SIMD programming

Perhaps something to provide insight will help.

SIMD = Single Instruction Multiple Data aka vector operations.

In a loop that is vectorized something like this happens:

```...
if(simpleLogicalExpression) then
out(i) = ExpressionSuitableForVectorization
Also(i) = ...
ThisToo(i) = ...
else
out(i) = OtherExpressionSuitableForVectorization
Also(i) = Other...
ThisToo(i) = Other...
endif

Becomes: (sketch code)

temp1A = ExpressionSuitableForVectorization
temp2A = ...
temp3A = ...
temp1B = OtherExpressionSuitableForVectorization
temp2B = Other...
Temp3B = Other...