- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi, I was trying to vectorize one loop that has a lot of ifs with the next directive
m=0.0 ! form banded matrix of Puasson equastion pok=3. call annotate_site_begin( "pressure" ) call annotate_iteration_task( "pressure-task" ) do 99 k=2,kbm1 do 99 i=2,imm1 !DIR$ SIMD LASTPRIVATE(bb2) REDUCTION(+:m, gc2, gc1, gen) do 99 j=2,jmm1 if (k+1<=kb.and.i+1<=im)then aa1(i+1,j,k+1)=.25e0*aaf(i+1,j,k+1) 1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j) 1 +.25e0*aaf(i,j,k+1)*dq(i+1,j)/dq(i,j) 1 *dy(i,j)/ddx(i,j)/art(i,j) endif if (k+1<=kb.and.i-1>=1)then aa2(i-1,j,k+1)=-.25e0*aaf(i-1,j,k+1) 1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j) 1 -.25e0*aaf(i,j,k+1)*dq(i-1,j)/dq(i,j) 1 *dy(i,j)/ddx(i,j)/art(i,j) end if if (k-1>=1.and.i+1<=im)then aa3(i+1,j,k-1)=-.25e0*aaf(i+1,j,k-1) 1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j) 1 -.25e0*aaf(i,j,k-1)*dq(i+1,j)/dq(i,j) 1 *dy(i,j)/ddx(i,j)/art(i,j) end if if (k-1>=1.and.i-1>=1)then aa4(i-1,j,k-1)=.25e0*aaf(i-1,j,k-1) 1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j) 1 +.25e0*aaf(i,j,k-1)*dq(i-1,j)/dq(i,j) 1 *dy(i,j)/ddx(i,j)/art(i,j) end if if (k+1<=kb.and.j+1<=jm)then bb1(i,j+1,k+1)=.25e0*bbf(i,j+1,k+1) 1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j) 1 +.25e0*bbf(i,j,k+1)*dq(i,j+1)/dq(i,j) 1 *dx(i,j)/ddy(i,j)/art(i,j) end if if (k+1<=kb.and.j-1>=1)then bb2(i,j-1,k+1)=-.25e0*bbf(i,j-1,k+1) 1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j) 1 -.25e0*bbf(i,j,k+1)*dq(i,j-1)/dq(i,j) 1 *dx(i,j)/ddy(i,j)/art(i,j) end if if (k-1>=1.and.j+1<=jm)then bb3(i,j+1,k-1)=-.25e0*bbf(i,j+1,k-1) 1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j) 1 -.25e0*bbf(i,j,k-1)*dq(i,j+1)/dq(i,j) 1 *dx(i,j)/ddy(i,j)/art(i,j) end if if (k-1>=1.and.j-1>=1) then bb4(i,j-1,k-1)=.25e0*bbf(i,j-1,k-1) 1 *.5*(dx(i,j)+dx(i,j-1))/ddx(i,j-1)/art(i,j) 1 +.25e0*bbf(i,j,k-1)*dq(i,j-1)/dq(i,j) 1 *dx(i,j)/ddy(i,j)/art(i,j) end if if (i+1<=im) then ga1(i+1,j,k)=dz(k)*dq(i+1,j) 1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j) end if if (i-1>=1) then ga2(i-1,j,k)=dz(k)*dq(i-1,j) 1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j) end if if (j+1<=jm) then gb1(i,j+1,k)=dz(k)*dq(i,j+1) 1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j) end if if (j-1>=1) then gb2(i,j-1,k)=dz(k)*dq(i,j-1) 1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j) end if if (k+1<=kb) then gc1(i,j,k+1)=1.e0/(dzz(k)*dq(i,j))* 1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k+1)*dy(i,j)/ 1 dx(i,j)+.5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k+1)*dx(i,j)/ 1 dy(i,j))/art(i,j) end if if (k-1>=1) then gc2(i,j,k-1)=1.e0/(dzz(k-1)*dq(i,j))* 1 (art(i,j)+.5*(aaf(i,j,k-1)+aaf(i,j,k))*aaf(i,j,k-1)*dy(i,j)/ 1 dx(i,j)+.5*(bbf(i,j,k-1)+bbf(i,j,k))*bbf(i,j,k-1)*dx(i,j)/ 1 dy(i,j))/art(i,j) end if ! if(iint==5)stop if (i-1>=1.and.j-1>=1.and.k-1>=1.) then gen(i,j,k)=(-dq(i,j)*dz(k)*(.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)+ 1 .5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)+ 1 .5*(dx(i,j)+dx(i,j+1))/ddy(i,j)+ 1 .5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)))/art(i,j)- 2 (1.e0/dzz(k-1)+1.e0/dzz(k))/dq(i,j)*(art(i,j) 1 +.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+ 1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j) 1 )/art(i,j) else gen(i,j,k)=(-dq(i,j)*dz(k)*(dy(i,j)/ddx(i,j)+ 1 dy(i,j)/ddx(i-1,j)+ 1 dx(i,j)/ddy(i,j)+ 1 dx(i,j)/ddy(i,j-1)))/art(i,j)- 2 (2.e0/dzz(k))/dq(i,j)* 1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+ 1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j) 1 )/art(i,j) end if if (((k-1)*(k-kb)*(i-1)*(i-im)*(j-1)*(j-jm)).ne.0) then m=m+1 if (k==kbm1) then ga1(i+1,j,k)=ga1(i+1,j,k)+aa1(i+1,j,k+1) ga2(i-1,j,k)=ga2(i-1,j,k)+aa2(i-1,j,k+1) gb1(i,j+1,k)=gb1(i,j+1,k)+bb1(i,j+1,k+1) gb2(i,j-1,k)=gb2(i,j-1,k)+bb2(i,j-1,k+1) gen(i,j,k)=gen(i,j,k)+gc1(i,j,k+1) endif if (k==2) then !¸òþñþôýð ÿþòõ¨¿ýþ¸ª¹ aa2(i+1,j,k-1)=0. bb2(i,j+1,k-1)=0. bb4(i,j-1,k-1)=0. gc2(i,j,k-1)=0. endif if (i==2) then gc2(i,j,k-1)=gc2(i,j,k-1)+aa4(i-1,j,k-1) gc1(i,j,k+1)=gc1(i,j,k+1)+aa2(i-1,j,k+1) gen(i,j,k)=gen(i,j,k)+ga2(i-1,j,k) endif if (i==imm1) then gc2(i,j,k-1)=gc2(i,j,k-1)+aa3(i+1,j,k-1) gc1(i,j,k+1)=gc1(i,j,k+1)+aa1(i+1,j,k+1) gen(i,j,k)=gen(i,j,k)+ga1(i+1,j,k) endif if (j==2) then gc2(i,j,k-1)=gc2(i,j,k-1)+bb4(i,j-1,k-1) gc1(i,j,k+1)=gc1(i,j,k+1)+bb2(i,j-1,k+1) gen(i,j,k)=gen(i,j,k)+gb2(i,j-1,k) endif if (j==jmm1) then gc2(i,j,k-1)=gc2(i,j,k-1)+bb3(i,j+1,k-1) gc1(i,j,k+1)=gc1(i,j,k+1)+bb1(i,j+1,k+1) gen(i,j,k)=gen(i,j,k)+gb1(i,j+1,k) endif endif if (maa1+m<=lm) then if (k+1>kbm1.or.i+1>imm1) then apr(m)=0.0 else apr(m)=aa1(i+1,j,k+1) end if ja(m)=ind(m+maa1) ia(m)=ind(m) end if lapr=ma1 if (mbb1+m<=lm) then if (k+1>kbm1.or.j+1>jmm1) then apr(lapr+m)=0.0 else apr(lapr+m)=bb1(i,j+1,k+1) end if ja(m+lapr)=ind(m+mbb1) ia(m+lapr)=ind(m) end if lapr=ma1+mb1 if (mgc+m<=lm) then if (k+1>kbm1) then apr(lapr+m)=0.0 else apr(lapr+m)=gc1(i,j,k+1) end if ja(m+lapr)=ind(m+mgc) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc if (mbb2+m<=lm) then if (k+1>kbm1.or.j-1<2) then apr(lapr+m)=0.0 else apr(lapr+m)=bb2(i,j-1,k+1) end if ja(m+lapr)=ind(m+mbb2) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2 if (maa2+m<=lm) then if (k+1>kbm1.or.i-1<2) then apr(lapr+m)=0.0 else apr(lapr+m)=aa2(i-1,j,k+1) endif ja(m+lapr)=ind(m+maa2) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2+ma2 if (mga+m<=lm) then if (i+1>imm1) then apr(lapr+m)=0.0 else apr(lapr+m)=ga1(i+1,j,k) end if ja(m+lapr)=ind(m+mga) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2+ma2+ma if (1+m<=lm) then if (j+1>jmm1) then apr(lapr+m)=0.0 else apr(lapr+m)=gb1(i,j+1,k) end if ja(m+lapr)=ind(m+mgb) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2+ma2+ma+mb apr(lapr+m)=gen(i,j,k) ja(m+lapr)=ind(m) ia(m+lapr)=ind(m) lapr=ma1+mb1+mc+mb2+ma2+ma+mb+lm if (m-mgb>=1) then if (j-1<2) then apr(m-mgb+lapr)=0.0 else apr(m-mgb+lapr)=gb2(i,j-1,k) end if ja(m-mgb+lapr)=ind(m-mgb) ia(m-mgb+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2+ma2+ma+2*mb+lm if (m-mga>=1) then if (i-1<2) then apr(m-mga+lapr)=0.0 else apr(m-mga+lapr)=ga2(i-1,j,k) end if ia(m-mga+lapr)=ind(m) ja(m-mga+lapr)=ind(-mga+m) end if lapr=ma1+mb1+mc+mb2+ma2+2*ma+2*mb+lm if (m-maa2>=1) then if (k-1<2.or.i+1>imm1) then apr(m-maa2+lapr)=0.0 else apr(m-maa2+lapr)=aa3(i+1,j,k-1) endif ia(m-maa2+lapr)=ind(m) ja(m-maa2+lapr)=ind(-maa2+m) end if lapr=ma1+mb1+mc+mb2+2*ma2+2*ma+2*mb+lm if (m-mbb2>=1) then if (j+1>jmm1.or.k-1<2) then apr(m-mbb2+lapr)=0.0 else apr(m-mbb2+lapr)=bb3(i,j+1,k-1) endif ia(m-mbb2+lapr)=ind(m) ja(m-mbb2+lapr)=ind(-mbb2+m) end if lapr=ma1+mb1+mc+2*mb2+2*ma2+2*ma+2*mb+lm if(m-mgc>=1)then if (k-1>kbm1) then apr(m-mgc+lapr)=0.0 else apr(m-mgc+lapr)=gc2(i,j,k-1) end if ja(m-mgc+lapr)=ind(m-mgc) ia(m-mgc+lapr)=ind(m) end if lapr=ma1+mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm if (m-mbb1>=1) then if (j-1<2.or.k-1<2)then apr(m-mbb1+lapr)=0.0 else apr(m-mbb1+lapr)=bb4(i,j-1,k-1) endif ia(m-mbb1+lapr)=ind(m) ja(m-mbb1+lapr)=ind(m-mbb1) end if lapr=ma1+2*mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm if (m-maa1>=1) then if (i-1<2.or.k-1<2)then apr(m-maa1+lapr)=0.0 else apr(m-maa1+lapr)=aa4(i-1,j,k-1) endif ia(m-maa1+lapr)=ind(m) ja(m-maa1+lapr)=ind(m-maa1) end if 99 continue call annotate_site_end
However, when I run it, I got this SIGSEGV
forrtl: severe (174): SIGSEGV, segmentation fault occurred
Image PC Routine Line Source
nohydropom_intel 000000000044B7F3 Unknown Unknown Unknown
libpthread-2.23.s 00007F1CF2551390 Unknown Unknown Unknown
nohydropom_intel 000000000042BEBD pressure1_ 92 pressure1.for
nohydropom_intel 000000000040C755 MAIN__.R 614 Main.for
nohydropom_intel 0000000000403D32 Unknown Unknown Unknown
libc-2.23.so 00007F1CF1F92830 __libc_start_main Unknown Unknown
nohydropom_intel 0000000000403C29 Unknown Unknown Unknown
I debugged my code. I inserted a breakpoint in line 614 at Main.for. Then I found that the SIGSEGV happened at one argument that it is an array.
Breakpoint 1, main () at ../Main.for:614
614 call pressure1(dti,q)
(gdb) info address dti
No symbol "dti" in current context.
(gdb) info address q
Symbol "q" is static storage at address 0x1088280.
(gdb) print dti
No symbol "dti" in current context.
(gdb) print q
$1 = (( ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...) ...) ...)
(gdb) info line 614
Line 614 of "../Main.for" starts at address 0x40c743 <main+35251> and ends at 0x40c755 <main+35269>.
(gdb) disas 0x40c743, 0x40c755
Dump of assembler code from 0x40c743 to 0x40c755:
=> 0x000000000040c743 <main+35251>: mov $0x6458900,%edi
0x000000000040c748 <main+35256>: mov $0x1088280,%esi
0x000000000040c74d <main+35261>: vzeroupper
0x000000000040c750 <main+35264>: callq 0x42bb30 <pressure1>
End of assembler dump.
(gdb) continue
Continuing.
Program received signal SIGSEGV, Segmentation fault.
0x000000000042bebd in pressure1 (dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>)
at ../pressure1.for:92
92 allocate(apr(n_apr))
(gdb) info stack
#0 0x000000000042bebd in pressure1 (dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>)
at ../pressure1.for:92
#1 0x000000000040c755 in main () at ../Main.for:614
#2 0x0000000000403d32 in main ()
(gdb) info frame
Stack level 0, frame at 0x7fffffffbf80:
rip = 0x42bebd in pressure1 (../pressure1.for:92); saved rip = 0x40c755
called by frame at 0x7fffffffc400
source language fortran.
Arglist at 0x7fffffffbf70, args: dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>
Locals at 0x7fffffffbf70, Previous frame's sp is 0x7fffffffbf80
Saved registers:
rbx at 0x7fffffffbf38, rbp at 0x7fffffffbf70, r12 at 0x7fffffffbf58, r13 at 0x7fffffffbf50, r14 at 0x7fffffffbf48,
r15 at 0x7fffffffbf40, rip at 0x7fffffffbf78
(gdb) info address q1
Symbol "q1" is a complex DWARF expression:
0: DW_OP_breg4 0 [$rsi]
.
(gdb) whatis q1
type = REAL(8) (400,6,80)
(gdb) up
#1 0x000000000040c755 in main () at ../Main.for:614
614 call pressure1(dti,q)
(gdb) whatis q
type = REAL(8) (400,6,80)
When I delete the DIR SIMD directive, and recompile my code then my code runs. This is the content of my makefile
EXE= nohydropom_intel
FC= ifort
FFLAGS+= -O2 -m64 -mavx -mtune=core-avx-i -axAVX -real-size 64 -fp-model precise -fp-model source \
-fast-transcendentals -fimf-use-svml=true -fma -g -ipo -qopt-report=5 \
-traceback
#FFLAGS+= -O2 -xHost -real-size 64 -parallel -ipo -fstack-protector-all
#LDFLAGS = -lslatec -llapack
#LIBDIR = -L/usr/local/lib -L/usr/lib/lapack
LDFLAGS = -ladvisor
LIBDIR = -L/opt/intel/advisor/lib64
INCDIR = -I/opt/intel/advisor/include/intel64
OBJS = \
Advsm.o Subr.o Bcond1.o Vertstruct.o S_t_subr.o \
Coef.o Main.o ztosig.o pprint.o pressure1.o Wveloc.o \
Depth.o seamount.o Liadv.o Slap.o
${EXE}: ${OBJS}
$(FC) $(FFLAGS) -o $(EXE) $^ $(INCDIR) $(LIBDIR) $(LDFLAGS)
${OBJS}: %.o: ../%.for
${FC} ${FFLAGS} -c -o $@ $< $(INCDIR)
clean:
rm -f *.o $(EXE)
I have many questions about this issue. The variables in the DIR SIMD are not related with q1. I checked the declarations of q in Main.for, and q1 in pressure.for and they have the same declarations
Main.for => DIMENSION q(im,jm,kb)
pressure1.for => dimension q1(im,jm,kb)
im, jm, kb are defined as PARAMETER (IM=400,JM=6,KB=80,ks=80) in a file called comblk98.h.
So, I don't think that is an issue with my code. Somehow, the compiler is messing the options with the DIR SIMD directive. I wonder if anyone of you know if the DIR SIMD affects the way an array is passed to a subroutine. If you don't then I will open a ticket to Intel.
By the way, this is my environment
Iepardo@epardohome:~/nohydro/intel$ ifort -v
ifort version 19.0.3.199
epardo@epardohome:~/nohydro/intel$ uname -a
Linux epardohome 4.4.0-128-generic #154-Ubuntu SMP Fri May 25 14:15:18 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
epardo@epardohome:~/nohydro/intel$ cat /etc/os-release
NAME="Ubuntu"
VERSION="16.04.6 LTS (Xenial Xerus)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 16.04.6 LTS"
VERSION_ID="16.04"
HOME_URL="http://www.ubuntu.com/"
SUPPORT_URL="http://help.ubuntu.com/"
BUG_REPORT_URL="http://bugs.launchpad.net/ubuntu/"
VERSION_CODENAME=xenial
UBUNTU_CODENAME=xenial
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I have opened ticket 04186524 to Intel.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
The traceback that you provided in #1 clearly shows that the last source line executed was in pressure1.for, namely, line 92. The hundreds of lines of source code that you provided do not tell us what was on that line, and there is insufficient information to guess whether the problem is with the code in the subroutine, the arguments passed to the subroutine, or some combination of both.
The reported "Cannot access memory at address 0x2" is also an important clue that should be followed up. That sort of address should not be referenced in a user mode program.
I suggest that you run a serial version of your program with the same input data and with similar optimization levels as with the parallel version. Until such a run works and yields reasonable results, the complications associated with parallel code are tough to understand and investigate.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
The include file 'comblk98.h' probably contains some type declarations that are pertinent. Do you use implicit typing in your sources?
How complicated is the source for advisor_annotate.mod? In other words, would it be asking too much to request that you provide all the sources needed for an independent compilation of the source file pressure1.for?
There are some instances of nonstandard Fortran expressions, such as '+' followed by '-' in lines 421, 422 of main.for: ...ADVUA(I,J) +- ARU(I,J)... I hope that you are aware of the implications of such usage.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I have uploaded all my code in a zip. For your analysis, you must use the makefile in the release directory to compile the code (that is the one that makes the exe that is crashing). My code is too old (fortran 77) , so it's using implicit types. I don't know how complicated is the source of advisor_annotate since that it is part of Intel Advisor 19 Update 3 (a tool that was using to guide me to vectorize and parallelize my code). However, I will attach the source files for advisor-annotate too. I will check the lines that you mentioned later.
Thanks.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
FWIW
SIMD generated code performs the calculations on all lanes of the SIMD (small vector) with applicable mask (depending on instruction set) and then performs a masked store.
In the code presented in post #1 you have exception sections for the perimeter (surface boundary) of the volume being computed. It would be more (most) efficient to compute (SIMD) the interior volume separated from the (scalar) perimeter (surface boundary) and thus eliminate complications of having the boundary tests (and unnecessary code that is masked out).
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Ernesto, the code that you provided in #6 has bugs, I think. For instance, at the point where Seamount() is called from main.f, the array variables X and Y are undefined, but their values are used in the double DO loops after the call to DEPTH() in Seamount.f. Do you agree?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi mecej4. You are right about the undefined variables in seamount, that should be marked as a bug. I will try to fix it. Also, you were right regarding the signs in the expression that you mentioned in post #5. Regarding post #7 from Jim, although I'm not very familiar with SIMD programming, his comments have showed me that there could be a better way to change my code to improve optimization. Thanks to both of you guys.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
>> although I'm not very familiar with SIMD programming
Perhaps something to provide insight will help.
SIMD = Single Instruction Multiple Data aka vector operations.
In a loop that is vectorized something like this happens:
... if(simpleLogicalExpression) then out(i) = ExpressionSuitableForVectorization Also(i) = ... ThisToo(i) = ... else out(i) = OtherExpressionSuitableForVectorization Also(i) = Other... ThisToo(i) = Other... endif Becomes: (sketch code) temp1A = ExpressionSuitableForVectorization temp2A = ... temp3A = ... temp1B = OtherExpressionSuitableForVectorization temp2B = Other... Temp3B = Other... mask = (simpleLogicalExpression) out(i) = (mask) ? temp1A : temp1B ! like in C++ Also(i) = (mask) ? temp2A : temp2B ThisToo(i) = (mask) ? temp3A : temp3B
Note, both branches of the IF statement are evaluated into temporary registers (across the width of the SIMD vector)
Then a mask is made (across the width of the SIMD vector)
Then two masked moves are made (across the width of the SIMD vector)
Any improvement in vectorization comes at the expense of computing both halves of the IF statement plus the mask creation and additional conditional move. The benefit is there is no out of line branching for the code run.
With simple (one or two) statements on each branch of the IF you generally see a net benefit. However, in problems such as yours, typically only one branch is executed at the periphery, and the other branch is executed in the interior. With this in mind, it makes sense to "effectively" duplicate the code inclusive of only the appropriate branch, one section for the periphery, and one for the interior.
Jim Dempsey
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page