Intel® Fortran Compiler
Build applications that can scale for the future with optimized code designed for Intel® Xeon® and compatible processors.
28435 Discussions

SIGSEGV in an argument on a subroutine

Pardo_Arroyo__Ernest
688 Views

Hi, I was trying to vectorize one loop that has a lot of ifs with the next directive 

  m=0.0
   ! form banded matrix of Puasson equastion
   pok=3.
    call annotate_site_begin( "pressure" )
      call annotate_iteration_task( "pressure-task" )
     do 99 k=2,kbm1
     do 99 i=2,imm1       
!DIR$ SIMD LASTPRIVATE(bb2) REDUCTION(+:m, gc2, gc1, gen)
     do 99 j=2,jmm1           

     if (k+1<=kb.and.i+1<=im)then
     aa1(i+1,j,k+1)=.25e0*aaf(i+1,j,k+1)
    1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
    1 +.25e0*aaf(i,j,k+1)*dq(i+1,j)/dq(i,j)
    1 *dy(i,j)/ddx(i,j)/art(i,j)   
    endif

    if (k+1<=kb.and.i-1>=1)then
     aa2(i-1,j,k+1)=-.25e0*aaf(i-1,j,k+1)
    1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
    1 -.25e0*aaf(i,j,k+1)*dq(i-1,j)/dq(i,j)
    1 *dy(i,j)/ddx(i,j)/art(i,j)   
    end if
    if (k-1>=1.and.i+1<=im)then
     aa3(i+1,j,k-1)=-.25e0*aaf(i+1,j,k-1)
    1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
    1 -.25e0*aaf(i,j,k-1)*dq(i+1,j)/dq(i,j)
    1 *dy(i,j)/ddx(i,j)/art(i,j)   
    end if
    if (k-1>=1.and.i-1>=1)then
     aa4(i-1,j,k-1)=.25e0*aaf(i-1,j,k-1)
    1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
    1 +.25e0*aaf(i,j,k-1)*dq(i-1,j)/dq(i,j)
    1 *dy(i,j)/ddx(i,j)/art(i,j)   
    end if
    if (k+1<=kb.and.j+1<=jm)then
     bb1(i,j+1,k+1)=.25e0*bbf(i,j+1,k+1)
    1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
    1 +.25e0*bbf(i,j,k+1)*dq(i,j+1)/dq(i,j)
    1 *dx(i,j)/ddy(i,j)/art(i,j)   
    end if
    if (k+1<=kb.and.j-1>=1)then
     bb2(i,j-1,k+1)=-.25e0*bbf(i,j-1,k+1)
    1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j)
    1 -.25e0*bbf(i,j,k+1)*dq(i,j-1)/dq(i,j)
    1 *dx(i,j)/ddy(i,j)/art(i,j)   
    end if
   if (k-1>=1.and.j+1<=jm)then
     bb3(i,j+1,k-1)=-.25e0*bbf(i,j+1,k-1)
    1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
    1 -.25e0*bbf(i,j,k-1)*dq(i,j+1)/dq(i,j)
    1 *dx(i,j)/ddy(i,j)/art(i,j)   
    end if
    if (k-1>=1.and.j-1>=1) then
     bb4(i,j-1,k-1)=.25e0*bbf(i,j-1,k-1)
    1 *.5*(dx(i,j)+dx(i,j-1))/ddx(i,j-1)/art(i,j)
    1 +.25e0*bbf(i,j,k-1)*dq(i,j-1)/dq(i,j)
    1 *dx(i,j)/ddy(i,j)/art(i,j)   
    end if
    if (i+1<=im) then
     ga1(i+1,j,k)=dz(k)*dq(i+1,j)
    1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
    end if
    if (i-1>=1) then
     ga2(i-1,j,k)=dz(k)*dq(i-1,j)
    1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
    end if
    if (j+1<=jm) then
     gb1(i,j+1,k)=dz(k)*dq(i,j+1)
    1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
    end if
    if (j-1>=1) then
     gb2(i,j-1,k)=dz(k)*dq(i,j-1)
    1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j)
    end if
      if (k+1<=kb) then
     gc1(i,j,k+1)=1.e0/(dzz(k)*dq(i,j))*
    1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k+1)*dy(i,j)/
    1 dx(i,j)+.5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k+1)*dx(i,j)/
    1 dy(i,j))/art(i,j)
    end if
      if (k-1>=1) then
     gc2(i,j,k-1)=1.e0/(dzz(k-1)*dq(i,j))*
    1 (art(i,j)+.5*(aaf(i,j,k-1)+aaf(i,j,k))*aaf(i,j,k-1)*dy(i,j)/
    1 dx(i,j)+.5*(bbf(i,j,k-1)+bbf(i,j,k))*bbf(i,j,k-1)*dx(i,j)/
    1 dy(i,j))/art(i,j)
    end if
!   if(iint==5)stop
    if (i-1>=1.and.j-1>=1.and.k-1>=1.) then
     gen(i,j,k)=(-dq(i,j)*dz(k)*(.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)+
    1 .5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)+
    1 .5*(dx(i,j)+dx(i,j+1))/ddy(i,j)+
    1 .5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)))/art(i,j)-
     2 (1.e0/dzz(k-1)+1.e0/dzz(k))/dq(i,j)*(art(i,j)
    1 +.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+
    1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j)
    1 )/art(i,j)
    else
     gen(i,j,k)=(-dq(i,j)*dz(k)*(dy(i,j)/ddx(i,j)+
    1 dy(i,j)/ddx(i-1,j)+
    1 dx(i,j)/ddy(i,j)+
    1 dx(i,j)/ddy(i,j-1)))/art(i,j)-
    2 (2.e0/dzz(k))/dq(i,j)*
    1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+
    1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j)
    1 )/art(i,j)   
      end if   
     
   if (((k-1)*(k-kb)*(i-1)*(i-im)*(j-1)*(j-jm)).ne.0) then
       m=m+1

       if (k==kbm1) then
           ga1(i+1,j,k)=ga1(i+1,j,k)+aa1(i+1,j,k+1)
           ga2(i-1,j,k)=ga2(i-1,j,k)+aa2(i-1,j,k+1)
            gb1(i,j+1,k)=gb1(i,j+1,k)+bb1(i,j+1,k+1)
            gb2(i,j-1,k)=gb2(i,j-1,k)+bb2(i,j-1,k+1)
            gen(i,j,k)=gen(i,j,k)+gc1(i,j,k+1)
          endif
           
        if (k==2) then !¸òþñþôýð  ÿþòõ¨¿ýþ¸ª¹
            aa2(i+1,j,k-1)=0.
           bb2(i,j+1,k-1)=0.
            bb4(i,j-1,k-1)=0.
            gc2(i,j,k-1)=0.
       endif
 
       if (i==2) then
            gc2(i,j,k-1)=gc2(i,j,k-1)+aa4(i-1,j,k-1)
            gc1(i,j,k+1)=gc1(i,j,k+1)+aa2(i-1,j,k+1)
            gen(i,j,k)=gen(i,j,k)+ga2(i-1,j,k)
        endif
     
         if (i==imm1) then
           gc2(i,j,k-1)=gc2(i,j,k-1)+aa3(i+1,j,k-1)
           gc1(i,j,k+1)=gc1(i,j,k+1)+aa1(i+1,j,k+1)
           gen(i,j,k)=gen(i,j,k)+ga1(i+1,j,k)
        endif

        if (j==2) then
           gc2(i,j,k-1)=gc2(i,j,k-1)+bb4(i,j-1,k-1)
          gc1(i,j,k+1)=gc1(i,j,k+1)+bb2(i,j-1,k+1)
          gen(i,j,k)=gen(i,j,k)+gb2(i,j-1,k)
       endif
       
       if (j==jmm1) then
           gc2(i,j,k-1)=gc2(i,j,k-1)+bb3(i,j+1,k-1)
            gc1(i,j,k+1)=gc1(i,j,k+1)+bb1(i,j+1,k+1)
           gen(i,j,k)=gen(i,j,k)+gb1(i,j+1,k)
      endif

   endif

   if (maa1+m<=lm) then
       if (k+1>kbm1.or.i+1>imm1) then
            apr(m)=0.0 
        else
          apr(m)=aa1(i+1,j,k+1)
       end if
        ja(m)=ind(m+maa1)
       ia(m)=ind(m)
   end if

   lapr=ma1
    if (mbb1+m<=lm) then
        if (k+1>kbm1.or.j+1>jmm1) then
            apr(lapr+m)=0.0
        else
            apr(lapr+m)=bb1(i,j+1,k+1)
       end if
       ja(m+lapr)=ind(m+mbb1)
       ia(m+lapr)=ind(m)
    end if

    lapr=ma1+mb1
    if (mgc+m<=lm) then 
        if (k+1>kbm1) then
            apr(lapr+m)=0.0
        else
            apr(lapr+m)=gc1(i,j,k+1)
        end if
        ja(m+lapr)=ind(m+mgc)
        ia(m+lapr)=ind(m)
   end if

    lapr=ma1+mb1+mc
    if (mbb2+m<=lm) then
        if (k+1>kbm1.or.j-1<2) then
            apr(lapr+m)=0.0            
       else
            apr(lapr+m)=bb2(i,j-1,k+1)
        end if 

        ja(m+lapr)=ind(m+mbb2)
       ia(m+lapr)=ind(m)
   end if

    lapr=ma1+mb1+mc+mb2
    if (maa2+m<=lm) then 
        if (k+1>kbm1.or.i-1<2) then
            apr(lapr+m)=0.0
        else
            apr(lapr+m)=aa2(i-1,j,k+1)
        endif
        ja(m+lapr)=ind(m+maa2)
        ia(m+lapr)=ind(m)
   end if

    lapr=ma1+mb1+mc+mb2+ma2
    if (mga+m<=lm) then
       if (i+1>imm1) then
            apr(lapr+m)=0.0
       else  
            apr(lapr+m)=ga1(i+1,j,k)
       end if
        ja(m+lapr)=ind(m+mga)
       ia(m+lapr)=ind(m)
   end if

   lapr=ma1+mb1+mc+mb2+ma2+ma
    if (1+m<=lm) then
        if (j+1>jmm1) then
            apr(lapr+m)=0.0
        else 
            apr(lapr+m)=gb1(i,j+1,k)
        end if
         ja(m+lapr)=ind(m+mgb)
        ia(m+lapr)=ind(m)
   end if

    lapr=ma1+mb1+mc+mb2+ma2+ma+mb
    apr(lapr+m)=gen(i,j,k)
    ja(m+lapr)=ind(m)
    ia(m+lapr)=ind(m)
    lapr=ma1+mb1+mc+mb2+ma2+ma+mb+lm
    if (m-mgb>=1) then
        if (j-1<2) then
            apr(m-mgb+lapr)=0.0
       else 
           apr(m-mgb+lapr)=gb2(i,j-1,k)
       end if
       ja(m-mgb+lapr)=ind(m-mgb)
        ia(m-mgb+lapr)=ind(m)
   end if

     lapr=ma1+mb1+mc+mb2+ma2+ma+2*mb+lm
    if (m-mga>=1) then
        if (i-1<2) then
           apr(m-mga+lapr)=0.0
       else
           apr(m-mga+lapr)=ga2(i-1,j,k)
       end if

        ia(m-mga+lapr)=ind(m)
        ja(m-mga+lapr)=ind(-mga+m)
    end if

      lapr=ma1+mb1+mc+mb2+ma2+2*ma+2*mb+lm
    if (m-maa2>=1) then
       if (k-1<2.or.i+1>imm1) then
            apr(m-maa2+lapr)=0.0
        else
            apr(m-maa2+lapr)=aa3(i+1,j,k-1)
        endif

        ia(m-maa2+lapr)=ind(m)
        ja(m-maa2+lapr)=ind(-maa2+m)
   end if

    lapr=ma1+mb1+mc+mb2+2*ma2+2*ma+2*mb+lm
    if (m-mbb2>=1) then
        if (j+1>jmm1.or.k-1<2) then
            apr(m-mbb2+lapr)=0.0
        else
           apr(m-mbb2+lapr)=bb3(i,j+1,k-1)
       endif

        ia(m-mbb2+lapr)=ind(m)
        ja(m-mbb2+lapr)=ind(-mbb2+m)
   end if

     lapr=ma1+mb1+mc+2*mb2+2*ma2+2*ma+2*mb+lm

    if(m-mgc>=1)then
        if (k-1>kbm1) then
           apr(m-mgc+lapr)=0.0
        else
           apr(m-mgc+lapr)=gc2(i,j,k-1)
       end if
       ja(m-mgc+lapr)=ind(m-mgc)
       ia(m-mgc+lapr)=ind(m)
   end if

      lapr=ma1+mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm
    if (m-mbb1>=1) then
        if (j-1<2.or.k-1<2)then
            apr(m-mbb1+lapr)=0.0
        else
              apr(m-mbb1+lapr)=bb4(i,j-1,k-1)
        endif

        ia(m-mbb1+lapr)=ind(m)
        ja(m-mbb1+lapr)=ind(m-mbb1)
   end if

     lapr=ma1+2*mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm
   if (m-maa1>=1) then
       if (i-1<2.or.k-1<2)then
           apr(m-maa1+lapr)=0.0
       else
           apr(m-maa1+lapr)=aa4(i-1,j,k-1)
       endif

        ia(m-maa1+lapr)=ind(m)
        ja(m-maa1+lapr)=ind(m-maa1)
   end if

99  continue

    call annotate_site_end

However, when I run it, I got this SIGSEGV

forrtl: severe (174): SIGSEGV, segmentation fault occurred
Image                       PC                             Routine                     Line        Source             
nohydropom_intel    000000000044B7F3  Unknown               Unknown  Unknown
libpthread-2.23.s      00007F1CF2551390  Unknown               Unknown  Unknown
nohydropom_intel    000000000042BEBD  pressure1_                       92  pressure1.for
nohydropom_intel    000000000040C755  MAIN__.R                        614  Main.for
nohydropom_intel    0000000000403D32  Unknown                 Unknown  Unknown
libc-2.23.so              00007F1CF1F92830  __libc_start_main    Unknown  Unknown
nohydropom_intel    0000000000403C29  Unknown                  Unknown  Unknown
 

I debugged my code. I inserted a breakpoint in line 614 at Main.for. Then I found that the SIGSEGV happened at one argument that it is an array.

Breakpoint 1, main () at ../Main.for:614
614        call pressure1(dti,q)
(gdb) info address dti
No symbol "dti" in current context.
(gdb) info address q
Symbol "q" is static storage at address 0x1088280.
(gdb) print dti
No symbol "dti" in current context.
(gdb) print q
$1 = (( ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...) ...) ...)
(gdb) info line 614
Line 614 of "../Main.for" starts at address 0x40c743 <main+35251> and ends at 0x40c755 <main+35269>.
(gdb) disas 0x40c743, 0x40c755
Dump of assembler code from 0x40c743 to 0x40c755:
=> 0x000000000040c743 <main+35251>:    mov    $0x6458900,%edi
   0x000000000040c748 <main+35256>:    mov    $0x1088280,%esi
   0x000000000040c74d <main+35261>:    vzeroupper 
   0x000000000040c750 <main+35264>:    callq  0x42bb30 <pressure1>
End of assembler dump.
(gdb) continue
Continuing.

Program received signal SIGSEGV, Segmentation fault.
0x000000000042bebd in pressure1 (dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>)
    at ../pressure1.for:92
92        allocate(apr(n_apr))
(gdb) info stack
#0  0x000000000042bebd in pressure1 (dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>)
    at ../pressure1.for:92
#1  0x000000000040c755 in main () at ../Main.for:614
#2  0x0000000000403d32 in main ()
(gdb) info frame
Stack level 0, frame at 0x7fffffffbf80:
 rip = 0x42bebd in pressure1 (../pressure1.for:92); saved rip = 0x40c755
 called by frame at 0x7fffffffc400
 source language fortran.
 Arglist at 0x7fffffffbf70, args: dt2=1.9762625833649862e-323, q1=<error reading variable: Cannot access memory at address 0x2>
 Locals at 0x7fffffffbf70, Previous frame's sp is 0x7fffffffbf80
 Saved registers:
  rbx at 0x7fffffffbf38, rbp at 0x7fffffffbf70, r12 at 0x7fffffffbf58, r13 at 0x7fffffffbf50, r14 at 0x7fffffffbf48,
  r15 at 0x7fffffffbf40, rip at 0x7fffffffbf78
(gdb) info address q1
Symbol "q1" is a complex DWARF expression:
     0: DW_OP_breg4 0 [$rsi]
.
(gdb) whatis q1
type = REAL(8) (400,6,80)
(gdb) up
#1  0x000000000040c755 in main () at ../Main.for:614
614        call pressure1(dti,q)
(gdb) whatis q
type = REAL(8) (400,6,80)

 

When I delete the DIR SIMD directive, and recompile my code then my code runs. This is the content of my makefile

 

EXE= nohydropom_intel
FC= ifort
FFLAGS+= -O2 -m64 -mavx -mtune=core-avx-i -axAVX -real-size 64 -fp-model precise -fp-model source \
-fast-transcendentals -fimf-use-svml=true -fma -g -ipo -qopt-report=5 \
 -traceback 
#FFLAGS+= -O2 -xHost -real-size 64 -parallel -ipo -fstack-protector-all
#LDFLAGS = -lslatec -llapack
#LIBDIR = -L/usr/local/lib -L/usr/lib/lapack
LDFLAGS = -ladvisor
LIBDIR = -L/opt/intel/advisor/lib64
INCDIR = -I/opt/intel/advisor/include/intel64

OBJS = \
Advsm.o    Subr.o    Bcond1.o    Vertstruct.o    S_t_subr.o \
Coef.o    Main.o    ztosig.o    pprint.o    pressure1.o    Wveloc.o \
Depth.o    seamount.o    Liadv.o    Slap.o

${EXE}:  ${OBJS}
    $(FC) $(FFLAGS) -o $(EXE) $^ $(INCDIR) $(LIBDIR) $(LDFLAGS)

${OBJS}: %.o: ../%.for
    ${FC} ${FFLAGS} -c -o $@ $< $(INCDIR)

clean:
    rm -f *.o $(EXE)

 

I have many questions about this issue. The variables in the DIR SIMD are not related with q1. I checked the declarations of q in Main.for, and q1 in pressure.for and they have the same declarations

Main.for => DIMENSION  q(im,jm,kb)

pressure1.for  =>   dimension q1(im,jm,kb)

im, jm, kb are defined as   PARAMETER (IM=400,JM=6,KB=80,ks=80)   in a file called comblk98.h.

So, I don't think that is an issue with my code. Somehow, the compiler is messing the options with the DIR SIMD directive. I wonder if anyone of you know if the DIR SIMD affects the way an array is passed to a subroutine. If you don't then I will open a ticket to Intel.

 

By the way, this is my environment

Iepardo@epardohome:~/nohydro/intel$ ifort -v
ifort version 19.0.3.199
epardo@epardohome:~/nohydro/intel$ uname -a
Linux epardohome 4.4.0-128-generic #154-Ubuntu SMP Fri May 25 14:15:18 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
epardo@epardohome:~/nohydro/intel$ cat /etc/os-release 
NAME="Ubuntu"
VERSION="16.04.6 LTS (Xenial Xerus)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 16.04.6 LTS"
VERSION_ID="16.04"
HOME_URL="http://www.ubuntu.com/"
SUPPORT_URL="http://help.ubuntu.com/"
BUG_REPORT_URL="http://bugs.launchpad.net/ubuntu/"
VERSION_CODENAME=xenial
UBUNTU_CODENAME=xenial

0 Kudos
9 Replies
Pardo_Arroyo__Ernest
688 Views

I have opened ticket 04186524 to Intel.

0 Kudos
mecej4
Honored Contributor III
688 Views

The traceback that you provided in #1 clearly shows that the last source line executed was in pressure1.for, namely, line 92. The hundreds of lines of source code that you provided do not tell us what was on that line, and there is insufficient information to guess whether the problem is with the code in the subroutine, the arguments passed to the subroutine, or some combination of both.

The reported "Cannot access memory at address 0x2" is also an important clue that should be followed up. That sort of address should not be referenced in a user mode program.

I suggest that you run a serial version of your program with the same input data and with similar optimization levels as with the parallel version. Until such a run works and yields reasonable results, the complications associated with parallel code are tough to understand and investigate.

0 Kudos
Pardo_Arroyo__Ernest
688 Views

Hi, I have uploaded the Main.for, pressure1.for, and makefile files. I didn't include any parallel or multithread options in the makefile, so I assume that my code was compiled as serial. Please let me know if you need more files for your analysis. 

0 Kudos
mecej4
Honored Contributor III
688 Views

The include file 'comblk98.h' probably contains some type declarations that are pertinent.  Do you use implicit typing in your sources?

How complicated is the source for advisor_annotate.mod? In other words, would it be asking too much to request that you provide all the sources needed for an independent compilation of the source file pressure1.for?

There are some instances of nonstandard Fortran expressions, such as '+' followed by '-' in lines 421, 422 of main.for: ...ADVUA(I,J) +- ARU(I,J)... I hope that you are aware of the implications of such usage.

0 Kudos
Pardo_Arroyo__Ernest
688 Views

I have uploaded all my code in a zip. For your analysis, you must use the makefile in the release directory to compile the code (that is the one that makes the exe that is crashing). My code is too old (fortran 77) , so it's using implicit types. I don't know how complicated is the source of advisor_annotate since that it is part of Intel Advisor 19 Update 3 (a tool that was using to guide me to vectorize and parallelize my code). However, I will attach the source files for advisor-annotate too. I will check the lines that you mentioned later.

Thanks.

0 Kudos
jimdempseyatthecove
Honored Contributor III
688 Views

FWIW

SIMD generated code performs the calculations on all lanes of the SIMD (small vector) with applicable mask (depending on instruction set) and then performs a masked store.

In the code presented in post #1 you have exception sections for the perimeter (surface boundary) of the volume being computed. It would be more (most) efficient to compute (SIMD) the interior volume separated from the (scalar) perimeter (surface boundary) and thus eliminate complications of having the boundary tests (and unnecessary code that is masked out).

Jim Dempsey 

0 Kudos
mecej4
Honored Contributor III
688 Views

Ernesto, the code that you provided in #6 has bugs, I think. For instance, at the point where Seamount() is called from main.f, the array variables X and Y are undefined, but their values are used in the double DO loops after the call to DEPTH() in Seamount.f. Do you agree?

0 Kudos
Pardo_Arroyo__Ernest
687 Views

Hi mecej4. You are right about the undefined variables in seamount, that should be marked as a bug. I will try to fix it. Also, you were right regarding the signs in the expression that you mentioned in post #5. Regarding post #7 from Jim,  although I'm not very familiar with SIMD programming, his comments have showed me that there could be a better way to change my code to improve optimization. Thanks to both of you guys.

0 Kudos
jimdempseyatthecove
Honored Contributor III
687 Views

>> although I'm not very familiar with SIMD programming

Perhaps something to provide insight will help.

SIMD = Single Instruction Multiple Data aka vector operations.

In a loop that is vectorized something like this happens:

...
if(simpleLogicalExpression) then
   out(i) = ExpressionSuitableForVectorization
   Also(i) = ...
   ThisToo(i) = ...
else
   out(i) = OtherExpressionSuitableForVectorization
   Also(i) = Other...
   ThisToo(i) = Other...
endif

Becomes: (sketch code)

   temp1A = ExpressionSuitableForVectorization
   temp2A = ...
   temp3A = ...
   temp1B = OtherExpressionSuitableForVectorization
   temp2B = Other...
   Temp3B = Other...
   mask = (simpleLogicalExpression)
   out(i) = (mask) ? temp1A : temp1B ! like in C++
   Also(i) = (mask) ? temp2A : temp2B
   ThisToo(i) = (mask) ? temp3A : temp3B

Note, both branches of the IF statement are evaluated into temporary registers (across the width of the SIMD vector)
Then a mask is made  (across the width of the SIMD vector)
Then two masked moves are made  (across the width of the SIMD vector)

Any improvement in vectorization comes at the expense of computing both halves of the IF statement plus the mask creation and additional conditional move. The benefit is there is no out of line branching for the code run.

With simple (one or two) statements on each branch of the IF you generally see a net benefit. However, in problems such as yours, typically only one branch is executed at the periphery, and the other branch is executed in the interior. With this in mind, it makes sense to "effectively" duplicate the code inclusive of only the appropriate branch, one section for the periphery, and one for the interior.

Jim Dempsey

0 Kudos
Reply