Intel® Fortran Compiler
Build applications that can scale for the future with optimized code designed for Intel® Xeon® and compatible processors.
28446 Discussions

stack overflow caused by !DIR$ SIMD LASTPRIVATE REDUCTION

Pardo_Arroyo__Ernest
243 Views

I analyzed my code with Intel Advisor 2019. Then, I decided to try the vectorize the most consuming CPU loop of my code. Hence, I compiled it with /Qopt-report:5 option as guide to vectorize the loop. Finally, I was able to vectorize with the next directive

m=0.0 


		! form banded matrix of Puasson equastion
	pok=3.
	call annotate_site_begin( "pressure" )
      call annotate_iteration_task( "pressure-task" )
	do 99 k=2,kbm1
! REVERTED Change the order of the do, so they can fit the model of array in fortran   
      
      do 99 j=2,jmm1
!DIR$ SIMD LASTPRIVATE(bb2) REDUCTION(+:m, gc2, gc1, gen)
      do 99 i=2,imm1
           	
      if (k+1<=kb.and.i+1<=im)then
	 aa1(i+1,j,k+1)=.25e0*aaf(i+1,j,k+1)
	1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
	1 +.25e0*aaf(i,j,k+1)*dq(i+1,j)/dq(i,j)
	1 *dy(i,j)/ddx(i,j)/art(i,j)	
	endif
	if (k+1<=kb.and.i-1>=1)then
	 aa2(i-1,j,k+1)=-.25e0*aaf(i-1,j,k+1)
	1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
	1 -.25e0*aaf(i,j,k+1)*dq(i-1,j)/dq(i,j)
	1 *dy(i,j)/ddx(i,j)/art(i,j)	
	end if
	if (k-1>=1.and.i+1<=im)then
	 aa3(i+1,j,k-1)=-.25e0*aaf(i+1,j,k-1)
	1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
	1 -.25e0*aaf(i,j,k-1)*dq(i+1,j)/dq(i,j)
	1 *dy(i,j)/ddx(i,j)/art(i,j)	
	end if
	if (k-1>=1.and.i-1>=1)then
	 aa4(i-1,j,k-1)=.25e0*aaf(i-1,j,k-1)
	1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
	1 +.25e0*aaf(i,j,k-1)*dq(i-1,j)/dq(i,j)
	1 *dy(i,j)/ddx(i,j)/art(i,j)	
	end if
	if (k+1<=kb.and.j+1<=jm)then
	 bb1(i,j+1,k+1)=.25e0*bbf(i,j+1,k+1)
	1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
	1 +.25e0*bbf(i,j,k+1)*dq(i,j+1)/dq(i,j)
	1 *dx(i,j)/ddy(i,j)/art(i,j)	
	end if
	if (k+1<=kb.and.j-1>=1)then
	 bb2(i,j-1,k+1)=-.25e0*bbf(i,j-1,k+1)
	1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j)
	1 -.25e0*bbf(i,j,k+1)*dq(i,j-1)/dq(i,j)
	1 *dx(i,j)/ddy(i,j)/art(i,j)	
	end if
	if (k-1>=1.and.j+1<=jm)then
	 bb3(i,j+1,k-1)=-.25e0*bbf(i,j+1,k-1)
	1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
	1 -.25e0*bbf(i,j,k-1)*dq(i,j+1)/dq(i,j)
	1 *dx(i,j)/ddy(i,j)/art(i,j)	
	end if
	if (k-1>=1.and.j-1>=1) then
	 bb4(i,j-1,k-1)=.25e0*bbf(i,j-1,k-1)
	1 *.5*(dx(i,j)+dx(i,j-1))/ddx(i,j-1)/art(i,j)
	1 +.25e0*bbf(i,j,k-1)*dq(i,j-1)/dq(i,j)
	1 *dx(i,j)/ddy(i,j)/art(i,j)	
	end if
	if (i+1<=im) then
	 ga1(i+1,j,k)=dz(k)*dq(i+1,j)
	1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
	end if
	if (i-1>=1) then
	 ga2(i-1,j,k)=dz(k)*dq(i-1,j)
	1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
	end if
	if (j+1<=jm) then
	 gb1(i,j+1,k)=dz(k)*dq(i,j+1)
	1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
	end if
	if (j-1>=1) then
	 gb2(i,j-1,k)=dz(k)*dq(i,j-1)
	1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j)
	end if
      if (k+1<=kb) then
	 gc1(i,j,k+1)=1.e0/(dzz(k)*dq(i,j))*
	1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k+1)*dy(i,j)/
     1 dx(i,j)+.5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k+1)*dx(i,j)/
	1 dy(i,j))/art(i,j)
	end if
      if (k-1>=1) then
	 gc2(i,j,k-1)=1.e0/(dzz(k-1)*dq(i,j))*
	1 (art(i,j)+.5*(aaf(i,j,k-1)+aaf(i,j,k))*aaf(i,j,k-1)*dy(i,j)/
	1 dx(i,j)+.5*(bbf(i,j,k-1)+bbf(i,j,k))*bbf(i,j,k-1)*dx(i,j)/
	1 dy(i,j))/art(i,j)
	end if
!	if(iint==5)stop
	if (i-1>=1.and.j-1>=1.and.k-1>=1.) then
	 gen(i,j,k)=(-dq(i,j)*dz(k)*(.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)+
	1 .5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)+
	1 .5*(dx(i,j)+dx(i,j+1))/ddy(i,j)+
	1 .5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)))/art(i,j)-
     2 (1.e0/dzz(k-1)+1.e0/dzz(k))/dq(i,j)*(art(i,j)
	1 +.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+
	1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j)
	1 )/art(i,j)
	else
	 gen(i,j,k)=(-dq(i,j)*dz(k)*(dy(i,j)/ddx(i,j)+
	1 dy(i,j)/ddx(i-1,j)+
	1 dx(i,j)/ddy(i,j)+
	1 dx(i,j)/ddy(i,j-1)))/art(i,j)-
     2 (2.e0/dzz(k))/dq(i,j)*
	1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+
	1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j)
	1 )/art(i,j)	
      end if	
	
	if (((k-1)*(k-kb)*(i-1)*(i-im)*(j-1)*(j-jm)).ne.0) then
	    m=m+1

		if (k==kbm1) then
	        ga1(i+1,j,k)=ga1(i+1,j,k)+aa1(i+1,j,k+1)
	        ga2(i-1,j,k)=ga2(i-1,j,k)+aa2(i-1,j,k+1)
	        gb1(i,j+1,k)=gb1(i,j+1,k)+bb1(i,j+1,k+1)
	        gb2(i,j-1,k)=gb2(i,j-1,k)+bb2(i,j-1,k+1)
	        gen(i,j,k)=gen(i,j,k)+gc1(i,j,k+1)
          endif
          
	    if (k==2) then !¸òþñþôýð  ÿþòõ¨¿ýþ¸ª¹
	        aa2(i+1,j,k-1)=0.
	        bb2(i,j+1,k-1)=0.
	        bb4(i,j-1,k-1)=0.
	        gc2(i,j,k-1)=0.
	    endif

	    if (i==2) then
	        gc2(i,j,k-1)=gc2(i,j,k-1)+aa4(i-1,j,k-1)
	        gc1(i,j,k+1)=gc1(i,j,k+1)+aa2(i-1,j,k+1)
	        gen(i,j,k)=gen(i,j,k)+ga2(i-1,j,k)
          endif
	
          if (i==imm1) then
	        gc2(i,j,k-1)=gc2(i,j,k-1)+aa3(i+1,j,k-1)
	        gc1(i,j,k+1)=gc1(i,j,k+1)+aa1(i+1,j,k+1)
	        gen(i,j,k)=gen(i,j,k)+ga1(i+1,j,k)
          endif
          
	    if (j==2) then
	        gc2(i,j,k-1)=gc2(i,j,k-1)+bb4(i,j-1,k-1)
	        gc1(i,j,k+1)=gc1(i,j,k+1)+bb2(i,j-1,k+1)
	        gen(i,j,k)=gen(i,j,k)+gb2(i,j-1,k) 
          endif
          
	    if (j==jmm1) then
	        gc2(i,j,k-1)=gc2(i,j,k-1)+bb3(i,j+1,k-1)
	        gc1(i,j,k+1)=gc1(i,j,k+1)+bb1(i,j+1,k+1)
	        gen(i,j,k)=gen(i,j,k)+gb1(i,j+1,k) 
	    endif
	endif

	if (maa1+m<=lm) then
		if (k+1>kbm1.or.i+1>imm1) then
		    apr(m)=0.0	
		else
		    apr(m)=aa1(i+1,j,k+1)
		end if
		ja(m)=ind(m+maa1)
		ia(m)=ind(m)
	end if
	lapr=ma1
	if (mbb1+m<=lm) then
		if (k+1>kbm1.or.j+1>jmm1) then
		    apr(lapr+m)=0.0
	    else
		    apr(lapr+m)=bb1(i,j+1,k+1)
		end if
		ja(m+lapr)=ind(m+mbb1)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1
	if (mgc+m<=lm) then  
		if (k+1>kbm1) then
		    apr(lapr+m)=0.0
	    else
		    apr(lapr+m)=gc1(i,j,k+1)
		end if
		ja(m+lapr)=ind(m+mgc)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1+mc
	if (mbb2+m<=lm) then
		if (k+1>kbm1.or.j-1<2) then
		    apr(lapr+m)=0.0           	
		else
		    apr(lapr+m)=bb2(i,j-1,k+1)
		end if	
		ja(m+lapr)=ind(m+mbb2)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1+mc+mb2
	if (maa2+m<=lm) then  
		if (k+1>kbm1.or.i-1<2) then
		    apr(lapr+m)=0.0
		else
		    apr(lapr+m)=aa2(i-1,j,k+1)
		endif
		ja(m+lapr)=ind(m+maa2)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1+mc+mb2+ma2
	if (mga+m<=lm) then
		if (i+1>imm1) then 
		    apr(lapr+m)=0.0
		else   
		    apr(lapr+m)=ga1(i+1,j,k)
		end if
          ja(m+lapr)=ind(m+mga)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1+mc+mb2+ma2+ma
	if (1+m<=lm) then 
		if (j+1>jmm1) then 
		    apr(lapr+m)=0.0
		else  
		    apr(lapr+m)=gb1(i,j+1,k)
		end if
          ja(m+lapr)=ind(m+mgb)
		ia(m+lapr)=ind(m)
	end if 
	lapr=ma1+mb1+mc+mb2+ma2+ma+mb
	apr(lapr+m)=gen(i,j,k)
	ja(m+lapr)=ind(m)
	ia(m+lapr)=ind(m)
	lapr=ma1+mb1+mc+mb2+ma2+ma+mb+lm
	if (m-mgb>=1) then
		if (j-1<2) then 
		    apr(m-mgb+lapr)=0.0
		else  
		    apr(m-mgb+lapr)=gb2(i,j-1,k)
		end if
		ja(m-mgb+lapr)=ind(m-mgb)
	    ia(m-mgb+lapr)=ind(m)
	end if
      lapr=ma1+mb1+mc+mb2+ma2+ma+2*mb+lm
	if (m-mga>=1) then
		if (i-1<2) then 
		    apr(m-mga+lapr)=0.0
		else
		    apr(m-mga+lapr)=ga2(i-1,j,k)
		end if
		ia(m-mga+lapr)=ind(m)
	    ja(m-mga+lapr)=ind(-mga+m)
	end if
      lapr=ma1+mb1+mc+mb2+ma2+2*ma+2*mb+lm
	if (m-maa2>=1) then 
		if (k-1<2.or.i+1>imm1) then
		    apr(m-maa2+lapr)=0.0
		else
		    apr(m-maa2+lapr)=aa3(i+1,j,k-1)
		endif
		ia(m-maa2+lapr)=ind(m)
	    ja(m-maa2+lapr)=ind(-maa2+m)
	end if
      lapr=ma1+mb1+mc+mb2+2*ma2+2*ma+2*mb+lm
	if (m-mbb2>=1) then 
		if (j+1>jmm1.or.k-1<2) then
		    apr(m-mbb2+lapr)=0.0
		else
		    apr(m-mbb2+lapr)=bb3(i,j+1,k-1)
		endif
		ia(m-mbb2+lapr)=ind(m)
	    ja(m-mbb2+lapr)=ind(-mbb2+m)
	end if
      lapr=ma1+mb1+mc+2*mb2+2*ma2+2*ma+2*mb+lm
	if(m-mgc>=1)then
		if (k-1>kbm1) then
		    apr(m-mgc+lapr)=0.0
		else 
		    apr(m-mgc+lapr)=gc2(i,j,k-1)
		end if
		ja(m-mgc+lapr)=ind(m-mgc)
	    ia(m-mgc+lapr)=ind(m)
	end if
      lapr=ma1+mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm
	if (m-mbb1>=1) then 
		if (j-1<2.or.k-1<2)then
		    apr(m-mbb1+lapr)=0.0
		else
              apr(m-mbb1+lapr)=bb4(i,j-1,k-1)
		endif
		ia(m-mbb1+lapr)=ind(m)
	    ja(m-mbb1+lapr)=ind(m-mbb1)
	end if
      lapr=ma1+2*mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm
	if (m-maa1>=1) then 
		if (i-1<2.or.k-1<2)then
		    apr(m-maa1+lapr)=0.0
		else
		    apr(m-maa1+lapr)=aa4(i-1,j,k-1)
		endif
		ia(m-maa1+lapr)=ind(m)
	    ja(m-maa1+lapr)=ind(m-maa1)
	end if

99	continue
	call annotate_site_end

However, when I run my code I'm getting a stack overflow exception

 

forrtl: severe (170): Program Exception - stack overflow
Image              PC                Routine         Line     Source      
nohydroPOM.exe     00007FF7B4D83E38  Unknown         Unknown  Unknown
nohydroPOM.exe     00007FF7B4D64484  PRESSURE1            1   pressure1.for
nohydroPOM.exe     00007FF7B4D4EABB  MAIN__             620  Main.for
nohydroPOM.exe     00007FF7B4D83C02  Unknown        Unknown  Unknown
nohydroPOM.exe     00007FF7B4D8406D  Unknown        Unknown  Unknown
KERNEL32.DLL       00007FFD4FE14034  Unknown        Unknown  Unknown
ntdll.dll          00007FFD500F3691  Unknown        Unknown  Unknown

I noticed some messages like this

pressure1.for(273,7):remark #15329: vectorization support: irregularly indexed store was emulated for the variable <APR(M)>, masked, part of index is linear but may overflow

I have looking for more information about this message and how to fix it, but I haven't had any success so far. I have been looking for ways to troubleshoot stack overflow issues and tried some solutions -like increasing the size of stack, using heaparrays option, etc- but I'm still getting the same error. I will include the whole solution files in this post. I'm open to any suggestion about how to vectorize this loop. I will appreciate if someone provide more information about diagnostic message 15329 too.

My environment is 

Windows 10 Home Version 10.0.17134 Build 17134

Visual Studio Community 2015 Version 14.0.25431.01 Update 3

Intel Parallel Studio XE 2019 Update 3 Cluster Edition for Windows

 

Regards,

0 Kudos
0 Replies
Reply