Intel® Fortran Compiler
Build applications that can scale for the future with optimized code designed for Intel® Xeon® and compatible processors.
Announcements
Welcome to the Intel Community. If you get an answer you like, please mark it as an Accepted Solution to help others. Thank you!
26736 Discussions

stack overflow caused by !DIR$ SIMD LASTPRIVATE REDUCTION

Pardo_Arroyo__Ernest
93 Views

I analyzed my code with Intel Advisor 2019. Then, I decided to try the vectorize the most consuming CPU loop of my code. Hence, I compiled it with /Qopt-report:5 option as guide to vectorize the loop. Finally, I was able to vectorize with the next directive

m=0.0 


		! form banded matrix of Puasson equastion
	pok=3.
	call annotate_site_begin( "pressure" )
      call annotate_iteration_task( "pressure-task" )
	do 99 k=2,kbm1
! REVERTED Change the order of the do, so they can fit the model of array in fortran   
      
      do 99 j=2,jmm1
!DIR$ SIMD LASTPRIVATE(bb2) REDUCTION(+:m, gc2, gc1, gen)
      do 99 i=2,imm1
           	
      if (k+1<=kb.and.i+1<=im)then
	 aa1(i+1,j,k+1)=.25e0*aaf(i+1,j,k+1)
	1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
	1 +.25e0*aaf(i,j,k+1)*dq(i+1,j)/dq(i,j)
	1 *dy(i,j)/ddx(i,j)/art(i,j)	
	endif
	if (k+1<=kb.and.i-1>=1)then
	 aa2(i-1,j,k+1)=-.25e0*aaf(i-1,j,k+1)
	1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
	1 -.25e0*aaf(i,j,k+1)*dq(i-1,j)/dq(i,j)
	1 *dy(i,j)/ddx(i,j)/art(i,j)	
	end if
	if (k-1>=1.and.i+1<=im)then
	 aa3(i+1,j,k-1)=-.25e0*aaf(i+1,j,k-1)
	1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
	1 -.25e0*aaf(i,j,k-1)*dq(i+1,j)/dq(i,j)
	1 *dy(i,j)/ddx(i,j)/art(i,j)	
	end if
	if (k-1>=1.and.i-1>=1)then
	 aa4(i-1,j,k-1)=.25e0*aaf(i-1,j,k-1)
	1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
	1 +.25e0*aaf(i,j,k-1)*dq(i-1,j)/dq(i,j)
	1 *dy(i,j)/ddx(i,j)/art(i,j)	
	end if
	if (k+1<=kb.and.j+1<=jm)then
	 bb1(i,j+1,k+1)=.25e0*bbf(i,j+1,k+1)
	1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
	1 +.25e0*bbf(i,j,k+1)*dq(i,j+1)/dq(i,j)
	1 *dx(i,j)/ddy(i,j)/art(i,j)	
	end if
	if (k+1<=kb.and.j-1>=1)then
	 bb2(i,j-1,k+1)=-.25e0*bbf(i,j-1,k+1)
	1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j)
	1 -.25e0*bbf(i,j,k+1)*dq(i,j-1)/dq(i,j)
	1 *dx(i,j)/ddy(i,j)/art(i,j)	
	end if
	if (k-1>=1.and.j+1<=jm)then
	 bb3(i,j+1,k-1)=-.25e0*bbf(i,j+1,k-1)
	1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
	1 -.25e0*bbf(i,j,k-1)*dq(i,j+1)/dq(i,j)
	1 *dx(i,j)/ddy(i,j)/art(i,j)	
	end if
	if (k-1>=1.and.j-1>=1) then
	 bb4(i,j-1,k-1)=.25e0*bbf(i,j-1,k-1)
	1 *.5*(dx(i,j)+dx(i,j-1))/ddx(i,j-1)/art(i,j)
	1 +.25e0*bbf(i,j,k-1)*dq(i,j-1)/dq(i,j)
	1 *dx(i,j)/ddy(i,j)/art(i,j)	
	end if
	if (i+1<=im) then
	 ga1(i+1,j,k)=dz(k)*dq(i+1,j)
	1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j)
	end if
	if (i-1>=1) then
	 ga2(i-1,j,k)=dz(k)*dq(i-1,j)
	1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j)
	end if
	if (j+1<=jm) then
	 gb1(i,j+1,k)=dz(k)*dq(i,j+1)
	1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j)
	end if
	if (j-1>=1) then
	 gb2(i,j-1,k)=dz(k)*dq(i,j-1)
	1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j)
	end if
      if (k+1<=kb) then
	 gc1(i,j,k+1)=1.e0/(dzz(k)*dq(i,j))*
	1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k+1)*dy(i,j)/
     1 dx(i,j)+.5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k+1)*dx(i,j)/
	1 dy(i,j))/art(i,j)
	end if
      if (k-1>=1) then
	 gc2(i,j,k-1)=1.e0/(dzz(k-1)*dq(i,j))*
	1 (art(i,j)+.5*(aaf(i,j,k-1)+aaf(i,j,k))*aaf(i,j,k-1)*dy(i,j)/
	1 dx(i,j)+.5*(bbf(i,j,k-1)+bbf(i,j,k))*bbf(i,j,k-1)*dx(i,j)/
	1 dy(i,j))/art(i,j)
	end if
!	if(iint==5)stop
	if (i-1>=1.and.j-1>=1.and.k-1>=1.) then
	 gen(i,j,k)=(-dq(i,j)*dz(k)*(.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)+
	1 .5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)+
	1 .5*(dx(i,j)+dx(i,j+1))/ddy(i,j)+
	1 .5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)))/art(i,j)-
     2 (1.e0/dzz(k-1)+1.e0/dzz(k))/dq(i,j)*(art(i,j)
	1 +.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+
	1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j)
	1 )/art(i,j)
	else
	 gen(i,j,k)=(-dq(i,j)*dz(k)*(dy(i,j)/ddx(i,j)+
	1 dy(i,j)/ddx(i-1,j)+
	1 dx(i,j)/ddy(i,j)+
	1 dx(i,j)/ddy(i,j-1)))/art(i,j)-
     2 (2.e0/dzz(k))/dq(i,j)*
	1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+
	1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j)
	1 )/art(i,j)	
      end if	
	
	if (((k-1)*(k-kb)*(i-1)*(i-im)*(j-1)*(j-jm)).ne.0) then
	    m=m+1

		if (k==kbm1) then
	        ga1(i+1,j,k)=ga1(i+1,j,k)+aa1(i+1,j,k+1)
	        ga2(i-1,j,k)=ga2(i-1,j,k)+aa2(i-1,j,k+1)
	        gb1(i,j+1,k)=gb1(i,j+1,k)+bb1(i,j+1,k+1)
	        gb2(i,j-1,k)=gb2(i,j-1,k)+bb2(i,j-1,k+1)
	        gen(i,j,k)=gen(i,j,k)+gc1(i,j,k+1)
          endif
          
	    if (k==2) then !¸òþñþôýð  ÿþòõ¨¿ýþ¸ª¹
	        aa2(i+1,j,k-1)=0.
	        bb2(i,j+1,k-1)=0.
	        bb4(i,j-1,k-1)=0.
	        gc2(i,j,k-1)=0.
	    endif

	    if (i==2) then
	        gc2(i,j,k-1)=gc2(i,j,k-1)+aa4(i-1,j,k-1)
	        gc1(i,j,k+1)=gc1(i,j,k+1)+aa2(i-1,j,k+1)
	        gen(i,j,k)=gen(i,j,k)+ga2(i-1,j,k)
          endif
	
          if (i==imm1) then
	        gc2(i,j,k-1)=gc2(i,j,k-1)+aa3(i+1,j,k-1)
	        gc1(i,j,k+1)=gc1(i,j,k+1)+aa1(i+1,j,k+1)
	        gen(i,j,k)=gen(i,j,k)+ga1(i+1,j,k)
          endif
          
	    if (j==2) then
	        gc2(i,j,k-1)=gc2(i,j,k-1)+bb4(i,j-1,k-1)
	        gc1(i,j,k+1)=gc1(i,j,k+1)+bb2(i,j-1,k+1)
	        gen(i,j,k)=gen(i,j,k)+gb2(i,j-1,k) 
          endif
          
	    if (j==jmm1) then
	        gc2(i,j,k-1)=gc2(i,j,k-1)+bb3(i,j+1,k-1)
	        gc1(i,j,k+1)=gc1(i,j,k+1)+bb1(i,j+1,k+1)
	        gen(i,j,k)=gen(i,j,k)+gb1(i,j+1,k) 
	    endif
	endif

	if (maa1+m<=lm) then
		if (k+1>kbm1.or.i+1>imm1) then
		    apr(m)=0.0	
		else
		    apr(m)=aa1(i+1,j,k+1)
		end if
		ja(m)=ind(m+maa1)
		ia(m)=ind(m)
	end if
	lapr=ma1
	if (mbb1+m<=lm) then
		if (k+1>kbm1.or.j+1>jmm1) then
		    apr(lapr+m)=0.0
	    else
		    apr(lapr+m)=bb1(i,j+1,k+1)
		end if
		ja(m+lapr)=ind(m+mbb1)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1
	if (mgc+m<=lm) then  
		if (k+1>kbm1) then
		    apr(lapr+m)=0.0
	    else
		    apr(lapr+m)=gc1(i,j,k+1)
		end if
		ja(m+lapr)=ind(m+mgc)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1+mc
	if (mbb2+m<=lm) then
		if (k+1>kbm1.or.j-1<2) then
		    apr(lapr+m)=0.0           	
		else
		    apr(lapr+m)=bb2(i,j-1,k+1)
		end if	
		ja(m+lapr)=ind(m+mbb2)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1+mc+mb2
	if (maa2+m<=lm) then  
		if (k+1>kbm1.or.i-1<2) then
		    apr(lapr+m)=0.0
		else
		    apr(lapr+m)=aa2(i-1,j,k+1)
		endif
		ja(m+lapr)=ind(m+maa2)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1+mc+mb2+ma2
	if (mga+m<=lm) then
		if (i+1>imm1) then 
		    apr(lapr+m)=0.0
		else   
		    apr(lapr+m)=ga1(i+1,j,k)
		end if
          ja(m+lapr)=ind(m+mga)
		ia(m+lapr)=ind(m)
	end if
	lapr=ma1+mb1+mc+mb2+ma2+ma
	if (1+m<=lm) then 
		if (j+1>jmm1) then 
		    apr(lapr+m)=0.0
		else  
		    apr(lapr+m)=gb1(i,j+1,k)
		end if
          ja(m+lapr)=ind(m+mgb)
		ia(m+lapr)=ind(m)
	end if 
	lapr=ma1+mb1+mc+mb2+ma2+ma+mb
	apr(lapr+m)=gen(i,j,k)
	ja(m+lapr)=ind(m)
	ia(m+lapr)=ind(m)
	lapr=ma1+mb1+mc+mb2+ma2+ma+mb+lm
	if (m-mgb>=1) then
		if (j-1<2) then 
		    apr(m-mgb+lapr)=0.0
		else  
		    apr(m-mgb+lapr)=gb2(i,j-1,k)
		end if
		ja(m-mgb+lapr)=ind(m-mgb)
	    ia(m-mgb+lapr)=ind(m)
	end if
      lapr=ma1+mb1+mc+mb2+ma2+ma+2*mb+lm
	if (m-mga>=1) then
		if (i-1<2) then 
		    apr(m-mga+lapr)=0.0
		else
		    apr(m-mga+lapr)=ga2(i-1,j,k)
		end if
		ia(m-mga+lapr)=ind(m)
	    ja(m-mga+lapr)=ind(-mga+m)
	end if
      lapr=ma1+mb1+mc+mb2+ma2+2*ma+2*mb+lm
	if (m-maa2>=1) then 
		if (k-1<2.or.i+1>imm1) then
		    apr(m-maa2+lapr)=0.0
		else
		    apr(m-maa2+lapr)=aa3(i+1,j,k-1)
		endif
		ia(m-maa2+lapr)=ind(m)
	    ja(m-maa2+lapr)=ind(-maa2+m)
	end if
      lapr=ma1+mb1+mc+mb2+2*ma2+2*ma+2*mb+lm
	if (m-mbb2>=1) then 
		if (j+1>jmm1.or.k-1<2) then
		    apr(m-mbb2+lapr)=0.0
		else
		    apr(m-mbb2+lapr)=bb3(i,j+1,k-1)
		endif
		ia(m-mbb2+lapr)=ind(m)
	    ja(m-mbb2+lapr)=ind(-mbb2+m)
	end if
      lapr=ma1+mb1+mc+2*mb2+2*ma2+2*ma+2*mb+lm
	if(m-mgc>=1)then
		if (k-1>kbm1) then
		    apr(m-mgc+lapr)=0.0
		else 
		    apr(m-mgc+lapr)=gc2(i,j,k-1)
		end if
		ja(m-mgc+lapr)=ind(m-mgc)
	    ia(m-mgc+lapr)=ind(m)
	end if
      lapr=ma1+mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm
	if (m-mbb1>=1) then 
		if (j-1<2.or.k-1<2)then
		    apr(m-mbb1+lapr)=0.0
		else
              apr(m-mbb1+lapr)=bb4(i,j-1,k-1)
		endif
		ia(m-mbb1+lapr)=ind(m)
	    ja(m-mbb1+lapr)=ind(m-mbb1)
	end if
      lapr=ma1+2*mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm
	if (m-maa1>=1) then 
		if (i-1<2.or.k-1<2)then
		    apr(m-maa1+lapr)=0.0
		else
		    apr(m-maa1+lapr)=aa4(i-1,j,k-1)
		endif
		ia(m-maa1+lapr)=ind(m)
	    ja(m-maa1+lapr)=ind(m-maa1)
	end if

99	continue
	call annotate_site_end

However, when I run my code I'm getting a stack overflow exception

 

forrtl: severe (170): Program Exception - stack overflow
Image              PC                Routine         Line     Source      
nohydroPOM.exe     00007FF7B4D83E38  Unknown         Unknown  Unknown
nohydroPOM.exe     00007FF7B4D64484  PRESSURE1            1   pressure1.for
nohydroPOM.exe     00007FF7B4D4EABB  MAIN__             620  Main.for
nohydroPOM.exe     00007FF7B4D83C02  Unknown        Unknown  Unknown
nohydroPOM.exe     00007FF7B4D8406D  Unknown        Unknown  Unknown
KERNEL32.DLL       00007FFD4FE14034  Unknown        Unknown  Unknown
ntdll.dll          00007FFD500F3691  Unknown        Unknown  Unknown

I noticed some messages like this

pressure1.for(273,7):remark #15329: vectorization support: irregularly indexed store was emulated for the variable <APR(M)>, masked, part of index is linear but may overflow

I have looking for more information about this message and how to fix it, but I haven't had any success so far. I have been looking for ways to troubleshoot stack overflow issues and tried some solutions -like increasing the size of stack, using heaparrays option, etc- but I'm still getting the same error. I will include the whole solution files in this post. I'm open to any suggestion about how to vectorize this loop. I will appreciate if someone provide more information about diagnostic message 15329 too.

My environment is 

Windows 10 Home Version 10.0.17134 Build 17134

Visual Studio Community 2015 Version 14.0.25431.01 Update 3

Intel Parallel Studio XE 2019 Update 3 Cluster Edition for Windows

 

Regards,

0 Kudos
0 Replies
Reply