- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I analyzed my code with Intel Advisor 2019. Then, I decided to try the vectorize the most consuming CPU loop of my code. Hence, I compiled it with /Qopt-report:5 option as guide to vectorize the loop. Finally, I was able to vectorize with the next directive
m=0.0 ! form banded matrix of Puasson equastion pok=3. call annotate_site_begin( "pressure" ) call annotate_iteration_task( "pressure-task" ) do 99 k=2,kbm1 ! REVERTED Change the order of the do, so they can fit the model of array in fortran do 99 j=2,jmm1 !DIR$ SIMD LASTPRIVATE(bb2) REDUCTION(+:m, gc2, gc1, gen) do 99 i=2,imm1 if (k+1<=kb.and.i+1<=im)then aa1(i+1,j,k+1)=.25e0*aaf(i+1,j,k+1) 1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j) 1 +.25e0*aaf(i,j,k+1)*dq(i+1,j)/dq(i,j) 1 *dy(i,j)/ddx(i,j)/art(i,j) endif if (k+1<=kb.and.i-1>=1)then aa2(i-1,j,k+1)=-.25e0*aaf(i-1,j,k+1) 1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j) 1 -.25e0*aaf(i,j,k+1)*dq(i-1,j)/dq(i,j) 1 *dy(i,j)/ddx(i,j)/art(i,j) end if if (k-1>=1.and.i+1<=im)then aa3(i+1,j,k-1)=-.25e0*aaf(i+1,j,k-1) 1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j) 1 -.25e0*aaf(i,j,k-1)*dq(i+1,j)/dq(i,j) 1 *dy(i,j)/ddx(i,j)/art(i,j) end if if (k-1>=1.and.i-1>=1)then aa4(i-1,j,k-1)=.25e0*aaf(i-1,j,k-1) 1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j) 1 +.25e0*aaf(i,j,k-1)*dq(i-1,j)/dq(i,j) 1 *dy(i,j)/ddx(i,j)/art(i,j) end if if (k+1<=kb.and.j+1<=jm)then bb1(i,j+1,k+1)=.25e0*bbf(i,j+1,k+1) 1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j) 1 +.25e0*bbf(i,j,k+1)*dq(i,j+1)/dq(i,j) 1 *dx(i,j)/ddy(i,j)/art(i,j) end if if (k+1<=kb.and.j-1>=1)then bb2(i,j-1,k+1)=-.25e0*bbf(i,j-1,k+1) 1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j) 1 -.25e0*bbf(i,j,k+1)*dq(i,j-1)/dq(i,j) 1 *dx(i,j)/ddy(i,j)/art(i,j) end if if (k-1>=1.and.j+1<=jm)then bb3(i,j+1,k-1)=-.25e0*bbf(i,j+1,k-1) 1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j) 1 -.25e0*bbf(i,j,k-1)*dq(i,j+1)/dq(i,j) 1 *dx(i,j)/ddy(i,j)/art(i,j) end if if (k-1>=1.and.j-1>=1) then bb4(i,j-1,k-1)=.25e0*bbf(i,j-1,k-1) 1 *.5*(dx(i,j)+dx(i,j-1))/ddx(i,j-1)/art(i,j) 1 +.25e0*bbf(i,j,k-1)*dq(i,j-1)/dq(i,j) 1 *dx(i,j)/ddy(i,j)/art(i,j) end if if (i+1<=im) then ga1(i+1,j,k)=dz(k)*dq(i+1,j) 1 *.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)/art(i,j) end if if (i-1>=1) then ga2(i-1,j,k)=dz(k)*dq(i-1,j) 1 *.5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)/art(i,j) end if if (j+1<=jm) then gb1(i,j+1,k)=dz(k)*dq(i,j+1) 1 *.5*(dx(i,j)+dx(i,j+1))/ddy(i,j)/art(i,j) end if if (j-1>=1) then gb2(i,j-1,k)=dz(k)*dq(i,j-1) 1 *.5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)/art(i,j) end if if (k+1<=kb) then gc1(i,j,k+1)=1.e0/(dzz(k)*dq(i,j))* 1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k+1)*dy(i,j)/ 1 dx(i,j)+.5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k+1)*dx(i,j)/ 1 dy(i,j))/art(i,j) end if if (k-1>=1) then gc2(i,j,k-1)=1.e0/(dzz(k-1)*dq(i,j))* 1 (art(i,j)+.5*(aaf(i,j,k-1)+aaf(i,j,k))*aaf(i,j,k-1)*dy(i,j)/ 1 dx(i,j)+.5*(bbf(i,j,k-1)+bbf(i,j,k))*bbf(i,j,k-1)*dx(i,j)/ 1 dy(i,j))/art(i,j) end if ! if(iint==5)stop if (i-1>=1.and.j-1>=1.and.k-1>=1.) then gen(i,j,k)=(-dq(i,j)*dz(k)*(.5*(dy(i,j)+dy(i+1,j))/ddx(i,j)+ 1 .5*(dy(i,j)+dy(i-1,j))/ddx(i-1,j)+ 1 .5*(dx(i,j)+dx(i,j+1))/ddy(i,j)+ 1 .5*(dx(i,j)+dx(i,j-1))/ddy(i,j-1)))/art(i,j)- 2 (1.e0/dzz(k-1)+1.e0/dzz(k))/dq(i,j)*(art(i,j) 1 +.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+ 1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j) 1 )/art(i,j) else gen(i,j,k)=(-dq(i,j)*dz(k)*(dy(i,j)/ddx(i,j)+ 1 dy(i,j)/ddx(i-1,j)+ 1 dx(i,j)/ddy(i,j)+ 1 dx(i,j)/ddy(i,j-1)))/art(i,j)- 2 (2.e0/dzz(k))/dq(i,j)* 1 (art(i,j)+.5*(aaf(i,j,k+1)+aaf(i,j,k))*aaf(i,j,k)*dy(i,j)/dx(i,j)+ 1 .5*(bbf(i,j,k+1)+bbf(i,j,k))*bbf(i,j,k)*dx(i,j)/dy(i,j) 1 )/art(i,j) end if if (((k-1)*(k-kb)*(i-1)*(i-im)*(j-1)*(j-jm)).ne.0) then m=m+1 if (k==kbm1) then ga1(i+1,j,k)=ga1(i+1,j,k)+aa1(i+1,j,k+1) ga2(i-1,j,k)=ga2(i-1,j,k)+aa2(i-1,j,k+1) gb1(i,j+1,k)=gb1(i,j+1,k)+bb1(i,j+1,k+1) gb2(i,j-1,k)=gb2(i,j-1,k)+bb2(i,j-1,k+1) gen(i,j,k)=gen(i,j,k)+gc1(i,j,k+1) endif if (k==2) then !¸òþñþôýð ÿþòõ¨¿ýþ¸ª¹ aa2(i+1,j,k-1)=0. bb2(i,j+1,k-1)=0. bb4(i,j-1,k-1)=0. gc2(i,j,k-1)=0. endif if (i==2) then gc2(i,j,k-1)=gc2(i,j,k-1)+aa4(i-1,j,k-1) gc1(i,j,k+1)=gc1(i,j,k+1)+aa2(i-1,j,k+1) gen(i,j,k)=gen(i,j,k)+ga2(i-1,j,k) endif if (i==imm1) then gc2(i,j,k-1)=gc2(i,j,k-1)+aa3(i+1,j,k-1) gc1(i,j,k+1)=gc1(i,j,k+1)+aa1(i+1,j,k+1) gen(i,j,k)=gen(i,j,k)+ga1(i+1,j,k) endif if (j==2) then gc2(i,j,k-1)=gc2(i,j,k-1)+bb4(i,j-1,k-1) gc1(i,j,k+1)=gc1(i,j,k+1)+bb2(i,j-1,k+1) gen(i,j,k)=gen(i,j,k)+gb2(i,j-1,k) endif if (j==jmm1) then gc2(i,j,k-1)=gc2(i,j,k-1)+bb3(i,j+1,k-1) gc1(i,j,k+1)=gc1(i,j,k+1)+bb1(i,j+1,k+1) gen(i,j,k)=gen(i,j,k)+gb1(i,j+1,k) endif endif if (maa1+m<=lm) then if (k+1>kbm1.or.i+1>imm1) then apr(m)=0.0 else apr(m)=aa1(i+1,j,k+1) end if ja(m)=ind(m+maa1) ia(m)=ind(m) end if lapr=ma1 if (mbb1+m<=lm) then if (k+1>kbm1.or.j+1>jmm1) then apr(lapr+m)=0.0 else apr(lapr+m)=bb1(i,j+1,k+1) end if ja(m+lapr)=ind(m+mbb1) ia(m+lapr)=ind(m) end if lapr=ma1+mb1 if (mgc+m<=lm) then if (k+1>kbm1) then apr(lapr+m)=0.0 else apr(lapr+m)=gc1(i,j,k+1) end if ja(m+lapr)=ind(m+mgc) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc if (mbb2+m<=lm) then if (k+1>kbm1.or.j-1<2) then apr(lapr+m)=0.0 else apr(lapr+m)=bb2(i,j-1,k+1) end if ja(m+lapr)=ind(m+mbb2) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2 if (maa2+m<=lm) then if (k+1>kbm1.or.i-1<2) then apr(lapr+m)=0.0 else apr(lapr+m)=aa2(i-1,j,k+1) endif ja(m+lapr)=ind(m+maa2) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2+ma2 if (mga+m<=lm) then if (i+1>imm1) then apr(lapr+m)=0.0 else apr(lapr+m)=ga1(i+1,j,k) end if ja(m+lapr)=ind(m+mga) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2+ma2+ma if (1+m<=lm) then if (j+1>jmm1) then apr(lapr+m)=0.0 else apr(lapr+m)=gb1(i,j+1,k) end if ja(m+lapr)=ind(m+mgb) ia(m+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2+ma2+ma+mb apr(lapr+m)=gen(i,j,k) ja(m+lapr)=ind(m) ia(m+lapr)=ind(m) lapr=ma1+mb1+mc+mb2+ma2+ma+mb+lm if (m-mgb>=1) then if (j-1<2) then apr(m-mgb+lapr)=0.0 else apr(m-mgb+lapr)=gb2(i,j-1,k) end if ja(m-mgb+lapr)=ind(m-mgb) ia(m-mgb+lapr)=ind(m) end if lapr=ma1+mb1+mc+mb2+ma2+ma+2*mb+lm if (m-mga>=1) then if (i-1<2) then apr(m-mga+lapr)=0.0 else apr(m-mga+lapr)=ga2(i-1,j,k) end if ia(m-mga+lapr)=ind(m) ja(m-mga+lapr)=ind(-mga+m) end if lapr=ma1+mb1+mc+mb2+ma2+2*ma+2*mb+lm if (m-maa2>=1) then if (k-1<2.or.i+1>imm1) then apr(m-maa2+lapr)=0.0 else apr(m-maa2+lapr)=aa3(i+1,j,k-1) endif ia(m-maa2+lapr)=ind(m) ja(m-maa2+lapr)=ind(-maa2+m) end if lapr=ma1+mb1+mc+mb2+2*ma2+2*ma+2*mb+lm if (m-mbb2>=1) then if (j+1>jmm1.or.k-1<2) then apr(m-mbb2+lapr)=0.0 else apr(m-mbb2+lapr)=bb3(i,j+1,k-1) endif ia(m-mbb2+lapr)=ind(m) ja(m-mbb2+lapr)=ind(-mbb2+m) end if lapr=ma1+mb1+mc+2*mb2+2*ma2+2*ma+2*mb+lm if(m-mgc>=1)then if (k-1>kbm1) then apr(m-mgc+lapr)=0.0 else apr(m-mgc+lapr)=gc2(i,j,k-1) end if ja(m-mgc+lapr)=ind(m-mgc) ia(m-mgc+lapr)=ind(m) end if lapr=ma1+mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm if (m-mbb1>=1) then if (j-1<2.or.k-1<2)then apr(m-mbb1+lapr)=0.0 else apr(m-mbb1+lapr)=bb4(i,j-1,k-1) endif ia(m-mbb1+lapr)=ind(m) ja(m-mbb1+lapr)=ind(m-mbb1) end if lapr=ma1+2*mb1+2*mc+2*mb2+2*ma2+2*ma+2*mb+lm if (m-maa1>=1) then if (i-1<2.or.k-1<2)then apr(m-maa1+lapr)=0.0 else apr(m-maa1+lapr)=aa4(i-1,j,k-1) endif ia(m-maa1+lapr)=ind(m) ja(m-maa1+lapr)=ind(m-maa1) end if 99 continue call annotate_site_end
However, when I run my code I'm getting a stack overflow exception
forrtl: severe (170): Program Exception - stack overflow Image PC Routine Line Source nohydroPOM.exe 00007FF7B4D83E38 Unknown Unknown Unknown nohydroPOM.exe 00007FF7B4D64484 PRESSURE1 1 pressure1.for nohydroPOM.exe 00007FF7B4D4EABB MAIN__ 620 Main.for nohydroPOM.exe 00007FF7B4D83C02 Unknown Unknown Unknown nohydroPOM.exe 00007FF7B4D8406D Unknown Unknown Unknown KERNEL32.DLL 00007FFD4FE14034 Unknown Unknown Unknown ntdll.dll 00007FFD500F3691 Unknown Unknown Unknown
I noticed some messages like this
pressure1.for(273,7):remark #15329: vectorization support: irregularly indexed store was emulated for the variable <APR(M)>, masked, part of index is linear but may overflow
I have looking for more information about this message and how to fix it, but I haven't had any success so far. I have been looking for ways to troubleshoot stack overflow issues and tried some solutions -like increasing the size of stack, using heaparrays option, etc- but I'm still getting the same error. I will include the whole solution files in this post. I'm open to any suggestion about how to vectorize this loop. I will appreciate if someone provide more information about diagnostic message 15329 too.
My environment is
Windows 10 Home Version 10.0.17134 Build 17134
Visual Studio Community 2015 Version 14.0.25431.01 Update 3
Intel Parallel Studio XE 2019 Update 3 Cluster Edition for Windows
Regards,
Link Copied
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page