Intel® Moderncode for Parallel Architectures
Support for developing parallel programming applications on Intel® Architecture. Beginner
195 Views
my parallel source is:
do ii = 1, nion-1
i = iion(ii)
in = jion(ii)
ic = kion(ii)
xic = x(ic)
yic = y(ic)
zic = z(ic)
xi = x(i) - xic
yi = y(i) - yic
zi = z(i) - zic
fi = f * pchg(ii)
iuse = (use(i) .or. use(ic))
c\$omp parallel
c\$omp sections
c\$omp single

do j = 1, nion
cscale(iion(j)) = 1.0d0
end do
c\$omp end single
c\$omp section

do j = 1, n12(in)
cscale(i12(j,in)) = c2scale
end do
c\$omp section
do j = 1, n13(in)
cscale(i13(j,in)) = c3scale
end do
c\$omp section
do j = 1, n14(in)
cscale(i14(j,in)) = c4scale
end do
c\$omp section
do j = 1, n15(in)
cscale(i15(j,in)) = c5scale
end do
c\$omp end sections
c\$omp do
c\$omp& private(k,kn,kc,proceed,xr,yr,zr)
c\$omp& reduction( + : ec, dec, vir )
do kk = ii+1, nion
k = iion(kk)
kn = jion(kk)
kc = kion(kk)
proceed = .true.
if (use_group) call groups (proceed,fgrp,i,k,0,0,0,0)
if (proceed) proceed = (iuse .or. use(k) .or. use(kc))
if (proceed) then
xc = xic - x(kc)
yc = yic - y(kc)
zc = zic - z(kc)
if (use_image) call image (xc,yc,zc,0)
rc2 = xc*xc + yc*yc + zc*zc
if (rc2 .le. off2) then
xr = xc + xi - x(k) + x(kc)
yr = yc + yi - y(k) + y(kc)
zr = zc + zi - z(k) + z(kc)
r2 = xr*xr + yr*yr + zr*zr
r = sqrt(r2)
fik = fi * pchg(kk) * cscale(kn)
e = fik / r
de = -fik / r2
dc = 0.0d0
shift = fik / (0.5d0*(off+cut))
e = e - shift
if (rc2 .gt. cut2) then
rc = sqrt(rc2)
rc3 = rc2 * rc
rc4 = rc2 * rc2
rc5 = rc2 * rc3
rc6 = rc3 * rc3
rc7 = rc3 * rc4
taper = c5*rc5 + c4*rc4 + c3*rc3
& + c2*rc2 + c1*rc + c0
dtaper = 5.0d0*c5*rc4 + 4.0d0*c4*rc3
& + 3.0d0*c3*rc2 + 2.0d0*c2*rc + c1
trans = fik * (f7*rc7 + f6*rc6 + f5*rc5 + f4*rc4
& + f3*rc3 + f2*rc2 + f1*rc + f0)
dtrans = fik * (7.0d0*f7*rc6 + 6.0d0*f6*rc5
& + 5.0d0*f5*rc4 + 4.0d0*f4*rc3
& + 3.0d0*f3*rc2 + 2.0d0*f2*rc + f1)
dc = (e * dtaper + dtrans) / rc
de = de * taper
e = e * taper + trans
end if
if (use_group) then
e = e * fgrp
de = de * fgrp
dc = dc * fgrp
end if
de = de / r
dedx = de * xr
dedy = de * yr
dedz = de * zr
dedxc = dc * xc
dedyc = dc * yc
dedzc = dc * zc
ec = ec + e
dec(1,i) = dec(1,i) + dedx
dec(2,i) = dec(2,i) + dedy
dec(3,i) = dec(3,i) + dedz
dec(1,ic) = dec(1,ic) + dedxc
dec(2,ic) = dec(2,ic) + dedyc
dec(3,ic) = dec(3,ic) + dedzc
dec(1,k) = dec(1,k) - dedx
dec(2,k) = dec(2,k) - dedy
dec(3,k) = dec(3,k) - dedz
dec(1,kc) = dec(1,kc) - dedxc
dec(2,kc) = dec(2,kc) - dedyc
dec(3,kc) = dec(3,kc) - dedzc
vxx = xr*dedx + xc*dedxc
vyx = yr*dedx + yc*dedxc
vzx = zr*dedx + zc*dedxc
vyy = yr*dedy + yc*dedyc
vzy = zr*dedy + zc*dedyc
vzz = zr*dedz + zc*dedzc
vir(1,1) = vir(1,1) + vxx
vir(2,1) = vir(2,1) + vyx
vir(3,1) = vir(3,1) + vzx
vir(1,2) = vir(1,2) + vyx
vir(2,2) = vir(2,2) + vyy
vir(3,2) = vir(3,2) + vzy
vir(1,3) = vir(1,3) + vzx
vir(2,3) = vir(2,3) + vzy
vir(3,3) = vir(3,3) + vzz
if (molcule(i) .ne. molcule(k)) then
einter = einter + e
end if
end if
end if
end do
c\$omp end do
c\$omp end parallel
end do
but after parallel,my codeReducing efficiency 10-foldso i want to know ,why?
especially in reduction( + : vir,ec,dec)
ifone know the reason,please tell me
for example:
if nion=1000 Serial time=56 seconds,but parallel time =560 seconds
if nion=180,serial time=0.24 seconds,but parallel time = 2.4 seconds
if nion larger, the parallel time is more larger than serial time
7 Replies Black Belt
195 Views

Is your serial code running without Runtime checks for array bounds
Is your parallel code running with Runtime checks for array bounds Beginner
195 Views

Is your serial code running without Runtime checks for array bounds
Is your parallel code running with Runtime checks for array bounds
first,i must to say thank you very much,but what i should do to improve performance Black Belt
195 Views
Quoting - wert7588
first,i must to say thank you very much,but what i should do to improve performance

Turn off the run time checks (after test runs indicate no run-time errors).
The checks for using uninitialized variables and subscript out of bounds checks will slow down your program. Beginner
195 Views

Turn off the run time checks (after test runs indicate no run-time errors).
The checks for using uninitialized variables and subscript out of bounds checks will slow down your program.

sorry,i turn off the run-time check,but the run time is also slow
i add -check none in makefile,some people tell me that my problem is ''Multi-core communication time ,"communicationoverhead"
the slow reason may be the communication,if i use 4 core the time = 10 ,if i use 2 core the time = 5
so i feel may be the this is the true reason
but i want to ask you ,what should i do to improve performance and diminish the run time? Black Belt
195 Views
Of course, the suspicion is that you have false sharing, with all threads modifying the same cache lines. You don't give enough information to verify this. Black Belt
195 Views

Are any of the subroutines/functions called from within the parallel region calling a random number generator?

In your parallel loop you have:

if (use_image) call image (xc,yc,zc,0)

There is no thread context in the calling args. xc,yc,zc are different for each thread. However, check to see if image is writing to the same buffer for each thread. You may want to have different image buffers for each thread.

Jim Dempsey New Contributor III
195 Views
```[cpp]There are many assigned variables not declared as private. Each thread will use the same variable and will cause corruption and slow-down. Try putting the content of the loop into a subroutine.

You will no longer need to keep track of so many private variables and your code may become easier to understand and track down errors!

Here is an example of re-working your code, probably not correct yet.

c\$omp do
c\$omp& private(dedx, dedy, dedz, vxx, vyx, vyy...., e)
c\$omp& reduction( + : ec, dec, vir )
do kk = ii+1, nion
call doSomeThingInParallel(data..., kk, dedx, dedy, dedz, vxx, vyx, vyy...., e)
ec = ec + e
dec(1,i) = dec(1,i) + dedx
dec(2,i) = dec(2,i) + dedy
dec(3,i) = dec(3,i) + dedz
dec(1,ic) = dec(1,ic) + dedxc
dec(2,ic) = dec(2,ic) + dedyc
dec(3,ic) = dec(3,ic) + dedzc
dec(1,k) = dec(1,k) - dedx
dec(2,k) = dec(2,k) - dedy
dec(3,k) = dec(3,k) - dedz
dec(1,kc) = dec(1,kc) - dedxc
dec(2,kc) = dec(2,kc) - dedyc
dec(3,kc) = dec(3,kc) - dedzc
vxx = xr*dedx + xc*dedxc
vyx = yr*dedx + yc*dedxc
vzx = zr*dedx + zc*dedxc
vyy = yr*dedy + yc*dedyc
vzy = zr*dedy + zc*dedyc
vzz = zr*dedz + zc*dedzc
vir(1,1) = vir(1,1) + vxx
vir(2,1) = vir(2,1) + vyx
vir(3,1) = vir(3,1) + vzx
vir(1,2) = vir(1,2) + vyx
vir(2,2) = vir(2,2) + vyy
vir(3,2) = vir(3,2) + vzy
vir(1,3) = vir(1,3) + vzx
vir(2,3) = vir(2,3) + vzy
vir(3,3) = vir(3,3) + vzz
end do
c\$omp end do
c\$omp end parallel
end do
end subroutine

subroutine doSomeThingInParallel(dataInputs..., kk, dedx, dedy, dedz, vxx, vyx, vyy...., e)
declarations here...
k = iion(kk)
kn = jion(kk)
kc = kion(kk)
proceed = .true.
if (use_group)  call groups (proceed,fgrp,i,k,0,0,0,0)
if (proceed)  proceed = (iuse .or. use(k) .or. use(kc))
if (proceed) then
xc = xic - x(kc)
yc = yic - y(kc)
zc = zic - z(kc)
if (use_image)  call image (xc,yc,zc,0)
rc2 = xc*xc + yc*yc + zc*zc
if (rc2 .le. off2) then
xr = xc + xi - x(k) + x(kc)
yr = yc + yi - y(k) + y(kc)
zr = zc + zi - z(k) + z(kc)
r2 = xr*xr + yr*yr + zr*zr
r = sqrt(r2)
fik = fi * pchg(kk) * cscale(kn)
e = fik / r
de = -fik / r2
dc = 0.0d0
shift = fik / (0.5d0*(off+cut))
e = e - shift
if (rc2 .gt. cut2) then
rc = sqrt(rc2)
rc3 = rc2 * rc
rc4 = rc2 * rc2
rc5 = rc2 * rc3
rc6 = rc3 * rc3
rc7 = rc3 * rc4
taper = c5*rc5 + c4*rc4 + c3*rc3
&                          + c2*rc2 + c1*rc + c0
dtaper = 5.0d0*c5*rc4 + 4.0d0*c4*rc3
&                          + 3.0d0*c3*rc2 + 2.0d0*c2*rc + c1
trans = fik * (f7*rc7 + f6*rc6 + f5*rc5 + f4*rc4
&                              + f3*rc3 + f2*rc2 + f1*rc + f0)
dtrans = fik * (7.0d0*f7*rc6 + 6.0d0*f6*rc5
&                              + 5.0d0*f5*rc4 + 4.0d0*f4*rc3
&                            + 3.0d0*f3*rc2 + 2.0d0*f2*rc + f1)
dc = (e * dtaper + dtrans) / rc
de = de * taper
e = e * taper + trans
end if
if (use_group) then
e = e * fgrp
de = de * fgrp
dc = dc * fgrp
end if
de = de / r
dedx = de * xr
dedy = de * yr
dedz = de * zr
dedxc = dc * xc
dedyc = dc * yc
dedzc = dc * zc
if (molcule(i) .ne. molcule(k)) then
einter = einter + e
end if
end if
end if
end subroutine
[/cpp]``` 