- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
my parallel source is:
do ii = 1, nion-1
i = iion(ii)
in = jion(ii)
ic = kion(ii)
xic = x(ic)
yic = y(ic)
zic = z(ic)
xi = x(i) - xic
yi = y(i) - yic
zi = z(i) - zic
fi = f * pchg(ii)
iuse = (use(i) .or. use(ic))
c$omp parallel
c$omp sections
c$omp single
do j = 1, nion
cscale(iion(j)) = 1.0d0
end do
c$omp end single
c$omp section
do j = 1, n12(in)
cscale(i12(j,in)) = c2scale
end do
c$omp section
do j = 1, n13(in)
cscale(i13(j,in)) = c3scale
end do
c$omp section
do j = 1, n14(in)
cscale(i14(j,in)) = c4scale
end do
c$omp section
do j = 1, n15(in)
cscale(i15(j,in)) = c5scale
end do
c$omp end sections
c$omp do
c$omp& private(k,kn,kc,proceed,xr,yr,zr)
c$omp& reduction( + : ec, dec, vir )
do kk = ii+1, nion
k = iion(kk)
kn = jion(kk)
kc = kion(kk)
proceed = .true.
if (use_group) call groups (proceed,fgrp,i,k,0,0,0,0)
if (proceed) proceed = (iuse .or. use(k) .or. use(kc))
if (proceed) then
xc = xic - x(kc)
yc = yic - y(kc)
zc = zic - z(kc)
if (use_image) call image (xc,yc,zc,0)
rc2 = xc*xc + yc*yc + zc*zc
if (rc2 .le. off2) then
xr = xc + xi - x(k) + x(kc)
yr = yc + yi - y(k) + y(kc)
zr = zc + zi - z(k) + z(kc)
r2 = xr*xr + yr*yr + zr*zr
r = sqrt(r2)
fik = fi * pchg(kk) * cscale(kn)
e = fik / r
de = -fik / r2
dc = 0.0d0
shift = fik / (0.5d0*(off+cut))
e = e - shift
if (rc2 .gt. cut2) then
rc = sqrt(rc2)
rc3 = rc2 * rc
rc4 = rc2 * rc2
rc5 = rc2 * rc3
rc6 = rc3 * rc3
rc7 = rc3 * rc4
taper = c5*rc5 + c4*rc4 + c3*rc3
& + c2*rc2 + c1*rc + c0
dtaper = 5.0d0*c5*rc4 + 4.0d0*c4*rc3
& + 3.0d0*c3*rc2 + 2.0d0*c2*rc + c1
trans = fik * (f7*rc7 + f6*rc6 + f5*rc5 + f4*rc4
& + f3*rc3 + f2*rc2 + f1*rc + f0)
dtrans = fik * (7.0d0*f7*rc6 + 6.0d0*f6*rc5
& + 5.0d0*f5*rc4 + 4.0d0*f4*rc3
& + 3.0d0*f3*rc2 + 2.0d0*f2*rc + f1)
dc = (e * dtaper + dtrans) / rc
de = de * taper
e = e * taper + trans
end if
if (use_group) then
e = e * fgrp
de = de * fgrp
dc = dc * fgrp
end if
de = de / r
dedx = de * xr
dedy = de * yr
dedz = de * zr
dedxc = dc * xc
dedyc = dc * yc
dedzc = dc * zc
ec = ec + e
dec(1,i) = dec(1,i) + dedx
dec(2,i) = dec(2,i) + dedy
dec(3,i) = dec(3,i) + dedz
dec(1,ic) = dec(1,ic) + dedxc
dec(2,ic) = dec(2,ic) + dedyc
dec(3,ic) = dec(3,ic) + dedzc
dec(1,k) = dec(1,k) - dedx
dec(2,k) = dec(2,k) - dedy
dec(3,k) = dec(3,k) - dedz
dec(1,kc) = dec(1,kc) - dedxc
dec(2,kc) = dec(2,kc) - dedyc
dec(3,kc) = dec(3,kc) - dedzc
vxx = xr*dedx + xc*dedxc
vyx = yr*dedx + yc*dedxc
vzx = zr*dedx + zc*dedxc
vyy = yr*dedy + yc*dedyc
vzy = zr*dedy + zc*dedyc
vzz = zr*dedz + zc*dedzc
vir(1,1) = vir(1,1) + vxx
vir(2,1) = vir(2,1) + vyx
vir(3,1) = vir(3,1) + vzx
vir(1,2) = vir(1,2) + vyx
vir(2,2) = vir(2,2) + vyy
vir(3,2) = vir(3,2) + vzy
vir(1,3) = vir(1,3) + vzx
vir(2,3) = vir(2,3) + vzy
vir(3,3) = vir(3,3) + vzz
if (molcule(i) .ne. molcule(k)) then
einter = einter + e
end if
end if
end if
end do
c$omp end do
c$omp end parallel
end do
but after parallel,my codeReducing efficiency 10-foldso i want to know ,why?
especially in reduction( + : vir,ec,dec)
ifone know the reason,please tell me
for example:
if nion=1000 Serial time=56 seconds,but parallel time =560 seconds
if nion=180,serial time=0.24 seconds,but parallel time = 2.4 seconds
if nion larger, the parallel time is more larger than serial time
do ii = 1, nion-1
i = iion(ii)
in = jion(ii)
ic = kion(ii)
xic = x(ic)
yic = y(ic)
zic = z(ic)
xi = x(i) - xic
yi = y(i) - yic
zi = z(i) - zic
fi = f * pchg(ii)
iuse = (use(i) .or. use(ic))
c$omp parallel
c$omp sections
c$omp single
do j = 1, nion
cscale(iion(j)) = 1.0d0
end do
c$omp end single
c$omp section
do j = 1, n12(in)
cscale(i12(j,in)) = c2scale
end do
c$omp section
do j = 1, n13(in)
cscale(i13(j,in)) = c3scale
end do
c$omp section
do j = 1, n14(in)
cscale(i14(j,in)) = c4scale
end do
c$omp section
do j = 1, n15(in)
cscale(i15(j,in)) = c5scale
end do
c$omp end sections
c$omp do
c$omp& private(k,kn,kc,proceed,xr,yr,zr)
c$omp& reduction( + : ec, dec, vir )
do kk = ii+1, nion
k = iion(kk)
kn = jion(kk)
kc = kion(kk)
proceed = .true.
if (use_group) call groups (proceed,fgrp,i,k,0,0,0,0)
if (proceed) proceed = (iuse .or. use(k) .or. use(kc))
if (proceed) then
xc = xic - x(kc)
yc = yic - y(kc)
zc = zic - z(kc)
if (use_image) call image (xc,yc,zc,0)
rc2 = xc*xc + yc*yc + zc*zc
if (rc2 .le. off2) then
xr = xc + xi - x(k) + x(kc)
yr = yc + yi - y(k) + y(kc)
zr = zc + zi - z(k) + z(kc)
r2 = xr*xr + yr*yr + zr*zr
r = sqrt(r2)
fik = fi * pchg(kk) * cscale(kn)
e = fik / r
de = -fik / r2
dc = 0.0d0
shift = fik / (0.5d0*(off+cut))
e = e - shift
if (rc2 .gt. cut2) then
rc = sqrt(rc2)
rc3 = rc2 * rc
rc4 = rc2 * rc2
rc5 = rc2 * rc3
rc6 = rc3 * rc3
rc7 = rc3 * rc4
taper = c5*rc5 + c4*rc4 + c3*rc3
& + c2*rc2 + c1*rc + c0
dtaper = 5.0d0*c5*rc4 + 4.0d0*c4*rc3
& + 3.0d0*c3*rc2 + 2.0d0*c2*rc + c1
trans = fik * (f7*rc7 + f6*rc6 + f5*rc5 + f4*rc4
& + f3*rc3 + f2*rc2 + f1*rc + f0)
dtrans = fik * (7.0d0*f7*rc6 + 6.0d0*f6*rc5
& + 5.0d0*f5*rc4 + 4.0d0*f4*rc3
& + 3.0d0*f3*rc2 + 2.0d0*f2*rc + f1)
dc = (e * dtaper + dtrans) / rc
de = de * taper
e = e * taper + trans
end if
if (use_group) then
e = e * fgrp
de = de * fgrp
dc = dc * fgrp
end if
de = de / r
dedx = de * xr
dedy = de * yr
dedz = de * zr
dedxc = dc * xc
dedyc = dc * yc
dedzc = dc * zc
ec = ec + e
dec(1,i) = dec(1,i) + dedx
dec(2,i) = dec(2,i) + dedy
dec(3,i) = dec(3,i) + dedz
dec(1,ic) = dec(1,ic) + dedxc
dec(2,ic) = dec(2,ic) + dedyc
dec(3,ic) = dec(3,ic) + dedzc
dec(1,k) = dec(1,k) - dedx
dec(2,k) = dec(2,k) - dedy
dec(3,k) = dec(3,k) - dedz
dec(1,kc) = dec(1,kc) - dedxc
dec(2,kc) = dec(2,kc) - dedyc
dec(3,kc) = dec(3,kc) - dedzc
vxx = xr*dedx + xc*dedxc
vyx = yr*dedx + yc*dedxc
vzx = zr*dedx + zc*dedxc
vyy = yr*dedy + yc*dedyc
vzy = zr*dedy + zc*dedyc
vzz = zr*dedz + zc*dedzc
vir(1,1) = vir(1,1) + vxx
vir(2,1) = vir(2,1) + vyx
vir(3,1) = vir(3,1) + vzx
vir(1,2) = vir(1,2) + vyx
vir(2,2) = vir(2,2) + vyy
vir(3,2) = vir(3,2) + vzy
vir(1,3) = vir(1,3) + vzx
vir(2,3) = vir(2,3) + vzy
vir(3,3) = vir(3,3) + vzz
if (molcule(i) .ne. molcule(k)) then
einter = einter + e
end if
end if
end if
end do
c$omp end do
c$omp end parallel
end do
but after parallel,my codeReducing efficiency 10-foldso i want to know ,why?
especially in reduction( + : vir,ec,dec)
ifone know the reason,please tell me
for example:
if nion=1000 Serial time=56 seconds,but parallel time =560 seconds
if nion=180,serial time=0.24 seconds,but parallel time = 2.4 seconds
if nion larger, the parallel time is more larger than serial time
Link Copied
7 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Is your serial code running without Runtime checks for array bounds
Is your parallel code running with Runtime checks for array bounds
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Quoting - jimdempseyatthecove
Is your serial code running without Runtime checks for array bounds
Is your parallel code running with Runtime checks for array bounds
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Quoting - wert7588
first,i must to say thank you very much,but what i should do to improve performance
Turn off the run time checks (after test runs indicate no run-time errors).
The checks for using uninitialized variables and subscript out of bounds checks will slow down your program.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Quoting - jimdempseyatthecove
Turn off the run time checks (after test runs indicate no run-time errors).
The checks for using uninitialized variables and subscript out of bounds checks will slow down your program.
sorry,i turn off the run-time check,but the run time is also slow
i add -check none in makefile,some people tell me that my problem is ''Multi-core communication time ,"communicationoverhead"
the slow reason may be the communication,if i use 4 core the time = 10 ,if i use 2 core the time = 5
so i feel may be the this is the true reason
but i want to ask you ,what should i do to improve performance and diminish the run time?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Of course, the suspicion is that you have false sharing, with all threads modifying the same cache lines. You don't give enough information to verify this.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Are any of the subroutines/functions called from within the parallel region calling a random number generator?
In your parallel loop you have:
if (use_image) call image (xc,yc,zc,0)
There is no thread context in the calling args. xc,yc,zc are different for each thread. However, check to see if image is writing to the same buffer for each thread. You may want to have different image buffers for each thread.
Jim Dempsey
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
[cpp]There are many assigned variables not declared as private. Each thread will use the same variable and will cause corruption and slow-down. Try putting the content of the loop into a subroutine. You will no longer need to keep track of so many private variables and your code may become easier to understand and track down errors! Here is an example of re-working your code, probably not correct yet. c$omp do c$omp& private(dedx, dedy, dedz, vxx, vyx, vyy...., e) c$omp& reduction( + : ec, dec, vir ) do kk = ii+1, nion call doSomeThingInParallel(data..., kk, dedx, dedy, dedz, vxx, vyx, vyy...., e) ec = ec + e dec(1,i) = dec(1,i) + dedx dec(2,i) = dec(2,i) + dedy dec(3,i) = dec(3,i) + dedz dec(1,ic) = dec(1,ic) + dedxc dec(2,ic) = dec(2,ic) + dedyc dec(3,ic) = dec(3,ic) + dedzc dec(1,k) = dec(1,k) - dedx dec(2,k) = dec(2,k) - dedy dec(3,k) = dec(3,k) - dedz dec(1,kc) = dec(1,kc) - dedxc dec(2,kc) = dec(2,kc) - dedyc dec(3,kc) = dec(3,kc) - dedzc vxx = xr*dedx + xc*dedxc vyx = yr*dedx + yc*dedxc vzx = zr*dedx + zc*dedxc vyy = yr*dedy + yc*dedyc vzy = zr*dedy + zc*dedyc vzz = zr*dedz + zc*dedzc vir(1,1) = vir(1,1) + vxx vir(2,1) = vir(2,1) + vyx vir(3,1) = vir(3,1) + vzx vir(1,2) = vir(1,2) + vyx vir(2,2) = vir(2,2) + vyy vir(3,2) = vir(3,2) + vzy vir(1,3) = vir(1,3) + vzx vir(2,3) = vir(2,3) + vzy vir(3,3) = vir(3,3) + vzz end do c$omp end do c$omp end parallel end do end subroutine subroutine doSomeThingInParallel(dataInputs..., kk, dedx, dedy, dedz, vxx, vyx, vyy...., e) declarations here... k = iion(kk) kn = jion(kk) kc = kion(kk) proceed = .true. if (use_group) call groups (proceed,fgrp,i,k,0,0,0,0) if (proceed) proceed = (iuse .or. use(k) .or. use(kc)) if (proceed) then xc = xic - x(kc) yc = yic - y(kc) zc = zic - z(kc) if (use_image) call image (xc,yc,zc,0) rc2 = xc*xc + yc*yc + zc*zc if (rc2 .le. off2) then xr = xc + xi - x(k) + x(kc) yr = yc + yi - y(k) + y(kc) zr = zc + zi - z(k) + z(kc) r2 = xr*xr + yr*yr + zr*zr r = sqrt(r2) fik = fi * pchg(kk) * cscale(kn) e = fik / r de = -fik / r2 dc = 0.0d0 shift = fik / (0.5d0*(off+cut)) e = e - shift if (rc2 .gt. cut2) then rc = sqrt(rc2) rc3 = rc2 * rc rc4 = rc2 * rc2 rc5 = rc2 * rc3 rc6 = rc3 * rc3 rc7 = rc3 * rc4 taper = c5*rc5 + c4*rc4 + c3*rc3 & + c2*rc2 + c1*rc + c0 dtaper = 5.0d0*c5*rc4 + 4.0d0*c4*rc3 & + 3.0d0*c3*rc2 + 2.0d0*c2*rc + c1 trans = fik * (f7*rc7 + f6*rc6 + f5*rc5 + f4*rc4 & + f3*rc3 + f2*rc2 + f1*rc + f0) dtrans = fik * (7.0d0*f7*rc6 + 6.0d0*f6*rc5 & + 5.0d0*f5*rc4 + 4.0d0*f4*rc3 & + 3.0d0*f3*rc2 + 2.0d0*f2*rc + f1) dc = (e * dtaper + dtrans) / rc de = de * taper e = e * taper + trans end if if (use_group) then e = e * fgrp de = de * fgrp dc = dc * fgrp end if de = de / r dedx = de * xr dedy = de * yr dedz = de * zr dedxc = dc * xc dedyc = dc * yc dedzc = dc * zc if (molcule(i) .ne. molcule(k)) then einter = einter + e end if end if end if end subroutine [/cpp]
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page