program laplace
	implicit double precision (a-h,o-z)

      parameter (mp=12) ! mp (number of processors) must be the same 
                        ! as in execution >mpiexec -np 12 laplace

	parameter (mn=201,mnode=mn*mn,mt=4)
c         mn = number of nodes in i and j directions (odd #)
c         mnode = total number of nodes
c         mt = number of different tasks
      common /t/t(mn,mn),told(mn,mn),nbc(mn,mn)

      dimension nod14m(mn,mn),nim(mnode),njm(mnode)
c         nod14m(mn,mn) = global node numbering
c         nim(mnode) = i direction location for each node
c         njm(mnode) = j direction location for each node
      dimension ntaskt(mt),ntaskp(mt),ntaskn(mt)
c         'ntaskt(i)' = total number of nodes for task i
c         'ntaskp(i)' = number of node calculations for task i in each processor
c         'ntaskn(i)' = number of proc's (task i) that calculate 'ntaskp(i)' nodes*
c         *remaining processors calculate 'ntaskp(i)-1' nodes
      parameter (mtp=(mn-1)/2+1)
      parameter (mtpm=mtp*mtp)
      dimension nod14t(mtpm,mt)
c         nod14t(mtpm,mt) = nodes on each task
c         mtp = maximum number of nodes in i/j directions in tasks
c         mtpm = maximum number of node calculations for any task

      dimension ntasks(mp,mt),ntaske(mp,mt)
c         ntasks(np,nt) = start counter* on processor np, in task nt
c         ntaske(np,nt) = end counter* on processor np, in task nt
c         *counter is n entry in nod14t(n,nt) for task nt

      real e,etime,et(2)

c         MPI

	include 'mpif.h'
	integer status(MPI_STATUS_SIZE)
      call mpi_init(ierr)
	call mpi_comm_rank(MPI_COMM_WORLD,my_rank,ierr)
	call mpi_comm_size(MPI_COMM_WORLD,nsize,ierr)

      open(unit=1,file='t.dat',form='formatted',SHARED)

      if (my_rank.eq.0) then

c         Global node numbering

      nod=0
      do 2 j=1,mn
      do 2 i=1,mn
              nod=nod+1
              nod14m(i,j)=nod
              nim(nod)=i
              njm(nod)=j
2     continue

c         Tasks
c	The four tasks are based on the checkerboard
c	methodology. Any CFD node will eventually be
c	dependent (actual software developed for my work)
c	on all eight surrounding nodes (in 2D) so here 
c	instead of 2 tasks there are 4. In 3D there are more.

      npi=mn
      npj=mn
      nei=(mn-1)/2
      nej=(mn-1)/2

      do 3 nt=1,mt
      do 3 ntpm=1,mtpm
      nod14t(ntpm,nt)=0
3     continue

      nt=1 ! task 1
      n=0
      do 4 j=1,npj,2
      do 4 i=1,npi,2
      n=n+1
      nod14t(n,nt)=nod14m(i,j)
4     continue
      ntaskt(nt)=n

      nt=2 ! task 2
      n=0
      do 5 j=1,npj,2
      do 5 i=2,npi-1,2
      n=n+1
      nod14t(n,nt)=nod14m(i,j)
5     continue
      ntaskt(nt)=n

      nt=3 ! task 3
      n=0
      do 6 j=2,npj-1,2
      do 6 i=1,npi,2
      n=n+1
      nod14t(n,nt)=nod14m(i,j)
6     continue
      ntaskt(nt)=n

      nt=4 ! task 4
      n=0
      do 7 j=2,npj-1,2
      do 7 i=2,npi-1,2
      n=n+1
      nod14t(n,nt)=nod14m(i,j)
7     continue
      ntaskt(nt)=n

      do 8 nt=1,mt
      ntp=ntaskt(nt)/mp
      if (ntp*mp.eq.ntaskt(nt)) then
          ntaskp(nt)=ntp
          ntaskn(nt)=mp
      else
          ntaskp(nt)=ntp+1
          ntaskn(nt)=ntaskt(nt)-ntp*mp
      endif
8     continue

      do 11 nt=1,mt
      ncs=1
      ntn=ntaskn(nt)
      ntp=ntaskp(nt)
          do 9 np=1,ntn
              ntasks(np,nt)=ncs
              nce=ncs+ntp-1
              ntaske(np,nt)=nce
              ncs=nce+1
9         continue
              if (ntn.eq.mp) go to 11
              ntp=ntp-1
          do 10 np=ntn+1,mp
              ntasks(np,nt)=ncs
              nce=ncs+ntp-1
              ntaske(np,nt)=nce
              ncs=nce+1
10         continue
11    continue

      do 12 nt=1,mt
          if (ntaske(mp,nt).ne.ntaskt(nt)) then
              write(*,*) '# of nodes calculated in processors'
              write(*,*) 'do not correspond in task ',nt
              stop
          endif
12      continue

c         Initialize

      error=dlog10(1.0d-06)
      nc=1

	dx=1.0d0/dble(mn-1)
	do 15 i=1,mn
	do 15 j=1,mn
	t(i,j)=0.0d0
      nbc(i,j)=0
15    continue

c         Boundary Conditions

      ny=mn/10
	do 20 j=2,ny
      t(1,j)=dble(j-1)/dble(ny)
      nbc(1,j)=1
20    continue

      do 30 j=ny+1,mn
      t(1,j)=1.0d0
      nbc(1,j)=1
30    continue

      do 40 i=2,mn
      t(i,mn)=1.0d0
      nbc(i,mn)=1
40    continue

      do 41 i=1,mn
      nbc(i,1)=1
41    continue

      do 42 j=2,mn-1
      nbc(mn,j)=2
42    continue

c         to(i,j)

      do 45 i=1,mn
      do 45 j=1,mn
      told(i,j)=t(i,j)
45    continue

      e=etime(et)

      endif

c         Broadcast user input

      call MPI_Bcast(ntasks,mp*mt,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
      call MPI_Bcast(ntaske,mp*mt,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
      call MPI_Bcast(nod14t,mtpm*mt,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
      call MPI_Bcast(nim,mnode,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
      call MPI_Bcast(njm,mnode,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
      call MPI_Bcast(nbc,mnode,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
      call MPI_Bcast(error,1,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,ierr)

c         Laplace calculation - tasks divided on all processors

1     call MPI_Bcast(t,mnode,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,ierr)
      do 70 nt=1,mt

      if (my_rank.eq.0) then
         np=mp
      else
         np=my_rank
      endif

      nts=ntasks(np,nt)
      nte=ntaske(np,nt)
      do 55 m=nts,nte
          nod14=nod14t(m,nt)
          i=nim(nod14)
          j=njm(nod14)
              nbcv=nbc(i,j)
                  if (nbcv.eq.0) then
                      te=t(i+1,j)
                      tw=t(i-1,j)
                      tn=t(i,j+1)
                      ts=t(i,j-1)
                      t(i,j)=0.25d0*(te+tw+tn+ts)
                      go to 50
                  endif
                  if (nbcv.eq.2) then
                      t(i,j)=t(i-1,j)
                  endif
50    if (my_rank.eq.0) go to 55
      call mpi_send(t(i,j),1,MPI_DOUBLE_PRECISION,0,1, ! send to master
     *		              MPI_COMM_WORLD,ierr)
55    continue

      IF (my_rank.eq.0) THEN
          do 60 nproc=1,nsize-1
          nts=ntasks(nproc,nt)
          nte=ntaske(nproc,nt)
          do 60 m=nts,nte
          nod14=nod14t(m,nt)
          i=nim(nod14)
          j=njm(nod14)
      call mpi_recv(t(i,j),1,MPI_DOUBLE_PRECISION,nproc,1, ! master receive
     *		              MPI_COMM_WORLD,status,ierr)
60        continue
      endif

      call MPI_BARRIER(MPI_COMM_WORLD,ierr)

70    continue

      if (my_rank.eq.0) then

      res=0.0d0
      do 92 i=2,mn
      do 92 j=2,mn-1
      tnewv=t(i,j)
      toldv=told(i,j)
      res=res+dabs(tnewv-toldv)
      told(i,j)=t(i,j)
92    continue

      if(res.eq.0.0d0) res=1.0d-99
      res=dlog10(res)

      write(*,94) nc,res
94    format(2x,i7,2x,f8.3)

      nc=nc+1

      if (res.le.error) then

      e=etime(et)

c         Output

      write(*,*) e,et(1),et(2)

      WRITE(1,*) mn,mn
      Y=-DX
      DO 110 J=1,MN
          Y=Y+DX
      X=-DX
      DO 110 I=1,MN
          X=X+DX
      WRITE(1,102) X,Y,T(I,J)
110   CONTINUE

102   FORMAT(3(2X,D15.8))

      endif

      endif

c         continue to next iteration...

      call MPI_Bcast(res,1,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,ierr)
      if (res.gt.error) go to 1

c         ... else ... done.

      call mpi_finalize(ierr)

      end program laplace