1      program main
2C$Id: jacobi.f,v 1.2 1995-02-02 23:24:15 d3g681 Exp $
3      implicit double precision (a-h, o-z)
4      dimension q(1)
5      include 'msgtypesf.h'
6      parameter (niter = 30, maxp = 2047)
7      dimension jlo(0:maxp), nj(0:maxp)
8      data iqoff, iqaddr/0, 0/
9c
10c     Simple iterative (Jacobi) linear equation solver (Ax = b)
11c
12c     Initialize the message passing environment
13c
14      call pbeginf
15c     call evon
16c
17c     Read in the dimension  of the matrix on process 0 and
18c     broadcast its value to everyone else
19c
20      if (nodeid() .eq. 0) then
21         write(6,1)
22 1       format(' Input dimension of matrix: '$)
23         read(5,*) n
24      endif
25      call brdcst(1+MSGINT, n, mitob(1), 0)
26c
27c     makjlo assigns a range of columns to each process.
28c     ncolp is the number that this process is to do.
29c
30      call makjlo(n, jlo, nj)
31      ncolp = nj(nodeid())
32c
33c     allocate memory
34c
35      need = n*ncolp + 2*n + 2*ncolp
36      call getmem(need, q, iqaddr, iqoff)
37      if (iqaddr.eq.0) call parerr(999)
38      ia = iqoff + 1
39      ib = ia + n*ncolp
40      ix = ib + ncolp
41      is = ix + ncolp
42      iw = is + n
43c
44c     Make matrix (a), rhs vector (b) and initial guess (x)
45c
46      call makabx(n, jlo(nodeid()), ncolp, q(ia), q(ib), q(ix))
47c
48c     Do niter iterations of the Jacobi algorithm.
49c     Synchronize first for accurate timings.
50c
51      call synch(13)
52      rjunk = timer()
53      call jacobi(n, jlo, nj, q(ia), q(ib), q(ix), q(is), q(iw), niter)
54      used = timer()
55c
56c     Print out results
57c
58      rmflop = dble(niter*2*n)*dble(n) / (used*1.0d6)
59      if (nodeid() .eq. 0) write(6,2) n, used, nnodes(), rmflop
60 2    format(' N=',i4,' used ',f6.2,' secs with ',i3,' processes',
61     $     ', mflop=', f8.2)
62c
63      call pend
64      call fexit
65      end
66      subroutine jacobi(n, jlo, nj, a, b, x, s, work, niter)
67      implicit double precision (a-h, o-z)
68      dimension a(n, *), b(*), x(*), s(n), work(n),
69     $     jlo(0:*), nj(0:*)
70c
71c     Apply niter iterations of the Jacobi algorithm.
72c
73      me = nodeid()
74      do 10 iter = 1, niter
75c
76c     Compute matrix vector product Ax ... this is the real work
77c
78c     Do the part that we have
79c
80         call mxv(a, n, x, nj(me), s)
81c
82c     Now we have to add up the result over all the processors
83c     Call dgop for simple but inefficient version. Call mxvadd
84c     for a much more efficient version
85c
86c         call dgop(2, s, n, '+')
87c
88         call mxvadd(s, work, jlo, nj)
89c
90c     Compute our part of the update vector and compute
91c     the residual error (the error requires a global sum)
92c
93         err = 0.0d0
94         do 20 j = jlo(me), jlo(me) + nj(me) - 1
95            jj = j - jlo(me) + 1
96            x(jj) = x(jj) + (b(jj)-s(j))/a(j,jj)
97            err = err + abs((b(jj)-s(j)))
98 20      continue
99c
100c     Write out results every now and again
101c
102         if (mod(iter,10).eq.0) then
103            call dgop(3, err, 1, '+')
104            if (nodeid().eq.0) write(6,1) iter, err
105 1          format(' Iteration',i3,', Error',d9.2)
106         endif
107 10   continue
108c
109      end
110      subroutine makabx(n, jlo, nj, a, b, x)
111      implicit double precision (a-h, o-z)
112      dimension a(n, nj), b(nj), x(nj)
113c
114      jhi = jlo + nj - 1
115      do 10 j = jlo, jhi
116         jj = j - jlo + 1
117cvd$  novector
118         do 20 i = 1, n
119            a(i,jj) = dble(i+j) / dble(abs(i-j)*n+n/50+1)
120 20      continue
121         b(jj) = dble(mod(j,3))
122         x(jj) = b(jj) / a(j,jj)
123 10   continue
124c
125      end
126      subroutine makjlo(n, jlo, nj)
127      dimension jlo(0:*), nj(0:*)
128c
129      ncolp = n / nnodes()
130      next  = n - (ncolp*nnodes())
131      jjlo = 1
132      do 10 iproc = 0, nnodes()-1
133         jlo(iproc) = jjlo
134         jjlo = jjlo + ncolp
135         if (iproc.lt.next) jjlo = jjlo + 1
136         nj(iproc) = jjlo - jlo(iproc)
137 10   continue
138c
139      end
140      subroutine mxvadd(s, work, jlo, nj)
141      implicit real*8 (a-h, o-z)
142      include 'msgtypesf.h'
143      dimension s(*), work(*), jlo(0:*), nj(0:*)
144      logical synch
145      parameter (synch=.true.)
146c
147c     We have in s(1:n) this process's contribution to the
148c     matrix vector product A*x where we had nj(me) elements
149c     of x starting at element jlo(me). Each process needs
150c     to end up with the same elements of the full result
151c     vector s.
152c
153c     Thus we need to send to each process (ip) the elements
154c     s(jlo(ip)+k-1), k=1,nj(ip). And we need to receive from
155c     each process our piece of s which we add onto our result
156c     vector.
157c
158c     If communication is synchronous then we must explictly pair up
159c     send/receive requests on this process with the matching
160c     receive/send operations on other processes.
161c
162c     There is potential for much more asynch stuff but the damned
163c     iPSC-i860 hangs (irreproducibly) if we send off too many
164c     unresolved asynchronous messages (how many is too much?).
165c
166      me = nodeid()
167      nproc = nnodes()
168      nn = nproc + mod(nproc,2)
169c
170      if (synch) then
171        do 10 iter = 1, nn-1
172          call pairup(nn, me, iter, ip)
173          if (ip.lt.nproc) then
174            if (me. lt. ip) then
175              call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 1)
176              call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip,
177     &                 node, 1)
178            else if (me.gt.ip) then
179               call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip,
180     &                  node, 1)
181               call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 1)
182            endif
183            call vvadd(nj(me), s(jlo(me)), work)
184          endif
185 10     continue
186      else
187        do 20 iter = 1, nn-1
188          call pairup(nn, me, iter, ip)
189          if (ip.lt.nproc) then
190            call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 0)
191            call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip, node, 1)
192            call vvadd(nj(me), s(jlo(me)), work)
193          endif
194 20     continue
195        call waitcom(-1)
196      endif
197c
198      end
199      subroutine pairup(n, me, iter, ipair)
200c
201c     one of many ways of generating maximally overlapped pairs
202c     (not all that good on a hypercube though!)
203c
204      if (iter.eq.1) then
205        ipair = mod(n+1-me,n)
206      else if (me.eq.0) then
207        ipair = iter
208      else if (me.eq.iter) then
209        ipair = 0
210      else
211        if (ipair.eq.0) ipair = me
212        ipair = ipair + 2
213        if (ipair.ge.n) ipair = ipair + 1 - n
214      endif
215      end
216      subroutine vvadd(n, a, b)
217      implicit real*8 (a-h, o-z)
218      dimension a(*), b(*)
219c
220      do 10 i = 1,n
221        a(i) = a(i) + b(i)
222 10   continue
223c
224      end
225