1 program main 2C$Id: jacobi.f,v 1.2 1995-02-02 23:24:15 d3g681 Exp $ 3 implicit double precision (a-h, o-z) 4 dimension q(1) 5 include 'msgtypesf.h' 6 parameter (niter = 30, maxp = 2047) 7 dimension jlo(0:maxp), nj(0:maxp) 8 data iqoff, iqaddr/0, 0/ 9c 10c Simple iterative (Jacobi) linear equation solver (Ax = b) 11c 12c Initialize the message passing environment 13c 14 call pbeginf 15c call evon 16c 17c Read in the dimension of the matrix on process 0 and 18c broadcast its value to everyone else 19c 20 if (nodeid() .eq. 0) then 21 write(6,1) 22 1 format(' Input dimension of matrix: '$) 23 read(5,*) n 24 endif 25 call brdcst(1+MSGINT, n, mitob(1), 0) 26c 27c makjlo assigns a range of columns to each process. 28c ncolp is the number that this process is to do. 29c 30 call makjlo(n, jlo, nj) 31 ncolp = nj(nodeid()) 32c 33c allocate memory 34c 35 need = n*ncolp + 2*n + 2*ncolp 36 call getmem(need, q, iqaddr, iqoff) 37 if (iqaddr.eq.0) call parerr(999) 38 ia = iqoff + 1 39 ib = ia + n*ncolp 40 ix = ib + ncolp 41 is = ix + ncolp 42 iw = is + n 43c 44c Make matrix (a), rhs vector (b) and initial guess (x) 45c 46 call makabx(n, jlo(nodeid()), ncolp, q(ia), q(ib), q(ix)) 47c 48c Do niter iterations of the Jacobi algorithm. 49c Synchronize first for accurate timings. 50c 51 call synch(13) 52 rjunk = timer() 53 call jacobi(n, jlo, nj, q(ia), q(ib), q(ix), q(is), q(iw), niter) 54 used = timer() 55c 56c Print out results 57c 58 rmflop = dble(niter*2*n)*dble(n) / (used*1.0d6) 59 if (nodeid() .eq. 0) write(6,2) n, used, nnodes(), rmflop 60 2 format(' N=',i4,' used ',f6.2,' secs with ',i3,' processes', 61 $ ', mflop=', f8.2) 62c 63 call pend 64 call fexit 65 end 66 subroutine jacobi(n, jlo, nj, a, b, x, s, work, niter) 67 implicit double precision (a-h, o-z) 68 dimension a(n, *), b(*), x(*), s(n), work(n), 69 $ jlo(0:*), nj(0:*) 70c 71c Apply niter iterations of the Jacobi algorithm. 72c 73 me = nodeid() 74 do 10 iter = 1, niter 75c 76c Compute matrix vector product Ax ... this is the real work 77c 78c Do the part that we have 79c 80 call mxv(a, n, x, nj(me), s) 81c 82c Now we have to add up the result over all the processors 83c Call dgop for simple but inefficient version. Call mxvadd 84c for a much more efficient version 85c 86c call dgop(2, s, n, '+') 87c 88 call mxvadd(s, work, jlo, nj) 89c 90c Compute our part of the update vector and compute 91c the residual error (the error requires a global sum) 92c 93 err = 0.0d0 94 do 20 j = jlo(me), jlo(me) + nj(me) - 1 95 jj = j - jlo(me) + 1 96 x(jj) = x(jj) + (b(jj)-s(j))/a(j,jj) 97 err = err + abs((b(jj)-s(j))) 98 20 continue 99c 100c Write out results every now and again 101c 102 if (mod(iter,10).eq.0) then 103 call dgop(3, err, 1, '+') 104 if (nodeid().eq.0) write(6,1) iter, err 105 1 format(' Iteration',i3,', Error',d9.2) 106 endif 107 10 continue 108c 109 end 110 subroutine makabx(n, jlo, nj, a, b, x) 111 implicit double precision (a-h, o-z) 112 dimension a(n, nj), b(nj), x(nj) 113c 114 jhi = jlo + nj - 1 115 do 10 j = jlo, jhi 116 jj = j - jlo + 1 117cvd$ novector 118 do 20 i = 1, n 119 a(i,jj) = dble(i+j) / dble(abs(i-j)*n+n/50+1) 120 20 continue 121 b(jj) = dble(mod(j,3)) 122 x(jj) = b(jj) / a(j,jj) 123 10 continue 124c 125 end 126 subroutine makjlo(n, jlo, nj) 127 dimension jlo(0:*), nj(0:*) 128c 129 ncolp = n / nnodes() 130 next = n - (ncolp*nnodes()) 131 jjlo = 1 132 do 10 iproc = 0, nnodes()-1 133 jlo(iproc) = jjlo 134 jjlo = jjlo + ncolp 135 if (iproc.lt.next) jjlo = jjlo + 1 136 nj(iproc) = jjlo - jlo(iproc) 137 10 continue 138c 139 end 140 subroutine mxvadd(s, work, jlo, nj) 141 implicit real*8 (a-h, o-z) 142 include 'msgtypesf.h' 143 dimension s(*), work(*), jlo(0:*), nj(0:*) 144 logical synch 145 parameter (synch=.true.) 146c 147c We have in s(1:n) this process's contribution to the 148c matrix vector product A*x where we had nj(me) elements 149c of x starting at element jlo(me). Each process needs 150c to end up with the same elements of the full result 151c vector s. 152c 153c Thus we need to send to each process (ip) the elements 154c s(jlo(ip)+k-1), k=1,nj(ip). And we need to receive from 155c each process our piece of s which we add onto our result 156c vector. 157c 158c If communication is synchronous then we must explictly pair up 159c send/receive requests on this process with the matching 160c receive/send operations on other processes. 161c 162c There is potential for much more asynch stuff but the damned 163c iPSC-i860 hangs (irreproducibly) if we send off too many 164c unresolved asynchronous messages (how many is too much?). 165c 166 me = nodeid() 167 nproc = nnodes() 168 nn = nproc + mod(nproc,2) 169c 170 if (synch) then 171 do 10 iter = 1, nn-1 172 call pairup(nn, me, iter, ip) 173 if (ip.lt.nproc) then 174 if (me. lt. ip) then 175 call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 1) 176 call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip, 177 & node, 1) 178 else if (me.gt.ip) then 179 call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip, 180 & node, 1) 181 call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 1) 182 endif 183 call vvadd(nj(me), s(jlo(me)), work) 184 endif 185 10 continue 186 else 187 do 20 iter = 1, nn-1 188 call pairup(nn, me, iter, ip) 189 if (ip.lt.nproc) then 190 call snd(3+MSGDBL, s(jlo(ip)), mdtob(nj(ip)), ip, 0) 191 call rcv(3+MSGDBL, work, mdtob(nj(me)), lenmes, ip, node, 1) 192 call vvadd(nj(me), s(jlo(me)), work) 193 endif 194 20 continue 195 call waitcom(-1) 196 endif 197c 198 end 199 subroutine pairup(n, me, iter, ipair) 200c 201c one of many ways of generating maximally overlapped pairs 202c (not all that good on a hypercube though!) 203c 204 if (iter.eq.1) then 205 ipair = mod(n+1-me,n) 206 else if (me.eq.0) then 207 ipair = iter 208 else if (me.eq.iter) then 209 ipair = 0 210 else 211 if (ipair.eq.0) ipair = me 212 ipair = ipair + 2 213 if (ipair.ge.n) ipair = ipair + 1 - n 214 endif 215 end 216 subroutine vvadd(n, a, b) 217 implicit real*8 (a-h, o-z) 218 dimension a(*), b(*) 219c 220 do 10 i = 1,n 221 a(i) = a(i) + b(i) 222 10 continue 223c 224 end 225