1#if HAVE_CONFIG_H
2#   include "config.fh"
3#endif
4c $Id: nbtest.F,v 1.64.2.11 2007-04-06 22:37:35 d3g293 Exp $
5
6#define BLOCK_DIM 200
7
8      program main
9      implicit none
10#include "mafdecls.fh"
11#include "global.fh"
12#include "testutil.fh"
13      integer heap, stack, fudge, ma_heap, me, nproc
14      integer ndim, dims(2), pdims(2), lo(2), hi(2), ld
15      integer g_a, two, chunk(2)
16      integer i, j, ii, jj, n, ibuf
17      integer buf(BLOCK_DIM,BLOCK_DIM)
18      integer nbbuf(BLOCK_DIM,BLOCK_DIM,2)
19      integer nbhdl(2)
20      GA_ACCESS_INDEX_TYPE g_ptr, ioff
21      logical status, ok, ok2
22      integer i_put, i_get, i_acc, i_nbput, i_nbget, i_nbacc
23      integer i_putw, i_getw, i_accw
24      double precision t_put, t_get, t_acc, t_nbput, t_nbget, t_nbacc
25      double precision t_wput, t_wget, t_wacc, t_putw, t_getw, t_accw
26      double precision tst
27      parameter (heap=200*200*4, fudge=100, stack=200*200)
28c
29c***  Intitialize a message passing library
30c
31#include "mp3.fh"
32c
33c***  Initialize GA
34c
35c     There are 2 choices: ga_initialize or ga_initialize_ltd.
36c     In the first case, there is no explicit limit on memory usage.
37c     In the second, user can set limit (per processor) in bytes.
38c
39      call ga_initialize()
40      nproc = ga_nnodes()
41      me = ga_nodeid()
42c     we can also use GA_set_memory_limit BEFORE first ga_create call
43c
44      ma_heap = heap + fudge
45      ma_heap = 2*ma_heap
46      if (me.eq.0) then
47        print *, 'util_mdtob(', ma_heap, ')=', util_mdtob(ma_heap)
48        print *
49      endif
50      call GA_set_memory_limit(util_mdtob(ma_heap))
51c
52      if(ga_nodeid().eq.0)then
53         print *,'GA initialized on ',nproc,' processors'
54         call ffflush(6)
55      endif
56c
57c***  Initialize the MA package
58c     MA must be initialized before any global array is allocated
59c
60      status = ma_init(MT_DCPL, stack, ma_heap)
61      if (.not. status) call ga_error('ma_init failed',-1)
62c
63c   Initialize all timing variables
64c
65      i_put = 0
66      i_get = 0
67      i_acc = 0
68      i_nbput = 0
69      i_nbget = 0
70      i_nbacc = 0
71      i_putw = 0
72      i_getw = 0
73      i_accw = 0
74      t_put = 0.0d00
75      t_get = 0.0d00
76      t_acc = 0.0d00
77      t_nbput = 0.0d00
78      t_nbget = 0.0d00
79      t_nbacc = 0.0d00
80      t_wput = 0.0d00
81      t_wget = 0.0d00
82      t_wacc = 0.0d00
83      t_putw = 0.0d00
84      t_getw = 0.0d00
85      t_accw = 0.0d00
86c
87c     Uncomment the below line to register external memory allocator
88c     for dynamic arrays inside GA routines.
89c      call register_ext_memory()
90c
91c   Factor processors into a roughly square grid
92c
93      call factor(nproc,pdims)
94      if (me.eq.0) then
95        write(6,*)
96        write(6,'(a,i4,a,i4,a)') ' Processors configured on ',pdims(1),
97     +              ' x ',pdims(2),' grid'
98      endif
99c
100c   Create an array with each processor holding BLOCK_DIM*BLOCK_DIM
101c   elements and initialize it to zero
102c
103      dims(1) = BLOCK_DIM*pdims(1)
104      dims(2) = BLOCK_DIM*pdims(2)
105      ld =  BLOCK_DIM
106      g_a = ga_create_handle()
107      two = 2
108      call nga_set_data(g_a, two, dims, MT_INT)
109      chunk(1) = BLOCK_DIM
110      chunk(2) = BLOCK_DIM
111      call nga_set_chunk(g_a, chunk)
112      if (.not.ga_allocate(g_a)) then
113        n = dims(1)*dims(2)
114        write(6,'(a,i16)')
115     +    'Failure to initialize global array of size ',n
116        call ga_error('GA_Allocate failure',-1)
117      endif
118      call ga_zero(g_a)
119c
120c   Initialize entire GA from process 0 using blocking puts
121c
122      if (me.eq.0) then
123c
124c   Loop over all processors
125c
126        do n = 0, nproc-1
127c
128c   Calculate process grid coordinates
129c
130          ii = mod(n,pdims(1))
131          jj = (n-ii)/pdims(1)
132c
133c   Calculate lower and upper indices of the block held by process n
134c
135          lo(1) = ii*BLOCK_DIM + 1
136          lo(2) = jj*BLOCK_DIM + 1
137          hi(1) = (ii+1)*BLOCK_DIM
138          hi(2) = (jj+1)*BLOCK_DIM
139c
140c   Fill buffer with correct values
141c
142          do j = lo(2), hi(2)
143            jj = j-lo(2)+1
144            do i = lo(1), hi(1)
145              ii = i-lo(1)+1
146              buf(ii,jj) = (j-1)*dims(1)+(i-1)
147            end do
148          end do
149c
150c   Copy buffer to GA
151c
152          tst = ga_wtime()
153          call nga_put(g_a,lo,hi,buf,ld)
154          t_put = t_put + ga_wtime() - tst
155          i_put = i_put + 1
156        end do
157      endif
158      call ga_sync
159c
160c   Check values on each process for correctness
161c
162      ok = .true.
163      call nga_distribution(g_a,me,lo,hi)
164      call nga_access(g_a,lo,hi,g_ptr,ld)
165      ioff = 0
166      do j = lo(2), hi(2)
167        do i = lo(1), hi(1)
168          n = int_mb(g_ptr+ioff)
169          if (n.ne.(j-1)*dims(1)+(i-1)) ok = .false.
170          ioff = ioff+1
171        end do
172      end do
173      call nga_release(g_a, lo, hi)
174c
175      if (ok) then
176        if (me.eq.0) then
177          write(6,*)
178          write(6,'(a)') 'OK for simple PUT test'
179          write(6,*)
180        endif
181      else
182        write(6,'(a,i4)') 'Error found in PUT test on process ',me
183      endif
184c
185c  Collect all values on process 0 from GA
186c
187      ok = .true.
188      if (me.eq.0) then
189        do n = 0, nproc-1
190c
191c   Calculate process grid coordinates
192c
193          ii = mod(n,pdims(1))
194          jj = (n-ii)/pdims(1)
195c
196c   Calculate lower and upper indices of the block held by process n
197c
198          lo(1) = ii*BLOCK_DIM + 1
199          lo(2) = jj*BLOCK_DIM + 1
200          hi(1) = (ii+1)*BLOCK_DIM
201          hi(2) = (jj+1)*BLOCK_DIM
202c          write(6,'(a,i4,a,i4,a,i4,a,i4,a,i4)') 'Accessing block (',
203c     +      lo(1),',',hi(1),') (',lo(2),',',hi(2),') on process ',n
204c
205c   Copy contents of GA to local buffer
206c
207          tst = ga_wtime()
208          call nga_get(g_a,lo,hi,buf,ld)
209          t_get = t_get + ga_wtime() - tst
210          i_get = i_get + 1
211c
212c   check buffer for correct values
213c
214          ok2 = .true.
215          do j = lo(2), hi(2)
216            jj = j-lo(2)+1
217            do i = lo(1), hi(1)
218              ii = i-lo(1)+1
219              if (buf(ii,jj).ne.(j-1)*dims(1)+(i-1)) ok2 = .false.
220            end do
221          end do
222          if (.not.ok2) then
223            write(6,'(a,i4)') 'Error found in GET test on process ',n
224            ok = .false.
225          endif
226        end do
227        if (ok) then
228          if (me.eq.0) then
229            write(6,*)
230            write(6,'(a)') 'OK for simple GET test'
231            write(6,*)
232          endif
233        endif
234      endif
235c
236c   Double all values in the GA using accumulate
237c
238      if (me.eq.0) then
239c
240c   Loop over all processors
241c
242        do n = 0, nproc-1
243c
244c   Calculate process grid coordinates
245c
246          ii = mod(n,pdims(1))
247          jj = (n-ii)/pdims(1)
248c
249c   Calculate lower and upper indices of the block held by process n
250c
251          lo(1) = ii*BLOCK_DIM + 1
252          lo(2) = jj*BLOCK_DIM + 1
253          hi(1) = (ii+1)*BLOCK_DIM
254          hi(2) = (jj+1)*BLOCK_DIM
255c
256c   Fill buffer with correct values
257c
258          do j = lo(2), hi(2)
259            jj = j-lo(2)+1
260            do i = lo(1), hi(1)
261              ii = i-lo(1)+1
262              buf(ii,jj) = (j-1)*dims(1)+(i-1)
263            end do
264          end do
265c
266c   Accumulate buffer to GA
267c
268          tst = ga_wtime()
269          call nga_acc(g_a,lo,hi,buf,ld,1)
270          t_acc = t_acc + ga_wtime() - tst
271          i_acc = i_acc + 1
272        end do
273      endif
274      call ga_sync
275c
276c   Check values on each process for correctness
277c
278      ok = .true.
279      call nga_distribution(g_a,me,lo,hi)
280      call nga_access(g_a,lo,hi,g_ptr,ld)
281      ioff = 0
282      do j = lo(2), hi(2)
283        do i = lo(1), hi(1)
284          n = int_mb(g_ptr+ioff)
285          if (n.ne.2*((j-1)*dims(1)+(i-1))) ok = .false.
286          ioff = ioff+1
287        end do
288      end do
289      call nga_release(g_a, lo, hi)
290c
291      if (ok) then
292        if (me.eq.0) then
293          write(6,*)
294          write(6,'(a)') 'OK for simple ACC test'
295          write(6,*)
296        endif
297      else
298        write(6,'(a,i4)') 'Error found in ACC test on process ',me
299      endif
300c
301c   Redo test using non-blocking puts, gets and accumulates
302c
303      call ga_zero(g_a)
304      if (me.eq.0) then
305c
306c   Initialize puts by doing calculation for process 0
307c
308        tst = ga_wtime()
309        lo(1) = 1
310        lo(2) = 1
311        hi(1) = BLOCK_DIM
312        hi(2) = BLOCK_DIM
313c
314c   Fill first buffer with data
315c
316        do j = lo(2), hi(2)
317          jj = j-lo(2)+1
318          do i = lo(1), hi(1)
319            ii = i-lo(1)+1
320            nbbuf(ii,jj,1) = (j-1)*dims(1)+(i-1)
321          end do
322        end do
323        t_putw = t_putw + ga_wtime() - tst
324        i_putw = i_putw + 1
325c
326c   Start the first put
327c
328        tst = ga_wtime()
329        call nga_nbput(g_a,lo,hi,nbbuf(1,1,1),ld,nbhdl(1))
330        t_nbput = t_nbput + ga_wtime() - tst
331        i_nbput = i_nbput + 1
332c
333c   Loop over all processors
334c
335        do n = 1, nproc-1
336c
337c   Calculate process grid coordinates
338c
339          ii = mod(n,pdims(1))
340          jj = (n-ii)/pdims(1)
341c
342c   Calculate lower and upper indices of the block held by process n
343c
344          tst = ga_wtime()
345          lo(1) = ii*BLOCK_DIM + 1
346          lo(2) = jj*BLOCK_DIM + 1
347          hi(1) = (ii+1)*BLOCK_DIM
348          hi(2) = (jj+1)*BLOCK_DIM
349c
350c   Fill buffer with correct values
351c
352          ibuf = mod(n,2)+1
353          do j = lo(2), hi(2)
354            jj = j-lo(2)+1
355            do i = lo(1), hi(1)
356              ii = i-lo(1)+1
357              nbbuf(ii,jj,ibuf) = (j-1)*dims(1)+(i-1)
358            end do
359          end do
360          t_putw = t_putw + ga_wtime() - tst
361          i_putw = i_putw + 1
362c
363c   Copy buffer to GA
364c
365          tst = ga_wtime()
366          call nga_nbput(g_a,lo,hi,nbbuf(1,1,ibuf),ld,nbhdl(ibuf))
367          t_nbput = t_nbput + ga_wtime() - tst
368          i_nbput = i_nbput + 1
369          tst = ga_wtime()
370          call nga_nbwait(nbhdl(mod(n-1,2)+1))
371          t_wput = t_wput + ga_wtime() - tst
372        end do
373        tst = ga_wtime()
374        call nga_nbwait(nbhdl(mod(nproc-1,2)+1))
375        t_wput = t_wput + ga_wtime() - tst
376      endif
377      call ga_sync
378c
379c   Check values on each process for correctness
380c
381      ok = .true.
382      call nga_distribution(g_a,me,lo,hi)
383      call nga_access(g_a,lo,hi,g_ptr,ld)
384      ioff = 0
385      do j = lo(2), hi(2)
386        do i = lo(1), hi(1)
387          n = int_mb(g_ptr+ioff)
388          if (n.ne.(j-1)*dims(1)+(i-1)) ok = .false.
389          ioff = ioff+1
390        end do
391      end do
392      call nga_release(g_a, lo, hi)
393c
394      if (ok) then
395        if (me.eq.0) then
396          write(6,*)
397          write(6,'(a)') 'OK for Non-Blocking PUT test'
398          write(6,*)
399        endif
400      else
401        write(6,'(a,i4)')
402     +      'Error found in Non-Blocking PUT test on process ',me
403      endif
404c
405c  Collect all values on process 0 from GA using non-blocking calls
406c
407      ok = .true.
408      if (me.eq.0) then
409c
410c  Start with get on process 0
411c
412        lo(1) = 1
413        lo(2) = 1
414        hi(1) = BLOCK_DIM
415        hi(2) = BLOCK_DIM
416c
417c   Fill first buffer with data
418c
419        do j = lo(2), hi(2)
420          jj = j-lo(2)+1
421          do i = lo(1), hi(1)
422            ii = i-lo(1)+1
423            nbbuf(ii,jj,1) = (j-1)*dims(1)+(i-1)
424          end do
425        end do
426c
427c   Start the first get
428c
429        tst = ga_wtime()
430        call nga_nbget(g_a,lo,hi,nbbuf(1,1,1),ld,nbhdl(1))
431        t_nbget = t_nbget + ga_wtime() - tst
432        i_nbget = i_nbget + 1
433        do n = 1, nproc-1
434c
435c   Calculate process grid coordinates
436c
437          ii = mod(n,pdims(1))
438          jj = (n-ii)/pdims(1)
439c
440c   Calculate lower and upper indices of the block held by process n
441c
442          lo(1) = ii*BLOCK_DIM + 1
443          lo(2) = jj*BLOCK_DIM + 1
444          hi(1) = (ii+1)*BLOCK_DIM
445          hi(2) = (jj+1)*BLOCK_DIM
446c          write(6,'(a,i4,a,i4,a,i4,a,i4,a,i4)') 'Accessing block (',
447c     +      lo(1),',',hi(1),') (',lo(2),',',hi(2),') on process ',n
448c
449c   Copy contents of GA to local buffer
450c
451          ibuf = mod(n,2)+1
452          tst = ga_wtime()
453          call nga_nbget(g_a,lo,hi,nbbuf(1,1,ibuf),ld,nbhdl(ibuf))
454          t_nbget = t_nbget + ga_wtime() - tst
455          i_nbget = i_nbget + 1
456          tst = ga_wtime()
457          call nga_nbwait(nbhdl(mod(n-1,2)+1))
458          t_wget = t_wget + ga_wtime() - tst
459c
460c   check buffer for correct values
461c
462          tst = ga_wtime()
463          ok2 = .true.
464          ibuf = mod(n-1,2)+1
465          ii = mod(n-1,pdims(1))
466          jj = (n-1-ii)/pdims(1)
467          lo(1) = ii*BLOCK_DIM + 1
468          lo(2) = jj*BLOCK_DIM + 1
469          hi(1) = (ii+1)*BLOCK_DIM
470          hi(2) = (jj+1)*BLOCK_DIM
471          do j = lo(2), hi(2)
472            jj = j-lo(2)+1
473            do i = lo(1), hi(1)
474              ii = i-lo(1)+1
475              if (nbbuf(ii,jj,ibuf).ne.(j-1)*dims(1)+(i-1))
476     +             ok2 = .false.
477            end do
478          end do
479          if (.not.ok2) then
480            write(6,'(a,i4)') 'Error found in GET test on process ',n
481            ok = .false.
482          endif
483          t_getw = t_getw + ga_wtime() - tst
484          i_getw = i_getw + 1
485        end do
486        tst = ga_wtime()
487        call nga_nbwait(nbhdl(mod(n-1,2)+1))
488        t_wget = t_wget + ga_wtime() - tst
489        tst = ga_wtime()
490        ok2 = .true.
491        ibuf = mod(nproc-1,2)+1
492        ii = mod(n-1,pdims(1))
493        jj = (n-1-ii)/pdims(1)
494        lo(1) = ii*BLOCK_DIM + 1
495        lo(2) = jj*BLOCK_DIM + 1
496        hi(1) = (ii+1)*BLOCK_DIM
497        hi(2) = (jj+1)*BLOCK_DIM
498        do j = lo(2), hi(2)
499          jj = j-lo(2)+1
500          do i = lo(1), hi(1)
501            ii = i-lo(1)+1
502            if (nbbuf(ii,jj,ibuf).ne.(j-1)*dims(1)+(i-1)) ok2 = .false.
503          end do
504        end do
505        if (.not.ok2) then
506          write(6,'(a,i4)') 'Error found in GET test on process ',n
507          ok = .false.
508        endif
509        if (ok) then
510          if (me.eq.0) then
511            write(6,*)
512            write(6,'(a)') 'OK for Non-Blocking GET test'
513            write(6,*)
514          endif
515        endif
516        t_getw = t_getw + ga_wtime() - tst
517        i_getw = i_getw + 1
518      endif
519c
520c   Double all values in the GA using accumulate
521c
522      if (me.eq.0) then
523c
524c   Initialize puts by doing calculation for process 0
525c
526        tst = ga_wtime()
527        lo(1) = 1
528        lo(2) = 1
529        hi(1) = BLOCK_DIM
530        hi(2) = BLOCK_DIM
531c
532c   Fill first buffer with data
533c
534        do j = lo(2), hi(2)
535          jj = j-lo(2)+1
536          do i = lo(1), hi(1)
537            ii = i-lo(1)+1
538            nbbuf(ii,jj,1) = (j-1)*dims(1)+(i-1)
539          end do
540        end do
541        t_accw = t_accw + ga_wtime() - tst
542        i_accw = i_accw + 1
543c
544c   Start the first put
545c
546        tst = ga_wtime()
547        call nga_nbacc(g_a,lo,hi,nbbuf(1,1,1),ld,1,nbhdl(1))
548        t_nbacc = t_nbacc + ga_wtime() - tst
549        i_nbacc = i_nbacc + 1
550c
551c   Loop over all processors
552c
553        do n = 1, nproc-1
554c
555c   Calculate process grid coordinates
556c
557          ii = mod(n,pdims(1))
558          jj = (n-ii)/pdims(1)
559c
560c   Calculate lower and upper indices of the block held by process n
561c
562          tst = ga_wtime()
563          lo(1) = ii*BLOCK_DIM + 1
564          lo(2) = jj*BLOCK_DIM + 1
565          hi(1) = (ii+1)*BLOCK_DIM
566          hi(2) = (jj+1)*BLOCK_DIM
567c
568c   Fill buffer with correct values
569c
570          ibuf = mod(n,2)+1
571          do j = lo(2), hi(2)
572            jj = j-lo(2)+1
573            do i = lo(1), hi(1)
574              ii = i-lo(1)+1
575              nbbuf(ii,jj,ibuf) = (j-1)*dims(1)+(i-1)
576            end do
577          end do
578          t_accw = t_accw + ga_wtime() - tst
579          i_accw = i_accw + 1
580c
581c   Copy buffer to GA
582c
583          tst = ga_wtime()
584          call nga_nbacc(g_a,lo,hi,nbbuf(1,1,ibuf),ld,1,nbhdl(ibuf))
585          t_nbacc = t_nbacc + ga_wtime() - tst
586          i_nbacc = i_nbacc + 1
587          tst = ga_wtime()
588          call nga_nbwait(nbhdl(mod(n-1,2)+1))
589          t_wacc = t_wacc + ga_wtime() - tst
590        end do
591        tst = ga_wtime()
592        call nga_nbwait(nbhdl(mod(nproc-1,2)+1))
593        t_wacc = t_wacc + ga_wtime() - tst
594      endif
595      call ga_sync
596c
597c   Check values on each process for correctness
598c
599      ok = .true.
600      call nga_distribution(g_a,me,lo,hi)
601      call nga_access(g_a,lo,hi,g_ptr,ld)
602      ioff = 0
603      do j = lo(2), hi(2)
604        do i = lo(1), hi(1)
605          n = int_mb(g_ptr+ioff)
606          if (n.ne.2*((j-1)*dims(1)+(i-1))) ok = .false.
607          ioff = ioff+1
608        end do
609      end do
610      call nga_release(g_a, lo, hi)
611c
612      if (ok) then
613        if (me.eq.0) then
614          write(6,*)
615          write(6,'(a)') 'OK for Non-Blocking ACC test'
616          write(6,*)
617        endif
618      else
619        write(6,'(a,i4)')
620     +      'Error found in Non-Blocking ACC test on process ',me
621      endif
622c
623c   Print out timing numbers. Start by summing timers over all
624c   processors
625c
626      call ga_igop(1,i_put,1,'+')
627      call ga_igop(2,i_get,1,'+')
628      call ga_igop(3,i_acc,1,'+')
629      call ga_igop(4,i_nbput,1,'+')
630      call ga_igop(5,i_nbget,1,'+')
631      call ga_igop(6,i_nbacc,1,'+')
632      call ga_igop(7,i_putw,1,'+')
633      call ga_igop(8,i_getw,1,'+')
634      call ga_igop(9,i_accw,1,'+')
635      call ga_dgop(1,t_put,1,'+')
636      call ga_dgop(2,t_get,1,'+')
637      call ga_dgop(3,t_acc,1,'+')
638      call ga_dgop(4,t_nbput,1,'+')
639      call ga_dgop(5,t_nbget,1,'+')
640      call ga_dgop(6,t_nbacc,1,'+')
641      call ga_dgop(7,t_wput,1,'+')
642      call ga_dgop(8,t_wget,1,'+')
643      call ga_dgop(9,t_wacc,1,'+')
644      call ga_dgop(1,t_putw,1,'+')
645      call ga_dgop(2,t_getw,1,'+')
646      call ga_dgop(3,t_accw,1,'+')
647      if (me.eq.0) then
648        write(6,'(a,f16.8)') 'Average time for blocking PUT:      ',
649     +                     t_put/dble(i_put)
650        write(6,'(a,f16.8)') 'Average time for blocking GET:      ',
651     +                     t_get/dble(i_get)
652        write(6,'(a,f16.8)') 'Average time for blocking ACC:      ',
653     +                     t_acc/dble(i_acc)
654        write(6,'(a,f16.8)') 'Average time for non-blocking PUT:  ',
655     +                     t_nbput/dble(i_nbput)
656        write(6,'(a,f16.8)') 'Average time for non-blocking GET:  ',
657     +                     t_nbget/dble(i_nbget)
658        write(6,'(a,f16.8)') 'Average time for non-blocking ACC:  ',
659     +                     t_nbacc/dble(i_nbacc)
660        write(6,'(a,f16.8)') 'Average time for wait on PUT:       ',
661     +                     t_wput/dble(i_nbput)
662        write(6,'(a,f16.8)') 'Average time for wait on GET:       ',
663     +                     t_wget/dble(i_nbget)
664        write(6,'(a,f16.8)') 'Average time for wait on ACC:       ',
665     +                     t_wacc/dble(i_nbacc)
666        write(6,'(a,f16.8)') 'Aggregate time on non-blocking PUT: ',
667     +                     (t_nbput+t_wput)/dble(i_nbput)
668        write(6,'(a,f16.8)') 'Aggregate time on non-blocking GET: ',
669     +                     (t_nbget+t_wget)/dble(i_nbget)
670        write(6,'(a,f16.8)') 'Aggregate time on non-blocking ACC: ',
671     +                     (t_nbacc+t_wacc)/dble(i_nbacc)
672        write(6,'(a,f16.8)') 'Average time in PUT work:           ',
673     +                     t_putw/dble(i_putw)
674        write(6,'(a,f16.8)') 'Average time in GET work:           ',
675     +                     t_getw/dble(i_getw)
676        write(6,'(a,f16.8)') 'Average time in ACC work:           ',
677     +                     t_accw/dble(i_accw)
678      endif
679
680c
681c***  Tidy up the GA package
682c
683      call ga_terminate()
684c
685c***  Tidy up after message-passing library
686c
687      call MP_FINALIZE()
688c
689      end
690c
691c  Obtain a 2x2 processor grid for p processors
692c
693      subroutine factor(p,dims)
694      implicit none
695      integer i,j,p,dims(2),imin,mdim
696      integer ip,ifac,pmax,prime(1000)
697      integer fac(1000)
698c
699      i = 1
700      ip = p
701      do i = 1, 2
702        dims(i) = 1
703      end do
704c
705c    factor p completely
706c    first, find all prime numbers less than or equal to p
707c
708      pmax = 0
709      do i = 2, p
710        do j = 1, pmax
711          if (mod(i,prime(j)).eq.0) go to 100
712        end do
713        pmax = pmax + 1
714        prime(pmax) = i
715  100   continue
716      end do
717c
718c    find all prime factors of p
719c
720      ifac = 0
721      do i = 1, pmax
722  200   if (mod(ip,prime(i)).eq.0) then
723          ifac = ifac + 1
724          fac(ifac) = prime(i)
725          ip = ip/prime(i)
726          go to 200
727        endif
728      end do
729c
730c    determine dimensions of processor grid
731c
732      do i = ifac, 1, -1
733c
734c    find dimension with minimum value
735c
736        imin = dims(1)
737        mdim = 1
738        if (dims(2).lt.imin) then
739          imin = dims(2)
740          mdim = 2
741        endif
742        dims(mdim) = dims(mdim)*fac(i)
743      end do
744c
745      return
746      end
747
748