1c $Id$
2c------------------------------------------------------------------------
3c Program perf.x is used to test performance of GA put, get, accumulate  |
4c It has to be executed on four processors.                              |
5c remote operations access data on processes 1,2,3 in the round-robin way|
6c------------------------------------------------------------------------
7c
8      subroutine util_perf_test
9      implicit none
10#include "mafdecls.fh"
11#include "global.fh"
12      integer heap
13c
14c***  Intitialize a message passing library
15c
16#ifdef MPI
17c     integer ierr
18c     call mpi_init(ierr)
19#else
20c     call pbeginf
21#endif
22c
23c***  Intitialize the GA package
24c     call ga_initialize()
25      if(ga_nnodes().ne.4) then
26         if (ga_nodeid().eq.0) then
27         write(6,*) '[This test requires 4 processors]'
28         write(6,*) '[This test will be skipped]'
29         call util_flush(6)
30         endif
31         return
32      endif
33c
34c***  Initialize the MA package
35      heap = 900000
36c     if (.not. ma_init(MT_DBL, heap,heap))
37c    $     call ga_error('ma init failed',2*heap)
38c
39      call test2D()
40      call test1D()
41c     call ga_terminate()
42c
43#ifdef MPI
44c     call mpi_finalize(ierr)
45#else
46c     call pend()
47#endif
48      end
49
50
51      subroutine test1D()
52      implicit none
53#include "mafdecls.fh"
54#include "global.fh"
55c
56c
57      integer n, nn, num_chunks
58      parameter (n = 1024*1024, nn = n/4, num_chunks=16)
59      double precision buf(nn)
60c
61      integer g_a
62      integer ilo, ihi, jlo, jhi
63      integer nproc, me, loop
64      integer chunk(num_chunks)
65      data    chunk /1,9,16,81,256,576,900,2304,4096,8281,
66     $               16384,29241,65536,124609,193600,262144/
67c
68      nproc = ga_nnodes()
69      me = ga_nodeid()
70c
71c***  Create global array
72      if (.not. ga_create(MT_DBL, n, 1, 'a', 0, 0, g_a))
73     $     call ga_error(' ga_create failed ',1)
74c
75      do loop=1,nn
76         buf(loop) = .01d0
77      enddo
78      call ga_zero(g_a)
79c
80c      if (me .eq. 0) then
81c        write(*,*)' '
82c        write(*,*)' '
83c        write(*,*)' '
84c        write(*,55)n
85c55      format(' Performance of GA get, put & acc',
86c     $           ' for 1-dimensional sections of array[',i7,']')
87c        print *,' '
88c      endif
89c
90c     do loop=1,2
91c
92c***  local ops
93c
94      call ga_distribution(g_a, me, ilo, ihi, jlo, jhi)
95      call TestPutGetAcc1
96     &     (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi, .true.)
97c
98c***  remote ops
99c
100      call TestPutGetAcc1
101     &     (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi,.false.)
102
103c     enddo
104      if (.not. ga_destroy(g_a))
105     $     call ga_error(' ga_destroy failed ',1)
106      end
107
108
109      subroutine TestPutGetAcc1
110     &      (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo,jhi, local)
111      implicit none
112#include "global.fh"
113#include "testutil.fh"
114c
115      integer num_chunks, chunk(num_chunks)
116      integer n, ilo, ihi, jlo,jhi,g_a
117      double precision buf(*), tg, tp, ta
118      double precision time_acc1, time_get1, time_put1
119      logical local
120c
121      integer me
122      integer loop, jump, count, bytes
123c
124      me = ga_nodeid()
125      if (me .eq. 0) then
126        write(6,*)' '
127        if(local) then
128          write(6,'(26X, 25HLocal 1-D Array Section    )')
129        else
130          write(6,'(26X, 25HRemote 1-D Array Section   )')
131        endif
132
133        write(6,*)'    section           get               put',
134     &           '           accumulate'
135        write(6,*)' bytes    dim     sec      MB/s     sec      MB/s',
136     &           '     sec      MB/s'
137        call util_flush(6)
138      endif
139      call ga_sync()
140c
141      do loop = 1, num_chunks
142        bytes = util_mdtob(1)*chunk(loop) ! how much data is accessed
143        jump  =  n/(6000*loop) ! jump distance between consecutive patches
144        if(loop.eq.num_chunks)jump=0
145c
146c       everybody touches own data
147        call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
148        if (me .eq. 0) then
149        tg=time_get1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
150     $               local)
151        else
152          call util_sleep(1)
153        endif
154c
155c       everybody touches own data
156        call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
157        if (me .eq. 0) then
158        tp=time_put1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
159     $               local)
160        else
161          call util_sleep(1)
162        endif
163c
164c       everybody touches own data
165        call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop)
166        if (me .eq. 0) then
167        ta=time_acc1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
168     $               local)
169        else
170          call util_sleep(1)
171        endif
172c
173        if (me .eq. 0) then
174          write(6,77)bytes, chunk(loop), tg,
175     &          1d-6*bytes/tg,tp, 1d-6*bytes/tp, ta, 1d-6*bytes/ta
176          call util_flush(6)
177        endif
178      enddo
179c
18077    format(i7, i7,1x, 3(1x,d8.3,1x,d8.3))
181      end
182
183
184
185      double precision function
186     &   time_acc1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
187c
188      implicit none
189#include "global.fh"
190#include "testutil.fh"
191c
192      integer g_a, chunk, jump, count, is, js, ie, je
193      logical local
194      integer rows, indx, shifti(3)
195c
196      integer ilo, ihi, jlo, jhi
197      double precision seconds, buf(*)
198c
199      count = 0
200      rows = ie - is + 1
201      shifti(1) = rows
202      shifti(2) = 2*rows
203      shifti(3) = 3*rows
204      jlo = js
205      jhi = je
206
207      seconds = util_timer()
208c
209c       distance between consecutive patches increased by jump
210c       to destroy locality of reference
211        do ilo = is, ie -chunk-jump +1, chunk+jump
212           ihi = ilo + chunk -1
213           count = count + 1
214           if (local) then
215                 call ga_acc(g_a, ilo, ihi, jlo, jhi, buf, chunk, 1d0)
216           else
217                 indx = Mod(count,3) + 1
218                 call ga_acc(g_a, ilo+shifti(indx), ihi+shifti(indx),
219     $                       jlo, jhi,  buf, chunk, 1d0)
220           endif
221        enddo
222      seconds = util_timer() - seconds
223c
224      time_acc1 = seconds/count
225      end
226
227
228      double precision function
229     &    time_get1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
230c
231      implicit none
232#include "global.fh"
233#include "testutil.fh"
234c
235      integer g_a, chunk, jump, count, is, js, ie, je
236      integer rows, indx, shifti(3)
237      logical local
238c
239      integer ilo, ihi, jlo, jhi
240      double precision seconds, buf(*)
241c
242      count = 0
243      rows = ie - is + 1
244      shifti(1) = 3*rows
245      shifti(2) = 2*rows
246      shifti(3) = rows
247      jlo = js
248      jhi = je
249
250      seconds = util_timer()
251c
252c       distance between consecutive patches increased by jump
253c       to destroy locality of reference
254        do ilo = is, ie -chunk-jump +1, chunk+jump
255              ihi = ilo + chunk -1
256              count = count + 1
257              if (local) then
258                 call ga_get(g_a, ilo, ihi, jlo, jhi, buf, chunk)
259              else
260                 indx = Mod(count,3) + 1
261                 call ga_get(g_a, ilo+shifti(indx), ihi+shifti(indx),
262     $                       jlo, jhi,  buf, chunk)
263              endif
264        enddo
265      seconds = util_timer() - seconds
266c
267      time_get1 = seconds/count
268      end
269
270
271
272      double precision function
273     &   time_put1(g_a, is, ie, js, je, buf, chunk, jump, count, local)
274c
275      implicit none
276#include "global.fh"
277#include "testutil.fh"
278c
279      integer g_a, chunk, jump, count, is, js, ie, je
280      integer rows, indx, shifti(3), shiftj(3)
281      logical local
282c
283      integer ilo, ihi, jlo, jhi
284      double precision  seconds, buf(*)
285c
286      count = 0
287      rows = ie - is + 1
288      shifti(1) = rows
289      shifti(2) = 2*rows
290      shifti(3) = 3*rows
291      jlo = js
292      jhi = je
293
294      seconds = util_timer()
295c
296c       distance between consecutive patches increased by jump
297c       to destroy locality of reference
298        do ilo = is, ie -chunk-jump +1, chunk+jump
299              ihi = ilo + chunk -1
300              count = count + 1
301              if (local) then
302                 call ga_put(g_a, ilo, ihi, jlo, jhi, buf, chunk)
303              else
304                 indx = Mod(count,3) + 1
305                 call ga_put(g_a, ilo+shifti(indx), ihi+shifti(indx),
306     $                       jlo, jhi,  buf, chunk)
307              endif
308        enddo
309      seconds = util_timer() - seconds
310c
311      time_put1 = seconds/count
312      end
313
314
315
316c
317c     test for square patches
318c
319      subroutine test2D()
320      implicit none
321#include "mafdecls.fh"
322#include "global.fh"
323c
324      integer n, nn, num_chunks
325      parameter (n = 1024, nn = n*n/4, num_chunks=16)
326      double precision buf(nn)
327c
328      integer g_a
329      integer ilo, ihi, jlo, jhi
330      integer nproc, me, loop
331      integer chunk(num_chunks)
332      data    chunk /1,3,4,9,16,24,30,48,64,91,128,171,256,353,440,512/
333c
334      nproc = ga_nnodes()
335      me = ga_nodeid()
336c
337c***  Create global array
338      if (.not. ga_create(MT_DBL, n, n, 'a', 0, 0, g_a))
339     $     call ga_error(' ga_create failed ',1)
340c
341      do loop=1,nn
342         buf(loop) = .01d0
343      enddo
344      call ga_zero(g_a)
345c
346c      if (me .eq. 0) then
347c        write(*,*)' '
348c        write(*,55)n,n
349c55      format(' Performance of GA get, put & acc',
350c     $           ' for square sections of array[',i4,',',i4,']')
351c        print *,' '
352c      endif
353c
354c     do loop=1,2
355c
356c***  local ops
357c
358      call ga_distribution(g_a, me, ilo, ihi, jlo, jhi)
359      call TestPutGetAcc
360     &     (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi, .true.)
361c
362c***  remote ops
363c
364      call TestPutGetAcc
365     &     (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi,.false.)
366
367c     enddo
368      if (.not. ga_destroy(g_a))
369     $     call ga_error(' ga_destroy failed ',1)
370      end
371
372
373      subroutine TestPutGetAcc
374     &      (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo,jhi, local)
375      implicit none
376#include "global.fh"
377#include "testutil.fh"
378c
379      integer num_chunks, chunk(num_chunks)
380      integer n, ilo, ihi, jlo,jhi,g_a
381      double precision buf(*), tg, tp, ta
382      double precision time_acc, time_get, time_put
383      logical local
384c
385      integer me
386      integer loop, jump, count, bytes
387c
388      me = ga_nodeid()
389      if (me .eq. 0) then
390        if(local) then
391          write(6,'(26X, 25HLocal 2-D Array Section    )')
392        else
393          write(6,'(26X, 25HRemote 2-D Array Section   )')
394        endif
395
396        write(6,*)'    section           get               put',
397     &           '           accumulate'
398        write(6,*)' bytes    dim     sec      MB/s     sec      MB/s',
399     &           '     sec      MB/s'
400        call util_flush(6)
401      endif
402      call ga_sync()
403c
404      do loop = 1, num_chunks
405        bytes = util_mdtob(1)*chunk(loop)*chunk(loop) !how much data is accessed
406        jump  =  n/(60*loop) ! jump distance between consecutive patches
407        if(loop.eq.num_chunks)jump=0
408c
409c       everybody touches own data
410        call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
411        if (me .eq. 0) then
412        tg=time_get(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
413     $               local)
414        else
415          call util_sleep(1)
416        endif
417c
418c       everybody touches own data
419        call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
420        if (me .eq. 0) then
421        tp=time_put(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
422     $               local)
423        else
424          call util_sleep(1)
425        endif
426c
427c       everybody touches own data
428        call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop)
429        if (me .eq. 0) then
430        ta=time_acc(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count,
431     $               local)
432        else
433          call util_sleep(1)
434        endif
435c
436        if (me .eq. 0) then
437          write(6,77)bytes, chunk(loop), tg,
438     &          1d-6*bytes/tg,tp, 1d-6*bytes/tp, ta, 1d-6*bytes/ta
439          call util_flush(6)
440        endif
441      enddo
442c
44377    format(i7, i7,1x, 3(1x,d8.3,1x,d8.3))
444      end
445
446
447
448      double precision function
449     &   time_acc(g_a, is, ie, js, je, buf, chunk, jump, count, local)
450c
451      implicit none
452#include "global.fh"
453#include "testutil.fh"
454c
455      integer g_a, chunk, jump, count, is, js, ie, je
456      logical local
457      integer rows, cols, indx, shifti(3), shiftj(3)
458c
459      integer ilo, ihi, jlo, jhi
460      double precision  seconds, buf(*)
461c
462      count = 0
463      rows = ie - is + 1
464      cols = je - js + 1
465      shifti(1) = rows
466      shifti(2) = 0
467      shifti(3) = rows
468      shiftj(1) = 0
469      shiftj(2) = cols
470      shiftj(3) = cols
471
472      seconds = util_timer()
473c
474c       distance between consecutive patches increased by jump
475c       to destroy locality of reference
476        do ilo = is, ie -chunk-jump +1, chunk+jump
477           ihi = ilo + chunk -1
478           do jlo = js, je -chunk-jump +1, chunk+jump
479              jhi = jlo + chunk -1
480              count = count + 1
481              if (local) then
482                 call ga_acc(g_a, ilo, ihi, jlo, jhi, buf, chunk, 1d0)
483              else
484                 indx = Mod(count,3) + 1
485                 call ga_acc(g_a, ilo+shifti(indx), ihi+shifti(indx),
486     $                       jlo+shiftj(indx), jhi+shiftj(indx),
487     $                       buf, chunk, 1d0)
488              endif
489           enddo
490        enddo
491      seconds = util_timer() - seconds
492c
493      time_acc = seconds/count
494      end
495
496
497      double precision function
498     &    time_get(g_a, is, ie, js, je, buf, chunk, jump, count, local)
499c
500      implicit none
501#include "global.fh"
502#include "testutil.fh"
503c
504      integer g_a, chunk, jump, count, is, js, ie, je
505      integer rows, cols, indx, shifti(3), shiftj(3)
506      logical local
507c
508      integer ilo, ihi, jlo, jhi
509      double precision  seconds, buf(*)
510c
511      count = 0
512      rows = ie - is + 1
513      cols = je - js + 1
514      shifti(1) = rows
515      shifti(2) = 0
516      shifti(3) = rows
517      shiftj(1) = 0
518      shiftj(2) = cols
519      shiftj(3) = cols
520
521      seconds = util_timer()
522c
523c       distance between consecutive patches increased by jump
524c       to destroy locality of reference
525        do ilo = is, ie -chunk-jump +1, chunk+jump
526           ihi = ilo + chunk -1
527           do jlo = js, je -chunk-jump +1, chunk+jump
528              jhi = jlo + chunk -1
529              count = count + 1
530              if (local) then
531                 call ga_get(g_a, ilo, ihi, jlo, jhi, buf, chunk)
532              else
533                 indx = Mod(count,3) + 1
534                 call ga_get(g_a, ilo+shifti(indx), ihi+shifti(indx),
535     $                       jlo+shiftj(indx), jhi+shiftj(indx),
536     $                       buf, chunk)
537              endif
538           enddo
539        enddo
540      seconds = util_timer() - seconds
541c
542      time_get = seconds/count
543      end
544
545
546
547      double precision function
548     &   time_put(g_a, is, ie, js, je, buf, chunk, jump, count, local)
549c
550      implicit none
551#include "global.fh"
552#include "testutil.fh"
553c
554      integer g_a, chunk, jump, count, is, js, ie, je
555      integer rows, cols, indx, shifti(3), shiftj(3)
556      logical local
557c
558      integer ilo, ihi, jlo, jhi
559      double precision  seconds, buf(*)
560c
561      count = 0
562      rows = ie - is + 1
563      cols = je - js + 1
564      shifti(1) = rows
565      shifti(2) = 0
566      shifti(3) = rows
567      shiftj(1) = 0
568      shiftj(2) = cols
569      shiftj(3) = cols
570
571      seconds = util_timer()
572c
573c       distance between consecutive patches increased by jump
574c       to destroy locality of reference
575        do ilo = is, ie -chunk-jump +1, chunk+jump
576           ihi = ilo + chunk -1
577           do jlo = js, je -chunk-jump +1, chunk+jump
578              jhi = jlo + chunk -1
579              count = count + 1
580              if (local) then
581                 call ga_put(g_a, ilo, ihi, jlo, jhi, buf, chunk)
582              else
583                 indx = Mod(count,3) + 1
584                 call ga_put(g_a, ilo+shifti(indx), ihi+shifti(indx),
585     $                       jlo+shiftj(indx), jhi+shiftj(indx),
586     $                       buf, chunk)
587              endif
588           enddo
589        enddo
590      seconds = util_timer() - seconds
591c
592      time_put = seconds/count
593      end
594
595
596