1c $Id$ 2c------------------------------------------------------------------------ 3c Program perf.x is used to test performance of GA put, get, accumulate | 4c It has to be executed on four processors. | 5c remote operations access data on processes 1,2,3 in the round-robin way| 6c------------------------------------------------------------------------ 7c 8 subroutine util_perf_test 9 implicit none 10#include "mafdecls.fh" 11#include "global.fh" 12 integer heap 13c 14c*** Intitialize a message passing library 15c 16#ifdef MPI 17c integer ierr 18c call mpi_init(ierr) 19#else 20c call pbeginf 21#endif 22c 23c*** Intitialize the GA package 24c call ga_initialize() 25 if(ga_nnodes().ne.4) then 26 if (ga_nodeid().eq.0) then 27 write(6,*) '[This test requires 4 processors]' 28 write(6,*) '[This test will be skipped]' 29 call util_flush(6) 30 endif 31 return 32 endif 33c 34c*** Initialize the MA package 35 heap = 900000 36c if (.not. ma_init(MT_DBL, heap,heap)) 37c $ call ga_error('ma init failed',2*heap) 38c 39 call test2D() 40 call test1D() 41c call ga_terminate() 42c 43#ifdef MPI 44c call mpi_finalize(ierr) 45#else 46c call pend() 47#endif 48 end 49 50 51 subroutine test1D() 52 implicit none 53#include "mafdecls.fh" 54#include "global.fh" 55c 56c 57 integer n, nn, num_chunks 58 parameter (n = 1024*1024, nn = n/4, num_chunks=16) 59 double precision buf(nn) 60c 61 integer g_a 62 integer ilo, ihi, jlo, jhi 63 integer nproc, me, loop 64 integer chunk(num_chunks) 65 data chunk /1,9,16,81,256,576,900,2304,4096,8281, 66 $ 16384,29241,65536,124609,193600,262144/ 67c 68 nproc = ga_nnodes() 69 me = ga_nodeid() 70c 71c*** Create global array 72 if (.not. ga_create(MT_DBL, n, 1, 'a', 0, 0, g_a)) 73 $ call ga_error(' ga_create failed ',1) 74c 75 do loop=1,nn 76 buf(loop) = .01d0 77 enddo 78 call ga_zero(g_a) 79c 80c if (me .eq. 0) then 81c write(*,*)' ' 82c write(*,*)' ' 83c write(*,*)' ' 84c write(*,55)n 85c55 format(' Performance of GA get, put & acc', 86c $ ' for 1-dimensional sections of array[',i7,']') 87c print *,' ' 88c endif 89c 90c do loop=1,2 91c 92c*** local ops 93c 94 call ga_distribution(g_a, me, ilo, ihi, jlo, jhi) 95 call TestPutGetAcc1 96 & (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi, .true.) 97c 98c*** remote ops 99c 100 call TestPutGetAcc1 101 & (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi,.false.) 102 103c enddo 104 if (.not. ga_destroy(g_a)) 105 $ call ga_error(' ga_destroy failed ',1) 106 end 107 108 109 subroutine TestPutGetAcc1 110 & (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo,jhi, local) 111 implicit none 112#include "global.fh" 113#include "testutil.fh" 114c 115 integer num_chunks, chunk(num_chunks) 116 integer n, ilo, ihi, jlo,jhi,g_a 117 double precision buf(*), tg, tp, ta 118 double precision time_acc1, time_get1, time_put1 119 logical local 120c 121 integer me 122 integer loop, jump, count, bytes 123c 124 me = ga_nodeid() 125 if (me .eq. 0) then 126 write(6,*)' ' 127 if(local) then 128 write(6,'(26X, 25HLocal 1-D Array Section )') 129 else 130 write(6,'(26X, 25HRemote 1-D Array Section )') 131 endif 132 133 write(6,*)' section get put', 134 & ' accumulate' 135 write(6,*)' bytes dim sec MB/s sec MB/s', 136 & ' sec MB/s' 137 call util_flush(6) 138 endif 139 call ga_sync() 140c 141 do loop = 1, num_chunks 142 bytes = util_mdtob(1)*chunk(loop) ! how much data is accessed 143 jump = n/(6000*loop) ! jump distance between consecutive patches 144 if(loop.eq.num_chunks)jump=0 145c 146c everybody touches own data 147 call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop) 148 if (me .eq. 0) then 149 tg=time_get1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, 150 $ local) 151 else 152 call util_sleep(1) 153 endif 154c 155c everybody touches own data 156 call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop) 157 if (me .eq. 0) then 158 tp=time_put1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, 159 $ local) 160 else 161 call util_sleep(1) 162 endif 163c 164c everybody touches own data 165 call ga_fill_patch(g_a, 1, n, 1, 1 , 1d0*me*loop) 166 if (me .eq. 0) then 167 ta=time_acc1(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, 168 $ local) 169 else 170 call util_sleep(1) 171 endif 172c 173 if (me .eq. 0) then 174 write(6,77)bytes, chunk(loop), tg, 175 & 1d-6*bytes/tg,tp, 1d-6*bytes/tp, ta, 1d-6*bytes/ta 176 call util_flush(6) 177 endif 178 enddo 179c 18077 format(i7, i7,1x, 3(1x,d8.3,1x,d8.3)) 181 end 182 183 184 185 double precision function 186 & time_acc1(g_a, is, ie, js, je, buf, chunk, jump, count, local) 187c 188 implicit none 189#include "global.fh" 190#include "testutil.fh" 191c 192 integer g_a, chunk, jump, count, is, js, ie, je 193 logical local 194 integer rows, indx, shifti(3) 195c 196 integer ilo, ihi, jlo, jhi 197 double precision seconds, buf(*) 198c 199 count = 0 200 rows = ie - is + 1 201 shifti(1) = rows 202 shifti(2) = 2*rows 203 shifti(3) = 3*rows 204 jlo = js 205 jhi = je 206 207 seconds = util_timer() 208c 209c distance between consecutive patches increased by jump 210c to destroy locality of reference 211 do ilo = is, ie -chunk-jump +1, chunk+jump 212 ihi = ilo + chunk -1 213 count = count + 1 214 if (local) then 215 call ga_acc(g_a, ilo, ihi, jlo, jhi, buf, chunk, 1d0) 216 else 217 indx = Mod(count,3) + 1 218 call ga_acc(g_a, ilo+shifti(indx), ihi+shifti(indx), 219 $ jlo, jhi, buf, chunk, 1d0) 220 endif 221 enddo 222 seconds = util_timer() - seconds 223c 224 time_acc1 = seconds/count 225 end 226 227 228 double precision function 229 & time_get1(g_a, is, ie, js, je, buf, chunk, jump, count, local) 230c 231 implicit none 232#include "global.fh" 233#include "testutil.fh" 234c 235 integer g_a, chunk, jump, count, is, js, ie, je 236 integer rows, indx, shifti(3) 237 logical local 238c 239 integer ilo, ihi, jlo, jhi 240 double precision seconds, buf(*) 241c 242 count = 0 243 rows = ie - is + 1 244 shifti(1) = 3*rows 245 shifti(2) = 2*rows 246 shifti(3) = rows 247 jlo = js 248 jhi = je 249 250 seconds = util_timer() 251c 252c distance between consecutive patches increased by jump 253c to destroy locality of reference 254 do ilo = is, ie -chunk-jump +1, chunk+jump 255 ihi = ilo + chunk -1 256 count = count + 1 257 if (local) then 258 call ga_get(g_a, ilo, ihi, jlo, jhi, buf, chunk) 259 else 260 indx = Mod(count,3) + 1 261 call ga_get(g_a, ilo+shifti(indx), ihi+shifti(indx), 262 $ jlo, jhi, buf, chunk) 263 endif 264 enddo 265 seconds = util_timer() - seconds 266c 267 time_get1 = seconds/count 268 end 269 270 271 272 double precision function 273 & time_put1(g_a, is, ie, js, je, buf, chunk, jump, count, local) 274c 275 implicit none 276#include "global.fh" 277#include "testutil.fh" 278c 279 integer g_a, chunk, jump, count, is, js, ie, je 280 integer rows, indx, shifti(3), shiftj(3) 281 logical local 282c 283 integer ilo, ihi, jlo, jhi 284 double precision seconds, buf(*) 285c 286 count = 0 287 rows = ie - is + 1 288 shifti(1) = rows 289 shifti(2) = 2*rows 290 shifti(3) = 3*rows 291 jlo = js 292 jhi = je 293 294 seconds = util_timer() 295c 296c distance between consecutive patches increased by jump 297c to destroy locality of reference 298 do ilo = is, ie -chunk-jump +1, chunk+jump 299 ihi = ilo + chunk -1 300 count = count + 1 301 if (local) then 302 call ga_put(g_a, ilo, ihi, jlo, jhi, buf, chunk) 303 else 304 indx = Mod(count,3) + 1 305 call ga_put(g_a, ilo+shifti(indx), ihi+shifti(indx), 306 $ jlo, jhi, buf, chunk) 307 endif 308 enddo 309 seconds = util_timer() - seconds 310c 311 time_put1 = seconds/count 312 end 313 314 315 316c 317c test for square patches 318c 319 subroutine test2D() 320 implicit none 321#include "mafdecls.fh" 322#include "global.fh" 323c 324 integer n, nn, num_chunks 325 parameter (n = 1024, nn = n*n/4, num_chunks=16) 326 double precision buf(nn) 327c 328 integer g_a 329 integer ilo, ihi, jlo, jhi 330 integer nproc, me, loop 331 integer chunk(num_chunks) 332 data chunk /1,3,4,9,16,24,30,48,64,91,128,171,256,353,440,512/ 333c 334 nproc = ga_nnodes() 335 me = ga_nodeid() 336c 337c*** Create global array 338 if (.not. ga_create(MT_DBL, n, n, 'a', 0, 0, g_a)) 339 $ call ga_error(' ga_create failed ',1) 340c 341 do loop=1,nn 342 buf(loop) = .01d0 343 enddo 344 call ga_zero(g_a) 345c 346c if (me .eq. 0) then 347c write(*,*)' ' 348c write(*,55)n,n 349c55 format(' Performance of GA get, put & acc', 350c $ ' for square sections of array[',i4,',',i4,']') 351c print *,' ' 352c endif 353c 354c do loop=1,2 355c 356c*** local ops 357c 358 call ga_distribution(g_a, me, ilo, ihi, jlo, jhi) 359 call TestPutGetAcc 360 & (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi, .true.) 361c 362c*** remote ops 363c 364 call TestPutGetAcc 365 & (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo, jhi,.false.) 366 367c enddo 368 if (.not. ga_destroy(g_a)) 369 $ call ga_error(' ga_destroy failed ',1) 370 end 371 372 373 subroutine TestPutGetAcc 374 & (g_a, n, chunk, num_chunks, buf, ilo, ihi, jlo,jhi, local) 375 implicit none 376#include "global.fh" 377#include "testutil.fh" 378c 379 integer num_chunks, chunk(num_chunks) 380 integer n, ilo, ihi, jlo,jhi,g_a 381 double precision buf(*), tg, tp, ta 382 double precision time_acc, time_get, time_put 383 logical local 384c 385 integer me 386 integer loop, jump, count, bytes 387c 388 me = ga_nodeid() 389 if (me .eq. 0) then 390 if(local) then 391 write(6,'(26X, 25HLocal 2-D Array Section )') 392 else 393 write(6,'(26X, 25HRemote 2-D Array Section )') 394 endif 395 396 write(6,*)' section get put', 397 & ' accumulate' 398 write(6,*)' bytes dim sec MB/s sec MB/s', 399 & ' sec MB/s' 400 call util_flush(6) 401 endif 402 call ga_sync() 403c 404 do loop = 1, num_chunks 405 bytes = util_mdtob(1)*chunk(loop)*chunk(loop) !how much data is accessed 406 jump = n/(60*loop) ! jump distance between consecutive patches 407 if(loop.eq.num_chunks)jump=0 408c 409c everybody touches own data 410 call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) 411 if (me .eq. 0) then 412 tg=time_get(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, 413 $ local) 414 else 415 call util_sleep(1) 416 endif 417c 418c everybody touches own data 419 call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) 420 if (me .eq. 0) then 421 tp=time_put(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, 422 $ local) 423 else 424 call util_sleep(1) 425 endif 426c 427c everybody touches own data 428 call ga_fill_patch(g_a, 1, n, 1, n , 1d0*me*loop) 429 if (me .eq. 0) then 430 ta=time_acc(g_a,ilo,ihi,jlo,jhi,buf,chunk(loop),jump,count, 431 $ local) 432 else 433 call util_sleep(1) 434 endif 435c 436 if (me .eq. 0) then 437 write(6,77)bytes, chunk(loop), tg, 438 & 1d-6*bytes/tg,tp, 1d-6*bytes/tp, ta, 1d-6*bytes/ta 439 call util_flush(6) 440 endif 441 enddo 442c 44377 format(i7, i7,1x, 3(1x,d8.3,1x,d8.3)) 444 end 445 446 447 448 double precision function 449 & time_acc(g_a, is, ie, js, je, buf, chunk, jump, count, local) 450c 451 implicit none 452#include "global.fh" 453#include "testutil.fh" 454c 455 integer g_a, chunk, jump, count, is, js, ie, je 456 logical local 457 integer rows, cols, indx, shifti(3), shiftj(3) 458c 459 integer ilo, ihi, jlo, jhi 460 double precision seconds, buf(*) 461c 462 count = 0 463 rows = ie - is + 1 464 cols = je - js + 1 465 shifti(1) = rows 466 shifti(2) = 0 467 shifti(3) = rows 468 shiftj(1) = 0 469 shiftj(2) = cols 470 shiftj(3) = cols 471 472 seconds = util_timer() 473c 474c distance between consecutive patches increased by jump 475c to destroy locality of reference 476 do ilo = is, ie -chunk-jump +1, chunk+jump 477 ihi = ilo + chunk -1 478 do jlo = js, je -chunk-jump +1, chunk+jump 479 jhi = jlo + chunk -1 480 count = count + 1 481 if (local) then 482 call ga_acc(g_a, ilo, ihi, jlo, jhi, buf, chunk, 1d0) 483 else 484 indx = Mod(count,3) + 1 485 call ga_acc(g_a, ilo+shifti(indx), ihi+shifti(indx), 486 $ jlo+shiftj(indx), jhi+shiftj(indx), 487 $ buf, chunk, 1d0) 488 endif 489 enddo 490 enddo 491 seconds = util_timer() - seconds 492c 493 time_acc = seconds/count 494 end 495 496 497 double precision function 498 & time_get(g_a, is, ie, js, je, buf, chunk, jump, count, local) 499c 500 implicit none 501#include "global.fh" 502#include "testutil.fh" 503c 504 integer g_a, chunk, jump, count, is, js, ie, je 505 integer rows, cols, indx, shifti(3), shiftj(3) 506 logical local 507c 508 integer ilo, ihi, jlo, jhi 509 double precision seconds, buf(*) 510c 511 count = 0 512 rows = ie - is + 1 513 cols = je - js + 1 514 shifti(1) = rows 515 shifti(2) = 0 516 shifti(3) = rows 517 shiftj(1) = 0 518 shiftj(2) = cols 519 shiftj(3) = cols 520 521 seconds = util_timer() 522c 523c distance between consecutive patches increased by jump 524c to destroy locality of reference 525 do ilo = is, ie -chunk-jump +1, chunk+jump 526 ihi = ilo + chunk -1 527 do jlo = js, je -chunk-jump +1, chunk+jump 528 jhi = jlo + chunk -1 529 count = count + 1 530 if (local) then 531 call ga_get(g_a, ilo, ihi, jlo, jhi, buf, chunk) 532 else 533 indx = Mod(count,3) + 1 534 call ga_get(g_a, ilo+shifti(indx), ihi+shifti(indx), 535 $ jlo+shiftj(indx), jhi+shiftj(indx), 536 $ buf, chunk) 537 endif 538 enddo 539 enddo 540 seconds = util_timer() - seconds 541c 542 time_get = seconds/count 543 end 544 545 546 547 double precision function 548 & time_put(g_a, is, ie, js, je, buf, chunk, jump, count, local) 549c 550 implicit none 551#include "global.fh" 552#include "testutil.fh" 553c 554 integer g_a, chunk, jump, count, is, js, ie, je 555 integer rows, cols, indx, shifti(3), shiftj(3) 556 logical local 557c 558 integer ilo, ihi, jlo, jhi 559 double precision seconds, buf(*) 560c 561 count = 0 562 rows = ie - is + 1 563 cols = je - js + 1 564 shifti(1) = rows 565 shifti(2) = 0 566 shifti(3) = rows 567 shiftj(1) = 0 568 shiftj(2) = cols 569 shiftj(3) = cols 570 571 seconds = util_timer() 572c 573c distance between consecutive patches increased by jump 574c to destroy locality of reference 575 do ilo = is, ie -chunk-jump +1, chunk+jump 576 ihi = ilo + chunk -1 577 do jlo = js, je -chunk-jump +1, chunk+jump 578 jhi = jlo + chunk -1 579 count = count + 1 580 if (local) then 581 call ga_put(g_a, ilo, ihi, jlo, jhi, buf, chunk) 582 else 583 indx = Mod(count,3) + 1 584 call ga_put(g_a, ilo+shifti(indx), ihi+shifti(indx), 585 $ jlo+shiftj(indx), jhi+shiftj(indx), 586 $ buf, chunk) 587 endif 588 enddo 589 enddo 590 seconds = util_timer() - seconds 591c 592 time_put = seconds/count 593 end 594 595 596