1! 2! test program for LIBXSMM function calls 3! 4! uses SPECFEM3D_GLOBE routine compute_forces_crust_mantle_Dev() with dummy example 5! 6 7! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION 8! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh 9#include "config.fh" 10 11!------------------------------------------------------------------- 12! 13! modules 14! 15!------------------------------------------------------------------- 16 17module my_libxsmm 18 19 use libxsmm !,only: LIBXSMM_SMMFUNCTION,libxsmm_dispatch,libxsmm_mmcall,libxsmm_init,libxsmm_finalize 20 21 implicit none 22 23 ! function pointers 24 ! (note: defined for single precision, thus needs CUSTOM_REAL to be SIZE_REAL) 25 type(LIBXSMM_SMMFUNCTION) :: xmm1, xmm2, xmm3 26 27 ! prefetch versions 28 type(LIBXSMM_SMMFUNCTION) :: xmm1p, xmm2p, xmm3p 29 30 logical :: USE_XSMM_FUNCTION,USE_XSMM_FUNCTION_PREFETCH 31 32end module my_libxsmm 33 34! 35!------------------------------------------------------------------- 36! 37 38module constants 39 40 implicit none 41 42 integer, parameter :: SIZE_REAL = 4, SIZE_DOUBLE = 8 43 integer, parameter :: CUSTOM_REAL = SIZE_REAL 44 45 integer, parameter :: ISTANDARD_OUTPUT = 6 46 integer, parameter :: IMAIN = ISTANDARD_OUTPUT 47 48 ! number of GLL points in each direction of an element (degree plus one) 49 integer, parameter :: NGLLX = 5 50 integer, parameter :: NGLLY = NGLLX 51 integer, parameter :: NGLLZ = NGLLX 52 integer, parameter :: NGLLCUBE = NGLLX * NGLLY * NGLLZ 53 54 ! Deville routines optimized for NGLLX = NGLLY = NGLLZ = 5 55 integer, parameter :: m1 = NGLLX, m2 = NGLLX * NGLLY 56 57 ! 3-D simulation 58 integer, parameter :: NDIM = 3 59 60 ! some useful constants 61 double precision, parameter :: PI = 3.141592653589793d0 62 63 integer, parameter :: IFLAG_IN_FICTITIOUS_CUBE = 11 64 65end module constants 66 67! 68!------------------------------------------------------------------- 69! 70 71module specfem_par 72 73! main parameter module for specfem simulations 74 75 use constants 76 use libxsmm,only: LIBXSMM_ALIGNMENT 77 78 implicit none 79 80 !------------------------------------------------ 81 82 ! number of spectral elements in x/y/z-directions 83 integer,parameter :: NEX = 40 84 integer,parameter :: NEY = 40 85 integer,parameter :: NEZ = 25 86 87 !------------------------------------------------ 88 89 ! MPI rank (dummy, no MPI for this test needed) 90 integer :: myrank 91 92 ! array with derivatives of Lagrange polynomials and precalculated products 93 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx 94 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT 95 96 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy 97 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz 98 real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz 99 100 ! arrays for Deville and force_vectorization 101 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D 102 103 ! mesh parameters 104 ! number of spectral elements 105 integer :: NSPEC 106 107 ! number of global nodes 108 integer :: NGLOB 109 110 ! local to global indexing 111 integer, dimension(:,:,:,:),allocatable :: ibool 112 113 ! displacement, velocity, acceleration 114 real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: displ,accel 115 116 ! for verification 117 real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: accel_default 118 119 !slow-down: please don't use unless you're sure... !dir$ ATTRIBUTES align:LIBXSMM_ALIGNMENT :: displ,accel,ibool,accel_default 120 121 ! gravity 122 logical,parameter :: GRAVITY_VAL = .true. 123 124 ! optimized arrays 125 integer, dimension(:,:),allocatable :: ibool_inv_tbl 126 integer, dimension(:,:),allocatable :: ibool_inv_st 127 integer, dimension(:,:),allocatable :: phase_iglob 128 integer, dimension(2) :: num_globs 129 130 ! work array with contributions 131 real(kind=CUSTOM_REAL), dimension(:,:,:,:,:),allocatable :: sum_terms 132 133 ! inner / outer elements crust/mantle region 134 integer :: num_phase_ispec 135 integer :: nspec_inner,nspec_outer 136 integer, dimension(:,:), allocatable :: phase_ispec_inner 137 integer :: iphase 138 139end module specfem_par 140 141 142 143!------------------------------------------------------------------- 144! 145! main program 146! 147!------------------------------------------------------------------- 148 149program test 150 151 use specfem_par 152 use my_libxsmm 153 154 implicit none 155 156 ! timing 157 double precision :: duration,duration_default 158 integer(kind=8) :: start 159 160 ! verification 161 real :: diff 162 integer, dimension(2) :: iloc_max 163 logical, parameter :: DEBUG_VERIFICATION = .false. 164 165 ! repetitions (time steps) 166 integer :: it 167 integer,parameter :: NSTEP = 20 ! should be > NSTEP_JITTER because of average timing 168 integer,parameter :: NSTEP_JITTER = 5 ! skip first few steps when timing the kernels (the first steps exhibit runtime jitter) 169 170 ! different versions 171 integer,parameter :: num_versions = 5 172 character(len=20) :: str_version(num_versions) = (/character(len=20) :: & 173 "Deville loops", & 174 "unrolled loops", & 175 "LIBXSMM dispatch" , & 176 "LIBXSMM prefetch", & 177 "LIBXSMM static" & 178 /) 179 double precision :: avg_time(num_versions) 180 integer :: iversion 181 182 print *,'--------------------------------------' 183 print *,'specfem example' 184 print *,'--------------------------------------' 185 print * 186 187 ! creates test mesh 188 call setup_mesh() 189 190 ! prepares arrays for time iteration loop 191 call prepare_timerun() 192 193 ! OpenMP output info 194 call prepare_openmp() 195 196 ! prepares libxsmm functions 197 call prepare_xsmm() 198 199 ! timing averages 200 avg_time(:) = 0.d0 201 iphase = 2 202 203 do it = 1,NSTEP 204 205 print * 206 print *,'step ',it 207 208 do iversion = 1,num_versions 209 ! initializes 210 accel(:,:) = 0._CUSTOM_REAL 211 212 ! timing 213 start = libxsmm_timer_tick() 214 215 ! computes forces 216 select case (iversion) 217 case (1) 218 ! Deville loops 219 call compute_forces_Dev() 220 221 case (2) 222 ! unrolled loops 223 call compute_forces_noDev() 224 225 case (3) 226 ! LIBXSMM with dispatch functions 227 if (USE_XSMM_FUNCTION) then 228 call compute_forces_with_xsmm() 229 else 230 cycle 231 endif 232 233 case (4) 234 ! LIBXSMM with prefetch function calls 235 if (USE_XSMM_FUNCTION_PREFETCH) then 236 call compute_forces_with_xsmm_prefetch() 237 else 238 cycle 239 endif 240 241 case (5) 242 ! LIBXSMM with static function calls 243 call compute_forces_with_xsmm_static() 244 245 end select 246 247 ! timing 248 duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) 249 if (iversion == 1) duration_default = duration 250 251 ! average time 252 if (it > NSTEP_JITTER) avg_time(iversion) = avg_time(iversion) + duration 253 254 ! for verification 255 if (iversion == 1) then 256 accel_default(:,:) = accel(:,:) 257 endif 258 diff = maxval(abs(accel(:,:) - accel_default(:,:))) 259 260 ! user output 261 if (iversion == 1) then 262 write(*,'(a30,a,f8.4,a)') 'duration with '//str_version(iversion),' = ',sngl(duration),' (s)' 263 else 264 write(*,'(a30,a,f8.4,a,f8.2,a,e12.4)') 'duration with '//str_version(iversion),' = ', & 265 sngl(duration),' (s) / speedup = ', & 266 sngl(100.0 * (duration_default-duration)/duration_default),' % / maximum diff = ',diff 267 endif 268 269 ! check 270 if (DEBUG_VERIFICATION) then 271 iloc_max = maxloc(abs(accel(:,:) - accel_default(:,:))) 272 print *,'verification: max diff = ',diff 273 print *,' iglob loc = ',iloc_max(1),iloc_max(2) 274 print *,'maximum difference: #current vs. #default value' 275 print *,' ',accel(1,iloc_max(2)),accel_default(1,iloc_max(2)) 276 print *,' ',accel(2,iloc_max(2)),accel_default(2,iloc_max(2)) 277 print *,' ',accel(3,iloc_max(2)),accel_default(3,iloc_max(2)) 278 print *,'min/max accel values = ',minval(accel(:,:)),maxval(accel(:,:)) 279 print * 280 endif 281 282 enddo ! iversion 283 enddo ! it 284 285 ! average timing (avoiding the first 5 steps which fluctuate quite a bit...) 286 avg_time(:) = avg_time(:) / dble(NSTEP - NSTEP_JITTER) 287 288 print * 289 print *,'==============================================' 290 print *,'average over ',NSTEP - NSTEP_JITTER,'repetitions' 291 write(*,'(a30,a,f8.4)') ' timing with '//str_version(1),' = ',avg_time(1) 292 do iversion = 2,num_versions 293 ! skip unused tests 294 if (iversion == 3 .and. .not. USE_XSMM_FUNCTION) cycle 295 if (iversion == 4 .and. .not. USE_XSMM_FUNCTION_PREFETCH) cycle 296 297 write(*,'(a30,a,f8.4,a,f8.2,a)') ' timing with '//str_version(iversion),' = ', & 298 avg_time(iversion),' / speedup = ', & 299 sngl(100.0 * (avg_time(1)-avg_time(iversion))/avg_time(1)),' %' 300 enddo 301 print *,'==============================================' 302 print * 303 304 ! frees memory 305 deallocate(displ,accel) 306 deallocate(accel_default,ibool) 307 deallocate(sum_terms) 308 deallocate(ibool_inv_st,ibool_inv_tbl,phase_iglob) 309 deallocate(phase_ispec_inner) 310 311 ! finalizes LIBXSMM 312 call libxsmm_finalize() 313 314end program test 315 316! 317!------------------------------------------------------------------- 318! 319 320 subroutine setup_mesh() 321 322 use constants 323 use specfem_par 324 325 implicit none 326 327 integer :: i1,i2 328 integer :: ix,iy,iz 329 integer :: i,j,k 330 integer :: iglob,ispec,inumber 331 332 integer, dimension(:), allocatable :: mask_ibool 333 integer, dimension(:,:,:,:), allocatable :: copy_ibool_ori 334 335 logical, dimension(:), allocatable :: mask_ibool_flag 336 337 ! total number of elements 338 NSPEC = NEX * NEY * NEZ 339 340 ! set up local to global numbering 341 allocate(ibool(NGLLX,NGLLY,NGLLZ,NSPEC)) 342 ibool(:,:,:,:) = 0 343 ispec = 0 344 iglob = 0 345 346 ! arranges a three-dimensional block, elements are collated side-by-side. mimicks a very simple unstructured grid. 347 do iz = 1,NEZ 348 do iy = 1,NEY 349 do ix = 1,NEX 350 ispec = ispec + 1 351 ! GLL point indexing 352 do k = 1,NGLLZ 353 do j = 1,NGLLY 354 do i = 1,NGLLX 355 ! set up local to global numbering 356 if ((i == 1) .and. (ix > 1)) then 357 ! previous element along x-direction 358 ibool(i,j,k,ispec) = ibool(NGLLX,j,k,ispec - 1) 359 else if ((j == 1) .and. (iy > 1)) then 360 ! previous element along y-direction 361 ibool(i,j,k,ispec) = ibool(i,NGLLY,k,ispec - NEX) 362 else if ((k == 1) .and. (iz > 1)) then 363 ! previous element along z-direction 364 ibool(i,j,k,ispec) = ibool(i,j,NGLLZ,ispec - NEX * NEY) 365 else 366 ! new point 367 iglob = iglob + 1 368 ibool(i,j,k,ispec) = iglob 369 endif 370 enddo 371 enddo 372 enddo 373 374 enddo ! NEX 375 enddo ! NEY 376 enddo ! NEZ 377 378 ! sets total numbers of nodes 379 NGLOB = iglob 380 381 print *,'mesh:' 382 print *,' total number of elements = ',NSPEC 383 print *,' total number of global nodes = ',NGLOB 384 !print *,' ibool min/max = ',minval(ibool),maxval(ibool) 385 386 ! checks 387 if (ispec /= NSPEC) stop 'Invalid ispec count' 388 if (minval(ibool(:,:,:,:)) < 1) stop 'Invalid ibool minimum value' 389 if (maxval(ibool(:,:,:,:)) > NSPEC * NGLLX * NGLLY * NGLLZ) stop 'Invalid ibool maximum value' 390 391 ! we can create a new indirect addressing to reduce cache misses 392 allocate(copy_ibool_ori(NGLLX,NGLLY,NGLLZ,NSPEC),mask_ibool(nglob)) 393 mask_ibool(:) = -1 394 copy_ibool_ori(:,:,:,:) = ibool(:,:,:,:) 395 inumber = 0 396 do ispec = 1,NSPEC 397 do k = 1,NGLLZ 398 do j = 1,NGLLY 399 do i = 1,NGLLX 400 if (mask_ibool(copy_ibool_ori(i,j,k,ispec)) == -1) then 401 ! creates a new point 402 inumber = inumber + 1 403 ibool(i,j,k,ispec) = inumber 404 mask_ibool(copy_ibool_ori(i,j,k,ispec)) = inumber 405 else 406 ! uses an existing point created previously 407 ibool(i,j,k,ispec) = mask_ibool(copy_ibool_ori(i,j,k,ispec)) 408 endif 409 enddo 410 enddo 411 enddo 412 enddo 413 if (inumber /= NGLOB) stop 'Invalid inumber count' 414 deallocate(copy_ibool_ori,mask_ibool) 415 416 ! define polynomial derivatives & weights 417 ! (dummy values) 418 do i1 = 1,NGLLX 419 do i2 = 1,NGLLX 420 hprime_xx(i2,i1) = i1 * 0.1 + i2 * 0.2 ! original: real(lagrange_deriv_GLL(i1-1,i2-1,xigll,NGLLX), kind=CUSTOM_REAL) 421 hprimewgll_xx(i2,i1) = hprime_xx(i2,i1) * (i2 * 1.0/NGLLX) ! real(lagrange_deriv_GLL(i1-1,i2-1,xigll,NGLLX)*wxgll(i2), kind=CUSTOM_REAL) 422 enddo 423 enddo 424 do i = 1,NGLLX 425 do j = 1,NGLLY 426 wgllwgll_xy(i,j) = (i * 1.0/NGLLX) * (j * 1.0/NGLLY) ! original: real(wxgll(i)*wygll(j), kind=CUSTOM_REAL) 427 enddo 428 enddo 429 do i = 1,NGLLX 430 do k = 1,NGLLZ 431 wgllwgll_xz(i,k) = (i * 1.0/NGLLX) * (k * 1.0/NGLLZ) ! original: real(wxgll(i)*wzgll(k), kind=CUSTOM_REAL) 432 enddo 433 enddo 434 do j = 1,NGLLY 435 do k = 1,NGLLZ 436 wgllwgll_yz(j,k) = (j * 1.0/NGLLY) * (k * 1.0/NGLLZ) ! original: real(wygll(j)*wzgll(k), kind=CUSTOM_REAL) 437 enddo 438 enddo 439 440 ! define a 3D extension in order to be able to force vectorization in the compute_forces_**_Dev routines 441 do k = 1,NGLLZ 442 do j = 1,NGLLY 443 do i = 1,NGLLX 444 wgllwgll_yz_3D(i,j,k) = wgllwgll_yz(j,k) 445 wgllwgll_xz_3D(i,j,k) = wgllwgll_xz(i,k) 446 wgllwgll_xy_3D(i,j,k) = wgllwgll_xy(i,j) 447 enddo 448 enddo 449 enddo 450 451 ! check that optimized routines from Deville et al. (2002) can be used 452 if (NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5) & 453 stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ = 5' 454 455 ! define transpose of derivation matrix 456 do j = 1,NGLLX 457 do i = 1,NGLLX 458 hprime_xxT(j,i) = hprime_xx(i,j) 459 hprimewgll_xxT(j,i) = hprimewgll_xx(i,j) 460 enddo 461 enddo 462 463 ! displacement and acceleration (dummy fields) 464 allocate(displ(NDIM,NGLOB),accel(NDIM,NGLOB)) 465 accel(:,:) = 0._CUSTOM_REAL 466 467 ! sets initial dummy values (to avoid getting only zero multiplications later on) 468 if (.true.) then 469 ! arbitrary linear function 470 do iglob = 1,NGLOB 471 displ(:,iglob) = dble(iglob - 1) / dble(NGLOB - 1) 472 enddo 473 else 474 ! arbitrary sine function 475 allocate(mask_ibool_flag(NGLOB)) 476 mask_ibool_flag(:) = .false. 477 ispec = 0 478 do iz = 1,NEZ 479 do iy = 1,NEY 480 do ix = 1,NEX 481 ispec = ispec + 1 482 ! GLL point indexing 483 do k = 1,NGLLZ 484 do j = 1,NGLLY 485 do i = 1,NGLLX 486 iglob = ibool(i,j,k,ispec) 487 if (.not. mask_ibool_flag(iglob)) then 488 ! only assigns global value once 489 mask_ibool_flag(iglob) = .true. 490 displ(:,iglob) = sin(PI * dble(ix - 1) / dble(NEX - 1)) & 491 * sin(PI * dble(iy - 1) / dble(NEY - 1)) & 492 * sin(PI * dble(iz - 1) / dble(NEZ - 1)) 493 endif 494 enddo 495 enddo 496 enddo 497 enddo 498 enddo 499 enddo 500 deallocate(mask_ibool_flag) 501 endif 502 503 ! for verification 504 allocate(accel_default(NDIM,NGLOB)) 505 accel_default(:,:) = 0._CUSTOM_REAL 506 507 end subroutine setup_mesh 508 509! 510!------------------------------------------------------------------- 511! 512 513 subroutine prepare_timerun() 514 515 use specfem_par 516 517 implicit none 518 519 ! local parameters 520 integer :: num_elements,ispec,ier 521 522 ! setup inner/outer elements (single slice only, no outer elements for halo) 523 myrank = 0 524 ! no MPI over-lapping communication in this example 525 nspec_inner = NSPEC 526 nspec_outer = 0 527 num_phase_ispec = NSPEC 528 allocate(phase_ispec_inner(num_phase_ispec,2),stat=ier) 529 if (ier /= 0 ) call exit_mpi(myrank,'Error allocating array phase_ispec_inner_crust_mantle') 530 phase_ispec_inner(:,:) = 0 531 do ispec = 1,NSPEC 532 phase_ispec_inner(ispec,2) = ispec 533 enddo 534 535 ! from original routine prepare_timerun_ibool_inv_tbl() 536 537 ! note: we use allocate for sum_terms arrays rather than defining within subroutine compute_forces_**_Dev() itself 538 ! as it will crash when using OpenMP and operating systems with small stack sizes 539 ! e.g. see http://stackoverflow.com/questions/22649827/illegal-instruction-error-when-running-openmp-in-gfortran-mac 540 allocate(sum_terms(NDIM,NGLLX,NGLLY,NGLLZ,NSPEC),stat=ier) 541 if (ier /= 0) stop 'Error allocating sum_terms arrays' 542 sum_terms(:,:,:,:,:) = 0._CUSTOM_REAL 543 544 ! inverse table 545 ! this helps to speedup the assembly, especially with OpenMP (or on MIC) threading 546 ! allocating arrays 547 allocate(ibool_inv_tbl(NGLLX*NGLLY*NGLLZ*NSPEC,2),stat=ier) 548 if (ier /= 0) stop 'Error allocating ibool_inv_tbl arrays' 549 550 allocate(ibool_inv_st(NGLOB+1,2),stat=ier) 551 if (ier /= 0) stop 'Error allocating ibool_inv_st arrays' 552 553 allocate(phase_iglob(NGLOB,2),stat=ier) 554 if (ier /= 0) stop 'Error allocating phase_iglob arrays' 555 556 ! initializing 557 num_globs(:) = 0 558 ibool_inv_tbl(:,:) = 0 559 ibool_inv_st(:,:) = 0 560 phase_iglob(:,:) = 0 561 562 !---- make inv. table ---------------------- 563 ! loops over phases 564 ! (1 == outer elements / 2 == inner elements) 565 do iphase = 1,2 566 ! crust mantle 567 if (iphase == 1) then 568 ! outer elements (iphase=1) 569 num_elements = nspec_outer 570 else 571 ! inner elements (iphase=2) 572 num_elements = nspec_inner 573 endif 574 call make_inv_table(iphase,NGLOB,NSPEC, & 575 num_elements,phase_ispec_inner, & 576 ibool,phase_iglob, & 577 ibool_inv_tbl, ibool_inv_st, & 578 num_globs) 579 enddo 580 581 ! user output 582 if (myrank == 0) then 583 write(IMAIN,*) " inverse table of ibool done" 584 call flush_IMAIN() 585 endif 586 587 ! synchronizes processes 588 call synchronize_all() 589 590 contains 591 592 subroutine make_inv_table(iphase,nglob,nspec, & 593 phase_nspec,phase_ispec,ibool,phase_iglob, & 594 ibool_inv_tbl,ibool_inv_st,num_globs,idoubling) 595 596 implicit none 597 598 ! arguments 599 integer,intent(in) :: iphase 600 integer,intent(in) :: nglob 601 integer,intent(in) :: nspec 602 integer,intent(in) :: phase_nspec 603 integer, dimension(:,:),intent(in) :: phase_ispec 604 integer, dimension(:,:,:,:),intent(in) :: ibool 605 606 integer, dimension(:,:),intent(inout) :: phase_iglob 607 integer, dimension(:,:),intent(inout) :: ibool_inv_tbl 608 integer, dimension(:,:),intent(inout) :: ibool_inv_st 609 integer, dimension(:),intent(inout) :: num_globs 610 611 integer,dimension(:),optional :: idoubling 612 613 ! local parameters 614 integer, dimension(:), allocatable :: ibool_inv_num 615 integer, dimension(:,:), allocatable :: ibool_inv_tbl_tmp 616 integer :: num_alloc_ibool_inv_tbl,num_alloc_ibool_inv_tbl_theor 617 integer :: num_used_ibool_inv_tbl 618 integer :: ip, iglob, ispec_p, ispec, iglob_p, ier 619 integer :: inum 620#ifdef FORCE_VECTORIZATION 621 integer :: ijk 622#else 623 integer :: i,j,k 624#endif 625 logical :: is_inner_core 626 627 ! tolerance number of shared degrees per node 628 integer, parameter :: N_TOL = 20 629 630 ! checks if anything to do (e.g., no outer elements for single process simulations) 631 if (phase_nspec == 0) return 632 633 ! checks if inner core region 634 if (present(idoubling)) then 635 is_inner_core = .true. 636 else 637 is_inner_core = .false. 638 endif 639 640 ! allocates temporary arrays 641 allocate(ibool_inv_num(nglob),stat=ier) 642 if (ier /= 0) stop 'Error allocating ibool_inv_num array' 643 644 ! gets valence of global degrees of freedom for current phase (inner/outer) elements 645 ibool_inv_num(:) = 0 646 do ispec_p = 1,phase_nspec 647 ispec = phase_ispec(ispec_p,iphase) 648 649 ! exclude fictitious elements in central cube 650 if (is_inner_core) then 651 if (idoubling(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle 652 endif 653 654 DO_LOOP_IJK 655 iglob = ibool(INDEX_IJK,ispec) 656 ! increases valence counter 657 ibool_inv_num(iglob) = ibool_inv_num(iglob) + 1 658 ENDDO_LOOP_IJK 659 enddo 660 661 ! gets maximum valence value 662 num_alloc_ibool_inv_tbl = maxval(ibool_inv_num(:)) 663 664 ! theoretical number of maximum shared degrees per node 665 num_alloc_ibool_inv_tbl_theor = N_TOL*(NGLLX*NGLLY*NGLLZ*nspec/nglob+1) 666 667 ! checks valence 668 if (num_alloc_ibool_inv_tbl < 1 .or. num_alloc_ibool_inv_tbl > num_alloc_ibool_inv_tbl_theor) then 669 print *,'Error invalid maximum valence:' 670 print *,'valence value = ',num_alloc_ibool_inv_tbl,' - theoretical maximum = ',num_alloc_ibool_inv_tbl_theor 671 stop 'Error invalid maximum valence value' 672 endif 673 ! debug 674 !print *,myrank,'maximum shared degrees theoretical = ',num_alloc_ibool_inv_tbl_theor ! regional_Greece_small example: 40 675 !print *,myrank,'maximum shared degrees from array = ',maxval(ibool_inv_num(:)) ! regional_Greece_small example: 8 and 16 676 677 allocate(ibool_inv_tbl_tmp(num_alloc_ibool_inv_tbl,nglob),stat=ier) 678 if (ier /= 0) stop 'Error allocating ibool_inv_tbl_tmp array' 679 680 !---- make temporary array of inv. table : ibool_inv_tbl_tmp 681 ibool_inv_tbl_tmp(:,:) = 0 682 ibool_inv_num(:) = 0 683 do ispec_p = 1,phase_nspec 684 ispec = phase_ispec(ispec_p,iphase) 685 686 ! exclude fictitious elements in central cube 687 if (is_inner_core) then 688 if (idoubling(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle 689 endif 690 691 DO_LOOP_IJK 692 693 iglob = ibool(INDEX_IJK,ispec) 694 695 ! increases counter 696 ibool_inv_num(iglob) = ibool_inv_num(iglob) + 1 697 698 ! inverse table 699 ! sets 1D index of local GLL point (between 1 and NGLLCUBE) 700#ifdef FORCE_VECTORIZATION 701 inum = ijk 702#else 703 inum = i + (j-1)*NGLLY + (k-1)*NGLLY*NGLLZ 704#endif 705 ! sets 1D index in local ibool array 706 ibool_inv_tbl_tmp(ibool_inv_num(iglob),iglob) = inum + NGLLX*NGLLY*NGLLZ*(ispec-1) 707 708 ENDDO_LOOP_IJK 709 710 enddo 711 712 !---- packing : ibool_inv_tbl_tmp -> ibool_inv_tbl 713 ip = 0 714 iglob_p = 0 715 num_used_ibool_inv_tbl = 0 716 do iglob = 1, nglob 717 if (ibool_inv_num(iglob) /= 0) then 718 iglob_p = iglob_p + 1 719 720 phase_iglob(iglob_p,iphase) = iglob 721 722 ! sets start index of table entry for this global node 723 ibool_inv_st(iglob_p,iphase) = ip + 1 724 725 ! sets maximum of used valence 726 if (ibool_inv_num(iglob) > num_used_ibool_inv_tbl) num_used_ibool_inv_tbl = ibool_inv_num(iglob) 727 728 ! loops over valence 729 do inum = 1, ibool_inv_num(iglob) 730 ! increases total counter 731 ip = ip + 1 732 ! maps local 1D index in ibool array 733 ibool_inv_tbl(ip,iphase) = ibool_inv_tbl_tmp(inum,iglob) 734 enddo 735 endif 736 enddo 737 ! sets last entry in start index table 738 ibool_inv_st(iglob_p+1,iphase) = ip + 1 739 740 ! total number global nodes in this phase (inner/outer) 741 num_globs(iphase) = iglob_p 742 743 ! checks 744 if (num_used_ibool_inv_tbl > num_alloc_ibool_inv_tbl) then 745 print *,"Error invalid inverse table setting:" 746 print *," num_alloc_ibool_inv_tbl = ",num_alloc_ibool_inv_tbl 747 print *," num_used_ibool_inv_tbl = ",num_used_ibool_inv_tbl 748 print *,"invalid value encountered: num_used_ibool_inv_tbl > num_alloc_ibool_inv_tbl" 749 print *,"#### Program exits... ##########" 750 call exit_MPI(myrank,'Error making inverse table for optimized arrays') 751 endif 752 753 ! debug 754 !if (myrank == 0) then 755 ! print *,'ibool_inv_tbl: ' 756 ! do iglob_p = 1,200 757 ! print *,' ',iglob_p,'table = ',(ibool_inv_tbl(ip,iphase), & 758 ! ip = ibool_inv_st(iglob_p,iphase),ibool_inv_st(iglob_p+1,iphase)-1) 759 ! enddo 760 !endif 761 762 ! frees memory 763 deallocate(ibool_inv_num) 764 deallocate(ibool_inv_tbl_tmp) 765 766 end subroutine make_inv_table 767 768 end subroutine prepare_timerun 769 770! 771!------------------------------------------------------------------- 772! 773 774 subroutine prepare_openmp() 775 776! outputs OpenMP support info 777 778#ifdef USE_OPENMP 779 use specfem_par,only: myrank,IMAIN 780#endif 781 782 implicit none 783 784#ifdef USE_OPENMP 785 ! local parameters 786 integer :: thread_id,num_threads 787 integer :: num_procs,max_threads 788 logical :: is_dynamic,is_nested 789 ! OpenMP functions 790 integer,external :: OMP_GET_NUM_THREADS,OMP_GET_THREAD_NUM 791 integer,external :: OMP_GET_NUM_PROCS,OMP_GET_MAX_THREADS 792 logical,external :: OMP_GET_DYNAMIC,OMP_GET_NESTED 793 794 ! OpenMP only supported for Deville routine 795 796!$OMP PARALLEL DEFAULT(NONE) & 797!$OMP SHARED(myrank) & 798!$OMP PRIVATE(thread_id,num_threads,num_procs,max_threads,is_dynamic,is_nested) 799 ! gets thread number 800 thread_id = OMP_GET_THREAD_NUM() 801 802 ! gets total number of threads for this MPI process 803 num_threads = OMP_GET_NUM_THREADS() 804 805 ! OpenMP master thread only 806 if (thread_id == 0) then 807 ! gets additional environment info 808 num_procs = OMP_GET_NUM_PROCS() 809 max_threads = OMP_GET_MAX_THREADS() 810 is_dynamic = OMP_GET_DYNAMIC() 811 is_nested = OMP_GET_NESTED() 812 813 ! user output 814 if (myrank == 0) then 815 write(IMAIN,*) '' 816 write(IMAIN,*) 'OpenMP information:' 817 write(IMAIN,*) ' number of threads = ', num_threads 818 write(IMAIN,*) '' 819 write(IMAIN,*) ' number of processors available = ', num_procs 820 write(IMAIN,*) ' maximum number of threads available = ', num_procs 821 write(IMAIN,*) ' dynamic thread adjustement = ', is_dynamic 822 write(IMAIN,*) ' nested parallelism = ', is_nested 823 write(IMAIN,*) '' 824 call flush_IMAIN() 825 endif 826 endif 827!$OMP END PARALLEL 828#else 829 ! nothing to do.. 830 return 831#endif 832 833 end subroutine prepare_openmp 834 835! 836!------------------------------------------------------------------- 837! 838 839 subroutine prepare_xsmm() 840 841 use constants,only: CUSTOM_REAL,SIZE_DOUBLE,m1,m2,IMAIN 842 843 use specfem_par,only: myrank 844 845 use my_libxsmm,only: libxsmm_init,libxsmm_dispatch,libxsmm_available,xmm1,xmm2,xmm3,USE_XSMM_FUNCTION 846 ! prefetch versions 847 use my_libxsmm,only: xmm1p,xmm2p,xmm3p,LIBXSMM_PREFETCH,USE_XSMM_FUNCTION_PREFETCH 848 849 implicit none 850 851 ! quick check 852 if (m1 /= 5) stop 'LibXSMM with invalid m1 constant (must have m1 == 5)' 853 if (m2 /= 5*5) stop 'LibXSMM with invalid m2 constant (must have m2 == 5*5)' 854 if (CUSTOM_REAL == SIZE_DOUBLE) stop 'LibXSMM optimization only for single precision functions' 855 856 ! initializes LIBXSMM 857 call libxsmm_init() 858 859 ! dispatch functions for matrix multiplications 860 ! (see in compute_forces_**Dev.F90 routines for actual function call) 861 ! example: a(n1,n2),b(n2,n3),c(n1,n3) -> c = a * b then libxsmm_dispatch(xmm,m=n1,n=n3,k=n2,alpha=1,beta=0) 862 863 ! with A(n1,n2) 5x5-matrix, B(n2,n3) 5x25-matrix and C(n1,n3) 5x25-matrix 864 call libxsmm_dispatch(xmm1, m=5, n=25, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL) 865 866 ! with A(n1,n2) 25x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3) 25x5-matrix 867 call libxsmm_dispatch(xmm2, m=25, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL) 868 869 ! with A(n1,n2,n4) 5x5x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3,n4) 5x5x5-matrix 870 call libxsmm_dispatch(xmm3, m=5, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL) 871 872 !directly: call libxsmm_smm_5_5_5(A,B,C) 873 if (libxsmm_available(xmm1) .and. libxsmm_available(xmm2) .and. libxsmm_available(xmm3)) then 874 USE_XSMM_FUNCTION = .true. 875 ! user output 876 if (myrank == 0) then 877 write(IMAIN,*) 878 write(IMAIN,*) "LIBXSMM dispatch functions ready for small matrix-matrix multiplications" 879 call flush_IMAIN() 880 endif 881 else 882 USE_XSMM_FUNCTION = .false. 883 print *,'LIBXSMM invalid dispatch function pointers:', & 884 libxsmm_available(xmm1),libxsmm_available(xmm2),libxsmm_available(xmm3) 885 ! hard stop 886 !call exit_MPI(myrank,'LIBXSMM functions not ready, please check configuration & compilation') 887 endif 888 889 ! synchronizes processes 890 call synchronize_all() 891 892 ! prefetch versions 893 call libxsmm_dispatch(xmm1p, m=5, n=25, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH) 894 call libxsmm_dispatch(xmm2p, m=25, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH) 895 call libxsmm_dispatch(xmm3p, m=5, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH) 896 897 if (libxsmm_available(xmm1p) .and. libxsmm_available(xmm2p) .and. libxsmm_available(xmm3p)) then 898 USE_XSMM_FUNCTION_PREFETCH = .true. 899 ! user output 900 if (myrank == 0) then 901 write(IMAIN,*) "LIBXSMM prefetch functions ready for small matrix-matrix multiplications" 902 write(IMAIN,*) 903 call flush_IMAIN() 904 endif 905 else 906 USE_XSMM_FUNCTION_PREFETCH = .false. 907 print *,'LIBXSMM invalid prefetch function pointers:', & 908 libxsmm_available(xmm1p),libxsmm_available(xmm2p),libxsmm_available(xmm3p) 909 ! hard stop 910 !call exit_MPI(myrank,'LIBXSMM prefetch functions not ready, please check configuration & compilation') 911 endif 912 913 ! force no dispatch 914 !USE_XSMM_FUNCTION = .false. 915 !USE_XSMM_FUNCTION_PREFETCH = .false. 916 917 end subroutine prepare_xsmm 918 919 920 921!------------------------------------------------------------------- 922! 923! dummy routines 924! 925!------------------------------------------------------------------- 926 927 928 929 subroutine compute_element_dummy(ispec,ibool,tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, & 930 dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H) 931 932! dummy example (original: isotropic element in crust/mantle region) 933! 934! it is mostly used to avoid over-simplification of the compute_forces routine: if we omit it, then compilers can do 935! much more aggressive optimizations and the timing results would be misleading. the original routines for computing 936! stresses on elements are more expensive and complicated. the dummy here will be much faster to compute, but should 937! give similar relative performance results 938 939 use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM 940#ifdef FORCE_VECTORIZATION 941 use constants,only: NGLLCUBE 942#endif 943 use specfem_par,only: NSPEC,GRAVITY_VAL 944 implicit none 945 946 ! element id 947 integer,intent(in) :: ispec 948 949 ! arrays with mesh parameters per slice 950 integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC),intent(in) :: ibool 951 952 ! element info 953 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(inout) :: & 954 tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3 955 956 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(in) :: dummyx_loc,dummyy_loc,dummyz_loc 957 958 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM),intent(out) :: rho_s_H 959 960 ! local parameters 961 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sigma_xx,sigma_yy,sigma_zz 962 real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy 963 real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl 964 real(kind=CUSTOM_REAL) :: fac,factor 965 integer :: idummy 966 967#ifdef FORCE_VECTORIZATION 968! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress 969! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3 970! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop 971 integer :: ijk 972#else 973 integer :: i,j,k 974#endif 975! note: profiling shows that this routine takes about 60% of the total time, another 30% is spend in the tiso routine below.. 976 977 DO_LOOP_IJK 978 ! compute stress sigma 979 ! (dummy values) 980 sigma_xx(INDEX_IJK) = 0.1 * dummyx_loc(INDEX_IJK) 981 sigma_yy(INDEX_IJK) = 0.1 * dummyy_loc(INDEX_IJK) 982 sigma_zz(INDEX_IJK) = 0.1 * dummyz_loc(INDEX_IJK) 983 984 sigma_xy(INDEX_IJK) = 0.3 * sigma_xx(INDEX_IJK) 985 sigma_xz(INDEX_IJK) = 0.3 * sigma_yy(INDEX_IJK) 986 sigma_yz(INDEX_IJK) = 0.3 * sigma_zz(INDEX_IJK) 987 ENDDO_LOOP_IJK 988 989 ! define symmetric components of sigma (to be general in case of gravity) 990 DO_LOOP_IJK 991 sigma_yx(INDEX_IJK) = sigma_xy(INDEX_IJK) 992 sigma_zx(INDEX_IJK) = sigma_xz(INDEX_IJK) 993 sigma_zy(INDEX_IJK) = sigma_yz(INDEX_IJK) 994 ENDDO_LOOP_IJK 995 996 ! compute non-symmetric terms for gravity 997 if (GRAVITY_VAL) then 998 ! dummy example, originally calls more complicated subroutine compute_element_gravity(..) 999 DO_LOOP_IJK 1000 ! compute G tensor from s . g and add to sigma (not symmetric) 1001 ! (dummy values) 1002 sigma_xx(INDEX_IJK) = sigma_xx(INDEX_IJK) + 1.1 ! real(sy_l*gyl + sz_l*gzl, kind=CUSTOM_REAL) 1003 sigma_yy(INDEX_IJK) = sigma_yy(INDEX_IJK) + 1.1 ! real(sx_l*gxl + sz_l*gzl, kind=CUSTOM_REAL) 1004 sigma_zz(INDEX_IJK) = sigma_zz(INDEX_IJK) + 1.1 ! real(sx_l*gxl + sy_l*gyl, kind=CUSTOM_REAL) 1005 1006 sigma_xy(INDEX_IJK) = sigma_xy(INDEX_IJK) - 0.3 ! real(sx_l * gyl, kind=CUSTOM_REAL) 1007 sigma_yx(INDEX_IJK) = sigma_yx(INDEX_IJK) - 0.3 ! real(sy_l * gxl, kind=CUSTOM_REAL) 1008 1009 sigma_xz(INDEX_IJK) = sigma_xz(INDEX_IJK) - 0.5 ! real(sx_l * gzl, kind=CUSTOM_REAL) 1010 sigma_zx(INDEX_IJK) = sigma_zx(INDEX_IJK) - 0.5 ! real(sz_l * gxl, kind=CUSTOM_REAL) 1011 1012 sigma_yz(INDEX_IJK) = sigma_yz(INDEX_IJK) - 0.7 ! real(sy_l * gzl, kind=CUSTOM_REAL) 1013 sigma_zy(INDEX_IJK) = sigma_zy(INDEX_IJK) - 0.7 ! real(sz_l * gyl, kind=CUSTOM_REAL) 1014 1015 ! precompute vector 1016 factor = 0.5 ! 0.5 * dummyz_loc(INDEX_IJK) ! dble(jacobianl(INDEX_IJK)) * wgll_cube(INDEX_IJK) 1017 1018 rho_s_H(INDEX_IJK,1) = factor * 1.5 ! real(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl), kind=CUSTOM_REAL) 1019 rho_s_H(INDEX_IJK,2) = factor * 1.5 ! real(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl), kind=CUSTOM_REAL) 1020 rho_s_H(INDEX_IJK,3) = factor * 1.5 ! real(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl), kind=CUSTOM_REAL) 1021 ENDDO_LOOP_IJK 1022 endif 1023 1024 ! dot product of stress tensor with test vector, non-symmetric form 1025 DO_LOOP_IJK 1026 ! reloads derivatives of ux, uy and uz with respect to x, y and z 1027 ! (dummy) 1028 xixl = 1.1 1029 xiyl = 1.2 1030 xizl = 1.3 1031 etaxl = 1.4 1032 etayl = 1.5 1033 etazl = 1.6 1034 gammaxl = 1.7 1035 gammayl = 1.8 1036 gammazl = 1.9 1037 1038 ! common factor (dummy) 1039 fac = 0.5 1040 1041 ! form dot product with test vector, non-symmetric form 1042 ! this goes to accel_x 1043 tempx1(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*xixl + sigma_yx(INDEX_IJK)*xiyl + sigma_zx(INDEX_IJK)*xizl) 1044 ! this goes to accel_y 1045 tempy1(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*xixl + sigma_yy(INDEX_IJK)*xiyl + sigma_zy(INDEX_IJK)*xizl) 1046 ! this goes to accel_z 1047 tempz1(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*xixl + sigma_yz(INDEX_IJK)*xiyl + sigma_zz(INDEX_IJK)*xizl) 1048 1049 ! this goes to accel_x 1050 tempx2(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*etaxl + sigma_yx(INDEX_IJK)*etayl + sigma_zx(INDEX_IJK)*etazl) 1051 ! this goes to accel_y 1052 tempy2(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*etaxl + sigma_yy(INDEX_IJK)*etayl + sigma_zy(INDEX_IJK)*etazl) 1053 ! this goes to accel_z 1054 tempz2(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*etaxl + sigma_yz(INDEX_IJK)*etayl + sigma_zz(INDEX_IJK)*etazl) 1055 1056 ! this goes to accel_x 1057 tempx3(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*gammaxl + sigma_yx(INDEX_IJK)*gammayl + sigma_zx(INDEX_IJK)*gammazl) 1058 ! this goes to accel_y 1059 tempy3(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*gammaxl + sigma_yy(INDEX_IJK)*gammayl + sigma_zy(INDEX_IJK)*gammazl) 1060 ! this goes to accel_z 1061 tempz3(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*gammaxl + sigma_yz(INDEX_IJK)*gammayl + sigma_zz(INDEX_IJK)*gammazl) 1062 1063 ENDDO_LOOP_IJK 1064 1065 ! avoid compiler warning 1066 idummy = ispec 1067 idummy = ibool(1,1,1,1) 1068 1069 end subroutine compute_element_dummy 1070 1071! 1072!------------------------------------------------------------------- 1073! 1074 1075 subroutine synchronize_all() 1076 1077! dummy routine to make it easier for copy-paste from the original code 1078 1079 implicit none 1080 1081 continue 1082 1083 end subroutine synchronize_all 1084 1085! 1086!------------------------------------------------------------------- 1087! 1088 1089 1090 subroutine exit_MPI(myrank,error_msg) 1091 1092! dummy routine to make it easier for copy-paste from the original code 1093 1094 use constants 1095 1096 implicit none 1097 1098 ! identifier for error message file 1099 integer, parameter :: IERROR = 30 1100 1101 integer :: myrank 1102 character(len=*) :: error_msg 1103 1104 ! write error message to screen 1105 write(*,*) error_msg(1:len(error_msg)) 1106 write(*,*) 'Error detected, aborting MPI... proc ',myrank 1107 1108 ! or just exit with message: 1109 stop 'Error, program ended in exit_MPI' 1110 1111 end subroutine exit_MPI 1112 1113! 1114!------------------------------------------------------------------- 1115! 1116 1117 subroutine flush_IMAIN() 1118 1119! dummy routine to make it easier for copy-paste from the original code 1120 1121 implicit none 1122 1123 continue 1124 1125 end subroutine flush_IMAIN 1126 1127