1!
2! test program for LIBXSMM function calls
3!
4! uses SPECFEM3D_GLOBE routine compute_forces_crust_mantle_Dev() with dummy example
5!
6
7! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION
8! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh
9#include "config.fh"
10
11!-------------------------------------------------------------------
12!
13! modules
14!
15!-------------------------------------------------------------------
16
17module my_libxsmm
18
19  use libxsmm !,only: LIBXSMM_SMMFUNCTION,libxsmm_dispatch,libxsmm_mmcall,libxsmm_init,libxsmm_finalize
20
21  implicit none
22
23  ! function pointers
24  ! (note: defined for single precision, thus needs CUSTOM_REAL to be SIZE_REAL)
25  type(LIBXSMM_SMMFUNCTION) :: xmm1, xmm2, xmm3
26
27  ! prefetch versions
28  type(LIBXSMM_SMMFUNCTION) :: xmm1p, xmm2p, xmm3p
29
30  logical :: USE_XSMM_FUNCTION,USE_XSMM_FUNCTION_PREFETCH
31
32end module my_libxsmm
33
34!
35!-------------------------------------------------------------------
36!
37
38module constants
39
40  implicit none
41
42  integer, parameter :: SIZE_REAL = 4, SIZE_DOUBLE = 8
43  integer, parameter :: CUSTOM_REAL = SIZE_REAL
44
45  integer, parameter :: ISTANDARD_OUTPUT = 6
46  integer, parameter :: IMAIN = ISTANDARD_OUTPUT
47
48  ! number of GLL points in each direction of an element (degree plus one)
49  integer, parameter :: NGLLX = 5
50  integer, parameter :: NGLLY = NGLLX
51  integer, parameter :: NGLLZ = NGLLX
52  integer, parameter :: NGLLCUBE = NGLLX * NGLLY * NGLLZ
53
54  ! Deville routines optimized for NGLLX = NGLLY = NGLLZ = 5
55  integer, parameter :: m1 = NGLLX, m2 = NGLLX * NGLLY
56
57  ! 3-D simulation
58  integer, parameter :: NDIM = 3
59
60  ! some useful constants
61  double precision, parameter :: PI = 3.141592653589793d0
62
63  integer, parameter :: IFLAG_IN_FICTITIOUS_CUBE = 11
64
65end module constants
66
67!
68!-------------------------------------------------------------------
69!
70
71module specfem_par
72
73! main parameter module for specfem simulations
74
75  use constants
76  use libxsmm,only: LIBXSMM_ALIGNMENT
77
78  implicit none
79
80  !------------------------------------------------
81
82  ! number of spectral elements in x/y/z-directions
83  integer,parameter :: NEX = 40
84  integer,parameter :: NEY = 40
85  integer,parameter :: NEZ = 25
86
87  !------------------------------------------------
88
89  ! MPI rank (dummy, no MPI for this test needed)
90  integer :: myrank
91
92  ! array with derivatives of Lagrange polynomials and precalculated products
93  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
94  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT
95
96  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
97  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
98  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
99
100  ! arrays for Deville and force_vectorization
101  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
102
103  ! mesh parameters
104  ! number of spectral elements
105  integer :: NSPEC
106
107  ! number of global nodes
108  integer :: NGLOB
109
110  ! local to global indexing
111  integer, dimension(:,:,:,:),allocatable :: ibool
112
113  ! displacement, velocity, acceleration
114  real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: displ,accel
115
116  ! for verification
117  real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: accel_default
118
119  !slow-down: please don't use unless you're sure... !dir$ ATTRIBUTES align:LIBXSMM_ALIGNMENT :: displ,accel,ibool,accel_default
120
121  ! gravity
122  logical,parameter :: GRAVITY_VAL = .true.
123
124  ! optimized arrays
125  integer, dimension(:,:),allocatable :: ibool_inv_tbl
126  integer, dimension(:,:),allocatable :: ibool_inv_st
127  integer, dimension(:,:),allocatable :: phase_iglob
128  integer, dimension(2) :: num_globs
129
130  ! work array with contributions
131  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:),allocatable :: sum_terms
132
133  ! inner / outer elements crust/mantle region
134  integer :: num_phase_ispec
135  integer :: nspec_inner,nspec_outer
136  integer, dimension(:,:), allocatable :: phase_ispec_inner
137  integer :: iphase
138
139end module specfem_par
140
141
142
143!-------------------------------------------------------------------
144!
145! main program
146!
147!-------------------------------------------------------------------
148
149program test
150
151  use specfem_par
152  use my_libxsmm
153
154  implicit none
155
156  ! timing
157  double precision :: duration,duration_default
158  integer(kind=8) :: start
159
160  ! verification
161  real :: diff
162  integer, dimension(2) :: iloc_max
163  logical, parameter :: DEBUG_VERIFICATION = .false.
164
165  ! repetitions (time steps)
166  integer :: it
167  integer,parameter :: NSTEP = 20 ! should be > NSTEP_JITTER because of average timing
168  integer,parameter :: NSTEP_JITTER = 5 ! skip first few steps when timing the kernels (the first steps exhibit runtime jitter)
169
170  ! different versions
171  integer,parameter :: num_versions = 5
172  character(len=20) :: str_version(num_versions) = (/character(len=20) :: &
173                                                     "Deville loops", &
174                                                     "unrolled loops", &
175                                                     "LIBXSMM dispatch" , &
176                                                     "LIBXSMM prefetch", &
177                                                     "LIBXSMM static" &
178                                                    /)
179  double precision :: avg_time(num_versions)
180  integer :: iversion
181
182  print *,'--------------------------------------'
183  print *,'specfem example'
184  print *,'--------------------------------------'
185  print *
186
187  ! creates test mesh
188  call setup_mesh()
189
190  ! prepares arrays for time iteration loop
191  call prepare_timerun()
192
193  ! OpenMP output info
194  call prepare_openmp()
195
196  ! prepares libxsmm functions
197  call prepare_xsmm()
198
199  ! timing averages
200  avg_time(:) = 0.d0
201  iphase = 2
202
203  do it = 1,NSTEP
204
205    print *
206    print *,'step ',it
207
208    do iversion = 1,num_versions
209      ! initializes
210      accel(:,:) = 0._CUSTOM_REAL
211
212      ! timing
213      start = libxsmm_timer_tick()
214
215      ! computes forces
216      select case (iversion)
217      case (1)
218        ! Deville loops
219        call compute_forces_Dev()
220
221      case (2)
222        ! unrolled loops
223        call compute_forces_noDev()
224
225      case (3)
226        ! LIBXSMM with dispatch functions
227        if (USE_XSMM_FUNCTION) then
228          call compute_forces_with_xsmm()
229        else
230          cycle
231        endif
232
233      case (4)
234        ! LIBXSMM with prefetch function calls
235        if (USE_XSMM_FUNCTION_PREFETCH) then
236          call compute_forces_with_xsmm_prefetch()
237        else
238          cycle
239        endif
240
241      case (5)
242        ! LIBXSMM with static function calls
243        call compute_forces_with_xsmm_static()
244
245      end select
246
247      ! timing
248      duration = libxsmm_timer_duration(start, libxsmm_timer_tick())
249      if (iversion == 1) duration_default = duration
250
251      ! average time
252      if (it > NSTEP_JITTER) avg_time(iversion) = avg_time(iversion) + duration
253
254      ! for verification
255      if (iversion == 1) then
256        accel_default(:,:) = accel(:,:)
257      endif
258      diff = maxval(abs(accel(:,:) - accel_default(:,:)))
259
260      ! user output
261      if (iversion == 1) then
262        write(*,'(a30,a,f8.4,a)') 'duration with '//str_version(iversion),' = ',sngl(duration),' (s)'
263      else
264        write(*,'(a30,a,f8.4,a,f8.2,a,e12.4)') 'duration with '//str_version(iversion),' = ', &
265                                    sngl(duration),' (s) / speedup = ', &
266                                    sngl(100.0 * (duration_default-duration)/duration_default),' %  / maximum diff = ',diff
267      endif
268
269      ! check
270      if (DEBUG_VERIFICATION) then
271        iloc_max = maxloc(abs(accel(:,:) - accel_default(:,:)))
272        print *,'verification: max diff  = ',diff
273        print *,'              iglob loc = ',iloc_max(1),iloc_max(2)
274        print *,'maximum difference: #current vs. #default value'
275        print *,'  ',accel(1,iloc_max(2)),accel_default(1,iloc_max(2))
276        print *,'  ',accel(2,iloc_max(2)),accel_default(2,iloc_max(2))
277        print *,'  ',accel(3,iloc_max(2)),accel_default(3,iloc_max(2))
278        print *,'min/max accel values = ',minval(accel(:,:)),maxval(accel(:,:))
279        print *
280      endif
281
282    enddo ! iversion
283  enddo ! it
284
285  ! average timing (avoiding the first 5 steps which fluctuate quite a bit...)
286  avg_time(:) = avg_time(:) / dble(NSTEP - NSTEP_JITTER)
287
288  print *
289  print *,'=============================================='
290  print *,'average over ',NSTEP - NSTEP_JITTER,'repetitions'
291  write(*,'(a30,a,f8.4)') '  timing with '//str_version(1),' = ',avg_time(1)
292  do iversion = 2,num_versions
293    ! skip unused tests
294    if (iversion == 3 .and. .not. USE_XSMM_FUNCTION) cycle
295    if (iversion == 4 .and. .not. USE_XSMM_FUNCTION_PREFETCH) cycle
296
297    write(*,'(a30,a,f8.4,a,f8.2,a)') '  timing with '//str_version(iversion),' = ', &
298                                     avg_time(iversion),' / speedup = ', &
299                                     sngl(100.0 * (avg_time(1)-avg_time(iversion))/avg_time(1)),' %'
300  enddo
301  print *,'=============================================='
302  print *
303
304  ! frees memory
305  deallocate(displ,accel)
306  deallocate(accel_default,ibool)
307  deallocate(sum_terms)
308  deallocate(ibool_inv_st,ibool_inv_tbl,phase_iglob)
309  deallocate(phase_ispec_inner)
310
311  ! finalizes LIBXSMM
312  call libxsmm_finalize()
313
314end program test
315
316!
317!-------------------------------------------------------------------
318!
319
320  subroutine setup_mesh()
321
322  use constants
323  use specfem_par
324
325  implicit none
326
327  integer :: i1,i2
328  integer :: ix,iy,iz
329  integer :: i,j,k
330  integer :: iglob,ispec,inumber
331
332  integer, dimension(:), allocatable :: mask_ibool
333  integer, dimension(:,:,:,:), allocatable :: copy_ibool_ori
334
335  logical, dimension(:), allocatable :: mask_ibool_flag
336
337  ! total number of elements
338  NSPEC = NEX * NEY * NEZ
339
340  ! set up local to global numbering
341  allocate(ibool(NGLLX,NGLLY,NGLLZ,NSPEC))
342  ibool(:,:,:,:) = 0
343  ispec = 0
344  iglob = 0
345
346  ! arranges a three-dimensional block, elements are collated side-by-side. mimicks a very simple unstructured grid.
347  do iz = 1,NEZ
348    do iy = 1,NEY
349      do ix = 1,NEX
350        ispec = ispec + 1
351        ! GLL point indexing
352        do k = 1,NGLLZ
353          do j = 1,NGLLY
354            do i = 1,NGLLX
355              ! set up local to global numbering
356              if ((i == 1) .and. (ix > 1)) then
357                ! previous element along x-direction
358                ibool(i,j,k,ispec) = ibool(NGLLX,j,k,ispec - 1)
359              else if ((j == 1) .and. (iy > 1)) then
360                ! previous element along y-direction
361                ibool(i,j,k,ispec) = ibool(i,NGLLY,k,ispec - NEX)
362              else if ((k == 1) .and. (iz > 1)) then
363                ! previous element along z-direction
364                ibool(i,j,k,ispec) = ibool(i,j,NGLLZ,ispec - NEX * NEY)
365              else
366                ! new point
367                iglob = iglob + 1
368                ibool(i,j,k,ispec) = iglob
369              endif
370            enddo
371          enddo
372        enddo
373
374      enddo ! NEX
375    enddo ! NEY
376  enddo ! NEZ
377
378  ! sets total numbers of nodes
379  NGLOB = iglob
380
381  print *,'mesh:'
382  print *,' total number of elements      = ',NSPEC
383  print *,' total number of global nodes  = ',NGLOB
384  !print *,' ibool min/max = ',minval(ibool),maxval(ibool)
385
386  ! checks
387  if (ispec /= NSPEC) stop 'Invalid ispec count'
388  if (minval(ibool(:,:,:,:)) < 1) stop 'Invalid ibool minimum value'
389  if (maxval(ibool(:,:,:,:)) > NSPEC * NGLLX * NGLLY * NGLLZ) stop 'Invalid ibool maximum value'
390
391  ! we can create a new indirect addressing to reduce cache misses
392  allocate(copy_ibool_ori(NGLLX,NGLLY,NGLLZ,NSPEC),mask_ibool(nglob))
393  mask_ibool(:) = -1
394  copy_ibool_ori(:,:,:,:) = ibool(:,:,:,:)
395  inumber = 0
396  do ispec = 1,NSPEC
397    do k = 1,NGLLZ
398      do j = 1,NGLLY
399        do i = 1,NGLLX
400          if (mask_ibool(copy_ibool_ori(i,j,k,ispec)) == -1) then
401            ! creates a new point
402            inumber = inumber + 1
403            ibool(i,j,k,ispec) = inumber
404            mask_ibool(copy_ibool_ori(i,j,k,ispec)) = inumber
405          else
406            ! uses an existing point created previously
407            ibool(i,j,k,ispec) = mask_ibool(copy_ibool_ori(i,j,k,ispec))
408          endif
409        enddo
410      enddo
411    enddo
412  enddo
413  if (inumber /= NGLOB) stop 'Invalid inumber count'
414  deallocate(copy_ibool_ori,mask_ibool)
415
416  ! define polynomial derivatives & weights
417  ! (dummy values)
418  do i1 = 1,NGLLX
419    do i2 = 1,NGLLX
420      hprime_xx(i2,i1) = i1 * 0.1 + i2 * 0.2  ! original: real(lagrange_deriv_GLL(i1-1,i2-1,xigll,NGLLX), kind=CUSTOM_REAL)
421      hprimewgll_xx(i2,i1) = hprime_xx(i2,i1) * (i2 * 1.0/NGLLX) ! real(lagrange_deriv_GLL(i1-1,i2-1,xigll,NGLLX)*wxgll(i2), kind=CUSTOM_REAL)
422    enddo
423  enddo
424  do i = 1,NGLLX
425    do j = 1,NGLLY
426      wgllwgll_xy(i,j) = (i * 1.0/NGLLX) * (j * 1.0/NGLLY) ! original: real(wxgll(i)*wygll(j), kind=CUSTOM_REAL)
427    enddo
428  enddo
429  do i = 1,NGLLX
430    do k = 1,NGLLZ
431      wgllwgll_xz(i,k) = (i * 1.0/NGLLX) * (k * 1.0/NGLLZ) ! original: real(wxgll(i)*wzgll(k), kind=CUSTOM_REAL)
432    enddo
433  enddo
434  do j = 1,NGLLY
435    do k = 1,NGLLZ
436      wgllwgll_yz(j,k) = (j * 1.0/NGLLY) * (k * 1.0/NGLLZ) ! original: real(wygll(j)*wzgll(k), kind=CUSTOM_REAL)
437    enddo
438  enddo
439
440  ! define a 3D extension in order to be able to force vectorization in the compute_forces_**_Dev routines
441  do k = 1,NGLLZ
442    do j = 1,NGLLY
443      do i = 1,NGLLX
444        wgllwgll_yz_3D(i,j,k) = wgllwgll_yz(j,k)
445        wgllwgll_xz_3D(i,j,k) = wgllwgll_xz(i,k)
446        wgllwgll_xy_3D(i,j,k) = wgllwgll_xy(i,j)
447      enddo
448    enddo
449  enddo
450
451  ! check that optimized routines from Deville et al. (2002) can be used
452  if (NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5) &
453    stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ = 5'
454
455  ! define transpose of derivation matrix
456  do j = 1,NGLLX
457    do i = 1,NGLLX
458      hprime_xxT(j,i) = hprime_xx(i,j)
459      hprimewgll_xxT(j,i) = hprimewgll_xx(i,j)
460    enddo
461  enddo
462
463  ! displacement and acceleration (dummy fields)
464  allocate(displ(NDIM,NGLOB),accel(NDIM,NGLOB))
465  accel(:,:) = 0._CUSTOM_REAL
466
467  ! sets initial dummy values (to avoid getting only zero multiplications later on)
468  if (.true.) then
469    ! arbitrary linear function
470    do iglob = 1,NGLOB
471      displ(:,iglob) = dble(iglob - 1) / dble(NGLOB - 1)
472    enddo
473  else
474    ! arbitrary sine function
475    allocate(mask_ibool_flag(NGLOB))
476    mask_ibool_flag(:) = .false.
477    ispec = 0
478    do iz = 1,NEZ
479      do iy = 1,NEY
480        do ix = 1,NEX
481          ispec = ispec + 1
482          ! GLL point indexing
483          do k = 1,NGLLZ
484            do j = 1,NGLLY
485              do i = 1,NGLLX
486                iglob = ibool(i,j,k,ispec)
487                if (.not. mask_ibool_flag(iglob)) then
488                  ! only assigns global value once
489                  mask_ibool_flag(iglob) = .true.
490                  displ(:,iglob) = sin(PI * dble(ix - 1) / dble(NEX - 1)) &
491                                 * sin(PI * dble(iy - 1) / dble(NEY - 1)) &
492                                 * sin(PI * dble(iz - 1) / dble(NEZ - 1))
493                endif
494              enddo
495            enddo
496          enddo
497        enddo
498      enddo
499    enddo
500    deallocate(mask_ibool_flag)
501  endif
502
503  ! for verification
504  allocate(accel_default(NDIM,NGLOB))
505  accel_default(:,:) = 0._CUSTOM_REAL
506
507  end subroutine setup_mesh
508
509!
510!-------------------------------------------------------------------
511!
512
513  subroutine prepare_timerun()
514
515  use specfem_par
516
517  implicit none
518
519  ! local parameters
520  integer :: num_elements,ispec,ier
521
522  ! setup inner/outer elements (single slice only, no outer elements for halo)
523  myrank = 0
524  ! no MPI over-lapping communication in this example
525  nspec_inner = NSPEC
526  nspec_outer = 0
527  num_phase_ispec = NSPEC
528  allocate(phase_ispec_inner(num_phase_ispec,2),stat=ier)
529  if (ier /= 0 ) call exit_mpi(myrank,'Error allocating array phase_ispec_inner_crust_mantle')
530  phase_ispec_inner(:,:) = 0
531  do ispec = 1,NSPEC
532    phase_ispec_inner(ispec,2) = ispec
533  enddo
534
535  ! from original routine prepare_timerun_ibool_inv_tbl()
536
537  ! note: we use allocate for sum_terms arrays rather than defining within subroutine compute_forces_**_Dev() itself
538  !       as it will crash when using OpenMP and operating systems with small stack sizes
539  !       e.g. see http://stackoverflow.com/questions/22649827/illegal-instruction-error-when-running-openmp-in-gfortran-mac
540  allocate(sum_terms(NDIM,NGLLX,NGLLY,NGLLZ,NSPEC),stat=ier)
541  if (ier /= 0) stop 'Error allocating sum_terms arrays'
542  sum_terms(:,:,:,:,:) = 0._CUSTOM_REAL
543
544  ! inverse table
545  ! this helps to speedup the assembly, especially with OpenMP (or on MIC) threading
546  ! allocating arrays
547  allocate(ibool_inv_tbl(NGLLX*NGLLY*NGLLZ*NSPEC,2),stat=ier)
548  if (ier /= 0) stop 'Error allocating ibool_inv_tbl arrays'
549
550  allocate(ibool_inv_st(NGLOB+1,2),stat=ier)
551  if (ier /= 0) stop 'Error allocating ibool_inv_st arrays'
552
553  allocate(phase_iglob(NGLOB,2),stat=ier)
554  if (ier /= 0) stop 'Error allocating phase_iglob arrays'
555
556  ! initializing
557  num_globs(:) = 0
558  ibool_inv_tbl(:,:) = 0
559  ibool_inv_st(:,:) = 0
560  phase_iglob(:,:) = 0
561
562  !---- make inv. table ----------------------
563  ! loops over phases
564  ! (1 == outer elements / 2 == inner elements)
565  do iphase = 1,2
566    ! crust mantle
567    if (iphase == 1) then
568      ! outer elements (iphase=1)
569      num_elements = nspec_outer
570    else
571      ! inner elements (iphase=2)
572      num_elements = nspec_inner
573    endif
574    call make_inv_table(iphase,NGLOB,NSPEC, &
575                        num_elements,phase_ispec_inner, &
576                        ibool,phase_iglob, &
577                        ibool_inv_tbl, ibool_inv_st, &
578                        num_globs)
579  enddo
580
581  ! user output
582  if (myrank == 0) then
583    write(IMAIN,*) " inverse table of ibool done"
584    call flush_IMAIN()
585  endif
586
587  ! synchronizes processes
588  call synchronize_all()
589
590  contains
591
592    subroutine make_inv_table(iphase,nglob,nspec, &
593                              phase_nspec,phase_ispec,ibool,phase_iglob, &
594                              ibool_inv_tbl,ibool_inv_st,num_globs,idoubling)
595
596    implicit none
597
598    ! arguments
599    integer,intent(in) :: iphase
600    integer,intent(in) :: nglob
601    integer,intent(in) :: nspec
602    integer,intent(in) :: phase_nspec
603    integer, dimension(:,:),intent(in) :: phase_ispec
604    integer, dimension(:,:,:,:),intent(in) :: ibool
605
606    integer, dimension(:,:),intent(inout) :: phase_iglob
607    integer, dimension(:,:),intent(inout) :: ibool_inv_tbl
608    integer, dimension(:,:),intent(inout) :: ibool_inv_st
609    integer, dimension(:),intent(inout) :: num_globs
610
611    integer,dimension(:),optional :: idoubling
612
613    ! local parameters
614    integer, dimension(:),   allocatable :: ibool_inv_num
615    integer, dimension(:,:), allocatable :: ibool_inv_tbl_tmp
616    integer :: num_alloc_ibool_inv_tbl,num_alloc_ibool_inv_tbl_theor
617    integer :: num_used_ibool_inv_tbl
618    integer :: ip, iglob, ispec_p, ispec, iglob_p, ier
619    integer :: inum
620#ifdef FORCE_VECTORIZATION
621    integer :: ijk
622#else
623    integer :: i,j,k
624#endif
625    logical :: is_inner_core
626
627    ! tolerance number of shared degrees per node
628    integer, parameter :: N_TOL = 20
629
630    ! checks if anything to do (e.g., no outer elements for single process simulations)
631    if (phase_nspec == 0) return
632
633    ! checks if inner core region
634    if (present(idoubling)) then
635      is_inner_core = .true.
636    else
637      is_inner_core = .false.
638    endif
639
640    ! allocates temporary arrays
641    allocate(ibool_inv_num(nglob),stat=ier)
642    if (ier /= 0) stop 'Error allocating ibool_inv_num array'
643
644    ! gets valence of global degrees of freedom for current phase (inner/outer) elements
645    ibool_inv_num(:) = 0
646    do ispec_p = 1,phase_nspec
647      ispec = phase_ispec(ispec_p,iphase)
648
649      ! exclude fictitious elements in central cube
650      if (is_inner_core) then
651        if (idoubling(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle
652      endif
653
654      DO_LOOP_IJK
655        iglob = ibool(INDEX_IJK,ispec)
656        ! increases valence counter
657        ibool_inv_num(iglob) = ibool_inv_num(iglob) + 1
658      ENDDO_LOOP_IJK
659    enddo
660
661    ! gets maximum valence value
662    num_alloc_ibool_inv_tbl = maxval(ibool_inv_num(:))
663
664    ! theoretical number of maximum shared degrees per node
665    num_alloc_ibool_inv_tbl_theor = N_TOL*(NGLLX*NGLLY*NGLLZ*nspec/nglob+1)
666
667    ! checks valence
668    if (num_alloc_ibool_inv_tbl < 1 .or. num_alloc_ibool_inv_tbl > num_alloc_ibool_inv_tbl_theor) then
669      print *,'Error invalid maximum valence:'
670      print *,'valence value = ',num_alloc_ibool_inv_tbl,' - theoretical maximum = ',num_alloc_ibool_inv_tbl_theor
671      stop 'Error invalid maximum valence value'
672    endif
673    ! debug
674    !print *,myrank,'maximum shared degrees theoretical = ',num_alloc_ibool_inv_tbl_theor ! regional_Greece_small example: 40
675    !print *,myrank,'maximum shared degrees from array  = ',maxval(ibool_inv_num(:))      ! regional_Greece_small example: 8 and 16
676
677    allocate(ibool_inv_tbl_tmp(num_alloc_ibool_inv_tbl,nglob),stat=ier)
678    if (ier /= 0) stop 'Error allocating ibool_inv_tbl_tmp array'
679
680    !---- make temporary array of inv. table : ibool_inv_tbl_tmp
681    ibool_inv_tbl_tmp(:,:) = 0
682    ibool_inv_num(:) = 0
683    do ispec_p = 1,phase_nspec
684      ispec = phase_ispec(ispec_p,iphase)
685
686      ! exclude fictitious elements in central cube
687      if (is_inner_core) then
688        if (idoubling(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle
689      endif
690
691      DO_LOOP_IJK
692
693        iglob = ibool(INDEX_IJK,ispec)
694
695        ! increases counter
696        ibool_inv_num(iglob) = ibool_inv_num(iglob) + 1
697
698        ! inverse table
699        ! sets 1D index of local GLL point (between 1 and NGLLCUBE)
700#ifdef FORCE_VECTORIZATION
701        inum = ijk
702#else
703        inum = i + (j-1)*NGLLY + (k-1)*NGLLY*NGLLZ
704#endif
705        ! sets 1D index in local ibool array
706        ibool_inv_tbl_tmp(ibool_inv_num(iglob),iglob) = inum + NGLLX*NGLLY*NGLLZ*(ispec-1)
707
708      ENDDO_LOOP_IJK
709
710    enddo
711
712    !---- packing : ibool_inv_tbl_tmp -> ibool_inv_tbl
713    ip = 0
714    iglob_p = 0
715    num_used_ibool_inv_tbl = 0
716    do iglob = 1, nglob
717      if (ibool_inv_num(iglob) /= 0) then
718        iglob_p = iglob_p + 1
719
720        phase_iglob(iglob_p,iphase) = iglob
721
722        ! sets start index of table entry for this global node
723        ibool_inv_st(iglob_p,iphase) = ip + 1
724
725        ! sets maximum of used valence
726        if (ibool_inv_num(iglob) > num_used_ibool_inv_tbl) num_used_ibool_inv_tbl = ibool_inv_num(iglob)
727
728        ! loops over valence
729        do inum = 1, ibool_inv_num(iglob)
730          ! increases total counter
731          ip = ip + 1
732          ! maps local 1D index in ibool array
733          ibool_inv_tbl(ip,iphase) = ibool_inv_tbl_tmp(inum,iglob)
734        enddo
735      endif
736    enddo
737    ! sets last entry in start index table
738    ibool_inv_st(iglob_p+1,iphase) = ip + 1
739
740    ! total number global nodes in this phase (inner/outer)
741    num_globs(iphase) = iglob_p
742
743    ! checks
744    if (num_used_ibool_inv_tbl > num_alloc_ibool_inv_tbl)  then
745      print *,"Error invalid inverse table setting:"
746      print *,"  num_alloc_ibool_inv_tbl = ",num_alloc_ibool_inv_tbl
747      print *,"  num_used_ibool_inv_tbl  = ",num_used_ibool_inv_tbl
748      print *,"invalid value encountered: num_used_ibool_inv_tbl > num_alloc_ibool_inv_tbl"
749      print *,"#### Program exits... ##########"
750      call exit_MPI(myrank,'Error making inverse table for optimized arrays')
751    endif
752
753    ! debug
754    !if (myrank == 0) then
755    !  print *,'ibool_inv_tbl: '
756    !  do iglob_p = 1,200
757    !    print *,'  ',iglob_p,'table = ',(ibool_inv_tbl(ip,iphase), &
758    !                                     ip = ibool_inv_st(iglob_p,iphase),ibool_inv_st(iglob_p+1,iphase)-1)
759    !  enddo
760    !endif
761
762    ! frees memory
763    deallocate(ibool_inv_num)
764    deallocate(ibool_inv_tbl_tmp)
765
766    end subroutine make_inv_table
767
768  end subroutine prepare_timerun
769
770!
771!-------------------------------------------------------------------
772!
773
774  subroutine prepare_openmp()
775
776! outputs OpenMP support info
777
778#ifdef USE_OPENMP
779  use specfem_par,only: myrank,IMAIN
780#endif
781
782  implicit none
783
784#ifdef USE_OPENMP
785  ! local parameters
786  integer :: thread_id,num_threads
787  integer :: num_procs,max_threads
788  logical :: is_dynamic,is_nested
789  ! OpenMP functions
790  integer,external :: OMP_GET_NUM_THREADS,OMP_GET_THREAD_NUM
791  integer,external :: OMP_GET_NUM_PROCS,OMP_GET_MAX_THREADS
792  logical,external :: OMP_GET_DYNAMIC,OMP_GET_NESTED
793
794  ! OpenMP only supported for Deville routine
795
796!$OMP PARALLEL DEFAULT(NONE) &
797!$OMP SHARED(myrank) &
798!$OMP PRIVATE(thread_id,num_threads,num_procs,max_threads,is_dynamic,is_nested)
799  ! gets thread number
800  thread_id = OMP_GET_THREAD_NUM()
801
802  ! gets total number of threads for this MPI process
803  num_threads = OMP_GET_NUM_THREADS()
804
805  ! OpenMP master thread only
806  if (thread_id == 0) then
807    ! gets additional environment info
808    num_procs = OMP_GET_NUM_PROCS()
809    max_threads = OMP_GET_MAX_THREADS()
810    is_dynamic = OMP_GET_DYNAMIC()
811    is_nested = OMP_GET_NESTED()
812
813    ! user output
814    if (myrank == 0) then
815      write(IMAIN,*) ''
816      write(IMAIN,*) 'OpenMP information:'
817      write(IMAIN,*) '  number of threads = ', num_threads
818      write(IMAIN,*) ''
819      write(IMAIN,*) '  number of processors available      = ', num_procs
820      write(IMAIN,*) '  maximum number of threads available = ', num_procs
821      write(IMAIN,*) '  dynamic thread adjustement          = ', is_dynamic
822      write(IMAIN,*) '  nested parallelism                  = ', is_nested
823      write(IMAIN,*) ''
824      call flush_IMAIN()
825    endif
826  endif
827!$OMP END PARALLEL
828#else
829  ! nothing to do..
830  return
831#endif
832
833  end subroutine prepare_openmp
834
835!
836!-------------------------------------------------------------------
837!
838
839  subroutine prepare_xsmm()
840
841  use constants,only: CUSTOM_REAL,SIZE_DOUBLE,m1,m2,IMAIN
842
843  use specfem_par,only: myrank
844
845  use my_libxsmm,only: libxsmm_init,libxsmm_dispatch,libxsmm_available,xmm1,xmm2,xmm3,USE_XSMM_FUNCTION
846  ! prefetch versions
847  use my_libxsmm,only: xmm1p,xmm2p,xmm3p,LIBXSMM_PREFETCH,USE_XSMM_FUNCTION_PREFETCH
848
849  implicit none
850
851  ! quick check
852  if (m1 /= 5) stop 'LibXSMM with invalid m1 constant (must have m1 == 5)'
853  if (m2 /= 5*5) stop 'LibXSMM with invalid m2 constant (must have m2 == 5*5)'
854  if (CUSTOM_REAL == SIZE_DOUBLE) stop 'LibXSMM optimization only for single precision functions'
855
856  ! initializes LIBXSMM
857  call libxsmm_init()
858
859  ! dispatch functions for matrix multiplications
860  ! (see in compute_forces_**Dev.F90 routines for actual function call)
861  ! example: a(n1,n2),b(n2,n3),c(n1,n3) -> c = a * b then libxsmm_dispatch(xmm,m=n1,n=n3,k=n2,alpha=1,beta=0)
862
863  ! with A(n1,n2) 5x5-matrix, B(n2,n3) 5x25-matrix and C(n1,n3) 5x25-matrix
864  call libxsmm_dispatch(xmm1, m=5, n=25, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL)
865
866  ! with A(n1,n2) 25x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3) 25x5-matrix
867  call libxsmm_dispatch(xmm2, m=25, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL)
868
869  ! with A(n1,n2,n4) 5x5x5-matrix, B(n2,n3) 5x5-matrix and C(n1,n3,n4) 5x5x5-matrix
870  call libxsmm_dispatch(xmm3, m=5, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL)
871
872  !directly: call libxsmm_smm_5_5_5(A,B,C)
873  if (libxsmm_available(xmm1) .and. libxsmm_available(xmm2) .and. libxsmm_available(xmm3)) then
874    USE_XSMM_FUNCTION = .true.
875    ! user output
876    if (myrank == 0) then
877      write(IMAIN,*)
878      write(IMAIN,*) "LIBXSMM dispatch functions ready for small matrix-matrix multiplications"
879      call flush_IMAIN()
880    endif
881  else
882    USE_XSMM_FUNCTION = .false.
883    print *,'LIBXSMM invalid dispatch function pointers:', &
884            libxsmm_available(xmm1),libxsmm_available(xmm2),libxsmm_available(xmm3)
885    ! hard stop
886    !call exit_MPI(myrank,'LIBXSMM functions not ready, please check configuration & compilation')
887  endif
888
889  ! synchronizes processes
890  call synchronize_all()
891
892  ! prefetch versions
893  call libxsmm_dispatch(xmm1p, m=5, n=25, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH)
894  call libxsmm_dispatch(xmm2p, m=25, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH)
895  call libxsmm_dispatch(xmm3p, m=5, n=5, k=5, alpha=1.0_CUSTOM_REAL, beta=0.0_CUSTOM_REAL,prefetch=LIBXSMM_PREFETCH)
896
897  if (libxsmm_available(xmm1p) .and. libxsmm_available(xmm2p) .and. libxsmm_available(xmm3p)) then
898    USE_XSMM_FUNCTION_PREFETCH = .true.
899    ! user output
900    if (myrank == 0) then
901      write(IMAIN,*) "LIBXSMM prefetch functions ready for small matrix-matrix multiplications"
902      write(IMAIN,*)
903      call flush_IMAIN()
904    endif
905  else
906    USE_XSMM_FUNCTION_PREFETCH = .false.
907    print *,'LIBXSMM invalid prefetch function pointers:', &
908            libxsmm_available(xmm1p),libxsmm_available(xmm2p),libxsmm_available(xmm3p)
909    ! hard stop
910    !call exit_MPI(myrank,'LIBXSMM prefetch functions not ready, please check configuration & compilation')
911  endif
912
913  ! force no dispatch
914  !USE_XSMM_FUNCTION = .false.
915  !USE_XSMM_FUNCTION_PREFETCH = .false.
916
917  end subroutine prepare_xsmm
918
919
920
921!-------------------------------------------------------------------
922!
923! dummy routines
924!
925!-------------------------------------------------------------------
926
927
928
929  subroutine compute_element_dummy(ispec,ibool,tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
930                                   dummyx_loc,dummyy_loc,dummyz_loc,rho_s_H)
931
932! dummy example (original: isotropic element in crust/mantle region)
933!
934! it is mostly used to avoid over-simplification of the compute_forces routine: if we omit it, then compilers can do
935! much more aggressive optimizations and the timing results would be misleading. the original routines for computing
936! stresses on elements are more expensive and complicated. the dummy here will be much faster to compute, but should
937! give similar relative performance results
938
939  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM
940#ifdef FORCE_VECTORIZATION
941  use constants,only: NGLLCUBE
942#endif
943  use specfem_par,only: NSPEC,GRAVITY_VAL
944  implicit none
945
946  ! element id
947  integer,intent(in) :: ispec
948
949  ! arrays with mesh parameters per slice
950  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC),intent(in) :: ibool
951
952  ! element info
953  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(inout) :: &
954    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
955
956  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(in) :: dummyx_loc,dummyy_loc,dummyz_loc
957
958  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NDIM),intent(out) :: rho_s_H
959
960  ! local parameters
961  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sigma_xx,sigma_yy,sigma_zz
962  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
963  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl
964  real(kind=CUSTOM_REAL) :: fac,factor
965  integer :: idummy
966
967#ifdef FORCE_VECTORIZATION
968! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress
969! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3
970! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop
971  integer :: ijk
972#else
973  integer :: i,j,k
974#endif
975! note: profiling shows that this routine takes about 60% of the total time, another 30% is spend in the tiso routine below..
976
977  DO_LOOP_IJK
978    ! compute stress sigma
979    ! (dummy values)
980    sigma_xx(INDEX_IJK) = 0.1 * dummyx_loc(INDEX_IJK)
981    sigma_yy(INDEX_IJK) = 0.1 * dummyy_loc(INDEX_IJK)
982    sigma_zz(INDEX_IJK) = 0.1 * dummyz_loc(INDEX_IJK)
983
984    sigma_xy(INDEX_IJK) = 0.3 * sigma_xx(INDEX_IJK)
985    sigma_xz(INDEX_IJK) = 0.3 * sigma_yy(INDEX_IJK)
986    sigma_yz(INDEX_IJK) = 0.3 * sigma_zz(INDEX_IJK)
987  ENDDO_LOOP_IJK
988
989  ! define symmetric components of sigma (to be general in case of gravity)
990  DO_LOOP_IJK
991    sigma_yx(INDEX_IJK) = sigma_xy(INDEX_IJK)
992    sigma_zx(INDEX_IJK) = sigma_xz(INDEX_IJK)
993    sigma_zy(INDEX_IJK) = sigma_yz(INDEX_IJK)
994  ENDDO_LOOP_IJK
995
996  ! compute non-symmetric terms for gravity
997  if (GRAVITY_VAL) then
998    ! dummy example, originally calls more complicated subroutine compute_element_gravity(..)
999    DO_LOOP_IJK
1000      ! compute G tensor from s . g and add to sigma (not symmetric)
1001      ! (dummy values)
1002      sigma_xx(INDEX_IJK) = sigma_xx(INDEX_IJK) + 1.1 ! real(sy_l*gyl + sz_l*gzl, kind=CUSTOM_REAL)
1003      sigma_yy(INDEX_IJK) = sigma_yy(INDEX_IJK) + 1.1 ! real(sx_l*gxl + sz_l*gzl, kind=CUSTOM_REAL)
1004      sigma_zz(INDEX_IJK) = sigma_zz(INDEX_IJK) + 1.1 ! real(sx_l*gxl + sy_l*gyl, kind=CUSTOM_REAL)
1005
1006      sigma_xy(INDEX_IJK) = sigma_xy(INDEX_IJK) - 0.3 ! real(sx_l * gyl, kind=CUSTOM_REAL)
1007      sigma_yx(INDEX_IJK) = sigma_yx(INDEX_IJK) - 0.3 ! real(sy_l * gxl, kind=CUSTOM_REAL)
1008
1009      sigma_xz(INDEX_IJK) = sigma_xz(INDEX_IJK) - 0.5 ! real(sx_l * gzl, kind=CUSTOM_REAL)
1010      sigma_zx(INDEX_IJK) = sigma_zx(INDEX_IJK) - 0.5 ! real(sz_l * gxl, kind=CUSTOM_REAL)
1011
1012      sigma_yz(INDEX_IJK) = sigma_yz(INDEX_IJK) - 0.7 ! real(sy_l * gzl, kind=CUSTOM_REAL)
1013      sigma_zy(INDEX_IJK) = sigma_zy(INDEX_IJK) - 0.7 ! real(sz_l * gyl, kind=CUSTOM_REAL)
1014
1015      ! precompute vector
1016      factor = 0.5 ! 0.5 * dummyz_loc(INDEX_IJK) ! dble(jacobianl(INDEX_IJK)) * wgll_cube(INDEX_IJK)
1017
1018      rho_s_H(INDEX_IJK,1) = factor * 1.5 ! real(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl), kind=CUSTOM_REAL)
1019      rho_s_H(INDEX_IJK,2) = factor * 1.5 ! real(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl), kind=CUSTOM_REAL)
1020      rho_s_H(INDEX_IJK,3) = factor * 1.5 ! real(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl), kind=CUSTOM_REAL)
1021    ENDDO_LOOP_IJK
1022  endif
1023
1024  ! dot product of stress tensor with test vector, non-symmetric form
1025  DO_LOOP_IJK
1026    ! reloads derivatives of ux, uy and uz with respect to x, y and z
1027    ! (dummy)
1028    xixl = 1.1
1029    xiyl = 1.2
1030    xizl = 1.3
1031    etaxl = 1.4
1032    etayl = 1.5
1033    etazl = 1.6
1034    gammaxl = 1.7
1035    gammayl = 1.8
1036    gammazl = 1.9
1037
1038    ! common factor (dummy)
1039    fac = 0.5
1040
1041    ! form dot product with test vector, non-symmetric form
1042    ! this goes to accel_x
1043    tempx1(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*xixl + sigma_yx(INDEX_IJK)*xiyl + sigma_zx(INDEX_IJK)*xizl)
1044    ! this goes to accel_y
1045    tempy1(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*xixl + sigma_yy(INDEX_IJK)*xiyl + sigma_zy(INDEX_IJK)*xizl)
1046    ! this goes to accel_z
1047    tempz1(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*xixl + sigma_yz(INDEX_IJK)*xiyl + sigma_zz(INDEX_IJK)*xizl)
1048
1049    ! this goes to accel_x
1050    tempx2(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*etaxl + sigma_yx(INDEX_IJK)*etayl + sigma_zx(INDEX_IJK)*etazl)
1051    ! this goes to accel_y
1052    tempy2(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*etaxl + sigma_yy(INDEX_IJK)*etayl + sigma_zy(INDEX_IJK)*etazl)
1053    ! this goes to accel_z
1054    tempz2(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*etaxl + sigma_yz(INDEX_IJK)*etayl + sigma_zz(INDEX_IJK)*etazl)
1055
1056    ! this goes to accel_x
1057    tempx3(INDEX_IJK) = fac * (sigma_xx(INDEX_IJK)*gammaxl + sigma_yx(INDEX_IJK)*gammayl + sigma_zx(INDEX_IJK)*gammazl)
1058    ! this goes to accel_y
1059    tempy3(INDEX_IJK) = fac * (sigma_xy(INDEX_IJK)*gammaxl + sigma_yy(INDEX_IJK)*gammayl + sigma_zy(INDEX_IJK)*gammazl)
1060    ! this goes to accel_z
1061    tempz3(INDEX_IJK) = fac * (sigma_xz(INDEX_IJK)*gammaxl + sigma_yz(INDEX_IJK)*gammayl + sigma_zz(INDEX_IJK)*gammazl)
1062
1063  ENDDO_LOOP_IJK
1064
1065  ! avoid compiler warning
1066  idummy = ispec
1067  idummy = ibool(1,1,1,1)
1068
1069  end subroutine compute_element_dummy
1070
1071!
1072!-------------------------------------------------------------------
1073!
1074
1075  subroutine synchronize_all()
1076
1077! dummy routine to make it easier for copy-paste from the original code
1078
1079  implicit none
1080
1081  continue
1082
1083  end subroutine synchronize_all
1084
1085!
1086!-------------------------------------------------------------------
1087!
1088
1089
1090  subroutine exit_MPI(myrank,error_msg)
1091
1092! dummy routine to make it easier for copy-paste from the original code
1093
1094  use constants
1095
1096  implicit none
1097
1098  ! identifier for error message file
1099  integer, parameter :: IERROR = 30
1100
1101  integer :: myrank
1102  character(len=*) :: error_msg
1103
1104  ! write error message to screen
1105  write(*,*) error_msg(1:len(error_msg))
1106  write(*,*) 'Error detected, aborting MPI... proc ',myrank
1107
1108  ! or just exit with message:
1109  stop 'Error, program ended in exit_MPI'
1110
1111  end subroutine exit_MPI
1112
1113!
1114!-------------------------------------------------------------------
1115!
1116
1117  subroutine flush_IMAIN()
1118
1119! dummy routine to make it easier for copy-paste from the original code
1120
1121  implicit none
1122
1123  continue
1124
1125  end subroutine flush_IMAIN
1126
1127