1!--------------------------------------------------------------------------------------------------!
2!   CP2K: A general program to perform molecular dynamics simulations                              !
3!   Copyright (C) 2000 - 2020  CP2K developers group                                               !
4!--------------------------------------------------------------------------------------------------!
5
6! **************************************************************************************************
7!> \brief Wrapper for ELPA
8!> \author Ole Schuett
9! **************************************************************************************************
10MODULE cp_fm_elpa
11   USE cp_blacs_env, ONLY: cp_blacs_env_type
12   USE cp_fm_basic_linalg, ONLY: cp_fm_upper_to_full
13   USE cp_fm_diag_utils, ONLY: cp_fm_redistribute_start, &
14                               cp_fm_redistribute_end, &
15                               cp_fm_redistribute_info
16   USE cp_fm_struct, ONLY: cp_fm_struct_get
17   USE cp_fm_types, ONLY: cp_fm_type, &
18                          cp_fm_to_fm, &
19                          cp_fm_release, &
20                          cp_fm_create, &
21                          cp_fm_write_info
22   USE cp_log_handling, ONLY: cp_get_default_logger, &
23                              cp_logger_get_default_io_unit, &
24                              cp_logger_type
25   USE kinds, ONLY: default_string_length, &
26                    dp
27   USE message_passing, ONLY: mp_comm_free, &
28                              mp_comm_split_direct, &
29                              mp_bcast
30!$ USE OMP_LIB, ONLY: omp_get_num_threads
31
32#include "../base/base_uses.f90"
33
34#if defined (__ELPA)
35   USE elpa_constants, ONLY: ELPA_2STAGE_REAL_GENERIC, &
36                             ELPA_2STAGE_REAL_GENERIC_SIMPLE, &
37                             ELPA_2STAGE_REAL_BGP, &
38                             ELPA_2STAGE_REAL_BGQ, &
39                             ELPA_2STAGE_REAL_SSE_ASSEMBLY, &
40                             ELPA_2STAGE_REAL_SSE_BLOCK2, &
41                             ELPA_2STAGE_REAL_SSE_BLOCK4, &
42                             ELPA_2STAGE_REAL_SSE_BLOCK6, &
43                             ELPA_2STAGE_REAL_AVX_BLOCK2, &
44                             ELPA_2STAGE_REAL_AVX_BLOCK4, &
45                             ELPA_2STAGE_REAL_AVX_BLOCK6, &
46                             ELPA_2STAGE_REAL_AVX2_BLOCK2, &
47                             ELPA_2STAGE_REAL_AVX2_BLOCK4, &
48                             ELPA_2STAGE_REAL_AVX2_BLOCK6, &
49                             ELPA_2STAGE_REAL_AVX512_BLOCK2, &
50                             ELPA_2STAGE_REAL_AVX512_BLOCK4, &
51                             ELPA_2STAGE_REAL_AVX512_BLOCK6, &
52                             ELPA_2STAGE_REAL_GPU
53   USE elpa, ONLY: elpa_t, elpa_solver_2stage, &
54                   elpa_init, elpa_uninit, &
55                   elpa_allocate, elpa_deallocate, elpa_ok
56#endif
57
58   IMPLICIT NONE
59
60   PRIVATE
61
62   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'cp_fm_elpa'
63
64   CHARACTER(len=14), DIMENSION(*), PARAMETER :: elpa_kernel_names = [CHARACTER(len=14) :: &
65                                                                      "AUTO", &
66                                                                      "GENERIC", &
67                                                                      "GENERIC_SIMPLE", &
68                                                                      "BGP", &
69                                                                      "BGQ", &
70                                                                      "SSE", &
71                                                                      "SSE_BLOCK2", &
72                                                                      "SSE_BLOCK4", &
73                                                                      "SSE_BLOCK6", &
74                                                                      "AVX_BLOCK2", &
75                                                                      "AVX_BLOCK4", &
76                                                                      "AVX_BLOCK6", &
77                                                                      "AVX2_BLOCK2", &
78                                                                      "AVX2_BLOCK4", &
79                                                                      "AVX2_BLOCK6", &
80                                                                      "AVX512_BLOCK2", &
81                                                                      "AVX512_BLOCK4", &
82                                                                      "AVX512_BLOCK6", &
83                                                                      "GPU"]
84
85   CHARACTER(len=44), DIMENSION(*), PARAMETER :: elpa_kernel_descriptions = [CHARACTER(len=44) :: &
86                                                                             "Automatically selected kernel", &
87                                                                             "Generic kernel", &
88                                                                             "Simplified generic kernel", &
89                                                                             "Kernel optimized for IBM BGP", &
90                                                                             "Kernel optimized for IBM BGQ", &
91                                                                             "Kernel optimized for x86_64/SSE", &
92                                                                             "Kernel optimized for x86_64/SSE (block=2)", &
93                                                                             "Kernel optimized for x86_64/SSE (block=4)", &
94                                                                             "Kernel optimized for x86_64/SSE (block=6)", &
95                                                                             "Kernel optimized for Intel AVX (block=2)", &
96                                                                             "Kernel optimized for Intel AVX (block=4)", &
97                                                                             "Kernel optimized for Intel AVX (block=6)", &
98                                                                             "Kernel optimized for Intel AVX2 (block=2)", &
99                                                                             "Kernel optimized for Intel AVX2 (block=4)", &
100                                                                             "Kernel optimized for Intel AVX2 (block=6)", &
101                                                                             "Kernel optimized for Intel AVX-512 (block=2)", &
102                                                                             "Kernel optimized for Intel AVX-512 (block=4)", &
103                                                                             "Kernel optimized for Intel AVX-512 (block=6)", &
104                                                                             "Kernel targeting GPUs"]
105
106   INTEGER, SAVE :: elpa_kernel = -1 ! auto
107   LOGICAL, SAVE :: elpa_qr = .FALSE., &
108                    elpa_qr_unsafe = .FALSE., &
109                    elpa_should_print = .FALSE., &
110                    elpa_use_gpu = .FALSE.
111
112   PUBLIC :: cp_fm_diag_elpa, &
113             get_elpa_number_kernels, &
114             get_elpa_kernel_index, &
115             set_elpa_kernel, &
116             set_elpa_qr, &
117             set_elpa_print, &
118             elpa_kernel_names, &
119             elpa_kernel_descriptions, &
120             initialize_elpa_library, &
121             finalize_elpa_library
122
123CONTAINS
124
125! **************************************************************************************************
126!> \brief Return the number of available ELPA kernels
127!> \return ...
128! **************************************************************************************************
129   PURE FUNCTION get_elpa_number_kernels() RESULT(num)
130      INTEGER :: num
131
132      num = SIZE(elpa_kernel_names)
133   END FUNCTION get_elpa_number_kernels
134
135! **************************************************************************************************
136!> \brief Returns the index of an ELPA kernel suitable for the given CPUID
137!> \return ...
138! **************************************************************************************************
139   PURE FUNCTION get_elpa_kernel_index(cpuid) RESULT(index)
140      USE machine, ONLY: MACHINE_X86, &
141                         MACHINE_CPU_GENERIC, &
142                         MACHINE_X86_SSE4, &
143                         MACHINE_X86_AVX, &
144                         MACHINE_X86_AVX2
145      INTEGER, INTENT(IN) :: cpuid
146      INTEGER :: index
147
148      index = 1 ! AUTO
149      IF ((MACHINE_CPU_GENERIC .LT. cpuid) .AND. (cpuid .LE. MACHINE_X86)) THEN
150         ! the following indizes correspond to OUR list of kernels, not the ones from ELPA
151         SELECT CASE (cpuid)
152         CASE (MACHINE_X86_SSE4)
153            index = 8 ! ELPA_2STAGE_REAL_SSE_BLOCK4
154         CASE (MACHINE_X86_AVX)
155            index = 11 ! ELPA_2STAGE_REAL_AVX_BLOCK4
156         CASE (MACHINE_X86_AVX2)
157            index = 14 ! ELPA_2STAGE_REAL_AVX2_BLOCK4
158         CASE DEFAULT
159            index = 17 ! ELPA_2STAGE_REAL_AVX512_BLOCK4
160         END SELECT
161      END IF
162
163   END FUNCTION get_elpa_kernel_index
164
165! **************************************************************************************************
166!> \brief Initialize the ELPA library
167! **************************************************************************************************
168   SUBROUTINE initialize_elpa_library()
169#if defined(__ELPA)
170      IF (elpa_init(20180525) /= elpa_ok) &
171         CPABORT("The linked ELPA library does not support the required API version")
172#else
173      CPABORT("Initialization of ELPA library requested but not enabled during build")
174#endif
175   END SUBROUTINE
176
177! **************************************************************************************************
178!> \brief Finalize the ELPA library
179! **************************************************************************************************
180   SUBROUTINE finalize_elpa_library()
181#if defined(__ELPA)
182      CALL elpa_uninit()
183#else
184      CPABORT("Finalization of ELPA library requested but not enabled during build")
185#endif
186   END SUBROUTINE
187
188! **************************************************************************************************
189!> \brief Sets the active ELPA kernel.
190!> \param kernel Integer between 1 and get_elpa_number_kernels()
191! **************************************************************************************************
192   SUBROUTINE set_elpa_kernel(kernel)
193      INTEGER, INTENT(IN) :: kernel
194
195#:def pick_macro(kernel_const)
196#! use Fypp's eval directive to hide ELPA flags from convention checker
197$: "#if defined(__ELPA)"
198$: "   elpa_kernel = ELPA_2STAGE_REAL_" + kernel_const
199$: "#else"
200$: "   CPABORT('ELPA is not available')"
201$: "#endif"
202#:enddef
203
204      SELECT CASE (kernel)
205      CASE (1)
206         elpa_kernel = -1 ! auto
207      CASE (2)
208@:        pick_macro(GENERIC)
209      CASE (3)
210@:        pick_macro(GENERIC_SIMPLE)
211      CASE (4)
212@:        pick_macro(BGP)
213      CASE (5)
214@:        pick_macro(BGQ)
215      CASE (6)
216@:        pick_macro(SSE_ASSEMBLY)
217      CASE (7)
218@:        pick_macro(SSE_BLOCK2)
219      CASE (8)
220@:        pick_macro(SSE_BLOCK4)
221      CASE (9)
222@:        pick_macro(SSE_BLOCK6)
223      CASE (10)
224@:        pick_macro(AVX_BLOCK2)
225      CASE (11)
226@:        pick_macro(AVX_BLOCK4)
227      CASE (12)
228@:        pick_macro(AVX_BLOCK6)
229      CASE (13)
230@:        pick_macro(AVX2_BLOCK2)
231      CASE (14)
232@:        pick_macro(AVX2_BLOCK4)
233      CASE (15)
234@:        pick_macro(AVX2_BLOCK6)
235      CASE (16)
236@:        pick_macro(AVX512_BLOCK2)
237      CASE (17)
238@:        pick_macro(AVX512_BLOCK4)
239      CASE (18)
240@:        pick_macro(AVX512_BLOCK6)
241      CASE (19)
242@:        pick_macro(GPU)
243         elpa_use_gpu = .TRUE.
244      CASE DEFAULT
245         CPABORT("Invalid ELPA kernel selected")
246      END SELECT
247
248   END SUBROUTINE set_elpa_kernel
249
250! **************************************************************************************************
251!> \brief Sets flags that determines if ELPA should try to use QR during diagonalization
252!>        If use_qr = .TRUE., the QR step is performed only if the size of the input matrix is
253!>        suitable. Check cp_fm_diag_elpa for further details.
254!> \param use_qr the logical flag
255!> \param use_qr_unsafe logical which determines if block size checks should be bypassed for some
256!>                      ELPA versions, potentially leading to incorrect eigenvalues
257! **************************************************************************************************
258   SUBROUTINE set_elpa_qr(use_qr, use_qr_unsafe)
259      LOGICAL, INTENT(IN) :: use_qr, use_qr_unsafe
260
261      elpa_qr = use_qr
262      elpa_qr_unsafe = use_qr_unsafe
263   END SUBROUTINE set_elpa_qr
264
265! **************************************************************************************************
266!> \brief Sets a flag that determines if additional information about the ELPA diagonalization
267!>        should be printed when the diagonalization routine is called.
268!> \param flag the logical flag
269! **************************************************************************************************
270   SUBROUTINE set_elpa_print(flag)
271      LOGICAL, INTENT(IN) :: flag
272
273      elpa_should_print = flag
274   END SUBROUTINE set_elpa_print
275
276! **************************************************************************************************
277!> \brief Driver routine to diagonalize a FM matrix with the ELPA library.
278!> \param matrix the matrix that is diagonalized
279!> \param eigenvectors eigenvectors of the input matrix
280!> \param eigenvalues eigenvalues of the input matrix
281! **************************************************************************************************
282   SUBROUTINE cp_fm_diag_elpa(matrix, eigenvectors, eigenvalues)
283      TYPE(cp_fm_type), POINTER                :: matrix, eigenvectors
284      REAL(KIND=dp), DIMENSION(:)              :: eigenvalues
285
286      CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_diag_elpa'
287
288#if defined(__ELPA)
289      INTEGER                                  :: handle
290      TYPE(cp_fm_type), POINTER                :: eigenvectors_new, matrix_new
291      TYPE(cp_fm_redistribute_info)            :: rdinfo
292
293      CALL timeset(routineN, handle)
294
295      ! Determine if the input matrix needs to be redistributed before diagonalization.
296      ! Heuristics are used to determine the optimal number of CPUs for diagonalization.
297      ! The redistributed matrix is stored in matrix_new, which is just a pointer
298      ! to the original matrix if no redistribution is required.
299      ! With ELPA, we have to make sure that all processor columns have nonzero width
300      CALL cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvectors_new, &
301                                    caller_is_elpa=.TRUE., redist_info=rdinfo)
302
303      ! Call ELPA on CPUs that hold the new matrix
304      IF (ASSOCIATED(matrix_new)) &
305         CALL cp_fm_diag_elpa_base(matrix_new, eigenvectors_new, eigenvalues, rdinfo)
306
307      ! Redistribute results and clean up
308      CALL cp_fm_redistribute_end(matrix, eigenvectors, eigenvalues, matrix_new, eigenvectors_new)
309
310      CALL timestop(handle)
311#else
312      MARK_USED(matrix)
313      MARK_USED(eigenvectors)
314      MARK_USED(eigenvalues)
315
316      CPABORT("CP2K compiled without the ELPA library.")
317#endif
318   END SUBROUTINE cp_fm_diag_elpa
319
320#if defined(__ELPA)
321! **************************************************************************************************
322!> \brief Actual routine that calls ELPA to diagonalize a FM matrix.
323!> \param matrix the matrix that is diagonalized
324!> \param eigenvectors eigenvectors of the input matrix
325!> \param eigenvalues eigenvalues of the input matrix
326! **************************************************************************************************
327   SUBROUTINE cp_fm_diag_elpa_base(matrix, eigenvectors, eigenvalues, rdinfo)
328
329      TYPE(cp_fm_type), POINTER, INTENT(INOUT) :: matrix, eigenvectors
330      REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: eigenvalues
331      TYPE(cp_fm_redistribute_info), INTENT(IN) :: rdinfo
332
333      CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_diag_elpa_base'
334
335      INTEGER                                  :: handle
336      CLASS(elpa_t), POINTER                   :: elpa_obj
337      CHARACTER(len=14)                        :: kernel
338      INTEGER                                  :: group, &
339                                                  mypcol, myprow, n, &
340                                                  n_rows, n_cols, &
341                                                  nblk, neig, io_unit, &
342                                                  success
343      LOGICAL                                  :: use_qr, check_eigenvalues
344      REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: eval, eval_noqr
345      TYPE(cp_blacs_env_type), POINTER         :: context
346      TYPE(cp_fm_type), POINTER                :: matrix_noqr, eigenvectors_noqr
347      TYPE(cp_logger_type), POINTER            :: logger
348      REAL(KIND=dp), PARAMETER                 :: th = 1.0E-14_dp
349      INTEGER, DIMENSION(:), POINTER           :: ncol_locals
350
351      CALL timeset(routineN, handle)
352      NULLIFY (logger)
353      NULLIFY (ncol_locals)
354
355      check_eigenvalues = .FALSE.
356
357      logger => cp_get_default_logger()
358      io_unit = cp_logger_get_default_io_unit(logger)
359
360      n = matrix%matrix_struct%nrow_global
361      context => matrix%matrix_struct%context
362      group = matrix%matrix_struct%para_env%group
363
364      myprow = context%mepos(1)
365      mypcol = context%mepos(2)
366
367      ! elpa needs the full matrix
368      CALL cp_fm_upper_to_full(matrix, eigenvectors)
369
370      CALL cp_fm_struct_get(matrix%matrix_struct, &
371                            local_leading_dimension=n_rows, &
372                            ncol_local=n_cols, &
373                            nrow_block=nblk, &
374                            ncol_locals=ncol_locals)
375
376      ! ELPA will fail in 'solve_tridi', with no useful error message, fail earlier
377      IF (io_unit > 0 .AND. ANY(ncol_locals == 0)) THEN
378         CALL rdinfo%write(io_unit)
379         CALL cp_fm_write_info(matrix, io_unit)
380         CPABORT("ELPA [pre-fail]: Problem contains processor column with zero width.")
381      END IF
382
383      neig = SIZE(eigenvalues, 1)
384      ! Decide if matrix is suitable for ELPA to use QR
385      ! The definition of what is considered a suitable matrix depends on the ELPA version
386      ! The relevant ELPA files to check are
387      !     - Proper matrix order:  src/elpa2/elpa2_template.F90
388      !     - Proper block size:    test/Fortran/test.F90
389      ! Note that the names of these files might change in different ELPA versions
390      ! Matrix order must be even
391      use_qr = elpa_qr .AND. (MOD(n, 2) .EQ. 0)
392      ! Matrix order and block size must be greater than or equal to 64
393      IF (.NOT. elpa_qr_unsafe) &
394         use_qr = use_qr .AND. (n .GE. 64) .AND. (nblk .GE. 64)
395
396      ! Check if eigenvalues computed with ELPA_QR_UNSAFE should be verified
397      IF (use_qr .AND. elpa_qr_unsafe .AND. elpa_should_print) &
398         check_eigenvalues = .TRUE.
399
400      CALL mp_bcast(check_eigenvalues, matrix%matrix_struct%para_env%source, matrix%matrix_struct%para_env%group)
401
402      IF (check_eigenvalues) THEN
403         ! Allocate and initialize needed temporaries to compute eigenvalues without ELPA QR
404         ALLOCATE (eval_noqr(n))
405         CALL cp_fm_create(matrix=matrix_noqr, matrix_struct=matrix%matrix_struct)
406         CALL cp_fm_to_fm(matrix, matrix_noqr)
407         CALL cp_fm_create(matrix=eigenvectors_noqr, matrix_struct=eigenvectors%matrix_struct)
408         CALL cp_fm_upper_to_full(matrix_noqr, eigenvectors_noqr)
409      END IF
410
411      IF (io_unit > 0 .AND. elpa_should_print) THEN
412         WRITE (io_unit, '(/,A)') "ELPA| Matrix diagonalization information"
413
414         IF (elpa_kernel == -1) THEN
415            kernel = TRIM(elpa_kernel_names(1))
416         ELSE
417            kernel = TRIM(elpa_kernel_names(elpa_kernel + 1)) ! this assumes that our indizes differ from the ELPA idx by -1
418         END IF
419
420         WRITE (io_unit, '(A,I14)') "ELPA| Matrix order             : ", n
421         WRITE (io_unit, '(A,I14)') "ELPA| Matrix block size        : ", nblk
422         WRITE (io_unit, '(A,A14)') "ELPA| Kernel                   : ", ADJUSTR(kernel)
423         WRITE (io_unit, '(A,L14)') "ELPA| QR step requested        : ", elpa_qr
424
425         IF (elpa_qr) THEN
426            WRITE (io_unit, '(A,L14)') "ELPA| Use potentially unsafe QR: ", elpa_qr_unsafe
427            WRITE (io_unit, '(A,L14)') "ELPA| Matrix is suitable for QR: ", use_qr
428
429            IF (.NOT. use_qr) THEN
430               IF (MOD(n, 2) .NE. 0) &
431                  WRITE (io_unit, '(A)') "ELPA| Matrix order is NOT even"
432               IF (nblk .LT. 64 .AND. .NOT. elpa_qr_unsafe) &
433                  WRITE (io_unit, '(A)') "ELPA| Matrix block size is NOT 64 or greater"
434            ELSE
435               IF (nblk .LT. 64 .AND. elpa_qr_unsafe) &
436                  WRITE (io_unit, '(A,L14)') "ELPA| Matrix block size check was bypassed"
437            END IF
438         END IF
439      END IF
440
441      ! the full eigenvalues vector is needed
442      ALLOCATE (eval(n))
443
444      elpa_obj => elpa_allocate()
445
446      CALL elpa_obj%set("na", n, success)
447      CALL elpa_obj%set("nev", neig, success)
448      CALL elpa_obj%set("local_nrows", n_rows, success)
449      CALL elpa_obj%set("local_ncols", n_cols, success)
450      CALL elpa_obj%set("nblk", nblk, success)
451      CALL elpa_obj%set("mpi_comm_parent", group, success)
452      CALL elpa_obj%set("process_row", myprow, success)
453      CALL elpa_obj%set("process_col", mypcol, success)
454
455      success = elpa_obj%setup()
456      IF (success /= elpa_ok) &
457         CPABORT("Setting up the ELPA object failed")
458
459      CALL elpa_obj%set("solver", elpa_solver_2stage, success)
460
461      IF (elpa_kernel .GE. 1) &
462         CALL elpa_obj%set("real_kernel", elpa_kernel, success)
463
464      IF (elpa_use_gpu) &
465         CALL elpa_obj%set("gpu", 1, success)
466
467      IF (use_qr) &
468         CALL elpa_obj%set("qr", 1, success)
469
470!$    CALL elpa_obj%set("omp_threads", omp_get_num_threads(), success)
471
472      CALL elpa_obj%eigenvectors(matrix%local_data, eval, eigenvectors%local_data, success)
473
474      IF (success /= elpa_ok) &
475         CPABORT("ELPA failed to diagonalize a matrix")
476
477      IF (check_eigenvalues) THEN
478         ! run again without QR
479         CALL elpa_obj%set("qr", 0, success)
480         CALL elpa_obj%eigenvectors(matrix_noqr%local_data, eval_noqr, eigenvectors_noqr%local_data, success)
481
482         IF (success /= elpa_ok) &
483            CPABORT("ELPA failed to diagonalize a matrix even without QR decomposition")
484
485         IF (ANY(ABS(eval(1:neig) - eval_noqr(1:neig)) .GT. th)) &
486            CPABORT("Eigenvalues calculated with QR decomp. in ELPA are wrong. Disable ELPA_QR_UNSAFE.")
487
488         DEALLOCATE (eval_noqr)
489         CALL cp_fm_release(matrix_noqr)
490         CALL cp_fm_release(eigenvectors_noqr)
491      END IF
492
493      CALL elpa_deallocate(elpa_obj)
494
495      eigenvalues(1:neig) = eval(1:neig)
496      DEALLOCATE (eval)
497
498      CALL timestop(handle)
499
500   END SUBROUTINE cp_fm_diag_elpa_base
501#endif
502
503END MODULE cp_fm_elpa
504