1!--------------------------------------------------------------------------------------------------! 2! CP2K: A general program to perform molecular dynamics simulations ! 3! Copyright (C) 2000 - 2020 CP2K developers group ! 4!--------------------------------------------------------------------------------------------------! 5 6! ************************************************************************************************** 7!> \brief Wrapper for ELPA 8!> \author Ole Schuett 9! ************************************************************************************************** 10MODULE cp_fm_elpa 11 USE cp_blacs_env, ONLY: cp_blacs_env_type 12 USE cp_fm_basic_linalg, ONLY: cp_fm_upper_to_full 13 USE cp_fm_diag_utils, ONLY: cp_fm_redistribute_start, & 14 cp_fm_redistribute_end, & 15 cp_fm_redistribute_info 16 USE cp_fm_struct, ONLY: cp_fm_struct_get 17 USE cp_fm_types, ONLY: cp_fm_type, & 18 cp_fm_to_fm, & 19 cp_fm_release, & 20 cp_fm_create, & 21 cp_fm_write_info 22 USE cp_log_handling, ONLY: cp_get_default_logger, & 23 cp_logger_get_default_io_unit, & 24 cp_logger_type 25 USE kinds, ONLY: default_string_length, & 26 dp 27 USE message_passing, ONLY: mp_comm_free, & 28 mp_comm_split_direct, & 29 mp_bcast 30!$ USE OMP_LIB, ONLY: omp_get_num_threads 31 32#include "../base/base_uses.f90" 33 34#if defined (__ELPA) 35 USE elpa_constants, ONLY: ELPA_2STAGE_REAL_GENERIC, & 36 ELPA_2STAGE_REAL_GENERIC_SIMPLE, & 37 ELPA_2STAGE_REAL_BGP, & 38 ELPA_2STAGE_REAL_BGQ, & 39 ELPA_2STAGE_REAL_SSE_ASSEMBLY, & 40 ELPA_2STAGE_REAL_SSE_BLOCK2, & 41 ELPA_2STAGE_REAL_SSE_BLOCK4, & 42 ELPA_2STAGE_REAL_SSE_BLOCK6, & 43 ELPA_2STAGE_REAL_AVX_BLOCK2, & 44 ELPA_2STAGE_REAL_AVX_BLOCK4, & 45 ELPA_2STAGE_REAL_AVX_BLOCK6, & 46 ELPA_2STAGE_REAL_AVX2_BLOCK2, & 47 ELPA_2STAGE_REAL_AVX2_BLOCK4, & 48 ELPA_2STAGE_REAL_AVX2_BLOCK6, & 49 ELPA_2STAGE_REAL_AVX512_BLOCK2, & 50 ELPA_2STAGE_REAL_AVX512_BLOCK4, & 51 ELPA_2STAGE_REAL_AVX512_BLOCK6, & 52 ELPA_2STAGE_REAL_GPU 53 USE elpa, ONLY: elpa_t, elpa_solver_2stage, & 54 elpa_init, elpa_uninit, & 55 elpa_allocate, elpa_deallocate, elpa_ok 56#endif 57 58 IMPLICIT NONE 59 60 PRIVATE 61 62 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'cp_fm_elpa' 63 64 CHARACTER(len=14), DIMENSION(*), PARAMETER :: elpa_kernel_names = [CHARACTER(len=14) :: & 65 "AUTO", & 66 "GENERIC", & 67 "GENERIC_SIMPLE", & 68 "BGP", & 69 "BGQ", & 70 "SSE", & 71 "SSE_BLOCK2", & 72 "SSE_BLOCK4", & 73 "SSE_BLOCK6", & 74 "AVX_BLOCK2", & 75 "AVX_BLOCK4", & 76 "AVX_BLOCK6", & 77 "AVX2_BLOCK2", & 78 "AVX2_BLOCK4", & 79 "AVX2_BLOCK6", & 80 "AVX512_BLOCK2", & 81 "AVX512_BLOCK4", & 82 "AVX512_BLOCK6", & 83 "GPU"] 84 85 CHARACTER(len=44), DIMENSION(*), PARAMETER :: elpa_kernel_descriptions = [CHARACTER(len=44) :: & 86 "Automatically selected kernel", & 87 "Generic kernel", & 88 "Simplified generic kernel", & 89 "Kernel optimized for IBM BGP", & 90 "Kernel optimized for IBM BGQ", & 91 "Kernel optimized for x86_64/SSE", & 92 "Kernel optimized for x86_64/SSE (block=2)", & 93 "Kernel optimized for x86_64/SSE (block=4)", & 94 "Kernel optimized for x86_64/SSE (block=6)", & 95 "Kernel optimized for Intel AVX (block=2)", & 96 "Kernel optimized for Intel AVX (block=4)", & 97 "Kernel optimized for Intel AVX (block=6)", & 98 "Kernel optimized for Intel AVX2 (block=2)", & 99 "Kernel optimized for Intel AVX2 (block=4)", & 100 "Kernel optimized for Intel AVX2 (block=6)", & 101 "Kernel optimized for Intel AVX-512 (block=2)", & 102 "Kernel optimized for Intel AVX-512 (block=4)", & 103 "Kernel optimized for Intel AVX-512 (block=6)", & 104 "Kernel targeting GPUs"] 105 106 INTEGER, SAVE :: elpa_kernel = -1 ! auto 107 LOGICAL, SAVE :: elpa_qr = .FALSE., & 108 elpa_qr_unsafe = .FALSE., & 109 elpa_should_print = .FALSE., & 110 elpa_use_gpu = .FALSE. 111 112 PUBLIC :: cp_fm_diag_elpa, & 113 get_elpa_number_kernels, & 114 get_elpa_kernel_index, & 115 set_elpa_kernel, & 116 set_elpa_qr, & 117 set_elpa_print, & 118 elpa_kernel_names, & 119 elpa_kernel_descriptions, & 120 initialize_elpa_library, & 121 finalize_elpa_library 122 123CONTAINS 124 125! ************************************************************************************************** 126!> \brief Return the number of available ELPA kernels 127!> \return ... 128! ************************************************************************************************** 129 PURE FUNCTION get_elpa_number_kernels() RESULT(num) 130 INTEGER :: num 131 132 num = SIZE(elpa_kernel_names) 133 END FUNCTION get_elpa_number_kernels 134 135! ************************************************************************************************** 136!> \brief Returns the index of an ELPA kernel suitable for the given CPUID 137!> \return ... 138! ************************************************************************************************** 139 PURE FUNCTION get_elpa_kernel_index(cpuid) RESULT(index) 140 USE machine, ONLY: MACHINE_X86, & 141 MACHINE_CPU_GENERIC, & 142 MACHINE_X86_SSE4, & 143 MACHINE_X86_AVX, & 144 MACHINE_X86_AVX2 145 INTEGER, INTENT(IN) :: cpuid 146 INTEGER :: index 147 148 index = 1 ! AUTO 149 IF ((MACHINE_CPU_GENERIC .LT. cpuid) .AND. (cpuid .LE. MACHINE_X86)) THEN 150 ! the following indizes correspond to OUR list of kernels, not the ones from ELPA 151 SELECT CASE (cpuid) 152 CASE (MACHINE_X86_SSE4) 153 index = 8 ! ELPA_2STAGE_REAL_SSE_BLOCK4 154 CASE (MACHINE_X86_AVX) 155 index = 11 ! ELPA_2STAGE_REAL_AVX_BLOCK4 156 CASE (MACHINE_X86_AVX2) 157 index = 14 ! ELPA_2STAGE_REAL_AVX2_BLOCK4 158 CASE DEFAULT 159 index = 17 ! ELPA_2STAGE_REAL_AVX512_BLOCK4 160 END SELECT 161 END IF 162 163 END FUNCTION get_elpa_kernel_index 164 165! ************************************************************************************************** 166!> \brief Initialize the ELPA library 167! ************************************************************************************************** 168 SUBROUTINE initialize_elpa_library() 169#if defined(__ELPA) 170 IF (elpa_init(20180525) /= elpa_ok) & 171 CPABORT("The linked ELPA library does not support the required API version") 172#else 173 CPABORT("Initialization of ELPA library requested but not enabled during build") 174#endif 175 END SUBROUTINE 176 177! ************************************************************************************************** 178!> \brief Finalize the ELPA library 179! ************************************************************************************************** 180 SUBROUTINE finalize_elpa_library() 181#if defined(__ELPA) 182 CALL elpa_uninit() 183#else 184 CPABORT("Finalization of ELPA library requested but not enabled during build") 185#endif 186 END SUBROUTINE 187 188! ************************************************************************************************** 189!> \brief Sets the active ELPA kernel. 190!> \param kernel Integer between 1 and get_elpa_number_kernels() 191! ************************************************************************************************** 192 SUBROUTINE set_elpa_kernel(kernel) 193 INTEGER, INTENT(IN) :: kernel 194 195#:def pick_macro(kernel_const) 196#! use Fypp's eval directive to hide ELPA flags from convention checker 197$: "#if defined(__ELPA)" 198$: " elpa_kernel = ELPA_2STAGE_REAL_" + kernel_const 199$: "#else" 200$: " CPABORT('ELPA is not available')" 201$: "#endif" 202#:enddef 203 204 SELECT CASE (kernel) 205 CASE (1) 206 elpa_kernel = -1 ! auto 207 CASE (2) 208@: pick_macro(GENERIC) 209 CASE (3) 210@: pick_macro(GENERIC_SIMPLE) 211 CASE (4) 212@: pick_macro(BGP) 213 CASE (5) 214@: pick_macro(BGQ) 215 CASE (6) 216@: pick_macro(SSE_ASSEMBLY) 217 CASE (7) 218@: pick_macro(SSE_BLOCK2) 219 CASE (8) 220@: pick_macro(SSE_BLOCK4) 221 CASE (9) 222@: pick_macro(SSE_BLOCK6) 223 CASE (10) 224@: pick_macro(AVX_BLOCK2) 225 CASE (11) 226@: pick_macro(AVX_BLOCK4) 227 CASE (12) 228@: pick_macro(AVX_BLOCK6) 229 CASE (13) 230@: pick_macro(AVX2_BLOCK2) 231 CASE (14) 232@: pick_macro(AVX2_BLOCK4) 233 CASE (15) 234@: pick_macro(AVX2_BLOCK6) 235 CASE (16) 236@: pick_macro(AVX512_BLOCK2) 237 CASE (17) 238@: pick_macro(AVX512_BLOCK4) 239 CASE (18) 240@: pick_macro(AVX512_BLOCK6) 241 CASE (19) 242@: pick_macro(GPU) 243 elpa_use_gpu = .TRUE. 244 CASE DEFAULT 245 CPABORT("Invalid ELPA kernel selected") 246 END SELECT 247 248 END SUBROUTINE set_elpa_kernel 249 250! ************************************************************************************************** 251!> \brief Sets flags that determines if ELPA should try to use QR during diagonalization 252!> If use_qr = .TRUE., the QR step is performed only if the size of the input matrix is 253!> suitable. Check cp_fm_diag_elpa for further details. 254!> \param use_qr the logical flag 255!> \param use_qr_unsafe logical which determines if block size checks should be bypassed for some 256!> ELPA versions, potentially leading to incorrect eigenvalues 257! ************************************************************************************************** 258 SUBROUTINE set_elpa_qr(use_qr, use_qr_unsafe) 259 LOGICAL, INTENT(IN) :: use_qr, use_qr_unsafe 260 261 elpa_qr = use_qr 262 elpa_qr_unsafe = use_qr_unsafe 263 END SUBROUTINE set_elpa_qr 264 265! ************************************************************************************************** 266!> \brief Sets a flag that determines if additional information about the ELPA diagonalization 267!> should be printed when the diagonalization routine is called. 268!> \param flag the logical flag 269! ************************************************************************************************** 270 SUBROUTINE set_elpa_print(flag) 271 LOGICAL, INTENT(IN) :: flag 272 273 elpa_should_print = flag 274 END SUBROUTINE set_elpa_print 275 276! ************************************************************************************************** 277!> \brief Driver routine to diagonalize a FM matrix with the ELPA library. 278!> \param matrix the matrix that is diagonalized 279!> \param eigenvectors eigenvectors of the input matrix 280!> \param eigenvalues eigenvalues of the input matrix 281! ************************************************************************************************** 282 SUBROUTINE cp_fm_diag_elpa(matrix, eigenvectors, eigenvalues) 283 TYPE(cp_fm_type), POINTER :: matrix, eigenvectors 284 REAL(KIND=dp), DIMENSION(:) :: eigenvalues 285 286 CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_diag_elpa' 287 288#if defined(__ELPA) 289 INTEGER :: handle 290 TYPE(cp_fm_type), POINTER :: eigenvectors_new, matrix_new 291 TYPE(cp_fm_redistribute_info) :: rdinfo 292 293 CALL timeset(routineN, handle) 294 295 ! Determine if the input matrix needs to be redistributed before diagonalization. 296 ! Heuristics are used to determine the optimal number of CPUs for diagonalization. 297 ! The redistributed matrix is stored in matrix_new, which is just a pointer 298 ! to the original matrix if no redistribution is required. 299 ! With ELPA, we have to make sure that all processor columns have nonzero width 300 CALL cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvectors_new, & 301 caller_is_elpa=.TRUE., redist_info=rdinfo) 302 303 ! Call ELPA on CPUs that hold the new matrix 304 IF (ASSOCIATED(matrix_new)) & 305 CALL cp_fm_diag_elpa_base(matrix_new, eigenvectors_new, eigenvalues, rdinfo) 306 307 ! Redistribute results and clean up 308 CALL cp_fm_redistribute_end(matrix, eigenvectors, eigenvalues, matrix_new, eigenvectors_new) 309 310 CALL timestop(handle) 311#else 312 MARK_USED(matrix) 313 MARK_USED(eigenvectors) 314 MARK_USED(eigenvalues) 315 316 CPABORT("CP2K compiled without the ELPA library.") 317#endif 318 END SUBROUTINE cp_fm_diag_elpa 319 320#if defined(__ELPA) 321! ************************************************************************************************** 322!> \brief Actual routine that calls ELPA to diagonalize a FM matrix. 323!> \param matrix the matrix that is diagonalized 324!> \param eigenvectors eigenvectors of the input matrix 325!> \param eigenvalues eigenvalues of the input matrix 326! ************************************************************************************************** 327 SUBROUTINE cp_fm_diag_elpa_base(matrix, eigenvectors, eigenvalues, rdinfo) 328 329 TYPE(cp_fm_type), POINTER, INTENT(INOUT) :: matrix, eigenvectors 330 REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: eigenvalues 331 TYPE(cp_fm_redistribute_info), INTENT(IN) :: rdinfo 332 333 CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_diag_elpa_base' 334 335 INTEGER :: handle 336 CLASS(elpa_t), POINTER :: elpa_obj 337 CHARACTER(len=14) :: kernel 338 INTEGER :: group, & 339 mypcol, myprow, n, & 340 n_rows, n_cols, & 341 nblk, neig, io_unit, & 342 success 343 LOGICAL :: use_qr, check_eigenvalues 344 REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: eval, eval_noqr 345 TYPE(cp_blacs_env_type), POINTER :: context 346 TYPE(cp_fm_type), POINTER :: matrix_noqr, eigenvectors_noqr 347 TYPE(cp_logger_type), POINTER :: logger 348 REAL(KIND=dp), PARAMETER :: th = 1.0E-14_dp 349 INTEGER, DIMENSION(:), POINTER :: ncol_locals 350 351 CALL timeset(routineN, handle) 352 NULLIFY (logger) 353 NULLIFY (ncol_locals) 354 355 check_eigenvalues = .FALSE. 356 357 logger => cp_get_default_logger() 358 io_unit = cp_logger_get_default_io_unit(logger) 359 360 n = matrix%matrix_struct%nrow_global 361 context => matrix%matrix_struct%context 362 group = matrix%matrix_struct%para_env%group 363 364 myprow = context%mepos(1) 365 mypcol = context%mepos(2) 366 367 ! elpa needs the full matrix 368 CALL cp_fm_upper_to_full(matrix, eigenvectors) 369 370 CALL cp_fm_struct_get(matrix%matrix_struct, & 371 local_leading_dimension=n_rows, & 372 ncol_local=n_cols, & 373 nrow_block=nblk, & 374 ncol_locals=ncol_locals) 375 376 ! ELPA will fail in 'solve_tridi', with no useful error message, fail earlier 377 IF (io_unit > 0 .AND. ANY(ncol_locals == 0)) THEN 378 CALL rdinfo%write(io_unit) 379 CALL cp_fm_write_info(matrix, io_unit) 380 CPABORT("ELPA [pre-fail]: Problem contains processor column with zero width.") 381 END IF 382 383 neig = SIZE(eigenvalues, 1) 384 ! Decide if matrix is suitable for ELPA to use QR 385 ! The definition of what is considered a suitable matrix depends on the ELPA version 386 ! The relevant ELPA files to check are 387 ! - Proper matrix order: src/elpa2/elpa2_template.F90 388 ! - Proper block size: test/Fortran/test.F90 389 ! Note that the names of these files might change in different ELPA versions 390 ! Matrix order must be even 391 use_qr = elpa_qr .AND. (MOD(n, 2) .EQ. 0) 392 ! Matrix order and block size must be greater than or equal to 64 393 IF (.NOT. elpa_qr_unsafe) & 394 use_qr = use_qr .AND. (n .GE. 64) .AND. (nblk .GE. 64) 395 396 ! Check if eigenvalues computed with ELPA_QR_UNSAFE should be verified 397 IF (use_qr .AND. elpa_qr_unsafe .AND. elpa_should_print) & 398 check_eigenvalues = .TRUE. 399 400 CALL mp_bcast(check_eigenvalues, matrix%matrix_struct%para_env%source, matrix%matrix_struct%para_env%group) 401 402 IF (check_eigenvalues) THEN 403 ! Allocate and initialize needed temporaries to compute eigenvalues without ELPA QR 404 ALLOCATE (eval_noqr(n)) 405 CALL cp_fm_create(matrix=matrix_noqr, matrix_struct=matrix%matrix_struct) 406 CALL cp_fm_to_fm(matrix, matrix_noqr) 407 CALL cp_fm_create(matrix=eigenvectors_noqr, matrix_struct=eigenvectors%matrix_struct) 408 CALL cp_fm_upper_to_full(matrix_noqr, eigenvectors_noqr) 409 END IF 410 411 IF (io_unit > 0 .AND. elpa_should_print) THEN 412 WRITE (io_unit, '(/,A)') "ELPA| Matrix diagonalization information" 413 414 IF (elpa_kernel == -1) THEN 415 kernel = TRIM(elpa_kernel_names(1)) 416 ELSE 417 kernel = TRIM(elpa_kernel_names(elpa_kernel + 1)) ! this assumes that our indizes differ from the ELPA idx by -1 418 END IF 419 420 WRITE (io_unit, '(A,I14)') "ELPA| Matrix order : ", n 421 WRITE (io_unit, '(A,I14)') "ELPA| Matrix block size : ", nblk 422 WRITE (io_unit, '(A,A14)') "ELPA| Kernel : ", ADJUSTR(kernel) 423 WRITE (io_unit, '(A,L14)') "ELPA| QR step requested : ", elpa_qr 424 425 IF (elpa_qr) THEN 426 WRITE (io_unit, '(A,L14)') "ELPA| Use potentially unsafe QR: ", elpa_qr_unsafe 427 WRITE (io_unit, '(A,L14)') "ELPA| Matrix is suitable for QR: ", use_qr 428 429 IF (.NOT. use_qr) THEN 430 IF (MOD(n, 2) .NE. 0) & 431 WRITE (io_unit, '(A)') "ELPA| Matrix order is NOT even" 432 IF (nblk .LT. 64 .AND. .NOT. elpa_qr_unsafe) & 433 WRITE (io_unit, '(A)') "ELPA| Matrix block size is NOT 64 or greater" 434 ELSE 435 IF (nblk .LT. 64 .AND. elpa_qr_unsafe) & 436 WRITE (io_unit, '(A,L14)') "ELPA| Matrix block size check was bypassed" 437 END IF 438 END IF 439 END IF 440 441 ! the full eigenvalues vector is needed 442 ALLOCATE (eval(n)) 443 444 elpa_obj => elpa_allocate() 445 446 CALL elpa_obj%set("na", n, success) 447 CALL elpa_obj%set("nev", neig, success) 448 CALL elpa_obj%set("local_nrows", n_rows, success) 449 CALL elpa_obj%set("local_ncols", n_cols, success) 450 CALL elpa_obj%set("nblk", nblk, success) 451 CALL elpa_obj%set("mpi_comm_parent", group, success) 452 CALL elpa_obj%set("process_row", myprow, success) 453 CALL elpa_obj%set("process_col", mypcol, success) 454 455 success = elpa_obj%setup() 456 IF (success /= elpa_ok) & 457 CPABORT("Setting up the ELPA object failed") 458 459 CALL elpa_obj%set("solver", elpa_solver_2stage, success) 460 461 IF (elpa_kernel .GE. 1) & 462 CALL elpa_obj%set("real_kernel", elpa_kernel, success) 463 464 IF (elpa_use_gpu) & 465 CALL elpa_obj%set("gpu", 1, success) 466 467 IF (use_qr) & 468 CALL elpa_obj%set("qr", 1, success) 469 470!$ CALL elpa_obj%set("omp_threads", omp_get_num_threads(), success) 471 472 CALL elpa_obj%eigenvectors(matrix%local_data, eval, eigenvectors%local_data, success) 473 474 IF (success /= elpa_ok) & 475 CPABORT("ELPA failed to diagonalize a matrix") 476 477 IF (check_eigenvalues) THEN 478 ! run again without QR 479 CALL elpa_obj%set("qr", 0, success) 480 CALL elpa_obj%eigenvectors(matrix_noqr%local_data, eval_noqr, eigenvectors_noqr%local_data, success) 481 482 IF (success /= elpa_ok) & 483 CPABORT("ELPA failed to diagonalize a matrix even without QR decomposition") 484 485 IF (ANY(ABS(eval(1:neig) - eval_noqr(1:neig)) .GT. th)) & 486 CPABORT("Eigenvalues calculated with QR decomp. in ELPA are wrong. Disable ELPA_QR_UNSAFE.") 487 488 DEALLOCATE (eval_noqr) 489 CALL cp_fm_release(matrix_noqr) 490 CALL cp_fm_release(eigenvectors_noqr) 491 END IF 492 493 CALL elpa_deallocate(elpa_obj) 494 495 eigenvalues(1:neig) = eval(1:neig) 496 DEALLOCATE (eval) 497 498 CALL timestop(handle) 499 500 END SUBROUTINE cp_fm_diag_elpa_base 501#endif 502 503END MODULE cp_fm_elpa 504