1!--------------------------------------------------------------------------------------------------! 2! CP2K: A general program to perform molecular dynamics simulations ! 3! Copyright (C) 2000 - 2019 CP2K developers group ! 4!--------------------------------------------------------------------------------------------------! 5 6! ************************************************************************************************** 7!> \brief operations for skinny matrices/vectors expressed in dbcsr form 8!> \par History 9!> 2014.10 created [Florian Schiffmann] 10!> \author Florian Schiffmann 11! ************************************************************************************************** 12 13MODULE dbcsr_vector 14 USE dbcsr_api, ONLY: dbcsr_copy,& 15 dbcsr_create,& 16 dbcsr_distribution_get,& 17 dbcsr_distribution_new,& 18 dbcsr_distribution_release,& 19 dbcsr_distribution_type,& 20 dbcsr_get_info,& 21 dbcsr_iterator_blocks_left,& 22 dbcsr_iterator_next_block,& 23 dbcsr_iterator_start,& 24 dbcsr_iterator_stop,& 25 dbcsr_iterator_type,& 26 dbcsr_release,& 27 dbcsr_reserve_all_blocks,& 28 dbcsr_set,dbcsr_get_data_p,& 29 dbcsr_type,& 30 dbcsr_type_antisymmetric,& 31 dbcsr_type_complex_4,& 32 dbcsr_type_complex_4,& 33 dbcsr_type_complex_8,& 34 dbcsr_type_complex_8,& 35 dbcsr_type_no_symmetry,& 36 dbcsr_type_real_4,& 37 dbcsr_type_real_4,& 38 dbcsr_type_real_8,& 39 dbcsr_type_real_8,& 40 dbcsr_type_symmetric 41 USE kinds, ONLY: real_4,& 42 real_8 43 USE message_passing, ONLY: mp_bcast,& 44 mp_sum 45 46 47#include "../base/base_uses.f90" 48 49!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads 50 51 IMPLICIT NONE 52 53 PRIVATE 54 55 CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_vector_operations' 56 57! ! the following types provide fast access to distributed dbcsr vectors 58#include "hash_table_types.f90" 59 60 TYPE block_ptr_d 61 REAL(real_8), DIMENSION(:, :), POINTER :: ptr => NULL() 62 INTEGER :: assigned_thread 63 END TYPE 64 TYPE block_ptr_s 65 REAL(real_4), DIMENSION(:, :), POINTER :: ptr => NULL() 66 INTEGER :: assigned_thread 67 END TYPE 68 TYPE block_ptr_c 69 COMPLEX(real_4), DIMENSION(:, :), POINTER :: ptr => NULL() 70 INTEGER :: assigned_thread 71 END TYPE 72 TYPE block_ptr_z 73 COMPLEX(real_8), DIMENSION(:, :), POINTER :: ptr => NULL() 74 INTEGER :: assigned_thread 75 END TYPE 76 77 TYPE fast_vec_access_type 78 TYPE(hash_table_type) :: hash_table 79 TYPE(block_ptr_d), DIMENSION(:), ALLOCATABLE :: blk_map_d 80 TYPE(block_ptr_s), DIMENSION(:), ALLOCATABLE :: blk_map_s 81 TYPE(block_ptr_c), DIMENSION(:), ALLOCATABLE :: blk_map_c 82 TYPE(block_ptr_z), DIMENSION(:), ALLOCATABLE :: blk_map_z 83 END TYPE 84 85 PUBLIC :: dbcsr_matrix_colvec_multiply, & 86 create_col_vec_from_matrix, & 87 create_row_vec_from_matrix, & 88 create_replicated_col_vec_from_matrix, & 89 create_replicated_row_vec_from_matrix 90 91 INTERFACE dbcsr_matrix_colvec_multiply 92 MODULE PROCEDURE dbcsr_matrix_colvec_multiply_d 93 MODULE PROCEDURE dbcsr_matrix_colvec_multiply_s 94 MODULE PROCEDURE dbcsr_matrix_colvec_multiply_z 95 MODULE PROCEDURE dbcsr_matrix_colvec_multiply_c 96 END INTERFACE 97 98CONTAINS 99 100! ************************************************************************************************** 101!> \brief creates a dbcsr col vector like object which lives on proc_col 0 102!> and has the same row dist as the template matrix 103!> the returned matrix is fully allocated and all blocks are set to 0 104!> this is not a sparse object (and must never be) 105!> \param dbcsr_vec the vector object to create must be allocated but not initialized 106!> \param matrix a dbcsr matrix used as template 107!> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix) 108! ************************************************************************************************** 109 SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol) 110 TYPE(dbcsr_type) :: dbcsr_vec, matrix 111 INTEGER :: ncol 112 113 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix' 114 115 INTEGER :: handle, npcols, data_type 116 INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist 117 TYPE(dbcsr_distribution_type) :: dist_col_vec, dist 118 119 CALL timeset(routineN, handle) 120 121 CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist) 122 CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist) 123 124 ALLOCATE (col_dist(1), col_blk_size(1)) 125 col_dist(1) = 0 126 col_blk_size(1) = ncol 127 CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist) 128 129 CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec,& 130 matrix_type=dbcsr_type_no_symmetry, & 131 row_blk_size=row_blk_size,& 132 col_blk_size=col_blk_size, & 133 nze=0, data_type=data_type) 134 CALL dbcsr_reserve_all_blocks(dbcsr_vec) 135 136 CALL dbcsr_distribution_release(dist_col_vec) 137 DEALLOCATE (col_dist, col_blk_size) 138 CALL timestop(handle) 139 140 END SUBROUTINE create_col_vec_from_matrix 141 142! ************************************************************************************************** 143!> \brief creates a dbcsr row vector like object which lives on proc_row 0 144!> and has the same row dist as the template matrix 145!> the returned matrix is fully allocated and all blocks are set to 0 146!> this is not a sparse object (and must never be) 147!> \param dbcsr_vec ... 148!> \param matrix a dbcsr matrix used as template 149!> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix) 150! ************************************************************************************************** 151SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow) 152 TYPE(dbcsr_type) :: dbcsr_vec, matrix 153 INTEGER :: nrow 154 155 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix' 156 157 INTEGER :: handle, nprows, data_type 158 INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist 159 TYPE(dbcsr_distribution_type) :: dist_row_vec, dist 160 161 CALL timeset(routineN, handle) 162 163 CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist) 164 CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist) 165 166 ALLOCATE (row_dist(1), row_blk_size(1)) 167 row_dist(1) = 0 168 row_blk_size(1) = nrow 169 CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist) 170 171 CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec,& 172 matrix_type=dbcsr_type_no_symmetry, & 173 row_blk_size=row_blk_size,& 174 col_blk_size=col_blk_size, & 175 nze=0, data_type=data_type) 176 CALL dbcsr_reserve_all_blocks(dbcsr_vec) 177 178 CALL dbcsr_distribution_release(dist_row_vec) 179 DEALLOCATE (row_dist, row_blk_size) 180 CALL timestop(handle) 181 182 END SUBROUTINE create_row_vec_from_matrix 183 184 185! ************************************************************************************************** 186!> \brief creates a col vector like object whose blocks can be replicated 187!> along the processor row and has the same row dist as the template matrix 188!> the returned matrix is fully allocated and all blocks are set to 0 189!> this is not a sparse object (and must never be) 190!> \param dbcsr_vec the vector object to create must be allocated but not initialized 191!> \param matrix a dbcsr matrix used as template 192!> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix) 193! ************************************************************************************************** 194 SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol) 195 TYPE(dbcsr_type) :: dbcsr_vec, matrix 196 INTEGER :: ncol 197 198 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix' 199 200 INTEGER :: handle, npcols, data_type, i 201 INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist 202 TYPE(dbcsr_distribution_type) :: dist_col_vec, dist 203 CALL timeset(routineN, handle) 204 205 CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist) 206 CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist) 207 208 ALLOCATE (col_dist(npcols), col_blk_size(npcols)) 209 col_blk_size(:) = ncol 210 DO i = 0, npcols-1 211 col_dist(i+1) = i 212 END DO 213 CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist) 214 215 CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, & 216 matrix_type=dbcsr_type_no_symmetry, & 217 row_blk_size=row_blk_size,& 218 col_blk_size=col_blk_size, & 219 nze=0, data_type=data_type) 220 CALL dbcsr_reserve_all_blocks(dbcsr_vec) 221 222 CALL dbcsr_distribution_release(dist_col_vec) 223 DEALLOCATE (col_dist, col_blk_size) 224 CALL timestop(handle) 225 226 END SUBROUTINE create_replicated_col_vec_from_matrix 227 228! ************************************************************************************************** 229!> \brief creates a row vector like object whose blocks can be replicated 230!> along the processor col and has the same col dist as the template matrix 231!> the returned matrix is fully allocated and all blocks are set to 0 232!> this is not a sparse object (and must never be) 233!> \param dbcsr_vec the vector object to create must be allocated but not initialized 234!> \param matrix a dbcsr matrix used as template 235!> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix) 236! ************************************************************************************************** 237 SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow) 238 TYPE(dbcsr_type) :: dbcsr_vec 239 TYPE(dbcsr_type) :: matrix 240 INTEGER :: nrow 241 242 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix' 243 244 INTEGER :: handle, i, nprows, data_type 245 INTEGER, DIMENSION(:), POINTER :: row_dist, col_dist, row_blk_size, col_blk_size 246 TYPE(dbcsr_distribution_type) :: dist_row_vec, dist 247 248 CALL timeset(routineN, handle) 249 250 CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type) 251 CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist) 252 253 ALLOCATE (row_dist(nprows), row_blk_size(nprows)) 254 row_blk_size(:) = nrow 255 DO i = 0, nprows-1 256 row_dist(i+1) = i 257 END DO 258 CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist) 259 260 CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, & 261 row_blk_size=row_blk_size, col_blk_size=col_blk_size, & 262 nze=0, data_type=data_type) 263 CALL dbcsr_reserve_all_blocks(dbcsr_vec) 264 265 CALL dbcsr_distribution_release(dist_row_vec) 266 DEALLOCATE (row_dist, row_blk_Size) 267 CALL timestop(handle) 268 269 END SUBROUTINE create_replicated_row_vec_from_matrix 270 271! ************************************************************************************************** 272!> \brief given a column vector, prepare the fast_vec_access container 273!> \param vec ... 274!> \param fast_vec_access ... 275! ************************************************************************************************** 276 SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access) 277 TYPE(dbcsr_type) :: vec 278 TYPE(fast_vec_access_type) :: fast_vec_access 279 280 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access' 281 282 INTEGER :: handle, data_type 283 284 CALL timeset(routineN, handle) 285 286 CALL dbcsr_get_info(vec, data_type=data_type) 287 288 SELECT CASE (data_type) 289 CASE (dbcsr_type_real_8) 290 CALL create_fast_col_vec_access_d(vec, fast_vec_access) 291 CASE (dbcsr_type_real_4) 292 CALL create_fast_col_vec_access_s(vec, fast_vec_access) 293 CASE (dbcsr_type_complex_8) 294 CALL create_fast_col_vec_access_z(vec, fast_vec_access) 295 CASE (dbcsr_type_complex_4) 296 CALL create_fast_col_vec_access_c(vec, fast_vec_access) 297 END SELECT 298 299 CALL timestop(handle) 300 301 END SUBROUTINE create_fast_col_vec_access 302 303! ************************************************************************************************** 304!> \brief given a row vector, prepare the fast_vec_access_container 305!> \param vec ... 306!> \param fast_vec_access ... 307! ************************************************************************************************** 308 SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access) 309 TYPE(dbcsr_type) :: vec 310 TYPE(fast_vec_access_type) :: fast_vec_access 311 312 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access' 313 314 INTEGER :: handle, data_type 315 316 CALL timeset(routineN, handle) 317 318 CALL dbcsr_get_info(vec, data_type=data_type) 319 320 SELECT CASE (data_type) 321 CASE (dbcsr_type_real_8) 322 CALL create_fast_row_vec_access_d(vec, fast_vec_access) 323 CASE (dbcsr_type_real_4) 324 CALL create_fast_row_vec_access_s(vec, fast_vec_access) 325 CASE (dbcsr_type_complex_8) 326 CALL create_fast_row_vec_access_c(vec, fast_vec_access) 327 CASE (dbcsr_type_complex_4) 328 CALL create_fast_row_vec_access_z(vec, fast_vec_access) 329 END SELECT 330 331 CALL timestop(handle) 332 333 END SUBROUTINE create_fast_row_vec_access 334 335! ************************************************************************************************** 336!> \brief release all memory associated with the fast_vec_access type 337!> \param fast_vec_access ... 338! ************************************************************************************************** 339 SUBROUTINE release_fast_vec_access(fast_vec_access) 340 TYPE(fast_vec_access_type) :: fast_vec_access 341 342 CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access' 343 344 INTEGER :: handle 345 346 CALL timeset(routineN, handle) 347 348 CALL hash_table_release(fast_vec_access%hash_table) 349 IF (ALLOCATED(fast_vec_access%blk_map_d)) DEALLOCATE (fast_vec_access%blk_map_d) 350 IF (ALLOCATED(fast_vec_access%blk_map_s)) DEALLOCATE (fast_vec_access%blk_map_s) 351 IF (ALLOCATED(fast_vec_access%blk_map_c)) DEALLOCATE (fast_vec_access%blk_map_c) 352 IF (ALLOCATED(fast_vec_access%blk_map_z)) DEALLOCATE (fast_vec_access%blk_map_z) 353 354 CALL timestop(handle) 355 356 END SUBROUTINE release_fast_vec_access 357 358#include "hash_table.f90" 359 360#:set instances = [ ('s', 'REAL(kind=real_4)', '0.0_real_4'), & 361 ('d', 'REAL(kind=real_8)', '0.0_real_8'), & 362 ('c', 'COMPLEX(kind=real_4)', 'CMPLX(0.0, 0.0, real_4)'), & 363 ('z', 'COMPLEX(kind=real_8)', 'CMPLX(0.0, 0.0, real_8)') ] 364 365#:for nametype, type, zero in instances 366 367! ************************************************************************************************** 368!> \brief the real driver routine for the multiply, not all symmetries implemented yet 369!> \param matrix ... 370!> \param vec_in ... 371!> \param vec_out ... 372!> \param alpha ... 373!> \param beta ... 374!> \param work_row ... 375!> \param work_col ... 376! ************************************************************************************************** 377 SUBROUTINE dbcsr_matrix_colvec_multiply_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col) 378 TYPE(dbcsr_type) :: matrix, vec_in, vec_out 379 ${type}$ :: alpha, beta 380 TYPE(dbcsr_type) :: work_row, work_col 381 382 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_colvec_multiply_low', & 383 routineP = moduleN//':'//routineN 384 385 CHARACTER :: matrix_type 386 387 CALL dbcsr_get_info(matrix, matrix_type=matrix_type) 388 389 SELECT CASE(matrix_type) 390 CASE(dbcsr_type_no_symmetry) 391 CALL dbcsr_matrix_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col) 392 CASE(dbcsr_type_symmetric) 393 CALL dbcsr_sym_matrix_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col) 394 CASE(dbcsr_type_antisymmetric) 395 ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored??? 396 CPABORT("NYI, antisymmetric matrix not permitted") 397 CASE DEFAULT 398 CPABORT("Unknown matrix type, ...") 399 END SELECT 400 401 END SUBROUTINE dbcsr_matrix_colvec_multiply_${nametype}$ 402 403! ************************************************************************************************** 404!> \brief low level routines for matrix vector multiplies 405!> \param matrix ... 406!> \param vec_in ... 407!> \param vec_out ... 408!> \param alpha ... 409!> \param beta ... 410!> \param work_row ... 411!> \param work_col ... 412! ************************************************************************************************** 413 SUBROUTINE dbcsr_matrix_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col) 414 TYPE(dbcsr_type) :: matrix, vec_in, vec_out 415 ${type}$ :: alpha, beta 416 TYPE(dbcsr_type) :: work_row, work_col 417 418 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult' 419 420 INTEGER :: col, mypcol, & 421 myprow, & 422 ncols, pcol_group, nrows, & 423 prow_group, row, & 424 handle, handle1, ithread 425 ${type}$, DIMENSION(:), POINTER :: data_vec 426 ${type}$, DIMENSION(:, :), POINTER :: data_d, vec_res 427 TYPE(dbcsr_distribution_type) :: dist 428 TYPE(dbcsr_iterator_type) :: iter 429 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col 430 INTEGER :: prow, pcol 431 432 CALL timeset(routineN, handle) 433 ithread=0 434 435! Collect some data about the parallel environment. We will use them later to move the vector around 436 CALL dbcsr_get_info(matrix, distribution=dist) 437 CALL dbcsr_distribution_get(dist, prow_group=prow_group, pcol_group=pcol_group, myprow=myprow, mypcol=mypcol) 438 439 CALL create_fast_row_vec_access(work_row, fast_vec_row) 440 CALL create_fast_col_vec_access(work_col, fast_vec_col) 441 442! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply 443 CALL dbcsr_col_vec_to_rep_row_${nametype}$(vec_in, work_col, work_row, fast_vec_col) 444 445! Set the work vector for the results to 0 446 CALL dbcsr_set(work_col, ${zero}$) 447 448! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes 449! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively) 450 CALL timeset(routineN//"_local_mm", handle1) 451 452!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) & 453!$OMP SHARED(matrix,fast_vec_col,fast_vec_row) 454 !$ ithread = omp_get_thread_num () 455 CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.) 456 DO WHILE (dbcsr_iterator_blocks_left(iter)) 457 CALL dbcsr_iterator_next_block(iter, row, col, data_d) 458 prow=hash_table_get(fast_vec_col%hash_table,row) 459 IF(fast_vec_col%blk_map_${nametype}$(prow)%assigned_thread .NE. ithread ) CYCLE 460 pcol=hash_table_get(fast_vec_row%hash_table,col) 461 fast_vec_col%blk_map_${nametype}$(prow)%ptr=fast_vec_col%blk_map_${nametype}$(prow)%ptr+& 462 MATMUL(data_d,TRANSPOSE(fast_vec_row%blk_map_${nametype}$(pcol)%ptr)) 463 END DO 464 CALL dbcsr_iterator_stop(iter) 465!$OMP END PARALLEL 466 467 CALL timestop(handle1) 468 469! sum all the data onto the first processor col where the original vector is stored 470 data_vec => dbcsr_get_data_p(work_col, select_data_type=${zero}$) 471 CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols) 472 CALL mp_sum(data_vec(1:nrows*ncols), prow_group) 473 474! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector 475! from the replicated to the original vector. Let's play it safe and use the iterator 476 CALL dbcsr_iterator_start(iter, vec_out) 477 DO WHILE (dbcsr_iterator_blocks_left(iter)) 478 CALL dbcsr_iterator_next_block(iter, row, col, vec_res) 479 prow=hash_table_get(fast_vec_col%hash_table,row) 480 IF(ASSOCIATED(fast_vec_col%blk_map_${nametype}$(prow)%ptr)) THEN 481 vec_res(:, :)= beta*vec_res(:, :)+alpha*fast_vec_col%blk_map_${nametype}$(prow)%ptr(:,:) 482 ELSE 483 vec_res(:, :)= beta*vec_res(:, :) 484 END IF 485 END DO 486 CALL dbcsr_iterator_stop(iter) 487 488 CALL release_fast_vec_access(fast_vec_row) 489 CALL release_fast_vec_access(fast_vec_col) 490 491 CALL timestop(handle) 492 493 END SUBROUTINE dbcsr_matrix_vector_mult_${nametype}$ 494 495! ************************************************************************************************** 496!> \brief ... 497!> \param matrix ... 498!> \param vec_in ... 499!> \param vec_out ... 500!> \param alpha ... 501!> \param beta ... 502!> \param work_row ... 503!> \param work_col ... 504!> \param skip_diag ... 505! ************************************************************************************************** 506 SUBROUTINE dbcsr_matrixT_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag) 507 TYPE(dbcsr_type) :: matrix, vec_in, vec_out 508 ${type}$ :: alpha, beta 509 TYPE(dbcsr_type) :: work_row, work_col 510 LOGICAL :: skip_diag 511 512 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult' 513 514 INTEGER :: col, col_size, mypcol, & 515 myprow, & 516 ncols, pcol_group, nrows, & 517 prow_group, row, row_size, & 518 handle, handle1, ithread 519 ${type}$, DIMENSION(:), POINTER :: data_vec 520 ${type}$, DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res 521 TYPE(dbcsr_distribution_type) :: dist 522 TYPE(dbcsr_iterator_type) :: iter 523 524 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col 525 INTEGER :: prow, pcol 526 527 CALL timeset(routineN, handle) 528 ithread=0 529 530! Collect some data about the parallel environment. We will use them later to move the vector around 531 CALL dbcsr_get_info(matrix, distribution=dist) 532 CALL dbcsr_distribution_get(dist, prow_group=prow_group, pcol_group=pcol_group, myprow=myprow, mypcol=mypcol) 533 534 CALL create_fast_row_vec_access(work_row, fast_vec_row) 535 CALL create_fast_col_vec_access(work_col, fast_vec_col) 536 537! Set the work vector for the results to 0 538 CALL dbcsr_set(work_row, ${zero}$) 539 540! Transfer the correct parts of the input vector to the replicated vector on proc_col 0 541 CALL dbcsr_iterator_start(iter, vec_in) 542 DO WHILE (dbcsr_iterator_blocks_left(iter)) 543 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size) 544 prow=hash_table_get(fast_vec_col%hash_table,row) 545 fast_vec_col%blk_map_${nametype}$(prow)%ptr(1:row_size, 1:col_size)= vec_bl(1:row_size, 1:col_size) 546 END DO 547 CALL dbcsr_iterator_stop(iter) 548! Replicate the data on all processore in the row 549 data_vec => dbcsr_get_data_p(work_col, select_data_type=${zero}$) 550 CALL mp_bcast(data_vec, 0, prow_group) 551 552! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols 553 CALL timeset(routineN//"local_mm", handle1) 554 CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols) 555!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) & 556!$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols) 557 !$ ithread = omp_get_thread_num () 558 CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.) 559 DO WHILE (dbcsr_iterator_blocks_left(iter)) 560 CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size) 561 IF(skip_diag.AND.col==row)CYCLE 562 prow=hash_table_get(fast_vec_col%hash_table,row) 563 pcol=hash_table_get(fast_vec_row%hash_table,col) 564 IF ( ASSOCIATED(fast_vec_row%blk_map_${nametype}$(pcol)%ptr) .AND. & 565 ASSOCIATED(fast_vec_col%blk_map_${nametype}$(prow)%ptr) )THEN 566 IF(fast_vec_row%blk_map_${nametype}$(pcol)%assigned_thread .NE. ithread ) CYCLE 567 fast_vec_row%blk_map_${nametype}$(pcol)%ptr=fast_vec_row%blk_map_${nametype}$(pcol)%ptr+& 568 MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$(prow)%ptr),data_d) 569 ELSE 570 prow=hash_table_get(fast_vec_row%hash_table,row) 571 pcol=hash_table_get(fast_vec_col%hash_table,col) 572 IF(fast_vec_row%blk_map_${nametype}$(prow)%assigned_thread .NE. ithread ) CYCLE 573 fast_vec_row%blk_map_${nametype}$(prow)%ptr=fast_vec_row%blk_map_${nametype}$(prow)%ptr+& 574 MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$(pcol)%ptr),TRANSPOSE(data_d)) 575 END IF 576 END DO 577 CALL dbcsr_iterator_stop(iter) 578!$OMP END PARALLEL 579 580 CALL timestop(handle1) 581 582! sum all the data within a processor column to obtain the replicated result 583 data_vec => dbcsr_get_data_p(work_row, select_data_type=${zero}$) 584! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency 585 CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols) 586 CALL mp_sum(data_vec(1:nrows*ncols), pcol_group) 587 588! Convert the result to a column wise distribution 589 CALL dbcsr_rep_row_to_rep_col_vec_${nametype}$(work_col, work_row, fast_vec_row) 590 591! Create_the final vector by summing it to the result vector which lives on proc_col 0 592 CALL dbcsr_iterator_start(iter, vec_out) 593 DO WHILE (dbcsr_iterator_blocks_left(iter)) 594 CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size) 595 prow=hash_table_get(fast_vec_col%hash_table,row) 596 IF(ASSOCIATED(fast_vec_col%blk_map_${nametype}$(prow)%ptr)) THEN 597 vec_res(:, :)= beta*vec_res(:, :)+alpha*fast_vec_col%blk_map_${nametype}$(prow)%ptr(:,:) 598 ELSE 599 vec_res(:, :)= beta*vec_res(:, :) 600 END IF 601 END DO 602 CALL dbcsr_iterator_stop(iter) 603 604 CALL timestop(handle) 605 606 END SUBROUTINE dbcsr_matrixT_vector_mult_${nametype}$ 607 608! ************************************************************************************************** 609!> \brief ... 610!> \param vec_in ... 611!> \param rep_col_vec ... 612!> \param rep_row_vec ... 613!> \param fast_vec_col ... 614! ************************************************************************************************** 615 SUBROUTINE dbcsr_col_vec_to_rep_row_${nametype}$(vec_in, rep_col_vec, rep_row_vec, fast_vec_col) 616 TYPE(dbcsr_type) :: vec_in, rep_col_vec, & 617 rep_row_vec 618 TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col 619 620 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row' 621 622 INTEGER :: col, mypcol, myprow, ncols, & 623 nrows, pcol_group, & 624 prow_group, row, handle 625 INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist 626 ${type}$, DIMENSION(:), POINTER :: data_vec, data_vec_rep 627 ${type}$, DIMENSION(:, :), POINTER :: vec_row 628 TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col 629 TYPE(dbcsr_iterator_type) :: iter 630 631 CALL timeset(routineN, handle) 632 633! get information about the parallel environment 634 CALL dbcsr_get_info(vec_in, distribution=dist_in) 635 CALL dbcsr_distribution_get(dist_in, & 636 prow_group=prow_group, & 637 pcol_group=pcol_group, & 638 myprow=myprow, & 639 mypcol=mypcol) 640 641! Get the vector which tells us which blocks are local to which processor row in the col vec 642 CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col) 643 CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist) 644 645! Copy the local vector to the replicated on the first processor column (this is where vec_in lives) 646 CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols) 647 data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=${zero}$) 648 data_vec => dbcsr_get_data_p(vec_in, select_data_type=${zero}$) 649 IF(mypcol==0)data_vec_rep(1:nrows*ncols)=data_vec(1:nrows*ncols) 650! Replicate the data along the row 651 CALL mp_bcast(data_vec_rep(1:nrows*ncols), 0, prow_group) 652 653! Here it gets a bit tricky as we are dealing with two different parallel layouts: 654! The rep_col_vec contains all blocks local to the row distribution of the vector. 655! The rep_row_vec only needs the fraction which is local to the col distribution. 656! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i 657! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available 658! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere 659! Hope this clarifies the idea 660 CALL dbcsr_set(rep_row_vec, ${zero}$) 661 CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols) 662 CALL dbcsr_iterator_start(iter, rep_row_vec) 663 DO WHILE (dbcsr_iterator_blocks_left(iter)) 664 CALL dbcsr_iterator_next_block(iter, row, col, vec_row) 665 IF(row_dist(col)==myprow)THEN 666 vec_row=TRANSPOSE(fast_vec_col%blk_map_${nametype}$(hash_table_get(fast_vec_col%hash_table,col))%ptr) 667 END IF 668 END DO 669 CALL dbcsr_iterator_stop(iter) 670 CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols) 671 data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=${zero}$) 672 CALL mp_sum(data_vec_rep(1:ncols*nrows), pcol_group) 673 674 CALL timestop(handle) 675 676 END SUBROUTINE dbcsr_col_vec_to_rep_row_${nametype}$ 677 678! ************************************************************************************************** 679!> \brief ... 680!> \param rep_col_vec ... 681!> \param rep_row_vec ... 682!> \param fast_vec_row ... 683!> \param fast_vec_col_add ... 684! ************************************************************************************************** 685 SUBROUTINE dbcsr_rep_row_to_rep_col_vec_${nametype}$(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add) 686 TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec 687 TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add 688 TYPE(fast_vec_access_type) :: fast_vec_row 689 690 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec' 691 692 INTEGER :: col, mypcol, myprow, ncols, & 693 nrows, pcol_group, & 694 prow_group, row, handle 695 INTEGER, DIMENSION(:), POINTER :: col_dist 696 ${type}$, DIMENSION(:), POINTER :: data_vec_rep 697 ${type}$, DIMENSION(:, :), POINTER :: vec_col 698 TYPE(dbcsr_distribution_type) :: dist_rep_row, dist_rep_col 699 TYPE(dbcsr_iterator_type) :: iter 700 701 CALL timeset(routineN, handle) 702 703! get information about the parallel environment 704 CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col) 705 CALL dbcsr_distribution_get(dist_rep_col, & 706 prow_group=prow_group, & 707 pcol_group=pcol_group, & 708 myprow=myprow, & 709 mypcol=mypcol) 710 711! Get the vector which tells us which blocks are local to which processor col in the row vec 712 CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row) 713 CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist) 714 715 716! The same trick as described above with opposite direction 717 CALL dbcsr_set(rep_col_vec, ${zero}$) 718 CALL dbcsr_iterator_start(iter, rep_col_vec) 719 DO WHILE (dbcsr_iterator_blocks_left(iter)) 720 CALL dbcsr_iterator_next_block(iter, row, col, vec_col) 721 IF(col_dist(row)==mypcol)THEN 722 vec_col=TRANSPOSE(fast_vec_row%blk_map_${nametype}$(hash_table_get(fast_vec_row%hash_table,row))%ptr) 723 END IF 724 ! this one is special and allows to add the elements of a not yet summed replicated 725 ! column vector as it appears in M*V(row_rep) as result. Save an mp_sum in the symmetric case 726 IF(PRESENT(fast_vec_col_add))vec_col=vec_col+& 727 fast_vec_col_add%blk_map_${nametype}$(hash_table_get(fast_vec_col_add%hash_table,row))%ptr(:,:) 728 END DO 729 CALL dbcsr_iterator_stop(iter) 730 CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols) 731 data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=${zero}$) 732 CALL mp_sum(data_vec_rep(1:nrows*ncols), prow_group) 733 734 CALL timestop(handle) 735 736 END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_${nametype}$ 737 738 739! ************************************************************************************************** 740!> \brief given a column vector, prepare the fast_vec_access container 741!> \param vec ... 742!> \param fast_vec_access ... 743! ************************************************************************************************** 744 SUBROUTINE create_fast_col_vec_access_${nametype}$(vec, fast_vec_access) 745 TYPE(dbcsr_type) :: vec 746 TYPE(fast_vec_access_type) :: fast_vec_access 747 748 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access_${nametype}$' 749 750 INTEGER :: handle, nblk_local 751 INTEGER :: col, row, iblock, nthreads 752 ${type}$, DIMENSION(:, :), POINTER :: vec_bl 753 TYPE(dbcsr_iterator_type) :: iter 754 755 CALL timeset(routineN, handle) 756 757 ! figure out the number of threads 758 nthreads = 1 759!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads) 760!$OMP MASTER 761 !$ nthreads = OMP_GET_NUM_THREADS() 762!$OMP END MASTER 763!$OMP END PARALLEL 764 765 CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local) 766 ! 4 times makes sure the table is big enough to limit collisions. 767 CALL hash_table_create(fast_vec_access%hash_table,4*nblk_local) 768 ! include zero for effective dealing with values not in the hash table (will return 0) 769 ALLOCATE(fast_vec_access%blk_map_${nametype}$(0:nblk_local)) 770 771 CALL dbcsr_get_info(matrix=vec, nblkcols_local=col) 772 IF (col.GT.1) CPABORT("BUG") 773 774 ! go through the blocks of the vector 775 iblock=0 776 CALL dbcsr_iterator_start(iter, vec) 777 DO WHILE (dbcsr_iterator_blocks_left(iter)) 778 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl) 779 iblock=iblock+1 780 CALL hash_table_add(fast_vec_access%hash_table,row,iblock) 781 fast_vec_access%blk_map_${nametype}$(iblock)%ptr=>vec_bl 782 fast_vec_access%blk_map_${nametype}$(iblock)%assigned_thread=MOD(iblock,nthreads) 783 END DO 784 CALL dbcsr_iterator_stop(iter) 785 786 CALL timestop(handle) 787 788 END SUBROUTINE create_fast_col_vec_access_${nametype}$ 789 790! ************************************************************************************************** 791!> \brief given a row vector, prepare the fast_vec_access_container 792!> \param vec ... 793!> \param fast_vec_access ... 794! ************************************************************************************************** 795 SUBROUTINE create_fast_row_vec_access_${nametype}$(vec, fast_vec_access) 796 TYPE(dbcsr_type) :: vec 797 TYPE(fast_vec_access_type) :: fast_vec_access 798 799 CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access_${nametype}$' 800 801 INTEGER :: handle, nblk_local 802 INTEGER :: col, row, iblock, nthreads 803 ${type}$, DIMENSION(:, :), POINTER :: vec_bl 804 TYPE(dbcsr_iterator_type) :: iter 805 806 CALL timeset(routineN, handle) 807 808 ! figure out the number of threads 809 nthreads = 1 810!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads) 811!$OMP MASTER 812 !$ nthreads = OMP_GET_NUM_THREADS() 813!$OMP END MASTER 814!$OMP END PARALLEL 815 816 CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local) 817 ! 4 times makes sure the table is big enough to limit collisions. 818 CALL hash_table_create(fast_vec_access%hash_table,4*nblk_local) 819 ! include zero for effective dealing with values not in the hash table (will return 0) 820 ALLOCATE(fast_vec_access%blk_map_${nametype}$(0:nblk_local)) 821 822 ! sanity check 823 CALL dbcsr_get_info(matrix=vec, nblkrows_local=row) 824 IF (row.GT.1) CPABORT("BUG") 825 826 ! go through the blocks of the vector 827 iblock=0 828 CALL dbcsr_iterator_start(iter, vec) 829 DO WHILE (dbcsr_iterator_blocks_left(iter)) 830 CALL dbcsr_iterator_next_block(iter, row, col, vec_bl) 831 iblock=iblock+1 832 CALL hash_table_add(fast_vec_access%hash_table,col,iblock) 833 fast_vec_access%blk_map_${nametype}$(iblock)%ptr=>vec_bl 834 fast_vec_access%blk_map_${nametype}$(iblock)%assigned_thread=MOD(iblock,nthreads) 835 END DO 836 CALL dbcsr_iterator_stop(iter) 837 838 CALL timestop(handle) 839 840 END SUBROUTINE create_fast_row_vec_access_${nametype}$ 841 842! ************************************************************************************************** 843!> \brief ... 844!> \param matrix ... 845!> \param vec_in ... 846!> \param vec_out ... 847!> \param alpha ... 848!> \param beta ... 849!> \param work_row ... 850!> \param work_col ... 851! ************************************************************************************************** 852 SUBROUTINE dbcsr_sym_matrix_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col) 853 TYPE(dbcsr_type) :: matrix, vec_in, vec_out 854 ${type}$ :: alpha, beta 855 TYPE(dbcsr_type) :: work_row, work_col 856 857 CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_m_v_mult' 858 859 INTEGER :: col, mypcol, & 860 myprow, & 861 pcol_group, nrows, ncols,& 862 prow_group, row, & 863 handle, handle1, ithread, vec_dim 864 ${type}$, DIMENSION(:), POINTER :: data_vec 865 ${type}$, DIMENSION(:, :), POINTER :: data_d, vec_res 866 TYPE(dbcsr_distribution_type) :: dist 867 TYPE(dbcsr_iterator_type) :: iter 868 TYPE(dbcsr_type) :: result_row, result_col 869 870 TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col 871 INTEGER :: prow, pcol, rprow, rpcol 872 873 CALL timeset(routineN, handle) 874 ithread=0 875! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise 876 CALL dbcsr_get_info(matrix=vec_in,nfullcols_total=vec_dim) 877! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive 878 CALL dbcsr_set(work_col, ${zero}$) 879 CALL dbcsr_copy(result_col, work_col) 880 CALL dbcsr_set(work_row, ${zero}$) 881 CALL dbcsr_copy(result_row, work_row) 882 883! Collect some data about the parallel environment. We will use them later to move the vector around 884 CALL dbcsr_get_info(matrix=matrix, distribution=dist) 885 CALL dbcsr_distribution_get(dist, prow_group=prow_group, pcol_group=pcol_group, myprow=myprow, mypcol=mypcol) 886 887 CALL create_fast_row_vec_access(work_row, fast_vec_row) 888 CALL create_fast_col_vec_access(work_col, fast_vec_col) 889 CALL create_fast_row_vec_access(result_row, res_fast_vec_row) 890 CALL create_fast_col_vec_access(result_col, res_fast_vec_col) 891 892! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply 893 CALL dbcsr_col_vec_to_rep_row_${nametype}$(vec_in, work_col, work_row, fast_vec_col) 894 895! Probably I should rename the routine above as it delivers both the replicated row and column vector 896 897! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes 898! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively) 899 CALL timeset(routineN//"_local_mm", handle1) 900 901!------ perform the multiplication, we have to take car to take the correct blocks ---------- 902 903!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) & 904!$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col) 905 !$ ithread = omp_get_thread_num () 906 CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.) 907 DO WHILE (dbcsr_iterator_blocks_left(iter)) 908 CALL dbcsr_iterator_next_block(iter, row, col, data_d) 909 pcol=hash_table_get(fast_vec_row%hash_table,col) 910 rprow=hash_table_get(res_fast_vec_col%hash_table,row) 911 IF(ASSOCIATED(fast_vec_row%blk_map_${nametype}$(pcol)%ptr) .AND.& 912 ASSOCIATED(res_fast_vec_col%blk_map_${nametype}$(rprow)%ptr))THEN 913 IF(res_fast_vec_col%blk_map_${nametype}$(rprow)%assigned_thread .EQ. ithread ) THEN 914 res_fast_vec_col%blk_map_${nametype}$(rprow)%ptr=res_fast_vec_col%blk_map_${nametype}$(rprow)%ptr+& 915 MATMUL(data_d,TRANSPOSE(fast_vec_row%blk_map_${nametype}$(pcol)%ptr)) 916 END IF 917 prow=hash_table_get(fast_vec_col%hash_table,row) 918 rpcol=hash_table_get(res_fast_vec_row%hash_table,col) 919 IF(res_fast_vec_row%blk_map_${nametype}$(rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN 920 res_fast_vec_row%blk_map_${nametype}$(rpcol)%ptr=res_fast_vec_row%blk_map_${nametype}$(rpcol)%ptr+& 921 MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$(prow)%ptr),data_d) 922 END IF 923 ELSE 924 rpcol=hash_table_get(res_fast_vec_col%hash_table,col) 925 prow=hash_table_get(fast_vec_row%hash_table,row) 926 IF(res_fast_vec_col%blk_map_${nametype}$(rpcol)%assigned_thread .EQ. ithread ) THEN 927 res_fast_vec_col%blk_map_${nametype}$(rpcol)%ptr=res_fast_vec_col%blk_map_${nametype}$(rpcol)%ptr+& 928 TRANSPOSE(MATMUL(fast_vec_row%blk_map_${nametype}$(prow)%ptr,data_d)) 929 END IF 930 rprow=hash_table_get(res_fast_vec_row%hash_table,row) 931 pcol=hash_table_get(fast_vec_col%hash_table,col) 932 IF(res_fast_vec_row%blk_map_${nametype}$(rprow)%assigned_thread .EQ. ithread .AND. row .NE. col ) THEN 933 res_fast_vec_row%blk_map_${nametype}$(rprow)%ptr=res_fast_vec_row%blk_map_${nametype}$(rprow)%ptr+& 934 TRANSPOSE(MATMUL(data_d,fast_vec_col%blk_map_${nametype}$(pcol)%ptr)) 935 END IF 936 END IF 937 END DO 938 CALL dbcsr_iterator_stop(iter) 939!$OMP END PARALLEL 940 941 CALL timestop(handle1) 942 943 ! sum all the data within a processor column to obtain the replicated result from lower 944 data_vec => dbcsr_get_data_p(result_row, select_data_type=${zero}$) 945 CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols) 946 947 CALL mp_sum(data_vec(1:nrows*ncols), pcol_group) 948! 949!! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated 950!! While the result_col still has the partial results in parallel. The routine below takes care of that and saves an 951!! mp_sum. Of the res_row vectors are created only taking the approriate element (0 otherwise) while the res_col 952!! parallel bits are locally added. The mp_sum magically creates the correct vector 953 CALL dbcsr_rep_row_to_rep_col_vec_${nametype}$(work_col, result_row, res_fast_vec_row, res_fast_vec_col) 954 955! ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower 956 CALL dbcsr_iterator_start(iter, vec_out) 957 DO WHILE (dbcsr_iterator_blocks_left(iter)) 958 CALL dbcsr_iterator_next_block(iter, row, col, vec_res) 959 prow=hash_table_get(fast_vec_col%hash_table,row) 960 IF(ASSOCIATED(fast_vec_col%blk_map_${nametype}$(prow)%ptr))THEN 961 vec_res(:, :)= beta*vec_res(:, :)+alpha*(fast_vec_col%blk_map_${nametype}$(prow)%ptr(:, :)) 962 ELSE 963 vec_res(:, :)= beta*vec_res(:, :) 964 END IF 965 END DO 966 CALL dbcsr_iterator_stop(iter) 967 968 CALL release_fast_vec_access(fast_vec_row) 969 CALL release_fast_vec_access(fast_vec_col) 970 CALL release_fast_vec_access(res_fast_vec_row) 971 CALL release_fast_vec_access(res_fast_vec_col) 972 973 CALL dbcsr_release(result_row); CALL dbcsr_release(result_col) 974 975 CALL timestop(handle) 976 977 END SUBROUTINE dbcsr_sym_matrix_vector_mult_${nametype}$ 978 979#:endfor 980 981END MODULE dbcsr_vector 982