1!--------------------------------------------------------------------------------------------------!
2!   CP2K: A general program to perform molecular dynamics simulations                              !
3!   Copyright (C) 2000 - 2019  CP2K developers group                                               !
4!--------------------------------------------------------------------------------------------------!
5
6! **************************************************************************************************
7!> \brief operations for skinny matrices/vectors expressed in dbcsr form
8!> \par History
9!>       2014.10 created [Florian Schiffmann]
10!> \author Florian Schiffmann
11! **************************************************************************************************
12
13MODULE dbcsr_vector
14   USE dbcsr_api,                       ONLY: dbcsr_copy,&
15                                              dbcsr_create,&
16                                              dbcsr_distribution_get,&
17                                              dbcsr_distribution_new,&
18                                              dbcsr_distribution_release,&
19                                              dbcsr_distribution_type,&
20                                              dbcsr_get_info,&
21                                              dbcsr_iterator_blocks_left,&
22                                              dbcsr_iterator_next_block,&
23                                              dbcsr_iterator_start,&
24                                              dbcsr_iterator_stop,&
25                                              dbcsr_iterator_type,&
26                                              dbcsr_release,&
27                                              dbcsr_reserve_all_blocks,&
28                                              dbcsr_set,dbcsr_get_data_p,&
29                                              dbcsr_type,&
30                                              dbcsr_type_antisymmetric,&
31                                              dbcsr_type_complex_4,&
32                                              dbcsr_type_complex_4,&
33                                              dbcsr_type_complex_8,&
34                                              dbcsr_type_complex_8,&
35                                              dbcsr_type_no_symmetry,&
36                                              dbcsr_type_real_4,&
37                                              dbcsr_type_real_4,&
38                                              dbcsr_type_real_8,&
39                                              dbcsr_type_real_8,&
40                                              dbcsr_type_symmetric
41   USE kinds,                           ONLY: real_4,&
42                                              real_8
43   USE message_passing,                 ONLY: mp_bcast,&
44                                              mp_sum
45
46
47#include "../base/base_uses.f90"
48
49!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
50
51   IMPLICIT NONE
52
53   PRIVATE
54
55   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_vector_operations'
56
57!    ! the following types provide fast access to distributed dbcsr vectors
58#include "hash_table_types.f90"
59
60   TYPE block_ptr_d
61      REAL(real_8), DIMENSION(:, :), POINTER          :: ptr => NULL()
62      INTEGER                                         :: assigned_thread
63   END TYPE
64   TYPE block_ptr_s
65      REAL(real_4), DIMENSION(:, :), POINTER          :: ptr => NULL()
66      INTEGER                                         :: assigned_thread
67   END TYPE
68   TYPE block_ptr_c
69      COMPLEX(real_4), DIMENSION(:, :), POINTER       :: ptr => NULL()
70      INTEGER                                         :: assigned_thread
71   END TYPE
72   TYPE block_ptr_z
73      COMPLEX(real_8), DIMENSION(:, :), POINTER       :: ptr => NULL()
74      INTEGER                                         :: assigned_thread
75   END TYPE
76
77   TYPE fast_vec_access_type
78      TYPE(hash_table_type) :: hash_table
79      TYPE(block_ptr_d), DIMENSION(:), ALLOCATABLE :: blk_map_d
80      TYPE(block_ptr_s), DIMENSION(:), ALLOCATABLE :: blk_map_s
81      TYPE(block_ptr_c), DIMENSION(:), ALLOCATABLE :: blk_map_c
82      TYPE(block_ptr_z), DIMENSION(:), ALLOCATABLE :: blk_map_z
83   END TYPE
84
85   PUBLIC :: dbcsr_matrix_colvec_multiply, &
86             create_col_vec_from_matrix, &
87             create_row_vec_from_matrix, &
88             create_replicated_col_vec_from_matrix, &
89             create_replicated_row_vec_from_matrix
90
91   INTERFACE dbcsr_matrix_colvec_multiply
92      MODULE PROCEDURE dbcsr_matrix_colvec_multiply_d
93      MODULE PROCEDURE dbcsr_matrix_colvec_multiply_s
94      MODULE PROCEDURE dbcsr_matrix_colvec_multiply_z
95      MODULE PROCEDURE dbcsr_matrix_colvec_multiply_c
96   END INTERFACE
97
98CONTAINS
99
100! **************************************************************************************************
101!> \brief creates a dbcsr col vector like object which lives on proc_col 0
102!>        and has the same row dist as the template matrix
103!>        the returned matrix is fully allocated and all blocks are set to 0
104!>        this is not a sparse object (and must never be)
105!> \param dbcsr_vec  the vector object to create must be allocated but not initialized
106!> \param matrix a dbcsr matrix used as template
107!> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
108! **************************************************************************************************
109   SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
110      TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
111      INTEGER                                            :: ncol
112
113      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
114
115      INTEGER                                            :: handle, npcols, data_type
116      INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist
117      TYPE(dbcsr_distribution_type)                      :: dist_col_vec, dist
118
119      CALL timeset(routineN, handle)
120
121      CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
122      CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
123
124      ALLOCATE (col_dist(1), col_blk_size(1))
125      col_dist(1) = 0
126      col_blk_size(1) = ncol
127      CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
128
129      CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec,&
130                        matrix_type=dbcsr_type_no_symmetry, &
131                        row_blk_size=row_blk_size,&
132                        col_blk_size=col_blk_size, &
133                        nze=0, data_type=data_type)
134      CALL dbcsr_reserve_all_blocks(dbcsr_vec)
135
136      CALL dbcsr_distribution_release(dist_col_vec)
137      DEALLOCATE (col_dist, col_blk_size)
138      CALL timestop(handle)
139
140   END SUBROUTINE create_col_vec_from_matrix
141
142! **************************************************************************************************
143!> \brief creates a dbcsr row vector like object which lives on proc_row 0
144!>        and has the same row dist as the template matrix
145!>        the returned matrix is fully allocated and all blocks are set to 0
146!>        this is not a sparse object (and must never be)
147!> \param dbcsr_vec ...
148!> \param matrix a dbcsr matrix used as template
149!> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
150! **************************************************************************************************
151SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
152      TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
153      INTEGER                                            :: nrow
154
155      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
156
157      INTEGER                                            :: handle, nprows, data_type
158      INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist
159      TYPE(dbcsr_distribution_type)                      :: dist_row_vec, dist
160
161      CALL timeset(routineN, handle)
162
163      CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist)
164      CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
165
166      ALLOCATE (row_dist(1), row_blk_size(1))
167      row_dist(1) = 0
168      row_blk_size(1) = nrow
169      CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
170
171      CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec,&
172                        matrix_type=dbcsr_type_no_symmetry, &
173                        row_blk_size=row_blk_size,&
174                        col_blk_size=col_blk_size, &
175                        nze=0, data_type=data_type)
176      CALL dbcsr_reserve_all_blocks(dbcsr_vec)
177
178      CALL dbcsr_distribution_release(dist_row_vec)
179      DEALLOCATE (row_dist, row_blk_size)
180      CALL timestop(handle)
181
182   END SUBROUTINE create_row_vec_from_matrix
183
184
185! **************************************************************************************************
186!> \brief creates a col vector like object whose blocks can be replicated
187!>        along the processor row and has the same row dist as the template matrix
188!>        the returned matrix is fully allocated and all blocks are set to 0
189!>        this is not a sparse object (and must never be)
190!> \param dbcsr_vec the vector object to create must be allocated but not initialized
191!> \param matrix a dbcsr matrix used as template
192!> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
193! **************************************************************************************************
194   SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
195      TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
196      INTEGER                                            :: ncol
197
198      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
199
200      INTEGER                                            :: handle, npcols, data_type, i
201      INTEGER, DIMENSION(:), POINTER                     :: row_blk_size, col_blk_size, row_dist, col_dist
202      TYPE(dbcsr_distribution_type)                      :: dist_col_vec, dist
203      CALL timeset(routineN, handle)
204
205      CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
206      CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
207
208      ALLOCATE (col_dist(npcols), col_blk_size(npcols))
209      col_blk_size(:) = ncol
210      DO i = 0, npcols-1
211         col_dist(i+1) = i
212      END DO
213      CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
214
215      CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
216                        matrix_type=dbcsr_type_no_symmetry, &
217                        row_blk_size=row_blk_size,&
218                        col_blk_size=col_blk_size, &
219                        nze=0, data_type=data_type)
220      CALL dbcsr_reserve_all_blocks(dbcsr_vec)
221
222      CALL dbcsr_distribution_release(dist_col_vec)
223      DEALLOCATE (col_dist, col_blk_size)
224      CALL timestop(handle)
225
226   END SUBROUTINE create_replicated_col_vec_from_matrix
227
228! **************************************************************************************************
229!> \brief creates a row vector like object whose blocks can be replicated
230!>        along the processor col and has the same col dist as the template matrix
231!>        the returned matrix is fully allocated and all blocks are set to 0
232!>        this is not a sparse object (and must never be)
233!> \param dbcsr_vec the vector object to create must be allocated but not initialized
234!> \param matrix a dbcsr matrix used as template
235!> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
236! **************************************************************************************************
237   SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
238      TYPE(dbcsr_type)                                   :: dbcsr_vec
239      TYPE(dbcsr_type)                                   :: matrix
240      INTEGER                                            :: nrow
241
242      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
243
244      INTEGER                                            :: handle, i, nprows, data_type
245      INTEGER, DIMENSION(:), POINTER                     :: row_dist, col_dist, row_blk_size, col_blk_size
246      TYPE(dbcsr_distribution_type)                      :: dist_row_vec, dist
247
248      CALL timeset(routineN, handle)
249
250      CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type)
251      CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
252
253      ALLOCATE (row_dist(nprows), row_blk_size(nprows))
254      row_blk_size(:) = nrow
255      DO i = 0, nprows-1
256         row_dist(i+1) = i
257      END DO
258      CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
259
260      CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
261                        row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
262                        nze=0, data_type=data_type)
263      CALL dbcsr_reserve_all_blocks(dbcsr_vec)
264
265      CALL dbcsr_distribution_release(dist_row_vec)
266      DEALLOCATE (row_dist, row_blk_Size)
267      CALL timestop(handle)
268
269   END SUBROUTINE create_replicated_row_vec_from_matrix
270
271! **************************************************************************************************
272!> \brief given a column vector, prepare the fast_vec_access container
273!> \param vec ...
274!> \param fast_vec_access ...
275! **************************************************************************************************
276   SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
277      TYPE(dbcsr_type)                                   :: vec
278      TYPE(fast_vec_access_type)                         :: fast_vec_access
279
280      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
281
282      INTEGER                                            :: handle, data_type
283
284      CALL timeset(routineN, handle)
285
286      CALL dbcsr_get_info(vec, data_type=data_type)
287
288      SELECT CASE (data_type)
289      CASE (dbcsr_type_real_8)
290         CALL create_fast_col_vec_access_d(vec, fast_vec_access)
291      CASE (dbcsr_type_real_4)
292         CALL create_fast_col_vec_access_s(vec, fast_vec_access)
293      CASE (dbcsr_type_complex_8)
294         CALL create_fast_col_vec_access_z(vec, fast_vec_access)
295      CASE (dbcsr_type_complex_4)
296         CALL create_fast_col_vec_access_c(vec, fast_vec_access)
297      END SELECT
298
299      CALL timestop(handle)
300
301   END SUBROUTINE create_fast_col_vec_access
302
303! **************************************************************************************************
304!> \brief given a row vector, prepare the fast_vec_access_container
305!> \param vec ...
306!> \param fast_vec_access ...
307! **************************************************************************************************
308   SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
309      TYPE(dbcsr_type)                                   :: vec
310      TYPE(fast_vec_access_type)                         :: fast_vec_access
311
312      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
313
314      INTEGER                                            :: handle, data_type
315
316      CALL timeset(routineN, handle)
317
318      CALL dbcsr_get_info(vec, data_type=data_type)
319
320      SELECT CASE (data_type)
321      CASE (dbcsr_type_real_8)
322         CALL create_fast_row_vec_access_d(vec, fast_vec_access)
323      CASE (dbcsr_type_real_4)
324         CALL create_fast_row_vec_access_s(vec, fast_vec_access)
325      CASE (dbcsr_type_complex_8)
326         CALL create_fast_row_vec_access_c(vec, fast_vec_access)
327      CASE (dbcsr_type_complex_4)
328         CALL create_fast_row_vec_access_z(vec, fast_vec_access)
329      END SELECT
330
331      CALL timestop(handle)
332
333   END SUBROUTINE create_fast_row_vec_access
334
335! **************************************************************************************************
336!> \brief release all memory associated with the fast_vec_access type
337!> \param fast_vec_access ...
338! **************************************************************************************************
339   SUBROUTINE release_fast_vec_access(fast_vec_access)
340      TYPE(fast_vec_access_type)                         :: fast_vec_access
341
342      CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
343
344      INTEGER                                            :: handle
345
346      CALL timeset(routineN, handle)
347
348      CALL hash_table_release(fast_vec_access%hash_table)
349      IF (ALLOCATED(fast_vec_access%blk_map_d)) DEALLOCATE (fast_vec_access%blk_map_d)
350      IF (ALLOCATED(fast_vec_access%blk_map_s)) DEALLOCATE (fast_vec_access%blk_map_s)
351      IF (ALLOCATED(fast_vec_access%blk_map_c)) DEALLOCATE (fast_vec_access%blk_map_c)
352      IF (ALLOCATED(fast_vec_access%blk_map_z)) DEALLOCATE (fast_vec_access%blk_map_z)
353
354      CALL timestop(handle)
355
356   END SUBROUTINE release_fast_vec_access
357
358#include "hash_table.f90"
359
360#:set instances = [ ('s', 'REAL(kind=real_4)',    '0.0_real_4'), &
361                    ('d', 'REAL(kind=real_8)',    '0.0_real_8'), &
362                    ('c', 'COMPLEX(kind=real_4)', 'CMPLX(0.0, 0.0, real_4)'), &
363                    ('z', 'COMPLEX(kind=real_8)', 'CMPLX(0.0, 0.0, real_8)') ]
364
365#:for nametype, type, zero in instances
366
367! **************************************************************************************************
368!> \brief the real driver routine for the multiply, not all symmetries implemented yet
369!> \param matrix ...
370!> \param vec_in ...
371!> \param vec_out ...
372!> \param alpha ...
373!> \param beta ...
374!> \param work_row ...
375!> \param work_col ...
376! **************************************************************************************************
377  SUBROUTINE dbcsr_matrix_colvec_multiply_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
378    TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out
379    ${type}$                                  :: alpha, beta
380    TYPE(dbcsr_type)                          :: work_row, work_col
381
382    CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_colvec_multiply_low', &
383      routineP = moduleN//':'//routineN
384
385    CHARACTER                                :: matrix_type
386
387    CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
388
389    SELECT CASE(matrix_type)
390    CASE(dbcsr_type_no_symmetry)
391       CALL dbcsr_matrix_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
392    CASE(dbcsr_type_symmetric)
393       CALL dbcsr_sym_matrix_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
394    CASE(dbcsr_type_antisymmetric)
395        ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
396       CPABORT("NYI, antisymmetric matrix not permitted")
397    CASE DEFAULT
398       CPABORT("Unknown matrix type, ...")
399    END SELECT
400
401  END SUBROUTINE dbcsr_matrix_colvec_multiply_${nametype}$
402
403! **************************************************************************************************
404!> \brief low level routines for matrix vector multiplies
405!> \param matrix ...
406!> \param vec_in ...
407!> \param vec_out ...
408!> \param alpha ...
409!> \param beta ...
410!> \param work_row ...
411!> \param work_col ...
412! **************************************************************************************************
413  SUBROUTINE dbcsr_matrix_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
414    TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out
415    ${type}$                                  :: alpha, beta
416    TYPE(dbcsr_type)                          :: work_row, work_col
417
418    CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
419
420    INTEGER                                  :: col, mypcol, &
421                                                myprow, &
422                                                ncols, pcol_group, nrows, &
423                                                prow_group, row, &
424                                                handle, handle1, ithread
425    ${type}$, DIMENSION(:), POINTER          :: data_vec
426    ${type}$, DIMENSION(:, :), POINTER       :: data_d, vec_res
427    TYPE(dbcsr_distribution_type)            :: dist
428    TYPE(dbcsr_iterator_type)                :: iter
429    TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col
430    INTEGER                                  :: prow, pcol
431
432    CALL timeset(routineN, handle)
433    ithread=0
434
435! Collect some data about the parallel environment. We will use them later to move the vector around
436    CALL dbcsr_get_info(matrix, distribution=dist)
437    CALL dbcsr_distribution_get(dist, prow_group=prow_group, pcol_group=pcol_group, myprow=myprow, mypcol=mypcol)
438
439    CALL create_fast_row_vec_access(work_row, fast_vec_row)
440    CALL create_fast_col_vec_access(work_col, fast_vec_col)
441
442! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
443    CALL dbcsr_col_vec_to_rep_row_${nametype}$(vec_in, work_col, work_row, fast_vec_col)
444
445! Set the work vector for the results to 0
446    CALL dbcsr_set(work_col, ${zero}$)
447
448! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
449! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
450    CALL timeset(routineN//"_local_mm", handle1)
451
452!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
453!$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)
454    !$ ithread = omp_get_thread_num ()
455    CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
456    DO WHILE (dbcsr_iterator_blocks_left(iter))
457       CALL dbcsr_iterator_next_block(iter, row, col, data_d)
458       prow=hash_table_get(fast_vec_col%hash_table,row)
459       IF(fast_vec_col%blk_map_${nametype}$(prow)%assigned_thread .NE. ithread ) CYCLE
460       pcol=hash_table_get(fast_vec_row%hash_table,col)
461       fast_vec_col%blk_map_${nametype}$(prow)%ptr=fast_vec_col%blk_map_${nametype}$(prow)%ptr+&
462            MATMUL(data_d,TRANSPOSE(fast_vec_row%blk_map_${nametype}$(pcol)%ptr))
463    END DO
464    CALL dbcsr_iterator_stop(iter)
465!$OMP END PARALLEL
466
467    CALL timestop(handle1)
468
469! sum all the data onto the first processor col where the original vector is stored
470    data_vec => dbcsr_get_data_p(work_col, select_data_type=${zero}$)
471    CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
472    CALL mp_sum(data_vec(1:nrows*ncols), prow_group)
473
474! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
475! from the replicated to the original vector. Let's play it safe and use the iterator
476    CALL dbcsr_iterator_start(iter, vec_out)
477    DO WHILE (dbcsr_iterator_blocks_left(iter))
478       CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
479       prow=hash_table_get(fast_vec_col%hash_table,row)
480       IF(ASSOCIATED(fast_vec_col%blk_map_${nametype}$(prow)%ptr)) THEN
481          vec_res(:, :)= beta*vec_res(:, :)+alpha*fast_vec_col%blk_map_${nametype}$(prow)%ptr(:,:)
482       ELSE
483          vec_res(:, :)= beta*vec_res(:, :)
484       END IF
485    END DO
486    CALL dbcsr_iterator_stop(iter)
487
488    CALL release_fast_vec_access(fast_vec_row)
489    CALL release_fast_vec_access(fast_vec_col)
490
491    CALL timestop(handle)
492
493  END SUBROUTINE dbcsr_matrix_vector_mult_${nametype}$
494
495! **************************************************************************************************
496!> \brief ...
497!> \param matrix ...
498!> \param vec_in ...
499!> \param vec_out ...
500!> \param alpha ...
501!> \param beta ...
502!> \param work_row ...
503!> \param work_col ...
504!> \param skip_diag ...
505! **************************************************************************************************
506  SUBROUTINE dbcsr_matrixT_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
507    TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out
508    ${type}$                                  :: alpha, beta
509    TYPE(dbcsr_type)                          :: work_row, work_col
510    LOGICAL                                   :: skip_diag
511
512    CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
513
514    INTEGER                                  :: col, col_size, mypcol, &
515                                                myprow, &
516                                                ncols, pcol_group, nrows, &
517                                                prow_group, row, row_size, &
518                                                handle, handle1, ithread
519    ${type}$, DIMENSION(:), POINTER          :: data_vec
520    ${type}$, DIMENSION(:, :), POINTER       :: data_d, vec_bl, vec_res
521    TYPE(dbcsr_distribution_type)            :: dist
522    TYPE(dbcsr_iterator_type)                :: iter
523
524    TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col
525    INTEGER                                  :: prow, pcol
526
527    CALL timeset(routineN, handle)
528    ithread=0
529
530! Collect some data about the parallel environment. We will use them later to move the vector around
531    CALL dbcsr_get_info(matrix, distribution=dist)
532    CALL dbcsr_distribution_get(dist, prow_group=prow_group, pcol_group=pcol_group, myprow=myprow, mypcol=mypcol)
533
534    CALL create_fast_row_vec_access(work_row, fast_vec_row)
535    CALL create_fast_col_vec_access(work_col, fast_vec_col)
536
537! Set the work vector for the results to 0
538    CALL dbcsr_set(work_row, ${zero}$)
539
540! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
541    CALL dbcsr_iterator_start(iter, vec_in)
542    DO WHILE (dbcsr_iterator_blocks_left(iter))
543       CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
544       prow=hash_table_get(fast_vec_col%hash_table,row)
545       fast_vec_col%blk_map_${nametype}$(prow)%ptr(1:row_size, 1:col_size)= vec_bl(1:row_size, 1:col_size)
546    END DO
547    CALL dbcsr_iterator_stop(iter)
548! Replicate the data on all processore in the row
549    data_vec => dbcsr_get_data_p(work_col, select_data_type=${zero}$)
550    CALL mp_bcast(data_vec, 0, prow_group)
551
552! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
553    CALL timeset(routineN//"local_mm", handle1)
554    CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
555!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
556!$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
557    !$ ithread = omp_get_thread_num ()
558    CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
559    DO WHILE (dbcsr_iterator_blocks_left(iter))
560       CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
561       IF(skip_diag.AND.col==row)CYCLE
562       prow=hash_table_get(fast_vec_col%hash_table,row)
563       pcol=hash_table_get(fast_vec_row%hash_table,col)
564       IF ( ASSOCIATED(fast_vec_row%blk_map_${nametype}$(pcol)%ptr) .AND. &
565            ASSOCIATED(fast_vec_col%blk_map_${nametype}$(prow)%ptr) )THEN
566          IF(fast_vec_row%blk_map_${nametype}$(pcol)%assigned_thread .NE. ithread ) CYCLE
567          fast_vec_row%blk_map_${nametype}$(pcol)%ptr=fast_vec_row%blk_map_${nametype}$(pcol)%ptr+&
568               MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$(prow)%ptr),data_d)
569       ELSE
570          prow=hash_table_get(fast_vec_row%hash_table,row)
571          pcol=hash_table_get(fast_vec_col%hash_table,col)
572          IF(fast_vec_row%blk_map_${nametype}$(prow)%assigned_thread .NE. ithread ) CYCLE
573          fast_vec_row%blk_map_${nametype}$(prow)%ptr=fast_vec_row%blk_map_${nametype}$(prow)%ptr+&
574             MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$(pcol)%ptr),TRANSPOSE(data_d))
575       END IF
576    END DO
577    CALL dbcsr_iterator_stop(iter)
578!$OMP END PARALLEL
579
580    CALL timestop(handle1)
581
582! sum all the data within a processor column to obtain the replicated result
583    data_vec => dbcsr_get_data_p(work_row, select_data_type=${zero}$)
584! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
585    CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
586    CALL mp_sum(data_vec(1:nrows*ncols), pcol_group)
587
588! Convert the result to a column wise distribution
589    CALL dbcsr_rep_row_to_rep_col_vec_${nametype}$(work_col, work_row, fast_vec_row)
590
591! Create_the final vector by summing it to the result vector which lives on proc_col 0
592    CALL dbcsr_iterator_start(iter, vec_out)
593    DO WHILE (dbcsr_iterator_blocks_left(iter))
594       CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
595       prow=hash_table_get(fast_vec_col%hash_table,row)
596       IF(ASSOCIATED(fast_vec_col%blk_map_${nametype}$(prow)%ptr)) THEN
597          vec_res(:, :)= beta*vec_res(:, :)+alpha*fast_vec_col%blk_map_${nametype}$(prow)%ptr(:,:)
598       ELSE
599          vec_res(:, :)= beta*vec_res(:, :)
600       END IF
601    END DO
602    CALL dbcsr_iterator_stop(iter)
603
604    CALL timestop(handle)
605
606  END SUBROUTINE dbcsr_matrixT_vector_mult_${nametype}$
607
608! **************************************************************************************************
609!> \brief ...
610!> \param vec_in ...
611!> \param rep_col_vec ...
612!> \param rep_row_vec ...
613!> \param fast_vec_col ...
614! **************************************************************************************************
615  SUBROUTINE dbcsr_col_vec_to_rep_row_${nametype}$(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
616    TYPE(dbcsr_type)                          :: vec_in, rep_col_vec, &
617                                                rep_row_vec
618    TYPE(fast_vec_access_type), INTENT(IN)   :: fast_vec_col
619
620    CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
621
622    INTEGER                                  :: col, mypcol, myprow, ncols, &
623                                                nrows, pcol_group, &
624                                                prow_group, row, handle
625    INTEGER, DIMENSION(:), POINTER           :: local_cols, row_dist
626    ${type}$, DIMENSION(:), POINTER          :: data_vec, data_vec_rep
627    ${type}$, DIMENSION(:, :), POINTER       :: vec_row
628    TYPE(dbcsr_distribution_type)            :: dist_in, dist_rep_col
629    TYPE(dbcsr_iterator_type)                :: iter
630
631    CALL timeset(routineN, handle)
632
633! get information about the parallel environment
634    CALL dbcsr_get_info(vec_in, distribution=dist_in)
635    CALL dbcsr_distribution_get(dist_in, &
636                                prow_group=prow_group, &
637                                pcol_group=pcol_group, &
638                                myprow=myprow, &
639                                mypcol=mypcol)
640
641! Get the vector which tells us which blocks are local to which processor row in the col vec
642    CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
643    CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
644
645! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
646    CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
647    data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=${zero}$)
648    data_vec => dbcsr_get_data_p(vec_in, select_data_type=${zero}$)
649    IF(mypcol==0)data_vec_rep(1:nrows*ncols)=data_vec(1:nrows*ncols)
650! Replicate the data along the row
651    CALL mp_bcast(data_vec_rep(1:nrows*ncols), 0, prow_group)
652
653! Here it gets a bit tricky as we are dealing with two different parallel layouts:
654! The rep_col_vec contains all blocks local to the row distribution of the vector.
655! The rep_row_vec only needs the fraction which is local to the col distribution.
656! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
657! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
658! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
659! Hope this clarifies the idea
660    CALL dbcsr_set(rep_row_vec, ${zero}$)
661    CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
662    CALL dbcsr_iterator_start(iter, rep_row_vec)
663    DO WHILE (dbcsr_iterator_blocks_left(iter))
664       CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
665       IF(row_dist(col)==myprow)THEN
666          vec_row=TRANSPOSE(fast_vec_col%blk_map_${nametype}$(hash_table_get(fast_vec_col%hash_table,col))%ptr)
667       END IF
668    END DO
669    CALL dbcsr_iterator_stop(iter)
670    CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
671    data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=${zero}$)
672    CALL mp_sum(data_vec_rep(1:ncols*nrows), pcol_group)
673
674    CALL timestop(handle)
675
676  END SUBROUTINE dbcsr_col_vec_to_rep_row_${nametype}$
677
678! **************************************************************************************************
679!> \brief ...
680!> \param rep_col_vec ...
681!> \param rep_row_vec ...
682!> \param fast_vec_row ...
683!> \param fast_vec_col_add ...
684! **************************************************************************************************
685  SUBROUTINE dbcsr_rep_row_to_rep_col_vec_${nametype}$(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
686    TYPE(dbcsr_type)                          :: rep_col_vec, rep_row_vec
687    TYPE(fast_vec_access_type), OPTIONAL     :: fast_vec_col_add
688    TYPE(fast_vec_access_type)               :: fast_vec_row
689
690    CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
691
692    INTEGER                                  :: col, mypcol, myprow, ncols, &
693                                                nrows, pcol_group, &
694                                                prow_group, row, handle
695    INTEGER, DIMENSION(:), POINTER           :: col_dist
696    ${type}$, DIMENSION(:), POINTER          :: data_vec_rep
697    ${type}$, DIMENSION(:, :), POINTER       :: vec_col
698    TYPE(dbcsr_distribution_type)            :: dist_rep_row, dist_rep_col
699    TYPE(dbcsr_iterator_type)                :: iter
700
701    CALL timeset(routineN, handle)
702
703! get information about the parallel environment
704    CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
705    CALL dbcsr_distribution_get(dist_rep_col, &
706                                prow_group=prow_group, &
707                                pcol_group=pcol_group, &
708                                myprow=myprow, &
709                                mypcol=mypcol)
710
711! Get the vector which tells us which blocks are local to which processor col in the row vec
712    CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
713    CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
714
715
716! The same trick as described above with opposite direction
717    CALL dbcsr_set(rep_col_vec, ${zero}$)
718    CALL dbcsr_iterator_start(iter, rep_col_vec)
719    DO WHILE (dbcsr_iterator_blocks_left(iter))
720       CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
721       IF(col_dist(row)==mypcol)THEN
722          vec_col=TRANSPOSE(fast_vec_row%blk_map_${nametype}$(hash_table_get(fast_vec_row%hash_table,row))%ptr)
723       END IF
724       ! this one is special and allows to add the elements of a not yet summed replicated
725       ! column vector as it appears in M*V(row_rep) as result. Save an mp_sum in the symmetric case
726       IF(PRESENT(fast_vec_col_add))vec_col=vec_col+&
727            fast_vec_col_add%blk_map_${nametype}$(hash_table_get(fast_vec_col_add%hash_table,row))%ptr(:,:)
728    END DO
729    CALL dbcsr_iterator_stop(iter)
730    CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
731    data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=${zero}$)
732    CALL mp_sum(data_vec_rep(1:nrows*ncols), prow_group)
733
734    CALL timestop(handle)
735
736  END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_${nametype}$
737
738
739! **************************************************************************************************
740!> \brief given a column vector, prepare the fast_vec_access container
741!> \param vec ...
742!> \param fast_vec_access ...
743! **************************************************************************************************
744  SUBROUTINE create_fast_col_vec_access_${nametype}$(vec, fast_vec_access)
745    TYPE(dbcsr_type)                          :: vec
746    TYPE(fast_vec_access_type)               :: fast_vec_access
747
748    CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access_${nametype}$'
749
750    INTEGER                                  :: handle, nblk_local
751    INTEGER                                  :: col, row, iblock, nthreads
752    ${type}$, DIMENSION(:, :), POINTER       :: vec_bl
753    TYPE(dbcsr_iterator_type)                :: iter
754
755    CALL timeset(routineN, handle)
756
757    ! figure out the number of threads
758    nthreads = 1
759!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
760!$OMP MASTER
761    !$ nthreads = OMP_GET_NUM_THREADS()
762!$OMP END MASTER
763!$OMP END PARALLEL
764
765    CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
766    ! 4 times makes sure the table is big enough to limit collisions.
767    CALL hash_table_create(fast_vec_access%hash_table,4*nblk_local)
768    ! include zero for effective dealing with values not in the hash table (will return 0)
769    ALLOCATE(fast_vec_access%blk_map_${nametype}$(0:nblk_local))
770
771    CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
772    IF (col.GT.1) CPABORT("BUG")
773
774    ! go through the blocks of the vector
775    iblock=0
776    CALL dbcsr_iterator_start(iter, vec)
777    DO WHILE (dbcsr_iterator_blocks_left(iter))
778       CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
779       iblock=iblock+1
780       CALL hash_table_add(fast_vec_access%hash_table,row,iblock)
781       fast_vec_access%blk_map_${nametype}$(iblock)%ptr=>vec_bl
782       fast_vec_access%blk_map_${nametype}$(iblock)%assigned_thread=MOD(iblock,nthreads)
783    END DO
784    CALL dbcsr_iterator_stop(iter)
785
786    CALL timestop(handle)
787
788  END SUBROUTINE create_fast_col_vec_access_${nametype}$
789
790! **************************************************************************************************
791!> \brief given a row vector, prepare the fast_vec_access_container
792!> \param vec ...
793!> \param fast_vec_access ...
794! **************************************************************************************************
795  SUBROUTINE create_fast_row_vec_access_${nametype}$(vec, fast_vec_access)
796    TYPE(dbcsr_type)                          :: vec
797    TYPE(fast_vec_access_type)                :: fast_vec_access
798
799    CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access_${nametype}$'
800
801    INTEGER                                  :: handle, nblk_local
802    INTEGER                                  :: col, row, iblock, nthreads
803    ${type}$, DIMENSION(:, :), POINTER       :: vec_bl
804    TYPE(dbcsr_iterator_type)                :: iter
805
806    CALL timeset(routineN, handle)
807
808    ! figure out the number of threads
809    nthreads = 1
810!$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
811!$OMP MASTER
812    !$ nthreads = OMP_GET_NUM_THREADS()
813!$OMP END MASTER
814!$OMP END PARALLEL
815
816    CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
817    ! 4 times makes sure the table is big enough to limit collisions.
818    CALL hash_table_create(fast_vec_access%hash_table,4*nblk_local)
819    ! include zero for effective dealing with values not in the hash table (will return 0)
820    ALLOCATE(fast_vec_access%blk_map_${nametype}$(0:nblk_local))
821
822    ! sanity check
823    CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
824    IF (row.GT.1) CPABORT("BUG")
825
826    ! go through the blocks of the vector
827    iblock=0
828    CALL dbcsr_iterator_start(iter, vec)
829    DO WHILE (dbcsr_iterator_blocks_left(iter))
830       CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
831       iblock=iblock+1
832       CALL hash_table_add(fast_vec_access%hash_table,col,iblock)
833       fast_vec_access%blk_map_${nametype}$(iblock)%ptr=>vec_bl
834       fast_vec_access%blk_map_${nametype}$(iblock)%assigned_thread=MOD(iblock,nthreads)
835    END DO
836    CALL dbcsr_iterator_stop(iter)
837
838    CALL timestop(handle)
839
840  END SUBROUTINE create_fast_row_vec_access_${nametype}$
841
842! **************************************************************************************************
843!> \brief ...
844!> \param matrix ...
845!> \param vec_in ...
846!> \param vec_out ...
847!> \param alpha ...
848!> \param beta ...
849!> \param work_row ...
850!> \param work_col ...
851! **************************************************************************************************
852  SUBROUTINE dbcsr_sym_matrix_vector_mult_${nametype}$(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
853    TYPE(dbcsr_type)                          :: matrix, vec_in, vec_out
854    ${type}$                                  :: alpha, beta
855    TYPE(dbcsr_type)                          :: work_row, work_col
856
857    CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_m_v_mult'
858
859    INTEGER                                  :: col, mypcol, &
860                                                myprow, &
861                                                pcol_group, nrows, ncols,&
862                                                prow_group, row, &
863                                                handle, handle1, ithread, vec_dim
864    ${type}$, DIMENSION(:), POINTER          :: data_vec
865    ${type}$, DIMENSION(:, :), POINTER       :: data_d, vec_res
866    TYPE(dbcsr_distribution_type)            :: dist
867    TYPE(dbcsr_iterator_type)                :: iter
868    TYPE(dbcsr_type)                         :: result_row, result_col
869
870    TYPE(fast_vec_access_type)               :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col
871    INTEGER                                  :: prow, pcol, rprow, rpcol
872
873    CALL timeset(routineN, handle)
874    ithread=0
875! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
876    CALL dbcsr_get_info(matrix=vec_in,nfullcols_total=vec_dim)
877! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
878    CALL dbcsr_set(work_col, ${zero}$)
879    CALL dbcsr_copy(result_col, work_col)
880    CALL dbcsr_set(work_row, ${zero}$)
881    CALL dbcsr_copy(result_row, work_row)
882
883! Collect some data about the parallel environment. We will use them later to move the vector around
884    CALL dbcsr_get_info(matrix=matrix, distribution=dist)
885    CALL dbcsr_distribution_get(dist, prow_group=prow_group, pcol_group=pcol_group, myprow=myprow, mypcol=mypcol)
886
887    CALL create_fast_row_vec_access(work_row, fast_vec_row)
888    CALL create_fast_col_vec_access(work_col, fast_vec_col)
889    CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
890    CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
891
892! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
893    CALL dbcsr_col_vec_to_rep_row_${nametype}$(vec_in, work_col, work_row, fast_vec_col)
894
895! Probably I should rename the routine above as it delivers both the replicated row and column vector
896
897! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
898! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
899    CALL timeset(routineN//"_local_mm", handle1)
900
901!------ perform the multiplication, we have to take car to take the correct blocks ----------
902
903!$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
904!$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
905    !$ ithread = omp_get_thread_num ()
906    CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
907    DO WHILE (dbcsr_iterator_blocks_left(iter))
908       CALL dbcsr_iterator_next_block(iter, row, col, data_d)
909       pcol=hash_table_get(fast_vec_row%hash_table,col)
910       rprow=hash_table_get(res_fast_vec_col%hash_table,row)
911       IF(ASSOCIATED(fast_vec_row%blk_map_${nametype}$(pcol)%ptr) .AND.&
912          ASSOCIATED(res_fast_vec_col%blk_map_${nametype}$(rprow)%ptr))THEN
913          IF(res_fast_vec_col%blk_map_${nametype}$(rprow)%assigned_thread .EQ. ithread ) THEN
914             res_fast_vec_col%blk_map_${nametype}$(rprow)%ptr=res_fast_vec_col%blk_map_${nametype}$(rprow)%ptr+&
915               MATMUL(data_d,TRANSPOSE(fast_vec_row%blk_map_${nametype}$(pcol)%ptr))
916          END IF
917          prow=hash_table_get(fast_vec_col%hash_table,row)
918          rpcol=hash_table_get(res_fast_vec_row%hash_table,col)
919          IF(res_fast_vec_row%blk_map_${nametype}$(rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
920             res_fast_vec_row%blk_map_${nametype}$(rpcol)%ptr=res_fast_vec_row%blk_map_${nametype}$(rpcol)%ptr+&
921                MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$(prow)%ptr),data_d)
922          END IF
923       ELSE
924          rpcol=hash_table_get(res_fast_vec_col%hash_table,col)
925          prow=hash_table_get(fast_vec_row%hash_table,row)
926          IF(res_fast_vec_col%blk_map_${nametype}$(rpcol)%assigned_thread .EQ. ithread ) THEN
927             res_fast_vec_col%blk_map_${nametype}$(rpcol)%ptr=res_fast_vec_col%blk_map_${nametype}$(rpcol)%ptr+&
928                TRANSPOSE(MATMUL(fast_vec_row%blk_map_${nametype}$(prow)%ptr,data_d))
929          END IF
930          rprow=hash_table_get(res_fast_vec_row%hash_table,row)
931          pcol=hash_table_get(fast_vec_col%hash_table,col)
932          IF(res_fast_vec_row%blk_map_${nametype}$(rprow)%assigned_thread .EQ. ithread  .AND. row .NE. col ) THEN
933             res_fast_vec_row%blk_map_${nametype}$(rprow)%ptr=res_fast_vec_row%blk_map_${nametype}$(rprow)%ptr+&
934                TRANSPOSE(MATMUL(data_d,fast_vec_col%blk_map_${nametype}$(pcol)%ptr))
935          END IF
936       END IF
937    END DO
938    CALL dbcsr_iterator_stop(iter)
939!$OMP END PARALLEL
940
941    CALL timestop(handle1)
942
943    ! sum all the data within a processor column to obtain the replicated result from lower
944    data_vec => dbcsr_get_data_p(result_row, select_data_type=${zero}$)
945    CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
946
947    CALL mp_sum(data_vec(1:nrows*ncols), pcol_group)
948!
949!! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
950!! While the result_col still has the partial results in parallel. The routine below takes care of that and saves an
951!! mp_sum. Of the res_row vectors are created only taking the approriate element (0 otherwise) while the res_col
952!! parallel bits are locally added. The mp_sum magically creates the correct vector
953    CALL dbcsr_rep_row_to_rep_col_vec_${nametype}$(work_col, result_row, res_fast_vec_row, res_fast_vec_col)
954
955!    ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
956    CALL dbcsr_iterator_start(iter, vec_out)
957    DO WHILE (dbcsr_iterator_blocks_left(iter))
958       CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
959       prow=hash_table_get(fast_vec_col%hash_table,row)
960       IF(ASSOCIATED(fast_vec_col%blk_map_${nametype}$(prow)%ptr))THEN
961          vec_res(:, :)= beta*vec_res(:, :)+alpha*(fast_vec_col%blk_map_${nametype}$(prow)%ptr(:, :))
962       ELSE
963          vec_res(:, :)= beta*vec_res(:, :)
964       END IF
965    END DO
966    CALL dbcsr_iterator_stop(iter)
967
968    CALL release_fast_vec_access(fast_vec_row)
969    CALL release_fast_vec_access(fast_vec_col)
970    CALL release_fast_vec_access(res_fast_vec_row)
971    CALL release_fast_vec_access(res_fast_vec_col)
972
973    CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
974
975    CALL timestop(handle)
976
977  END SUBROUTINE dbcsr_sym_matrix_vector_mult_${nametype}$
978
979#:endfor
980
981END MODULE dbcsr_vector
982