1!--------------------------------------------------------------------------------------------------!
2!   CP2K: A general program to perform molecular dynamics simulations                              !
3!   Copyright (C) 2000 - 2019  CP2K developers group                                               !
4!--------------------------------------------------------------------------------------------------!
5
6! **************************************************************************************************
7!> \brief Rountines to calculate gradients of RI-GPW-MP2 energy using pw
8!> \par History
9!>      10.2013 created [Mauro Del Ben]
10! **************************************************************************************************
11MODULE mp2_ri_grad
12   USE atomic_kind_types,               ONLY: atomic_kind_type,&
13                                              get_atomic_kind_set
14   USE basis_set_types,                 ONLY: gto_basis_set_type
15   USE cell_types,                      ONLY: cell_type,&
16                                              pbc
17   USE cp_blacs_env,                    ONLY: cp_blacs_env_type
18   USE cp_control_types,                ONLY: dft_control_type
19   USE cp_dbcsr_operations,             ONLY: copy_dbcsr_to_fm,&
20                                              dbcsr_deallocate_matrix_set
21   USE cp_eri_mme_interface,            ONLY: cp_eri_mme_param
22   USE cp_fm_struct,                    ONLY: cp_fm_struct_create,&
23                                              cp_fm_struct_release,&
24                                              cp_fm_struct_type
25   USE cp_fm_types,                     ONLY: cp_fm_create,&
26                                              cp_fm_get_info,&
27                                              cp_fm_indxg2l,&
28                                              cp_fm_indxg2p,&
29                                              cp_fm_indxl2g,&
30                                              cp_fm_release,&
31                                              cp_fm_set_all,&
32                                              cp_fm_type
33   USE cp_gemm_interface,               ONLY: cp_gemm
34   USE cp_para_env,                     ONLY: cp_para_env_create,&
35                                              cp_para_env_release
36   USE cp_para_types,                   ONLY: cp_para_env_type
37   USE dbcsr_api,                       ONLY: &
38        dbcsr_add, dbcsr_copy, dbcsr_copy_into_existing, dbcsr_create, dbcsr_multiply, &
39        dbcsr_p_type, dbcsr_release, dbcsr_set, dbcsr_transposed, dbcsr_type, &
40        dbcsr_type_no_symmetry, dbcsr_type_symmetric
41   USE gaussian_gridlevels,             ONLY: gaussian_gridlevel
42   USE input_constants,                 ONLY: do_eri_gpw,&
43                                              do_eri_mme
44   USE kinds,                           ONLY: dp
45   USE mathconstants,                   ONLY: fourpi
46   USE message_passing,                 ONLY: &
47        mp_alltoall, mp_comm_split_direct, mp_irecv, mp_isend, mp_request_null, mp_sendrecv, &
48        mp_sum, mp_wait, mp_waitall
49   USE mp2_eri,                         ONLY: mp2_eri_2c_integrate,&
50                                              mp2_eri_3c_integrate,&
51                                              mp2_eri_deallocate_forces,&
52                                              mp2_eri_force
53   USE mp2_eri_gpw,                     ONLY: calc_potential_gpw,&
54                                              cleanup_gpw,&
55                                              prepare_gpw
56   USE mp2_types,                       ONLY: integ_mat_buffer_type,&
57                                              integ_mat_buffer_type_2D,&
58                                              mp2_type
59   USE orbital_pointers,                ONLY: ncoset
60   USE particle_types,                  ONLY: particle_type
61   USE pw_env_types,                    ONLY: pw_env_get,&
62                                              pw_env_type
63   USE pw_methods,                      ONLY: pw_copy,&
64                                              pw_derive,&
65                                              pw_integral_ab,&
66                                              pw_scale,&
67                                              pw_transfer
68   USE pw_poisson_methods,              ONLY: pw_poisson_solve
69   USE pw_poisson_types,                ONLY: pw_poisson_type
70   USE pw_pool_types,                   ONLY: pw_pool_create_pw,&
71                                              pw_pool_give_back_pw,&
72                                              pw_pool_type
73   USE pw_types,                        ONLY: COMPLEXDATA1D,&
74                                              REALDATA3D,&
75                                              REALSPACE,&
76                                              RECIPROCALSPACE,&
77                                              pw_p_type
78   USE qs_collocate_density,            ONLY: calculate_rho_elec,&
79                                              calculate_wavefunction
80   USE qs_environment_types,            ONLY: get_qs_env,&
81                                              qs_environment_type
82   USE qs_force_types,                  ONLY: qs_force_type
83   USE qs_integrate_potential,          ONLY: integrate_pgf_product_rspace,&
84                                              integrate_v_rspace
85   USE qs_kind_types,                   ONLY: get_qs_kind,&
86                                              qs_kind_type
87   USE qs_ks_types,                     ONLY: qs_ks_env_type
88   USE qs_neighbor_list_types,          ONLY: neighbor_list_set_p_type
89   USE realspace_grid_types,            ONLY: realspace_grid_desc_p_type,&
90                                              realspace_grid_p_type,&
91                                              rs_grid_release,&
92                                              rs_grid_retain
93   USE rs_pw_interface,                 ONLY: potential_pw2rs
94   USE task_list_types,                 ONLY: task_list_type
95   USE util,                            ONLY: get_limit
96   USE virial_types,                    ONLY: virial_type
97#include "./base/base_uses.f90"
98
99   IMPLICIT NONE
100
101   PRIVATE
102
103   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_grad'
104
105   PUBLIC :: calc_ri_mp2_nonsep
106
107CONTAINS
108
109! **************************************************************************************************
110!> \brief Calculate the non-separable part of the gradients and update the
111!>        Lagrangian
112!> \param qs_env ...
113!> \param mp2_env ...
114!> \param para_env ...
115!> \param para_env_sub ...
116!> \param cell ...
117!> \param particle_set ...
118!> \param atomic_kind_set ...
119!> \param qs_kind_set ...
120!> \param mo_coeff ...
121!> \param nmo ...
122!> \param homo ...
123!> \param dimen_RI ...
124!> \param Eigenval ...
125!> \param my_group_L_start ...
126!> \param my_group_L_end ...
127!> \param my_group_L_size ...
128!> \param sab_orb_sub ...
129!> \param mat_munu ...
130!> \param blacs_env_sub ...
131!> \param Eigenval_beta ...
132!> \param homo_beta ...
133!> \param mo_coeff_beta ...
134!> \author Mauro Del Ben
135! **************************************************************************************************
136   SUBROUTINE calc_ri_mp2_nonsep(qs_env, mp2_env, para_env, para_env_sub, cell, particle_set, &
137                                 atomic_kind_set, qs_kind_set, mo_coeff, nmo, homo, dimen_RI, Eigenval, &
138                                 my_group_L_start, my_group_L_end, my_group_L_size, sab_orb_sub, mat_munu, &
139                                 blacs_env_sub, Eigenval_beta, homo_beta, mo_coeff_beta)
140      TYPE(qs_environment_type), POINTER                 :: qs_env
141      TYPE(mp2_type), POINTER                            :: mp2_env
142      TYPE(cp_para_env_type), POINTER                    :: para_env, para_env_sub
143      TYPE(cell_type), POINTER                           :: cell
144      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
145      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
146      TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
147      TYPE(cp_fm_type), POINTER                          :: mo_coeff
148      INTEGER, INTENT(IN)                                :: nmo, homo, dimen_RI
149      REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
150      INTEGER, INTENT(IN)                                :: my_group_L_start, my_group_L_end, &
151                                                            my_group_L_size
152      TYPE(neighbor_list_set_p_type), DIMENSION(:), &
153         POINTER                                         :: sab_orb_sub
154      TYPE(dbcsr_p_type), INTENT(INOUT)                  :: mat_munu
155      TYPE(cp_blacs_env_type), POINTER                   :: blacs_env_sub
156      REAL(KIND=dp), DIMENSION(:), INTENT(IN), OPTIONAL  :: Eigenval_beta
157      INTEGER, INTENT(IN), OPTIONAL                      :: homo_beta
158      TYPE(cp_fm_type), OPTIONAL, POINTER                :: mo_coeff_beta
159
160      CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_ri_mp2_nonsep', &
161         routineP = moduleN//':'//routineN
162
163      INTEGER :: alpha, atom_a, beta, dimen, dir, eri_method, handle, handle2, handle3, i, iatom, &
164         igrid_level, ikind, iorb, ipgf, iset, itmp(2), L_counter, lb(3), LLL, location(3), &
165         my_P_end, my_P_size, my_P_start, na1, na2, natom, ncoa, nseta, offset, potential_type, &
166         sgfa, tp(3), ub(3), virtual, virtual_beta
167      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: atom_of_kind, kind_of
168      INTEGER, DIMENSION(3)                              :: comp
169      INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, npgfa, nsgfa
170      INTEGER, DIMENSION(:, :), POINTER                  :: first_sgfa
171      LOGICAL                                            :: alpha_beta, map_it_here, skip_shell, &
172                                                            use_virial
173      REAL(KIND=dp)                                      :: cutoff_old, e_hartree, eps_filter, &
174                                                            factor, omega, pair_energy, rab2, &
175                                                            relative_cutoff_old, total_rho
176      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: e_cutoff_old, wf_vector
177      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: G_PQ_local, I_ab
178      REAL(KIND=dp), DIMENSION(3)                        :: force_a, force_b, ra, rab
179      REAL(KIND=dp), DIMENSION(3, 3)                     :: h_stress, my_virial_a, my_virial_b
180      REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a
181      REAL(KIND=dp), DIMENSION(:, :), POINTER            :: I_tmp2, pab, rpgfa, sphi_a, zeta
182      TYPE(cp_eri_mme_param), POINTER                    :: eri_param
183      TYPE(cp_fm_struct_type), POINTER                   :: fm_struct_tmp
184      TYPE(cp_fm_type), POINTER                          :: L1_mu_i, L1_mu_i_beta, L2_nu_a, &
185                                                            L2_nu_a_beta
186      TYPE(dbcsr_p_type) :: Lag_mu_i_1, Lag_mu_i_1_beta, Lag_nu_a_2, Lag_nu_a_2_beta, &
187         matrix_P_inu, matrix_P_inu_beta, matrix_P_munu, matrix_P_munu_nosym
188      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: G_P_ia, G_P_ia_beta, mat_munu_local, &
189                                                            matrix_P_munu_local
190      TYPE(dbcsr_type), POINTER                          :: mo_coeff_o, mo_coeff_o_beta, mo_coeff_v, &
191                                                            mo_coeff_v_beta
192      TYPE(dft_control_type), POINTER                    :: dft_control
193      TYPE(gto_basis_set_type), POINTER                  :: basis_set_a
194      TYPE(mp2_eri_force), ALLOCATABLE, DIMENSION(:)     :: force_2c, force_3c_aux, force_3c_orb_mu, &
195                                                            force_3c_orb_nu
196      TYPE(pw_env_type), POINTER                         :: pw_env_sub
197      TYPE(pw_p_type)                                    :: dvg(3), pot_g, psi_L, psi_L_beta, rho_g, &
198                                                            rho_r, temp_pw_g
199      TYPE(pw_poisson_type), POINTER                     :: poisson_env
200      TYPE(pw_pool_type), POINTER                        :: auxbas_pw_pool
201      TYPE(qs_force_type), DIMENSION(:), POINTER         :: force
202      TYPE(qs_ks_env_type), POINTER                      :: ks_env
203      TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
204         POINTER                                         :: rs_descs
205      TYPE(realspace_grid_p_type), DIMENSION(:), POINTER :: rs_v
206      TYPE(task_list_type), POINTER                      :: task_list_sub
207      TYPE(virial_type), POINTER                         :: virial
208
209      CALL timeset(routineN, handle)
210
211      eri_method = mp2_env%eri_method
212      eri_param => mp2_env%eri_mme_param
213
214      ! Find out whether we have a closed or open shell
215      alpha_beta = .FALSE.
216      IF (PRESENT(homo_beta) .AND. PRESENT(Eigenval_beta)) alpha_beta = .TRUE.
217
218      dimen = nmo
219      virtual = dimen - homo
220      potential_type = mp2_env%potential_parameter%potential_type
221      omega = mp2_env%potential_parameter%omega
222      IF (alpha_beta) virtual_beta = dimen - homo_beta
223      eps_filter = mp2_env%mp2_gpw%eps_filter
224      NULLIFY (mo_coeff_o, mo_coeff_v, G_P_ia, ks_env)
225      mo_coeff_o => mp2_env%ri_grad%mo_coeff_o
226      mo_coeff_v => mp2_env%ri_grad%mo_coeff_v
227      G_P_ia => mp2_env%ri_grad%G_P_ia
228
229      IF (alpha_beta) THEN
230         NULLIFY (mo_coeff_o_beta, mo_coeff_v_beta, G_P_ia_beta)
231         mo_coeff_o_beta => mp2_env%ri_grad%mo_coeff_o_beta
232         mo_coeff_v_beta => mp2_env%ri_grad%mo_coeff_v_beta
233         G_P_ia_beta => mp2_env%ri_grad%G_P_ia_beta
234      ENDIF
235
236      itmp = get_limit(dimen_RI, para_env_sub%num_pe, para_env_sub%mepos)
237      my_P_start = itmp(1)
238      my_P_end = itmp(2)
239      my_P_size = itmp(2) - itmp(1) + 1
240      ALLOCATE (G_PQ_local(dimen_RI, my_group_L_size))
241
242      G_PQ_local = 0.0_dp
243      IF (.NOT. alpha_beta) THEN
244         G_PQ_local(my_P_start:my_P_end, 1:my_group_L_size) = mp2_env%ri_grad%Gamma_PQ
245      ELSE
246         G_PQ_local(my_P_start:my_P_end, 1:my_group_L_size) = &
247            0.50_dp*(mp2_env%ri_grad%Gamma_PQ + mp2_env%ri_grad%Gamma_PQ_beta)
248      ENDIF
249      DEALLOCATE (mp2_env%ri_grad%Gamma_PQ)
250      IF (alpha_beta) THEN
251         DEALLOCATE (mp2_env%ri_grad%Gamma_PQ_beta)
252      ENDIF
253      CALL mp_sum(G_PQ_local, para_env_sub%group)
254
255      ! deallocate here PQ_half, maybe usefull in the future
256      ! This is really bad style
257      DEALLOCATE (mp2_env%ri_grad%PQ_half)
258
259      ! create matrix holding the back transformation (G_P_inu)
260      ALLOCATE (matrix_P_inu%matrix)
261      CALL dbcsr_create(matrix_P_inu%matrix, template=mo_coeff_o)
262      IF (alpha_beta) THEN
263         ALLOCATE (matrix_P_inu_beta%matrix)
264         CALL dbcsr_create(matrix_P_inu_beta%matrix, template=mo_coeff_o_beta)
265      ENDIF
266
267      ! non symmetric matrix
268      ALLOCATE (matrix_P_munu_nosym%matrix)
269      CALL dbcsr_create(matrix_P_munu_nosym%matrix, template=mat_munu%matrix, &
270                        matrix_type=dbcsr_type_no_symmetry)
271
272      ! create Lagrangian matrices in mixed AO/MO formalism
273      ALLOCATE (Lag_mu_i_1%matrix)
274      CALL dbcsr_create(Lag_mu_i_1%matrix, template=mo_coeff_o)
275      CALL dbcsr_set(Lag_mu_i_1%matrix, 0.0_dp)
276      IF (alpha_beta) THEN
277         ALLOCATE (Lag_mu_i_1_beta%matrix)
278         CALL dbcsr_create(Lag_mu_i_1_beta%matrix, template=mo_coeff_o_beta)
279         CALL dbcsr_set(Lag_mu_i_1_beta%matrix, 0.0_dp)
280      ENDIF
281
282      ALLOCATE (Lag_nu_a_2%matrix)
283      CALL dbcsr_create(Lag_nu_a_2%matrix, template=mo_coeff_v)
284      CALL dbcsr_set(Lag_nu_a_2%matrix, 0.0_dp)
285      IF (alpha_beta) THEN
286         ALLOCATE (Lag_nu_a_2_beta%matrix)
287         CALL dbcsr_create(Lag_nu_a_2_beta%matrix, template=mo_coeff_v_beta)
288         CALL dbcsr_set(Lag_nu_a_2_beta%matrix, 0.0_dp)
289      ENDIF
290
291      ! get forces
292      NULLIFY (force, virial)
293      CALL get_qs_env(qs_env=qs_env, force=force, virial=virial)
294
295      ! prepare integral derivatives with mme method
296      IF (eri_method .EQ. do_eri_mme) THEN
297         ALLOCATE (matrix_P_munu_local(my_group_L_size))
298         ALLOCATE (mat_munu_local(my_group_L_size))
299         L_counter = 0
300         DO LLL = my_group_L_start, my_group_L_end
301            L_counter = L_counter + 1
302            ALLOCATE (matrix_P_munu_local(L_counter)%matrix)
303            ALLOCATE (mat_munu_local(L_counter)%matrix)
304            CALL dbcsr_create(matrix_P_munu_local(L_counter)%matrix, template=mat_munu%matrix, &
305                              matrix_type=dbcsr_type_symmetric)
306            CALL dbcsr_create(mat_munu_local(L_counter)%matrix, template=mat_munu%matrix, &
307                              matrix_type=dbcsr_type_symmetric)
308            CALL dbcsr_copy(mat_munu_local(L_counter)%matrix, mat_munu%matrix)
309            CALL dbcsr_set(mat_munu_local(L_counter)%matrix, 0.0_dp)
310
311            IF (alpha_beta) THEN
312               CALL G_P_transform_MO_to_AO(matrix_P_munu_local(L_counter), matrix_P_munu_nosym, mat_munu, &
313                                           G_P_ia(L_counter), matrix_P_inu, &
314                                           mo_coeff_v, mo_coeff_o, eps_filter, &
315                                           G_P_ia_beta(L_counter), matrix_P_inu_beta, mo_coeff_v_beta, mo_coeff_o_beta)
316            ELSE
317               CALL G_P_transform_MO_to_AO(matrix_P_munu_local(L_counter), matrix_P_munu_nosym, mat_munu, &
318                                           G_P_ia(L_counter), matrix_P_inu, &
319                                           mo_coeff_v, mo_coeff_o, eps_filter)
320            ENDIF
321         ENDDO
322
323         ALLOCATE (I_tmp2(dimen_RI, my_group_L_size))
324         I_tmp2(:, :) = 0.0_dp
325         CALL mp2_eri_2c_integrate(eri_param, potential_type, omega, para_env_sub, qs_env, &
326                                   basis_type_a="RI_AUX", basis_type_b="RI_AUX", &
327                                   hab=I_tmp2, first_b=my_group_L_start, last_b=my_group_L_end, &
328                                   eri_method=eri_method, pab=G_PQ_local, force_a=force_2c)
329
330         DEALLOCATE (I_tmp2)
331         CALL mp2_eri_3c_integrate(eri_param, potential_type, omega, para_env_sub, qs_env, &
332                                   first_c=my_group_L_start, last_c=my_group_L_end, mat_ab=mat_munu_local, &
333                                   basis_type_a="ORB", basis_type_b="ORB", basis_type_c="RI_AUX", &
334                                   sab_nl=sab_orb_sub, eri_method=eri_method, &
335                                   pabc=matrix_P_munu_local, &
336                                   force_a=force_3c_orb_mu, force_b=force_3c_orb_nu, force_c=force_3c_aux)
337
338         L_counter = 0
339         DO LLL = my_group_L_start, my_group_L_end
340            L_counter = L_counter + 1
341            ! we recompute matrix_P_inu
342            CALL dbcsr_multiply("N", "T", 1.0_dp, mo_coeff_v, G_P_ia(L_counter)%matrix, &
343                                0.0_dp, matrix_P_inu%matrix, filter_eps=eps_filter)
344            IF (alpha_beta) THEN
345               CALL dbcsr_multiply("N", "T", 1.0_dp, mo_coeff_v_beta, G_P_ia_beta(L_counter)%matrix, &
346                                   0.0_dp, matrix_P_inu_beta%matrix, filter_eps=eps_filter)
347            ENDIF
348
349            IF (alpha_beta) THEN
350               CALL update_lagrangian(mat_munu_local(L_counter), matrix_P_inu, Lag_mu_i_1, &
351                                      G_P_ia(L_counter), mo_coeff_o, Lag_nu_a_2, &
352                                      eps_filter, &
353                                      matrix_P_inu_beta, Lag_mu_i_1_beta, G_P_ia_beta(L_counter), mo_coeff_o_beta, Lag_nu_a_2_beta)
354            ELSE
355               CALL update_lagrangian(mat_munu_local(L_counter), matrix_P_inu, Lag_mu_i_1, &
356                                      G_P_ia(L_counter), mo_coeff_o, Lag_nu_a_2, &
357                                      eps_filter)
358
359            ENDIF
360         ENDDO
361
362         DO ikind = 1, SIZE(force)
363            force(ikind)%mp2_non_sep(:, :) = -4.0_dp*force_2c(ikind)%forces(:, :) + &
364                                             force_3c_orb_mu(ikind)%forces(:, :) + &
365                                             force_3c_orb_nu(ikind)%forces(:, :) + &
366                                             force_3c_aux(ikind)%forces(:, :)
367         END DO
368
369         CALL mp2_eri_deallocate_forces(force_2c)
370         CALL mp2_eri_deallocate_forces(force_3c_aux)
371         CALL mp2_eri_deallocate_forces(force_3c_orb_mu)
372         CALL mp2_eri_deallocate_forces(force_3c_orb_nu)
373         CALL dbcsr_deallocate_matrix_set(matrix_P_munu_local)
374         CALL dbcsr_deallocate_matrix_set(mat_munu_local)
375
376      ELSEIF (eri_method == do_eri_gpw) THEN
377         ALLOCATE (matrix_P_munu%matrix)
378         CALL dbcsr_create(matrix_P_munu%matrix, template=mat_munu%matrix, &
379                           matrix_type=dbcsr_type_symmetric)
380
381         CALL get_qs_env(qs_env, ks_env=ks_env)
382
383         natom = SIZE(particle_set)
384
385         ALLOCATE (kind_of(natom))
386         ALLOCATE (atom_of_kind(natom))
387         CALL get_atomic_kind_set(atomic_kind_set, kind_of=kind_of, atom_of_kind=atom_of_kind)
388
389         ! Supporting stuff for GPW
390         CALL prepare_gpw(qs_env, dft_control, e_cutoff_old, cutoff_old, relative_cutoff_old, para_env_sub, pw_env_sub, &
391                          auxbas_pw_pool, poisson_env, task_list_sub, rho_r, rho_g, pot_g, psi_L, sab_orb_sub)
392
393         ! wave function vector and supporting stuff
394         ALLOCATE (wf_vector(dimen_RI))
395         IF (alpha_beta) THEN
396            NULLIFY (psi_L_beta%pw)
397            CALL pw_pool_create_pw(auxbas_pw_pool, psi_L_beta%pw, &
398                                   use_data=REALDATA3D, &
399                                   in_space=REALSPACE)
400         ENDIF
401
402         ! check if we want to calculate the virial
403         use_virial = virial%pv_availability .AND. (.NOT. virial%pv_numer)
404
405         ! in case virial is required we need auxilliary pw
406         ! for calculate the MP2-volume contribution to the virial
407         ! (hartree potential derivatives)
408         IF (use_virial) THEN
409            NULLIFY (temp_pw_g%pw)
410            CALL pw_pool_create_pw(auxbas_pw_pool, temp_pw_g%pw, &
411                                   use_data=COMPLEXDATA1D, &
412                                   in_space=RECIPROCALSPACE)
413            DO i = 1, 3
414               NULLIFY (dvg(i)%pw)
415               CALL pw_pool_create_pw(auxbas_pw_pool, dvg(i)%pw, &
416                                      use_data=COMPLEXDATA1D, &
417                                      in_space=RECIPROCALSPACE)
418            END DO
419         END IF
420
421         ! start main loop over auxiliary basis functions
422         CALL timeset(routineN//"_loop", handle2)
423
424         L_counter = 0
425         DO LLL = my_group_L_start, my_group_L_end
426            L_counter = L_counter + 1
427
428            IF (alpha_beta) THEN
429               CALL G_P_transform_MO_to_AO(matrix_P_munu, matrix_P_munu_nosym, mat_munu, &
430                                           G_P_ia(L_counter), &
431                                           matrix_P_inu, &
432                                           mo_coeff_v, mo_coeff_o, &
433                                           eps_filter, &
434                                           G_P_ia_beta(L_counter), matrix_P_inu_beta, &
435                                           mo_coeff_v_beta, mo_coeff_o_beta)
436            ELSE
437               CALL G_P_transform_MO_to_AO(matrix_P_munu, matrix_P_munu_nosym, mat_munu, &
438                                           G_P_ia(L_counter), &
439                                           matrix_P_inu, &
440                                           mo_coeff_v, mo_coeff_o, &
441                                           eps_filter)
442            ENDIF
443
444            ! calculate potential associted to the single aux function
445            CALL timeset(routineN//"_wf_pot", handle3)
446            wf_vector = 0.0_dp
447            wf_vector(LLL) = 1.0_dp
448            ! pseudo psi_L
449            CALL calculate_wavefunction(mo_coeff, 1, psi_L, rho_g, atomic_kind_set, &
450                                        qs_kind_set, cell, dft_control, particle_set, pw_env_sub, &
451                                        basis_type="RI_AUX", &
452                                        external_vector=wf_vector)
453
454            rho_r%pw%cr3d = psi_L%pw%cr3d
455            IF (alpha_beta) THEN
456               CALL calculate_wavefunction(mo_coeff_beta, 1, psi_L_beta, rho_g, atomic_kind_set, &
457                                           qs_kind_set, cell, dft_control, particle_set, &
458                                           pw_env_sub, basis_type='RI_AUX', &
459                                           external_vector=wf_vector)
460               rho_r%pw%cr3d = 0.50_dp*(rho_r%pw%cr3d+psi_L_beta%pw%cr3d)
461            ENDIF
462
463            CALL calc_potential_gpw(rho_r, rho_g, poisson_env, pot_g, potential_type, omega)
464            CALL timestop(handle3)
465
466            IF (use_virial) THEN
467               ! make a copy of the density in G space
468               ! calculate the potential derivatives in G space
469               CALL timeset(routineN//"_Virial", handle3)
470               CALL pw_copy(rho_g%pw, temp_pw_g%pw)
471               DO i = 1, 3
472                  comp = 0
473                  comp(i) = 1
474                  CALL pw_copy(pot_g%pw, dvg(i)%pw)
475                  CALL pw_derive(dvg(i)%pw, comp)
476               END DO
477               CALL timestop(handle3)
478            END IF
479
480            ! integrate the potential of the single gaussian and update
481            ! 2-center forces with Gamma_PQ
482            CALL timeset(routineN//"_int_PQ", handle3)
483            NULLIFY (rs_v)
484            NULLIFY (rs_descs)
485            CALL pw_env_get(pw_env_sub, rs_descs=rs_descs, rs_grids=rs_v)
486            DO i = 1, SIZE(rs_v)
487               CALL rs_grid_retain(rs_v(i)%rs_grid)
488            END DO
489            CALL potential_pw2rs(rs_v, rho_r, pw_env_sub)
490
491            offset = 0
492            DO iatom = 1, natom
493               ikind = kind_of(iatom)
494               atom_a = atom_of_kind(iatom)
495               CALL get_qs_kind(qs_kind=qs_kind_set(ikind), basis_set=basis_set_a, &
496                                basis_type="RI_AUX")
497
498               first_sgfa => basis_set_a%first_sgf
499               la_max => basis_set_a%lmax
500               la_min => basis_set_a%lmin
501               npgfa => basis_set_a%npgf
502               nseta = basis_set_a%nset
503               nsgfa => basis_set_a%nsgf_set
504               rpgfa => basis_set_a%pgf_radius
505               set_radius_a => basis_set_a%set_radius
506               sphi_a => basis_set_a%sphi
507               zeta => basis_set_a%zet
508
509               ra(:) = pbc(particle_set(iatom)%r, cell)
510               rab = 0.0_dp
511               rab2 = 0.0_dp
512
513               force_a(:) = 0.0_dp
514               force_b(:) = 0.0_dp
515               IF (use_virial) THEN
516                  my_virial_a = 0.0_dp
517                  my_virial_b = 0.0_dp
518               END IF
519
520               DO iset = 1, nseta
521                  ncoa = npgfa(iset)*ncoset(la_max(iset))
522                  sgfa = first_sgfa(1, iset)
523
524                  ALLOCATE (I_tmp2(ncoa, 1))
525                  I_tmp2 = 0.0_dp
526                  ALLOCATE (I_ab(nsgfa(iset), 1))
527                  I_ab = 0.0_dp
528                  ALLOCATE (pab(ncoa, 1))
529                  pab = 0.0_dp
530
531                  I_ab(1:nsgfa(iset), 1) = -4.0_dp*G_PQ_local(offset + 1:offset + nsgfa(iset), L_counter)
532
533                  CALL dgemm("N", "N", ncoa, 1, nsgfa(iset), &
534                             1.0_dp, sphi_a(1, sgfa), SIZE(sphi_a, 1), &
535                             I_ab(1, 1), SIZE(I_ab, 1), &
536                             0.0_dp, pab(1, 1), SIZE(pab, 1))
537
538                  I_ab = 0.0_dp
539
540                  igrid_level = gaussian_gridlevel(pw_env_sub%gridlevel_info, MINVAL(zeta(:, iset)))
541                  map_it_here = .FALSE.
542                  IF (.NOT. ALL(rs_v(igrid_level)%rs_grid%desc%perd == 1)) THEN
543                     DO dir = 1, 3
544                        ! bounds of local grid (i.e. removing the 'wings'), if periodic
545                        tp(dir) = FLOOR(DOT_PRODUCT(cell%h_inv(dir, :), ra)*rs_v(igrid_level)%rs_grid%desc%npts(dir))
546                        tp(dir) = MODULO(tp(dir), rs_v(igrid_level)%rs_grid%desc%npts(dir))
547                        IF (rs_v(igrid_level)%rs_grid%desc%perd(dir) .NE. 1) THEN
548                           lb(dir) = rs_v(igrid_level)%rs_grid%lb_local(dir) + rs_v(igrid_level)%rs_grid%desc%border
549                           ub(dir) = rs_v(igrid_level)%rs_grid%ub_local(dir) - rs_v(igrid_level)%rs_grid%desc%border
550                        ELSE
551                           lb(dir) = rs_v(igrid_level)%rs_grid%lb_local(dir)
552                           ub(dir) = rs_v(igrid_level)%rs_grid%ub_local(dir)
553                        ENDIF
554                        ! distributed grid, only map if it is local to the grid
555                        location(dir) = tp(dir) + rs_v(igrid_level)%rs_grid%desc%lb(dir)
556                     ENDDO
557                     IF (lb(1) <= location(1) .AND. location(1) <= ub(1) .AND. &
558                         lb(2) <= location(2) .AND. location(2) <= ub(2) .AND. &
559                         lb(3) <= location(3) .AND. location(3) <= ub(3)) THEN
560                        map_it_here = .TRUE.
561                     ENDIF
562                  ELSE
563                     ! not distributed, just a round-robin distribution over the full set of CPUs
564                     IF (MODULO(offset, para_env_sub%num_pe) == para_env_sub%mepos) map_it_here = .TRUE.
565                  ENDIF
566
567                  offset = offset + nsgfa(iset)
568
569                  IF (map_it_here) THEN
570                     DO ipgf = 1, npgfa(iset)
571                        na1 = (ipgf - 1)*ncoset(la_max(iset)) + 1
572                        na2 = ipgf*ncoset(la_max(iset))
573                        igrid_level = gaussian_gridlevel(pw_env_sub%gridlevel_info, zeta(ipgf, iset))
574
575                        CALL integrate_pgf_product_rspace(la_max=la_max(iset), zeta=zeta(ipgf, iset)/2.0_dp, la_min=la_min(iset), &
576                                                          lb_max=0, zetb=zeta(ipgf, iset)/2.0_dp, lb_min=0, &
577                                                          ra=ra, rab=rab, rab2=rab2, &
578                                                          rsgrid=rs_v(igrid_level)%rs_grid, &
579                                                          cell=cell, &
580                                                          cube_info=pw_env_sub%cube_info(igrid_level), &
581                                                          hab=I_tmp2, &
582                                                          pab=pab, &
583                                                          o1=na1 - 1, &
584                                                          o2=0, &
585                                                          map_consistent=.TRUE., &
586                                                          eps_gvg_rspace=dft_control%qs_control%eps_gvg_rspace, &
587                                                          calculate_forces=.TRUE., &
588                                                          force_a=force_a, force_b=force_b, &
589                                                          use_virial=use_virial, my_virial_a=my_virial_a, my_virial_b=my_virial_b)
590
591                     END DO
592
593                     ! CALL dgemm("T","N",nsgfa(iset),1,ncoa,&
594                     !             1.0_dp,sphi_a(1,sgfa),SIZE(sphi_a,1),&
595                     !             I_tmp2(1,1),SIZE(I_tmp2,1),&
596                     !             1.0_dp,I_ab(1,1),SIZE(I_ab,1))
597                     ! L_local_col(offset-nsgfa(iset)+1:offset,i_counter)=I_ab(1:nsgfa(iset),1)
598                  END IF
599
600                  DEALLOCATE (I_tmp2)
601                  DEALLOCATE (I_ab)
602                  DEALLOCATE (pab)
603
604               END DO
605
606               force(ikind)%rho_elec(:, atom_a) = &
607                  force(ikind)%rho_elec(:, atom_a) + force_a(:) + force_b
608               IF (use_virial) THEN
609                  virial%pv_virial = virial%pv_virial + my_virial_a + my_virial_b
610               END IF
611            END DO
612
613            DO i = 1, SIZE(rs_v)
614               CALL rs_grid_release(rs_v(i)%rs_grid)
615            END DO
616            CALL timestop(handle3)
617            ! here we are done with the 2 centers
618
619            ! CALL cp_dbcsr_write_sparse_matrix(matrix_P_munu%matrix,4,12,qs_env,para_env_sub,&
620            !                                   output_unit=unit_nr)
621
622            ! integrate the potential of the single gaussian and update
623            ! 3-center forces
624            CALL timeset(routineN//"_int", handle3)
625            CALL dbcsr_set(mat_munu%matrix, 0.0_dp)
626            CALL integrate_v_rspace(rho_r, hmat=mat_munu, pmat=matrix_P_munu, &
627                                    qs_env=qs_env, calculate_forces=.TRUE., compute_tau=.FALSE., gapw=.FALSE., &
628                                    pw_env_external=pw_env_sub, &
629                                    task_list_external=task_list_sub)
630            CALL timestop(handle3)
631
632            IF (alpha_beta) THEN
633               CALL update_lagrangian(mat_munu, matrix_P_inu, Lag_mu_i_1, &
634                                      G_P_ia(L_counter), mo_coeff_o, Lag_nu_a_2, &
635                                      eps_filter, &
636                                      matrix_P_inu_beta, Lag_mu_i_1_beta, G_P_ia_beta(L_counter), mo_coeff_o_beta, Lag_nu_a_2_beta)
637            ELSE
638               CALL update_lagrangian(mat_munu, matrix_P_inu, Lag_mu_i_1, &
639                                      G_P_ia(L_counter), mo_coeff_o, Lag_nu_a_2, &
640                                      eps_filter)
641
642            ENDIF
643
644            IF (use_virial) THEN
645               ! add the volume contribution to the virial due to
646               ! the (P|Q) integrals, first we put the full gamme_PQ
647               ! pseudo wave-function into grid in order to calculate the
648               ! hartree potential derivatives
649               CALL timeset(routineN//"_Virial", handle3)
650               wf_vector = 0.0_dp
651               wf_vector(:) = -2.0_dp*G_PQ_local(:, L_counter)
652               CALL calculate_wavefunction(mo_coeff, 1, psi_L, rho_g, atomic_kind_set, &
653                                           qs_kind_set, cell, dft_control, particle_set, pw_env_sub, &
654                                           basis_type="RI_AUX", &
655                                           external_vector=wf_vector)
656               ! transfer to reciprocal space and calculate potential
657               rho_r%pw%cr3d = psi_L%pw%cr3d
658               CALL pw_transfer(rho_r%pw, rho_g%pw)
659               CALL pw_poisson_solve(poisson_env, rho_g%pw, pair_energy, pot_g%pw)
660               ! update virial with volume term (first calculate hartree like energy (diagonal part of the virial))
661               e_hartree = 0.0_dp
662               h_stress = 0.0_dp
663               e_hartree = pw_integral_ab(temp_pw_g%pw, pot_g%pw)
664               DO alpha = 1, 3
665                  comp = 0
666                  comp(alpha) = 1
667                  CALL pw_copy(pot_g%pw, rho_g%pw)
668                  CALL pw_derive(rho_g%pw, comp)
669                  h_stress(alpha, alpha) = -e_hartree
670                  DO beta = alpha, 3
671                     h_stress(alpha, beta) = h_stress(alpha, beta) &
672                                             - 2.0_dp*pw_integral_ab(rho_g%pw, dvg(beta)%pw)/fourpi
673                     h_stress(beta, alpha) = h_stress(alpha, beta)
674                  END DO
675               END DO
676               virial%pv_virial = virial%pv_virial + h_stress/REAL(para_env_sub%num_pe, dp)
677               CALL timestop(handle3)
678            END IF
679
680            ! put the gamma density on grid
681            CALL timeset(routineN//"_Gpot", handle3)
682            CALL calculate_rho_elec(matrix_p=matrix_P_munu%matrix, &
683                                    rho=rho_r, &
684                                    rho_gspace=rho_g, &
685                                    total_rho=total_rho, &
686                                    task_list_external=task_list_sub, &
687                                    pw_env_external=pw_env_sub, &
688                                    ks_env=ks_env)
689            ! calculate associated hartree potential
690            ! CALL pw_transfer(rho_r%pw, rho_g%pw)
691            !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1
692            CALL pw_poisson_solve(poisson_env, rho_g%pw, pair_energy, pot_g%pw)
693            CALL pw_transfer(pot_g%pw, rho_r%pw)
694            CALL pw_scale(rho_r%pw, rho_r%pw%pw_grid%dvol)
695            CALL timestop(handle3)
696
697            IF (use_virial) THEN
698               ! add the volume contribution to the virial coming from
699               ! the 3-center integrals (mu nu|P)
700               CALL timeset(routineN//"_Virial", handle3)
701               e_hartree = 0.0_dp
702               h_stress = 0.0_dp
703               e_hartree = pw_integral_ab(temp_pw_g%pw, pot_g%pw)
704               DO alpha = 1, 3
705                  comp = 0
706                  comp(alpha) = 1
707                  CALL pw_copy(pot_g%pw, rho_g%pw)
708                  CALL pw_derive(rho_g%pw, comp)
709                  h_stress(alpha, alpha) = -e_hartree
710                  DO beta = alpha, 3
711                     h_stress(alpha, beta) = h_stress(alpha, beta) &
712                                             - 2.0_dp*pw_integral_ab(rho_g%pw, dvg(beta)%pw)/fourpi
713                     h_stress(beta, alpha) = h_stress(alpha, beta)
714                  END DO
715               END DO
716               virial%pv_virial = virial%pv_virial + h_stress/REAL(para_env_sub%num_pe, dp)
717               CALL timestop(handle3)
718            END IF
719
720            ! integrate potential with auxiliary basis function derivatives
721            NULLIFY (rs_v)
722            NULLIFY (rs_descs)
723            CALL pw_env_get(pw_env_sub, rs_descs=rs_descs, rs_grids=rs_v)
724            DO i = 1, SIZE(rs_v)
725               CALL rs_grid_retain(rs_v(i)%rs_grid)
726            END DO
727            CALL potential_pw2rs(rs_v, rho_r, pw_env_sub)
728
729            offset = 0
730            DO iatom = 1, natom
731               ikind = kind_of(iatom)
732               atom_a = atom_of_kind(iatom)
733               CALL get_qs_kind(qs_kind=qs_kind_set(ikind), basis_set=basis_set_a, &
734                                basis_type="RI_AUX")
735
736               first_sgfa => basis_set_a%first_sgf
737               la_max => basis_set_a%lmax
738               la_min => basis_set_a%lmin
739               npgfa => basis_set_a%npgf
740               nseta = basis_set_a%nset
741               nsgfa => basis_set_a%nsgf_set
742               rpgfa => basis_set_a%pgf_radius
743               set_radius_a => basis_set_a%set_radius
744               sphi_a => basis_set_a%sphi
745               zeta => basis_set_a%zet
746
747               ra(:) = pbc(particle_set(iatom)%r, cell)
748               rab = 0.0_dp
749               rab2 = 0.0_dp
750
751               force_a(:) = 0.0_dp
752               force_b(:) = 0.0_dp
753               IF (use_virial) THEN
754                  my_virial_a = 0.0_dp
755                  my_virial_b = 0.0_dp
756               END IF
757
758               DO iset = 1, nseta
759                  ncoa = npgfa(iset)*ncoset(la_max(iset))
760                  sgfa = first_sgfa(1, iset)
761
762                  ALLOCATE (I_tmp2(ncoa, 1))
763                  I_tmp2 = 0.0_dp
764                  ALLOCATE (I_ab(nsgfa(iset), 1))
765                  I_ab = 0.0_dp
766                  ALLOCATE (pab(ncoa, 1))
767                  pab = 0.0_dp
768
769                  skip_shell = .TRUE.
770                  DO iorb = 1, nsgfa(iset)
771                     IF (iorb + offset == LLL) THEN
772                        I_ab(iorb, 1) = 1.0_dp
773                        skip_shell = .FALSE.
774                     END IF
775                  END DO
776
777                  IF (skip_shell) THEN
778                     offset = offset + nsgfa(iset)
779                     DEALLOCATE (I_tmp2)
780                     DEALLOCATE (I_ab)
781                     DEALLOCATE (pab)
782                     CYCLE
783                  END IF
784
785                  CALL dgemm("N", "N", ncoa, 1, nsgfa(iset), &
786                             1.0_dp, sphi_a(1, sgfa), SIZE(sphi_a, 1), &
787                             I_ab(1, 1), SIZE(I_ab, 1), &
788                             0.0_dp, pab(1, 1), SIZE(pab, 1))
789                  I_ab = 0.0_dp
790
791                  igrid_level = gaussian_gridlevel(pw_env_sub%gridlevel_info, MINVAL(zeta(:, iset)))
792                  map_it_here = .FALSE.
793                  IF (.NOT. ALL(rs_v(igrid_level)%rs_grid%desc%perd == 1)) THEN
794                     DO dir = 1, 3
795                        ! bounds of local grid (i.e. removing the 'wings'), if periodic
796                        tp(dir) = FLOOR(DOT_PRODUCT(cell%h_inv(dir, :), ra)*rs_v(igrid_level)%rs_grid%desc%npts(dir))
797                        tp(dir) = MODULO(tp(dir), rs_v(igrid_level)%rs_grid%desc%npts(dir))
798                        IF (rs_v(igrid_level)%rs_grid%desc%perd(dir) .NE. 1) THEN
799                           lb(dir) = rs_v(igrid_level)%rs_grid%lb_local(dir) + rs_v(igrid_level)%rs_grid%desc%border
800                           ub(dir) = rs_v(igrid_level)%rs_grid%ub_local(dir) - rs_v(igrid_level)%rs_grid%desc%border
801                        ELSE
802                           lb(dir) = rs_v(igrid_level)%rs_grid%lb_local(dir)
803                           ub(dir) = rs_v(igrid_level)%rs_grid%ub_local(dir)
804                        ENDIF
805                        ! distributed grid, only map if it is local to the grid
806                        location(dir) = tp(dir) + rs_v(igrid_level)%rs_grid%desc%lb(dir)
807                     ENDDO
808                     IF (lb(1) <= location(1) .AND. location(1) <= ub(1) .AND. &
809                         lb(2) <= location(2) .AND. location(2) <= ub(2) .AND. &
810                         lb(3) <= location(3) .AND. location(3) <= ub(3)) THEN
811                        map_it_here = .TRUE.
812                     ENDIF
813                  ELSE
814                     ! not distributed, just a round-robin distribution over the full set of CPUs
815                     IF (MODULO(offset, para_env_sub%num_pe) == para_env_sub%mepos) map_it_here = .TRUE.
816                  ENDIF
817
818                  offset = offset + nsgfa(iset)
819
820                  IF (map_it_here) THEN
821                     DO ipgf = 1, npgfa(iset)
822                        na1 = (ipgf - 1)*ncoset(la_max(iset)) + 1
823                        na2 = ipgf*ncoset(la_max(iset))
824                        igrid_level = gaussian_gridlevel(pw_env_sub%gridlevel_info, zeta(ipgf, iset))
825
826                        CALL integrate_pgf_product_rspace(la_max=la_max(iset), zeta=zeta(ipgf, iset)/2.0_dp, la_min=la_min(iset), &
827                                                          lb_max=0, zetb=zeta(ipgf, iset)/2.0_dp, lb_min=0, &
828                                                          ra=ra, rab=rab, rab2=rab2, &
829                                                          rsgrid=rs_v(igrid_level)%rs_grid, &
830                                                          cell=cell, &
831                                                          cube_info=pw_env_sub%cube_info(igrid_level), &
832                                                          hab=I_tmp2, &
833                                                          pab=pab, &
834                                                          o1=na1 - 1, &
835                                                          o2=0, &
836                                                          map_consistent=.TRUE., &
837                                                          eps_gvg_rspace=dft_control%qs_control%eps_gvg_rspace, &
838                                                          calculate_forces=.TRUE., &
839                                                          force_a=force_a, force_b=force_b, &
840                                                          use_virial=use_virial, my_virial_a=my_virial_a, my_virial_b=my_virial_b)
841                     END DO
842                     ! CALL dgemm("T","N".0_dp,sphi_a(1,sgfa),SIZE(sphi_a,1),&
843                     !             I_tmp2(1,1),SIZE(I_tmp2,1),&
844                     !             1.0_dp,I_ab(1,1),SIZE(I_ab,1))
845                     ! L_local_col(offset-nsgfa(iset)+1:offset,i_counter)=I_ab(1:nsgfa(iset),1)
846                  END IF
847
848                  DEALLOCATE (I_tmp2)
849                  DEALLOCATE (I_ab)
850                  DEALLOCATE (pab)
851
852               END DO
853
854               force(ikind)%rho_elec(:, atom_a) = &
855                  force(ikind)%rho_elec(:, atom_a) + force_a(:) + force_b(:)
856               IF (use_virial) THEN
857                  virial%pv_virial = virial%pv_virial + my_virial_a + my_virial_b
858               END IF
859            END DO
860
861            DO i = 1, SIZE(rs_v)
862               CALL rs_grid_release(rs_v(i)%rs_grid)
863            END DO
864
865         END DO
866
867         CALL timestop(handle2)
868
869         DEALLOCATE (kind_of)
870         DEALLOCATE (atom_of_kind)
871         DEALLOCATE (wf_vector)
872
873         IF (alpha_beta) THEN
874            CALL pw_pool_give_back_pw(auxbas_pw_pool, psi_L_beta%pw)
875         ENDIF
876
877         IF (use_virial) THEN
878            CALL pw_pool_give_back_pw(auxbas_pw_pool, temp_pw_g%pw)
879            DO i = 1, 3
880               CALL pw_pool_give_back_pw(auxbas_pw_pool, dvg(i)%pw)
881            END DO
882         END IF
883
884         CALL cleanup_gpw(qs_env, e_cutoff_old, cutoff_old, relative_cutoff_old, pw_env_sub, &
885                          task_list_sub, auxbas_pw_pool, rho_r, rho_g, pot_g, psi_L)
886
887         CALL dbcsr_release(matrix_P_munu%matrix)
888         DEALLOCATE (matrix_P_munu%matrix)
889
890      ENDIF
891
892      DEALLOCATE (G_PQ_local)
893
894      CALL dbcsr_release(matrix_P_inu%matrix)
895      DEALLOCATE (matrix_P_inu%matrix)
896
897      CALL dbcsr_release(matrix_P_munu_nosym%matrix)
898      DEALLOCATE (matrix_P_munu_nosym%matrix)
899
900      ! release the full gamma_P_ia structure
901      DEALLOCATE (G_P_ia)
902
903      ! Release G_P_ia_beta and and matrix_P_inu_beta
904      IF (alpha_beta) THEN
905         CALL dbcsr_release(matrix_P_inu_beta%matrix)
906         DEALLOCATE (matrix_P_inu_beta%matrix)
907         DEALLOCATE (G_P_ia_beta)
908      ENDIF
909
910      ! move the forces in the correct place
911      IF (eri_method .EQ. do_eri_gpw) THEN
912         DO ikind = 1, SIZE(force)
913            force(ikind)%mp2_non_sep(:, :) = force(ikind)%rho_elec(:, :)
914            force(ikind)%rho_elec(:, :) = 0.0_dp
915         END DO
916      ENDIF
917
918      ! Now we move from the local matrices to the global ones
919      ! defined over all MPI tasks
920      ! Start with moving from the DBCSR to FM for the lagrangians
921      NULLIFY (L1_mu_i, fm_struct_tmp)
922      CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env_sub, context=blacs_env_sub, &
923                               nrow_global=dimen, ncol_global=homo)
924      CALL cp_fm_create(L1_mu_i, fm_struct_tmp, name="Lag_mu_i")
925      CALL cp_fm_struct_release(fm_struct_tmp)
926      CALL cp_fm_set_all(L1_mu_i, 0.0_dp)
927      CALL copy_dbcsr_to_fm(matrix=Lag_mu_i_1%matrix, fm=L1_mu_i)
928
929      ! release Lag_mu_i_1
930      CALL dbcsr_release(Lag_mu_i_1%matrix)
931      DEALLOCATE (Lag_mu_i_1%matrix)
932
933      NULLIFY (L2_nu_a, fm_struct_tmp)
934      CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env_sub, context=blacs_env_sub, &
935                               nrow_global=dimen, ncol_global=virtual)
936      CALL cp_fm_create(L2_nu_a, fm_struct_tmp, name="Lag_nu_a")
937      CALL cp_fm_struct_release(fm_struct_tmp)
938      CALL cp_fm_set_all(L2_nu_a, 0.0_dp)
939      CALL copy_dbcsr_to_fm(matrix=Lag_nu_a_2%matrix, fm=L2_nu_a)
940
941      ! release Lag_nu_a_2
942      CALL dbcsr_release(Lag_nu_a_2%matrix)
943      DEALLOCATE (Lag_nu_a_2%matrix)
944
945      ! The same for the beta Lagrangians
946      IF (alpha_beta) THEN
947         ! Now we move from the local matrices to the global ones
948         ! defined over all MPI tasks
949         ! Start with moving from the DBCSR to FM for the lagrangians
950         NULLIFY (L1_mu_i_beta, fm_struct_tmp)
951         CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env_sub, context=blacs_env_sub, &
952                                  nrow_global=dimen, ncol_global=homo_beta)
953         CALL cp_fm_create(L1_mu_i_beta, fm_struct_tmp, name="Lag_mu_i_beta")
954         CALL cp_fm_struct_release(fm_struct_tmp)
955         CALL cp_fm_set_all(L1_mu_i_beta, 0.0_dp)
956         CALL copy_dbcsr_to_fm(matrix=Lag_mu_i_1_beta%matrix, fm=L1_mu_i_beta)
957
958         ! release Lag_mu_i_1
959         CALL dbcsr_release(Lag_mu_i_1_beta%matrix)
960         DEALLOCATE (Lag_mu_i_1_beta%matrix)
961
962         NULLIFY (L2_nu_a_beta, fm_struct_tmp)
963         CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env_sub, context=blacs_env_sub, &
964                                  nrow_global=dimen, ncol_global=virtual_beta)
965         CALL cp_fm_create(L2_nu_a_beta, fm_struct_tmp, name="Lag_nu_a_beta")
966         CALL cp_fm_struct_release(fm_struct_tmp)
967         CALL cp_fm_set_all(L2_nu_a_beta, 0.0_dp)
968         CALL copy_dbcsr_to_fm(matrix=Lag_nu_a_2_beta%matrix, fm=L2_nu_a_beta)
969
970         ! release Lag_nu_a_2
971         CALL dbcsr_release(Lag_nu_a_2_beta%matrix)
972         DEALLOCATE (Lag_nu_a_2_beta%matrix)
973      ENDIF
974
975      ! Set the factor to multiply P_ij (depends on the open or closed shell)
976      factor = 1.0_dp
977      IF (alpha_beta) factor = 0.50_dp
978
979      CALL create_W_P(qs_env, mp2_env, mo_coeff, homo, virtual, dimen, para_env, para_env_sub, &
980                      Eigenval, L1_mu_i, L2_nu_a, factor, .FALSE.)
981      ! Alpha_beta_case
982      IF (alpha_beta) THEN
983         CALL create_W_P(qs_env, mp2_env, mo_coeff_beta, homo_beta, virtual_beta, dimen, para_env, &
984                         para_env_sub, Eigenval_beta, L1_mu_i_beta, L2_nu_a_beta, &
985                         factor, .TRUE.)
986      ENDIF
987
988      CALL timestop(handle)
989
990   END SUBROUTINE calc_ri_mp2_nonsep
991
992! **************************************************************************************************
993!> \brief ...
994!> \param G_P_munu ...
995!> \param G_P_munu_nosym ...
996!> \param mat_munu ...
997!> \param G_P_ia ...
998!> \param G_P_inu ...
999!> \param mo_coeff_v ...
1000!> \param mo_coeff_o ...
1001!> \param eps_filter ...
1002!> \param G_P_ia_beta ...
1003!> \param G_P_inu_beta ...
1004!> \param mo_coeff_v_beta ...
1005!> \param mo_coeff_o_beta ...
1006! **************************************************************************************************
1007   SUBROUTINE G_P_transform_MO_to_AO(G_P_munu, G_P_munu_nosym, mat_munu, G_P_ia, G_P_inu, &
1008                                     mo_coeff_v, mo_coeff_o, eps_filter, &
1009                                     G_P_ia_beta, G_P_inu_beta, mo_coeff_v_beta, mo_coeff_o_beta)
1010      TYPE(dbcsr_p_type), INTENT(INOUT)                  :: G_P_munu, G_P_munu_nosym, mat_munu
1011      TYPE(dbcsr_p_type), INTENT(IN)                     :: G_P_ia
1012      TYPE(dbcsr_p_type), INTENT(INOUT)                  :: G_P_inu
1013      TYPE(dbcsr_type), INTENT(IN)                       :: mo_coeff_v, mo_coeff_o
1014      REAL(KIND=dp), INTENT(IN)                          :: eps_filter
1015      TYPE(dbcsr_p_type), INTENT(IN), OPTIONAL           :: G_P_ia_beta
1016      TYPE(dbcsr_p_type), INTENT(INOUT), OPTIONAL        :: G_P_inu_beta
1017      TYPE(dbcsr_type), INTENT(IN), OPTIONAL             :: mo_coeff_v_beta, mo_coeff_o_beta
1018
1019      CHARACTER(LEN=*), PARAMETER :: routineN = 'G_P_transform_MO_to_AO', &
1020         routineP = moduleN//':'//routineN
1021
1022      INTEGER                                            :: handle
1023      LOGICAL                                            :: alpha_beta
1024
1025      alpha_beta = PRESENT(G_P_ia_beta) .AND. PRESENT(G_P_inu_beta) .AND. PRESENT(mo_coeff_v_beta) .AND. PRESENT(mo_coeff_o_beta)
1026
1027      CALL dbcsr_set(G_P_munu_nosym%matrix, 0.0_dp)
1028      CALL G_P_transform_alpha_beta(G_P_ia, G_P_inu, G_P_munu_nosym, mo_coeff_v, mo_coeff_o, [1.0_dp, 0.0_dp], eps_filter)
1029
1030      IF (alpha_beta) THEN
1031         CALL G_P_transform_alpha_beta(G_P_ia_beta, G_P_inu_beta, G_P_munu_nosym, mo_coeff_v_beta, mo_coeff_o_beta, &
1032                                       [0.5_dp, 0.5_dp], eps_filter)
1033      ENDIF
1034
1035      ! symmetrize
1036      CALL timeset(routineN//"_symmetrize", handle)
1037      CALL dbcsr_set(G_P_munu%matrix, 0.0_dp)
1038      CALL dbcsr_transposed(G_P_munu%matrix, G_P_munu_nosym%matrix)
1039      CALL dbcsr_add(G_P_munu%matrix, G_P_munu_nosym%matrix, &
1040                     alpha_scalar=2.0_dp, beta_scalar=2.0_dp)
1041      ! this is a trick to avoid that integrate_v_rspace starts to cry
1042      CALL dbcsr_copy_into_existing(mat_munu%matrix, G_P_munu%matrix)
1043      CALL dbcsr_copy(G_P_munu%matrix, mat_munu%matrix)
1044
1045      CALL timestop(handle)
1046
1047   END SUBROUTINE G_P_transform_MO_to_AO
1048
1049! **************************************************************************************************
1050!> \brief ...
1051!> \param G_P_ia ...
1052!> \param G_P_inu ...
1053!> \param G_P_munu ...
1054!> \param mo_coeff_v ...
1055!> \param mo_coeff_o ...
1056!> \param fraction_add ...
1057!> \param eps_filter ...
1058! **************************************************************************************************
1059   SUBROUTINE G_P_transform_alpha_beta(G_P_ia, G_P_inu, G_P_munu, mo_coeff_v, mo_coeff_o, fraction_add, eps_filter)
1060      TYPE(dbcsr_p_type), INTENT(IN)                     :: G_P_ia
1061      TYPE(dbcsr_p_type), INTENT(INOUT)                  :: G_P_inu, G_P_munu
1062      TYPE(dbcsr_type), INTENT(IN)                       :: mo_coeff_v, mo_coeff_o
1063      REAL(KIND=dp), DIMENSION(2), INTENT(IN)            :: fraction_add
1064      REAL(KIND=dp), INTENT(IN)                          :: eps_filter
1065
1066      CHARACTER(LEN=*), PARAMETER :: routineN = 'G_P_transform_alpha_beta', &
1067         routineP = moduleN//':'//routineN
1068
1069      INTEGER                                            :: handle
1070
1071      CALL timeset(routineN//"_back_v", handle)
1072      ! first back-transformation a->nu
1073      CALL dbcsr_set(G_P_inu%matrix, 0.0_dp)
1074      CALL dbcsr_multiply("N", "T", 1.0_dp, mo_coeff_v, G_P_ia%matrix, &
1075                          0.0_dp, G_P_inu%matrix, filter_eps=eps_filter)
1076      CALL timestop(handle)
1077
1078      ! second back-transformation i->mu
1079      CALL timeset(routineN//"_back_o", handle)
1080      CALL dbcsr_multiply("N", "T", fraction_add(1), G_P_inu%matrix, mo_coeff_o, &
1081                          fraction_add(2), G_P_munu%matrix, filter_eps=eps_filter)
1082
1083      CALL timestop(handle)
1084
1085   END SUBROUTINE G_P_transform_alpha_beta
1086
1087! **************************************************************************************************
1088!> \brief ...
1089!> \param mat_munu ...
1090!> \param matrix_P_inu ...
1091!> \param Lag_mu_i_1 ...
1092!> \param G_P_ia ...
1093!> \param mo_coeff_o ...
1094!> \param Lag_nu_a_2 ...
1095!> \param eps_filter ...
1096!> \param matrix_P_inu_beta ...
1097!> \param Lag_mu_i_1_beta ...
1098!> \param G_P_ia_beta ...
1099!> \param mo_coeff_o_beta ...
1100!> \param Lag_nu_a_2_beta ...
1101! **************************************************************************************************
1102   SUBROUTINE update_lagrangian(mat_munu, matrix_P_inu, Lag_mu_i_1, &
1103                                G_P_ia, mo_coeff_o, Lag_nu_a_2, &
1104                                eps_filter, &
1105                                matrix_P_inu_beta, Lag_mu_i_1_beta, G_P_ia_beta, mo_coeff_o_beta, Lag_nu_a_2_beta)
1106      TYPE(dbcsr_p_type), INTENT(IN)                     :: mat_munu
1107      TYPE(dbcsr_p_type), INTENT(INOUT)                  :: matrix_P_inu, Lag_mu_i_1, G_P_ia
1108      TYPE(dbcsr_type), POINTER                          :: mo_coeff_o
1109      TYPE(dbcsr_p_type), INTENT(INOUT)                  :: Lag_nu_a_2
1110      REAL(KIND=dp), INTENT(IN)                          :: eps_filter
1111      TYPE(dbcsr_p_type), INTENT(INOUT), OPTIONAL        :: matrix_P_inu_beta, Lag_mu_i_1_beta, &
1112                                                            G_P_ia_beta
1113      TYPE(dbcsr_type), OPTIONAL, POINTER                :: mo_coeff_o_beta
1114      TYPE(dbcsr_p_type), INTENT(INOUT), OPTIONAL        :: Lag_nu_a_2_beta
1115
1116      CHARACTER(LEN=*), PARAMETER :: routineN = 'update_lagrangian', &
1117         routineP = moduleN//':'//routineN
1118
1119      INTEGER                                            :: handle
1120      LOGICAL                                            :: alpha_beta
1121
1122      ! update lagrangian
1123      CALL timeset(routineN//"_Lag", handle)
1124
1125      alpha_beta = PRESENT(matrix_P_inu_beta) .AND. PRESENT(Lag_mu_i_1_beta) .AND. PRESENT(G_P_ia_beta) &
1126                   .AND. PRESENT(mo_coeff_o_beta) .AND. PRESENT(Lag_nu_a_2_beta)
1127
1128      CALL update_lagrangian_alpha_beta(mat_munu, matrix_P_inu, Lag_mu_i_1, &
1129                                        G_P_ia, mo_coeff_o, Lag_nu_a_2, eps_filter)
1130      IF (alpha_beta) THEN
1131         CALL update_lagrangian_alpha_beta(mat_munu, matrix_P_inu_beta, Lag_mu_i_1_beta, &
1132                                           G_P_ia_beta, mo_coeff_o_beta, Lag_nu_a_2_beta, eps_filter)
1133      ENDIF
1134
1135      CALL timestop(handle)
1136
1137   END SUBROUTINE update_lagrangian
1138
1139! **************************************************************************************************
1140!> \brief ...
1141!> \param mat_munu ...
1142!> \param matrix_P_inu ...
1143!> \param Lag_mu_i_1 ...
1144!> \param G_P_ia ...
1145!> \param mo_coeff_o ...
1146!> \param Lag_nu_a_2 ...
1147!> \param eps_filter ...
1148! **************************************************************************************************
1149   SUBROUTINE update_lagrangian_alpha_beta(mat_munu, matrix_P_inu, Lag_mu_i_1, &
1150                                           G_P_ia, mo_coeff_o, Lag_nu_a_2, &
1151                                           eps_filter)
1152      TYPE(dbcsr_p_type), INTENT(IN)                     :: mat_munu
1153      TYPE(dbcsr_p_type), INTENT(INOUT)                  :: matrix_P_inu, Lag_mu_i_1, G_P_ia
1154      TYPE(dbcsr_type), POINTER                          :: mo_coeff_o
1155      TYPE(dbcsr_p_type), INTENT(INOUT)                  :: Lag_nu_a_2
1156      REAL(KIND=dp), INTENT(IN)                          :: eps_filter
1157
1158      CHARACTER(LEN=*), PARAMETER :: routineN = 'update_lagrangian_alpha_beta', &
1159         routineP = moduleN//':'//routineN
1160
1161      ! first contract mat_munu with the half back transformed Gamma_i_nu
1162      ! in order to update Lag_mu_i_1
1163      CALL dbcsr_multiply("N", "N", 1.0_dp, mat_munu%matrix, matrix_P_inu%matrix, &
1164                          1.0_dp, Lag_mu_i_1%matrix, filter_eps=eps_filter)
1165
1166      ! transform first index of mat_munu and store the result into matrix_P_inu
1167      CALL dbcsr_set(matrix_P_inu%matrix, 0.0_dp)
1168      CALL dbcsr_multiply("N", "N", 1.0_dp, mat_munu%matrix, mo_coeff_o, &
1169                          0.0_dp, matrix_P_inu%matrix, filter_eps=eps_filter)
1170
1171      ! contract the transformend matrix_P_inu with the untransformend Gamma_i_a
1172      ! in order to update Lag_nu_a_2
1173      CALL dbcsr_multiply("N", "N", -1.0_dp, matrix_P_inu%matrix, G_P_ia%matrix, &
1174                          1.0_dp, Lag_nu_a_2%matrix, filter_eps=eps_filter)
1175
1176      ! release the actual gamma_P_ia
1177      CALL dbcsr_release(G_P_ia%matrix)
1178      DEALLOCATE (G_P_ia%matrix)
1179
1180   END SUBROUTINE update_lagrangian_alpha_beta
1181
1182! **************************************************************************************************
1183!> \brief ...
1184!> \param qs_env ...
1185!> \param mp2_env ...
1186!> \param mo_coeff ...
1187!> \param homo ...
1188!> \param virtual ...
1189!> \param dimen ...
1190!> \param para_env ...
1191!> \param para_env_sub ...
1192!> \param Eigenval ...
1193!> \param L1_mu_i ...
1194!> \param L2_nu_a ...
1195!> \param factor ...
1196!> \param alpha_beta ...
1197! **************************************************************************************************
1198   SUBROUTINE create_W_P(qs_env, mp2_env, mo_coeff, homo, virtual, dimen, para_env, para_env_sub, &
1199                         Eigenval, L1_mu_i, L2_nu_a, factor, alpha_beta)
1200      TYPE(qs_environment_type), POINTER                 :: qs_env
1201      TYPE(mp2_type), POINTER                            :: mp2_env
1202      TYPE(cp_fm_type), POINTER                          :: mo_coeff
1203      INTEGER, INTENT(IN)                                :: homo, virtual, dimen
1204      TYPE(cp_para_env_type), POINTER                    :: para_env, para_env_sub
1205      REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
1206      TYPE(cp_fm_type), POINTER                          :: L1_mu_i, L2_nu_a
1207      REAL(KIND=dp), INTENT(IN)                          :: factor
1208      LOGICAL, INTENT(IN)                                :: alpha_beta
1209
1210      CHARACTER(LEN=*), PARAMETER :: routineN = 'create_W_P', routineP = moduleN//':'//routineN
1211
1212      INTEGER :: color_exchange, comm_exchange, dummy_proc, handle, handle2, handle3, i, i_global, &
1213         i_local, iiB, iii, iproc, itmp(2), j_global, j_local, jjB, max_col_size, max_row_size, &
1214         my_B_virtual_end, my_B_virtual_start, mypcol, mypcol_1i, mypcol_2a, myprow, myprow_1i, &
1215         myprow_2a, ncol_block, ncol_block_1i, ncol_block_2a, ncol_local, ncol_local_1i, &
1216         ncol_local_2a, npcol, npcol_1i, npcol_2a, nprow, nprow_1i, nprow_2a, nrow_block, &
1217         nrow_block_1i, nrow_block_2a, nrow_local, nrow_local_1i, nrow_local_2a, number_of_rec, &
1218         number_of_send, proc_receive, proc_receive_static, proc_send, proc_send_ex
1219      INTEGER :: proc_send_static, proc_send_sub, proc_shift, rec_col_size, rec_counter, &
1220         rec_row_size, send_col_size, send_counter, send_pcol, send_prow, send_row_size, &
1221         size_rec_buffer, size_send_buffer
1222      INTEGER, ALLOCATABLE, DIMENSION(:) :: iii_vet, map_rec_size, map_send_size, pos_info, &
1223         pos_info_ex, proc_2_send_pos, proc_map_ex, req_send, sub_proc_map
1224      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: grid_2_mepos, mepos_2_grid, &
1225                                                            mepos_2_grid_1i, mepos_2_grid_2a, &
1226                                                            sizes, sizes_1i, sizes_2a
1227      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: col_indeces_info_1i, &
1228                                                            col_indeces_info_2a, &
1229                                                            row_indeces_info_1i, &
1230                                                            row_indeces_info_2a
1231      INTEGER, DIMENSION(:), POINTER                     :: col_indices, col_indices_1i, &
1232                                                            col_indices_2a, row_indices, &
1233                                                            row_indices_1i, row_indices_2a
1234      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: ab_rec, ab_send, mat_rec, mat_send
1235      TYPE(cp_blacs_env_type), POINTER                   :: blacs_env
1236      TYPE(cp_fm_struct_type), POINTER                   :: fm_struct_tmp
1237      TYPE(cp_fm_type), POINTER                          :: fm_P_ij, L_mu_q
1238      TYPE(cp_para_env_type), POINTER                    :: para_env_exchange
1239      TYPE(integ_mat_buffer_type), ALLOCATABLE, &
1240         DIMENSION(:)                                    :: buffer_rec, buffer_send
1241      TYPE(integ_mat_buffer_type_2D), ALLOCATABLE, &
1242         DIMENSION(:)                                    :: buffer_cyclic
1243
1244      CALL timeset(routineN, handle)
1245
1246      ! create the globally distributed mixed lagrangian
1247      NULLIFY (blacs_env)
1248      CALL get_qs_env(qs_env, blacs_env=blacs_env)
1249
1250      NULLIFY (L_mu_q, fm_struct_tmp)
1251      CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env, context=blacs_env, &
1252                               nrow_global=dimen, ncol_global=dimen)
1253      CALL cp_fm_create(L_mu_q, fm_struct_tmp, name="Lag_mu_q")
1254      CALL cp_fm_struct_release(fm_struct_tmp)
1255      CALL cp_fm_set_all(L_mu_q, 0.0_dp)
1256
1257      ! create all information array
1258      ALLOCATE (pos_info(0:para_env%num_pe - 1))
1259      pos_info = 0
1260      pos_info(para_env%mepos) = para_env_sub%mepos
1261      CALL mp_sum(pos_info, para_env%group)
1262
1263      ALLOCATE (sub_proc_map(-para_env_sub%num_pe:2*para_env_sub%num_pe - 1))
1264      sub_proc_map = 0
1265      DO i = 0, para_env_sub%num_pe - 1
1266         sub_proc_map(i) = i
1267         sub_proc_map(-i - 1) = para_env_sub%num_pe - i - 1
1268         sub_proc_map(para_env_sub%num_pe + i) = i
1269      END DO
1270
1271      ! get matrix information for the global
1272      CALL cp_fm_get_info(matrix=L_mu_q, &
1273                          nrow_local=nrow_local, &
1274                          ncol_local=ncol_local, &
1275                          row_indices=row_indices, &
1276                          col_indices=col_indices, &
1277                          nrow_block=nrow_block, &
1278                          ncol_block=ncol_block)
1279      myprow = L_mu_q%matrix_struct%context%mepos(1)
1280      mypcol = L_mu_q%matrix_struct%context%mepos(2)
1281      nprow = L_mu_q%matrix_struct%context%num_pe(1)
1282      npcol = L_mu_q%matrix_struct%context%num_pe(2)
1283
1284      ALLOCATE (grid_2_mepos(0:nprow - 1, 0:npcol - 1))
1285      grid_2_mepos = 0
1286      grid_2_mepos(myprow, mypcol) = para_env%mepos
1287      CALL mp_sum(grid_2_mepos, para_env%group)
1288
1289      ! get matrix information for L1_mu_i
1290      CALL cp_fm_get_info(matrix=L1_mu_i, &
1291                          nrow_local=nrow_local_1i, &
1292                          ncol_local=ncol_local_1i, &
1293                          row_indices=row_indices_1i, &
1294                          col_indices=col_indices_1i, &
1295                          nrow_block=nrow_block_1i, &
1296                          ncol_block=ncol_block_1i)
1297      myprow_1i = L1_mu_i%matrix_struct%context%mepos(1)
1298      mypcol_1i = L1_mu_i%matrix_struct%context%mepos(2)
1299      nprow_1i = L1_mu_i%matrix_struct%context%num_pe(1)
1300      npcol_1i = L1_mu_i%matrix_struct%context%num_pe(2)
1301
1302      ALLOCATE (mepos_2_grid_1i(0:para_env_sub%num_pe - 1, 2))
1303      mepos_2_grid_1i = 0
1304      mepos_2_grid_1i(para_env_sub%mepos, 1) = myprow_1i
1305      mepos_2_grid_1i(para_env_sub%mepos, 2) = mypcol_1i
1306      CALL mp_sum(mepos_2_grid_1i, para_env_sub%group)
1307
1308      ALLOCATE (sizes_1i(2, 0:para_env_sub%num_pe - 1))
1309      sizes_1i = 0
1310      sizes_1i(1, para_env_sub%mepos) = nrow_local_1i
1311      sizes_1i(2, para_env_sub%mepos) = ncol_local_1i
1312      CALL mp_sum(sizes_1i, para_env_sub%group)
1313
1314      ! get matrix information for L2_nu_a
1315      CALL cp_fm_get_info(matrix=L2_nu_a, &
1316                          nrow_local=nrow_local_2a, &
1317                          ncol_local=ncol_local_2a, &
1318                          row_indices=row_indices_2a, &
1319                          col_indices=col_indices_2a, &
1320                          nrow_block=nrow_block_2a, &
1321                          ncol_block=ncol_block_2a)
1322      myprow_2a = L2_nu_a%matrix_struct%context%mepos(1)
1323      mypcol_2a = L2_nu_a%matrix_struct%context%mepos(2)
1324      nprow_2a = L2_nu_a%matrix_struct%context%num_pe(1)
1325      npcol_2a = L2_nu_a%matrix_struct%context%num_pe(2)
1326
1327      ALLOCATE (mepos_2_grid_2a(0:para_env_sub%num_pe - 1, 2))
1328      mepos_2_grid_2a = 0
1329      mepos_2_grid_2a(para_env_sub%mepos, 1) = myprow_2a
1330      mepos_2_grid_2a(para_env_sub%mepos, 2) = mypcol_2a
1331      CALL mp_sum(mepos_2_grid_2a, para_env_sub%group)
1332
1333      ALLOCATE (sizes_2a(2, 0:para_env_sub%num_pe - 1))
1334      sizes_2a = 0
1335      sizes_2a(1, para_env_sub%mepos) = nrow_local_2a
1336      sizes_2a(2, para_env_sub%mepos) = ncol_local_2a
1337      CALL mp_sum(sizes_2a, para_env_sub%group)
1338
1339      ! Here we perform a ring communication scheme taking into account
1340      ! for the sub-group distribution of the source matrices.
1341      ! as a first step we need to redistribute the data within
1342      ! the subgroup.
1343      ! In order to do so we have to allocate the structure
1344      ! that will hold the local data involved in communication, this
1345      ! structure will be the same for processes in different subgroups
1346      ! sharing the same position in the subgroup.
1347      ! -1) create the exchange para_env
1348      color_exchange = para_env_sub%mepos
1349      CALL mp_comm_split_direct(para_env%group, comm_exchange, color_exchange)
1350      NULLIFY (para_env_exchange)
1351      CALL cp_para_env_create(para_env_exchange, comm_exchange)
1352      ! crate the proc maps exchange and info
1353      ALLOCATE (proc_map_ex(-para_env_exchange%num_pe:2*para_env_exchange%num_pe - 1))
1354      DO i = 0, para_env_exchange%num_pe - 1
1355         proc_map_ex(i) = i
1356         proc_map_ex(-i - 1) = para_env_exchange%num_pe - i - 1
1357         proc_map_ex(para_env_exchange%num_pe + i) = i
1358      END DO
1359      ALLOCATE (pos_info_ex(0:para_env%num_pe - 1))
1360      pos_info_ex = 0
1361      pos_info_ex(para_env%mepos) = para_env_exchange%mepos
1362      CALL mp_sum(pos_info_ex, para_env%group)
1363      ALLOCATE (sizes(2, 0:para_env_exchange%num_pe - 1))
1364      sizes = 0
1365      sizes(1, para_env_exchange%mepos) = nrow_local
1366      sizes(2, para_env_exchange%mepos) = ncol_local
1367      CALL mp_sum(sizes, para_env_exchange%group)
1368
1369      ! 0) store some info about indeces of the fm matrices (subgroup)
1370      CALL timeset(routineN//"_inx", handle2)
1371      ! matrix L1_mu_i
1372      max_row_size = MAXVAL(sizes_1i(1, :))
1373      max_col_size = MAXVAL(sizes_1i(2, :))
1374      ALLOCATE (row_indeces_info_1i(2, max_row_size, 0:para_env_sub%num_pe - 1))
1375      ALLOCATE (col_indeces_info_1i(2, max_col_size, 0:para_env_sub%num_pe - 1))
1376      row_indeces_info_1i = 0
1377      col_indeces_info_1i = 0
1378      dummy_proc = 0
1379      ! row
1380      DO iiB = 1, nrow_local_1i
1381         i_global = row_indices_1i(iiB)
1382         send_prow = cp_fm_indxg2p(i_global, nrow_block, dummy_proc, &
1383                                   L_mu_q%matrix_struct%first_p_pos(1), nprow)
1384         i_local = cp_fm_indxg2l(i_global, nrow_block, dummy_proc, &
1385                                 L_mu_q%matrix_struct%first_p_pos(1), nprow)
1386         row_indeces_info_1i(1, iiB, para_env_sub%mepos) = send_prow
1387         row_indeces_info_1i(2, iiB, para_env_sub%mepos) = i_local
1388      END DO
1389      ! col
1390      DO jjB = 1, ncol_local_1i
1391         j_global = col_indices_1i(jjB)
1392         send_pcol = cp_fm_indxg2p(j_global, ncol_block, dummy_proc, &
1393                                   L_mu_q%matrix_struct%first_p_pos(2), npcol)
1394         j_local = cp_fm_indxg2l(j_global, ncol_block, dummy_proc, &
1395                                 L_mu_q%matrix_struct%first_p_pos(2), npcol)
1396         col_indeces_info_1i(1, jjB, para_env_sub%mepos) = send_pcol
1397         col_indeces_info_1i(2, jjB, para_env_sub%mepos) = j_local
1398      END DO
1399      CALL mp_sum(row_indeces_info_1i, para_env_sub%group)
1400      CALL mp_sum(col_indeces_info_1i, para_env_sub%group)
1401
1402      ! matrix L2_nu_a
1403      max_row_size = MAXVAL(sizes_2a(1, :))
1404      max_col_size = MAXVAL(sizes_2a(2, :))
1405      ALLOCATE (row_indeces_info_2a(2, max_row_size, 0:para_env_sub%num_pe - 1))
1406      ALLOCATE (col_indeces_info_2a(2, max_col_size, 0:para_env_sub%num_pe - 1))
1407      row_indeces_info_2a = 0
1408      col_indeces_info_2a = 0
1409      ! row
1410      DO iiB = 1, nrow_local_2a
1411         i_global = row_indices_2a(iiB)
1412         send_prow = cp_fm_indxg2p(i_global, nrow_block, dummy_proc, &
1413                                   L_mu_q%matrix_struct%first_p_pos(1), nprow)
1414         i_local = cp_fm_indxg2l(i_global, nrow_block, dummy_proc, &
1415                                 L_mu_q%matrix_struct%first_p_pos(1), nprow)
1416         row_indeces_info_2a(1, iiB, para_env_sub%mepos) = send_prow
1417         row_indeces_info_2a(2, iiB, para_env_sub%mepos) = i_local
1418      END DO
1419      ! col
1420      DO jjB = 1, ncol_local_2a
1421         j_global = col_indices_2a(jjB) + homo
1422         send_pcol = cp_fm_indxg2p(j_global, ncol_block, dummy_proc, &
1423                                   L_mu_q%matrix_struct%first_p_pos(2), npcol)
1424         j_local = cp_fm_indxg2l(j_global, ncol_block, dummy_proc, &
1425                                 L_mu_q%matrix_struct%first_p_pos(2), npcol)
1426         col_indeces_info_2a(1, jjB, para_env_sub%mepos) = send_pcol
1427         col_indeces_info_2a(2, jjB, para_env_sub%mepos) = j_local
1428      END DO
1429      CALL mp_sum(row_indeces_info_2a, para_env_sub%group)
1430      CALL mp_sum(col_indeces_info_2a, para_env_sub%group)
1431      CALL timestop(handle2)
1432
1433      ! 1) define the map for sending data in the subgroup starting with L1_mu_i
1434      CALL timeset(routineN//"_subinfo", handle2)
1435      ALLOCATE (map_send_size(0:para_env_sub%num_pe - 1))
1436      map_send_size = 0
1437      DO jjB = 1, ncol_local_1i
1438         ! j_global=col_indices_1i(jjB)
1439         ! send_pcol=cp_fm_indxg2p(j_global,ncol_block,dummy_proc,&
1440         !                         L_mu_q%matrix_struct%first_p_pos(2),npcol)
1441         send_pcol = col_indeces_info_1i(1, jjB, para_env_sub%mepos)
1442         DO iiB = 1, nrow_local_1i
1443            ! i_global=row_indices_1i(iiB)
1444            ! send_prow=cp_fm_indxg2p(i_global,nrow_block,dummy_proc,&
1445            !                         L_mu_q%matrix_struct%first_p_pos(1),nprow)
1446            send_prow = row_indeces_info_1i(1, iiB, para_env_sub%mepos)
1447            proc_send = grid_2_mepos(send_prow, send_pcol)
1448            proc_send_sub = pos_info(proc_send)
1449            map_send_size(proc_send_sub) = map_send_size(proc_send_sub) + 1
1450         END DO
1451      END DO
1452      ! and the same for L2_nu_a
1453      DO jjB = 1, ncol_local_2a
1454         ! j_global=col_indices_2a(jjB)+homo
1455         ! send_pcol=cp_fm_indxg2p(j_global,ncol_block,dummy_proc,&
1456         !                         L_mu_q%matrix_struct%first_p_pos(2),npcol)
1457         send_pcol = col_indeces_info_2a(1, jjB, para_env_sub%mepos)
1458         DO iiB = 1, nrow_local_2a
1459            ! i_global=row_indices_2a(iiB)
1460            ! send_prow=cp_fm_indxg2p(i_global,nrow_block,dummy_proc,&
1461            !                         L_mu_q%matrix_struct%first_p_pos(1),nprow)
1462            send_prow = row_indeces_info_2a(1, iiB, para_env_sub%mepos)
1463            proc_send = grid_2_mepos(send_prow, send_pcol)
1464            proc_send_sub = pos_info(proc_send)
1465            map_send_size(proc_send_sub) = map_send_size(proc_send_sub) + 1
1466         END DO
1467      END DO
1468      ! and exchange data in order to create map_rec_size
1469      ALLOCATE (map_rec_size(0:para_env_sub%num_pe - 1))
1470      map_rec_size = 0
1471      CALL mp_alltoall(map_send_size, map_rec_size, 1, para_env_sub%group)
1472      CALL timestop(handle2)
1473
1474      ! 2) reorder data in sending buffer
1475      CALL timeset(routineN//"_sub_Bsend", handle2)
1476      ! count the number of messages (include myself)
1477      number_of_send = 0
1478      DO proc_shift = 0, para_env_sub%num_pe - 1
1479         proc_send = sub_proc_map(para_env_sub%mepos + proc_shift)
1480         IF (map_send_size(proc_send) > 0) THEN
1481            number_of_send = number_of_send + 1
1482         END IF
1483      END DO
1484      ! allocate the structure that will hold the messages to be sent
1485      ALLOCATE (buffer_send(number_of_send))
1486      send_counter = 0
1487      ALLOCATE (proc_2_send_pos(0:para_env_sub%num_pe - 1))
1488      proc_2_send_pos = 0
1489      DO proc_shift = 0, para_env_sub%num_pe - 1
1490         proc_send = sub_proc_map(para_env_sub%mepos + proc_shift)
1491         size_send_buffer = map_send_size(proc_send)
1492         IF (map_send_size(proc_send) > 0) THEN
1493            send_counter = send_counter + 1
1494            ! allocate the sending buffer (msg)
1495            ALLOCATE (buffer_send(send_counter)%msg(size_send_buffer))
1496            buffer_send(send_counter)%msg = 0.0_dp
1497            buffer_send(send_counter)%proc = proc_send
1498            proc_2_send_pos(proc_send) = send_counter
1499         END IF
1500      END DO
1501      ! loop over the locally held data and fill the buffer_send
1502      ! for doing that we need an array that keep track if the
1503      ! sequential increase of the index for each message
1504      ALLOCATE (iii_vet(number_of_send))
1505      iii_vet = 0
1506      DO jjB = 1, ncol_local_1i
1507         ! j_global=col_indices_1i(jjB)
1508         ! send_pcol=cp_fm_indxg2p(j_global,ncol_block,dummy_proc,&
1509         !                         L_mu_q%matrix_struct%first_p_pos(2),npcol)
1510         send_pcol = col_indeces_info_1i(1, jjB, para_env_sub%mepos)
1511         DO iiB = 1, nrow_local_1i
1512            ! i_global=row_indices_1i(iiB)
1513            ! send_prow=cp_fm_indxg2p(i_global,nrow_block,dummy_proc,&
1514            !                         L_mu_q%matrix_struct%first_p_pos(1),nprow)
1515            send_prow = row_indeces_info_1i(1, iiB, para_env_sub%mepos)
1516            proc_send = grid_2_mepos(send_prow, send_pcol)
1517            proc_send_sub = pos_info(proc_send)
1518            send_counter = proc_2_send_pos(proc_send_sub)
1519            iii_vet(send_counter) = iii_vet(send_counter) + 1
1520            iii = iii_vet(send_counter)
1521            buffer_send(send_counter)%msg(iii) = L1_mu_i%local_data(iiB, jjB)
1522         END DO
1523      END DO
1524      ! release the local data of L1_mu_i
1525      DEALLOCATE (L1_mu_i%local_data)
1526      ! and the same for L2_nu_a
1527      DO jjB = 1, ncol_local_2a
1528         ! j_global=col_indices_2a(jjB)+homo
1529         ! send_pcol=cp_fm_indxg2p(j_global,ncol_block,dummy_proc,&
1530         !                         L_mu_q%matrix_struct%first_p_pos(2),npcol)
1531         send_pcol = col_indeces_info_2a(1, jjB, para_env_sub%mepos)
1532         DO iiB = 1, nrow_local_2a
1533            ! i_global=row_indices_2a(iiB)
1534            ! send_prow=cp_fm_indxg2p(i_global,nrow_block,dummy_proc,&
1535            !                         L_mu_q%matrix_struct%first_p_pos(1),nprow)
1536            send_prow = row_indeces_info_2a(1, iiB, para_env_sub%mepos)
1537            proc_send = grid_2_mepos(send_prow, send_pcol)
1538            proc_send_sub = pos_info(proc_send)
1539            send_counter = proc_2_send_pos(proc_send_sub)
1540            iii_vet(send_counter) = iii_vet(send_counter) + 1
1541            iii = iii_vet(send_counter)
1542            buffer_send(send_counter)%msg(iii) = L2_nu_a%local_data(iiB, jjB)
1543         END DO
1544      END DO
1545      DEALLOCATE (L2_nu_a%local_data)
1546      DEALLOCATE (proc_2_send_pos)
1547      DEALLOCATE (iii_vet)
1548      CALL timestop(handle2)
1549
1550      ! 3) create the buffer for receive, post the message with irecv
1551      !    and send the messages with mp_isend
1552      CALL timeset(routineN//"_sub_isendrecv", handle2)
1553      ! count the number of messages to be received
1554      number_of_rec = 0
1555      DO proc_shift = 0, para_env_sub%num_pe - 1
1556         proc_receive = sub_proc_map(para_env_sub%mepos - proc_shift)
1557         IF (map_rec_size(proc_receive) > 0) THEN
1558            number_of_rec = number_of_rec + 1
1559         END IF
1560      END DO
1561      ALLOCATE (buffer_rec(number_of_rec))
1562      rec_counter = 0
1563      DO proc_shift = 0, para_env_sub%num_pe - 1
1564         proc_receive = sub_proc_map(para_env_sub%mepos - proc_shift)
1565         size_rec_buffer = map_rec_size(proc_receive)
1566         IF (map_rec_size(proc_receive) > 0) THEN
1567            rec_counter = rec_counter + 1
1568            ! prepare the buffer for receive
1569            ALLOCATE (buffer_rec(rec_counter)%msg(size_rec_buffer))
1570            buffer_rec(rec_counter)%msg = 0.0_dp
1571            buffer_rec(rec_counter)%proc = proc_receive
1572            ! post the message to be received (not need to send to myself)
1573            IF (proc_receive /= para_env_sub%mepos) THEN
1574               CALL mp_irecv(buffer_rec(rec_counter)%msg, proc_receive, para_env_sub%group, &
1575                             buffer_rec(rec_counter)%msg_req)
1576            END IF
1577         END IF
1578      END DO
1579      ! send messages
1580      ALLOCATE (req_send(number_of_send))
1581      req_send = mp_request_null
1582      send_counter = 0
1583      DO proc_shift = 0, para_env_sub%num_pe - 1
1584         proc_send = sub_proc_map(para_env_sub%mepos + proc_shift)
1585         IF (map_send_size(proc_send) > 0) THEN
1586            send_counter = send_counter + 1
1587            IF (proc_send == para_env_sub%mepos) THEN
1588               buffer_rec(send_counter)%msg = buffer_send(send_counter)%msg
1589            ELSE
1590               CALL mp_isend(buffer_send(send_counter)%msg, proc_send, para_env_sub%group, &
1591                             buffer_send(send_counter)%msg_req)
1592               req_send(send_counter) = buffer_send(send_counter)%msg_req
1593            END IF
1594         END IF
1595      END DO
1596      DEALLOCATE (map_send_size)
1597      CALL timestop(handle2)
1598
1599      ! 4) (if memory is a problem we should move this part after point 5)
1600      !    Here we create the new buffer for cyclic(ring) communication and
1601      !    we fill it with the data received from the other member of the
1602      !    subgroup
1603      CALL timeset(routineN//"_Bcyclic", handle2)
1604      ! first allocata new structure
1605      ALLOCATE (buffer_cyclic(0:para_env_exchange%num_pe - 1))
1606      DO iproc = 0, para_env_exchange%num_pe - 1
1607         rec_row_size = sizes(1, iproc)
1608         rec_col_size = sizes(2, iproc)
1609         ALLOCATE (buffer_cyclic(iproc)%msg(rec_row_size, rec_col_size))
1610         buffer_cyclic(iproc)%msg = 0.0_dp
1611      END DO
1612      ! now collect data from other member of the subgroup and fill
1613      ! buffer_cyclic
1614      rec_counter = 0
1615      DO proc_shift = 0, para_env_sub%num_pe - 1
1616         proc_receive = sub_proc_map(para_env_sub%mepos - proc_shift)
1617         size_rec_buffer = map_rec_size(proc_receive)
1618         IF (map_rec_size(proc_receive) > 0) THEN
1619            rec_counter = rec_counter + 1
1620
1621            ! wait for the message
1622            IF (proc_receive /= para_env_sub%mepos) CALL mp_wait(buffer_rec(rec_counter)%msg_req)
1623
1624            CALL timeset(routineN//"_fill", handle3)
1625            iii = 0
1626            DO jjB = 1, sizes_1i(2, proc_receive)
1627               ! j_global=cp_fm_indxl2g(jjB,ncol_block_1i,mepos_2_grid_1i(proc_receive,2),&
1628               !                        L1_mu_i%matrix_struct%first_p_pos(2),npcol_1i)
1629               ! send_pcol=cp_fm_indxg2p(j_global,ncol_block,dummy_proc,&
1630               !                         L_mu_q%matrix_struct%first_p_pos(2),npcol)
1631               ! j_local=cp_fm_indxg2l(j_global,ncol_block,dummy_proc,&
1632               !                       L_mu_q%matrix_struct%first_p_pos(2),npcol)
1633               send_pcol = col_indeces_info_1i(1, jjB, proc_receive)
1634               j_local = col_indeces_info_1i(2, jjB, proc_receive)
1635               DO iiB = 1, sizes_1i(1, proc_receive)
1636                  ! i_global=cp_fm_indxl2g(iiB,nrow_block_1i,mepos_2_grid_1i(proc_receive,1),&
1637                  !                        L1_mu_i%matrix_struct%first_p_pos(1),nprow_1i)
1638                  ! send_prow=cp_fm_indxg2p(i_global,nrow_block,dummy_proc,&
1639                  !                         L_mu_q%matrix_struct%first_p_pos(1),nprow)
1640                  send_prow = row_indeces_info_1i(1, iiB, proc_receive)
1641                  proc_send = grid_2_mepos(send_prow, send_pcol)
1642                  proc_send_sub = pos_info(proc_send)
1643                  IF (proc_send_sub /= para_env_sub%mepos) CYCLE
1644                  iii = iii + 1
1645                  ! i_local=cp_fm_indxg2l(i_global,nrow_block,dummy_proc,&
1646                  !                       L_mu_q%matrix_struct%first_p_pos(1),nprow)
1647                  i_local = row_indeces_info_1i(2, iiB, proc_receive)
1648                  proc_send_ex = pos_info_ex(proc_send)
1649                  buffer_cyclic(proc_send_ex)%msg(i_local, j_local) = buffer_rec(rec_counter)%msg(iii)
1650               END DO
1651            END DO
1652            ! and the same for L2_nu_a
1653            DO jjB = 1, sizes_2a(2, proc_receive)
1654               ! j_global=cp_fm_indxl2g(jjB,ncol_block_2a,mepos_2_grid_2a(proc_receive,2),&
1655               !                        L2_nu_a%matrix_struct%first_p_pos(2),npcol_2a)+homo
1656               ! send_pcol=cp_fm_indxg2p(j_global,ncol_block,dummy_proc,&
1657               !                         L_mu_q%matrix_struct%first_p_pos(2),npcol)
1658               ! j_local=cp_fm_indxg2l(j_global,ncol_block,dummy_proc,&
1659               !                       L_mu_q%matrix_struct%first_p_pos(2),npcol)
1660               send_pcol = col_indeces_info_2a(1, jjB, proc_receive)
1661               j_local = col_indeces_info_2a(2, jjB, proc_receive)
1662               DO iiB = 1, sizes_2a(1, proc_receive)
1663                  ! i_global=cp_fm_indxl2g(iiB,nrow_block_2a,mepos_2_grid_2a(proc_receive,1),&
1664                  !                        L2_nu_a%matrix_struct%first_p_pos(1),nprow_2a)
1665                  ! send_prow=cp_fm_indxg2p(i_global,nrow_block,dummy_proc,&
1666                  !                         L_mu_q%matrix_struct%first_p_pos(1),nprow)
1667                  send_prow = row_indeces_info_2a(1, iiB, proc_receive)
1668                  proc_send = grid_2_mepos(send_prow, send_pcol)
1669                  proc_send_sub = pos_info(proc_send)
1670                  IF (proc_send_sub /= para_env_sub%mepos) CYCLE
1671                  iii = iii + 1
1672                  ! i_local=cp_fm_indxg2l(i_global,nrow_block,dummy_proc,&
1673                  !                       L_mu_q%matrix_struct%first_p_pos(1),nprow)
1674                  i_local = row_indeces_info_2a(2, iiB, proc_receive)
1675                  proc_send_ex = pos_info_ex(proc_send)
1676                  buffer_cyclic(proc_send_ex)%msg(i_local, j_local) = buffer_rec(rec_counter)%msg(iii)
1677               END DO
1678            END DO
1679            CALL timestop(handle3)
1680
1681            ! deallocate the received message
1682            DEALLOCATE (buffer_rec(rec_counter)%msg)
1683         END IF
1684      END DO
1685      DEALLOCATE (row_indeces_info_1i)
1686      DEALLOCATE (col_indeces_info_1i)
1687      DEALLOCATE (row_indeces_info_2a)
1688      DEALLOCATE (col_indeces_info_2a)
1689      DEALLOCATE (buffer_rec)
1690      DEALLOCATE (map_rec_size)
1691      CALL timestop(handle2)
1692
1693      ! 5)  Wait for all messeges to be sent in the subgroup
1694      CALL timeset(routineN//"_sub_waitall", handle2)
1695      CALL mp_waitall(req_send(:))
1696      DO send_counter = 1, number_of_send
1697         DEALLOCATE (buffer_send(send_counter)%msg)
1698      END DO
1699      DEALLOCATE (buffer_send)
1700      DEALLOCATE (req_send)
1701      CALL timestop(handle2)
1702
1703      ! 6) Start with ring communication
1704      CALL timeset(routineN//"_ring", handle2)
1705      proc_send_static = proc_map_ex(para_env_exchange%mepos + 1)
1706      proc_receive_static = proc_map_ex(para_env_exchange%mepos - 1)
1707      max_row_size = MAXVAL(sizes(1, :))
1708      max_col_size = MAXVAL(sizes(2, :))
1709      ALLOCATE (mat_send(max_row_size, max_col_size))
1710      ALLOCATE (mat_rec(max_row_size, max_col_size))
1711      mat_send = 0.0_dp
1712      mat_send(1:nrow_local, 1:ncol_local) = buffer_cyclic(para_env_exchange%mepos)%msg(:, :)
1713      DEALLOCATE (buffer_cyclic(para_env_exchange%mepos)%msg)
1714      DO proc_shift = 1, para_env_exchange%num_pe - 1
1715         proc_send = proc_map_ex(para_env_exchange%mepos + proc_shift)
1716         proc_receive = proc_map_ex(para_env_exchange%mepos - proc_shift)
1717
1718         rec_row_size = sizes(1, proc_receive)
1719         rec_col_size = sizes(2, proc_receive)
1720
1721         mat_rec = 0.0_dp
1722         CALL mp_sendrecv(mat_send, proc_send_static, &
1723                          mat_rec, proc_receive_static, &
1724                          para_env_exchange%group)
1725
1726         mat_send = 0.0_dp
1727         mat_send(1:rec_row_size, 1:rec_col_size) = mat_rec(1:rec_row_size, 1:rec_col_size) + &
1728                                                    buffer_cyclic(proc_receive)%msg(:, :)
1729
1730         DEALLOCATE (buffer_cyclic(proc_receive)%msg)
1731      END DO
1732      ! and finaly
1733      CALL mp_sendrecv(mat_send, proc_send_static, &
1734                       mat_rec, proc_receive_static, &
1735                       para_env_exchange%group)
1736      L_mu_q%local_data(1:nrow_local, 1:ncol_local) = mat_rec(1:nrow_local, 1:ncol_local)
1737      DEALLOCATE (buffer_cyclic)
1738      DEALLOCATE (mat_send)
1739      DEALLOCATE (mat_rec)
1740      CALL timestop(handle2)
1741
1742      DEALLOCATE (proc_map_ex)
1743      ! release para_env_exchange
1744      CALL cp_para_env_release(para_env_exchange)
1745
1746      CALL cp_fm_release(L1_mu_i)
1747      CALL cp_fm_release(L2_nu_a)
1748      DEALLOCATE (pos_info_ex)
1749      DEALLOCATE (grid_2_mepos)
1750      DEALLOCATE (mepos_2_grid_1i)
1751      DEALLOCATE (mepos_2_grid_2a)
1752      DEALLOCATE (sizes)
1753      DEALLOCATE (sizes_1i)
1754      DEALLOCATE (sizes_2a)
1755
1756      ! update the P_ij block of P_MP2 with the
1757      ! non-singular ij pairs
1758      CALL timeset(routineN//"_Pij", handle2)
1759      NULLIFY (fm_P_ij, fm_struct_tmp)
1760      CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env, context=blacs_env, &
1761                               nrow_global=homo, ncol_global=homo)
1762      CALL cp_fm_create(fm_P_ij, fm_struct_tmp, name="fm_P_ij")
1763      CALL cp_fm_struct_release(fm_struct_tmp)
1764      CALL cp_fm_set_all(fm_P_ij, 0.0_dp)
1765
1766      CALL cp_gemm('T', 'N', homo, homo, dimen, 1.0_dp, &
1767                   mo_coeff, L_mu_q, 0.0_dp, fm_P_ij, &
1768                   a_first_col=1, &
1769                   a_first_row=1, &
1770                   b_first_col=1, &
1771                   b_first_row=1, &
1772                   c_first_col=1, &
1773                   c_first_row=1)
1774      ! don't know if it is better to transpose (for communication reasons)
1775      ! or just recompute the transposed matrix
1776      CALL cp_gemm('T', 'N', homo, homo, dimen, -2.0_dp, &
1777                   L_mu_q, mo_coeff, 2.0_dp, fm_P_ij, &
1778                   a_first_col=1, &
1779                   a_first_row=1, &
1780                   b_first_col=1, &
1781                   b_first_row=1, &
1782                   c_first_col=1, &
1783                   c_first_row=1)
1784      ! we have it, update P_ij local
1785      CALL cp_fm_get_info(matrix=fm_P_ij, &
1786                          nrow_local=nrow_local, &
1787                          ncol_local=ncol_local, &
1788                          row_indices=row_indices, &
1789                          col_indices=col_indices)
1790      DO jjB = 1, ncol_local
1791         j_global = col_indices(jjB)
1792         DO iiB = 1, nrow_local
1793            i_global = row_indices(iiB)
1794            ! diagonal elements alreasy updated
1795            IF (j_global == i_global) CYCLE
1796            ! check if the given element is above the threshold
1797            IF (ABS(Eigenval(j_global) - Eigenval(i_global)) < mp2_env%ri_mp2%eps_canonical) CYCLE
1798            IF (.NOT. alpha_beta) THEN
1799               mp2_env%ri_grad%P_ij(i_global, j_global) = &
1800                  factor*fm_P_ij%local_data(iiB, jjB)/(Eigenval(j_global) - Eigenval(i_global))
1801            ELSE
1802               mp2_env%ri_grad%P_ij_beta(i_global, j_global) = &
1803                  factor*fm_P_ij%local_data(iiB, jjB)/(Eigenval(j_global) - Eigenval(i_global))
1804            ENDIF
1805         END DO
1806      END DO
1807      ! release fm_P_ij
1808      CALL cp_fm_release(fm_P_ij)
1809      ! mp_sum it (we can avoid mp_sum, but for now let's keep it easy)
1810      IF (.NOT. alpha_beta) THEN
1811         CALL mp_sum(mp2_env%ri_grad%P_ij, para_env%group)
1812      ELSE
1813         CALL mp_sum(mp2_env%ri_grad%P_ij_beta, para_env%group)
1814      ENDIF
1815      CALL timestop(handle2)
1816
1817      ! Now create and fill the P matrix (MO)
1818      ! FOR NOW WE ASSUME P_ab AND P_ij ARE REPLICATED OVER EACH MPI RANK
1819      IF (.NOT. alpha_beta) THEN
1820         CALL timeset(routineN//"_PMO", handle2)
1821         NULLIFY (mp2_env%ri_grad%P_mo, fm_struct_tmp)
1822         CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env, context=blacs_env, &
1823                                  nrow_global=dimen, ncol_global=dimen)
1824         CALL cp_fm_create(mp2_env%ri_grad%P_mo, fm_struct_tmp, name="P_MP2_MO")
1825         CALL cp_fm_set_all(mp2_env%ri_grad%P_mo, 0.0_dp)
1826      ELSE
1827         CALL timeset(routineN//"_PMO", handle2)
1828         NULLIFY (mp2_env%ri_grad%P_mo_beta, fm_struct_tmp)
1829         CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env, context=blacs_env, &
1830                                  nrow_global=dimen, ncol_global=dimen)
1831         CALL cp_fm_create(mp2_env%ri_grad%P_mo_beta, fm_struct_tmp, name="P_MP2_MO")
1832         CALL cp_fm_set_all(mp2_env%ri_grad%P_mo_beta, 0.0_dp)
1833      ENDIF
1834
1835      ! start with the (easy) occ-occ block and locally held P_ab elements
1836      itmp = get_limit(virtual, para_env_sub%num_pe, para_env_sub%mepos)
1837      my_B_virtual_start = itmp(1)
1838      my_B_virtual_end = itmp(2)
1839
1840      IF (.NOT. alpha_beta) THEN
1841         CALL cp_fm_get_info(matrix=mp2_env%ri_grad%P_mo, &
1842                             nrow_local=nrow_local, &
1843                             ncol_local=ncol_local, &
1844                             row_indices=row_indices, &
1845                             col_indices=col_indices, &
1846                             nrow_block=nrow_block, &
1847                             ncol_block=ncol_block)
1848         myprow = mp2_env%ri_grad%P_mo%matrix_struct%context%mepos(1)
1849         mypcol = mp2_env%ri_grad%P_mo%matrix_struct%context%mepos(2)
1850         nprow = mp2_env%ri_grad%P_mo%matrix_struct%context%num_pe(1)
1851         npcol = mp2_env%ri_grad%P_mo%matrix_struct%context%num_pe(2)
1852      ELSE
1853         CALL cp_fm_get_info(matrix=mp2_env%ri_grad%P_mo_beta, &
1854                             nrow_local=nrow_local, &
1855                             ncol_local=ncol_local, &
1856                             row_indices=row_indices, &
1857                             col_indices=col_indices, &
1858                             nrow_block=nrow_block, &
1859                             ncol_block=ncol_block)
1860         myprow = mp2_env%ri_grad%P_mo_beta%matrix_struct%context%mepos(1)
1861         mypcol = mp2_env%ri_grad%P_mo_beta%matrix_struct%context%mepos(2)
1862         nprow = mp2_env%ri_grad%P_mo_beta%matrix_struct%context%num_pe(1)
1863         npcol = mp2_env%ri_grad%P_mo_beta%matrix_struct%context%num_pe(2)
1864      ENDIF
1865
1866      DO jjB = 1, ncol_local
1867         j_global = col_indices(jjB)
1868         DO iiB = 1, nrow_local
1869            i_global = row_indices(iiB)
1870            IF (.NOT. alpha_beta) THEN
1871               IF (i_global <= homo .AND. j_global <= homo) THEN
1872                  mp2_env%ri_grad%P_mo%local_data(iiB, jjB) = mp2_env%ri_grad%P_ij(i_global, j_global)
1873               END IF
1874               IF ((my_B_virtual_start <= i_global - homo .AND. i_global - homo <= my_B_virtual_end) .AND. (j_global > homo)) THEN
1875                  mp2_env%ri_grad%P_mo%local_data(iiB, jjB) = &
1876                     mp2_env%ri_grad%P_ab(i_global - homo - my_B_virtual_start + 1, j_global - homo)
1877               END IF
1878            ELSE
1879               IF (i_global <= homo .AND. j_global <= homo) THEN
1880                  mp2_env%ri_grad%P_mo_beta%local_data(iiB, jjB) = mp2_env%ri_grad%P_ij_beta(i_global, j_global)
1881               END IF
1882               IF ((my_B_virtual_start <= i_global - homo .AND. i_global - homo <= my_B_virtual_end) .AND. (j_global > homo)) THEN
1883                  mp2_env%ri_grad%P_mo_beta%local_data(iiB, jjB) = &
1884                     mp2_env%ri_grad%P_ab_beta(i_global - homo - my_B_virtual_start + 1, j_global - homo)
1885               END IF
1886            ENDIF
1887         END DO
1888      END DO
1889      ! deallocate the local P_ij
1890      IF (.NOT. alpha_beta) THEN
1891         DEALLOCATE (mp2_env%ri_grad%P_ij)
1892      ELSE
1893         DEALLOCATE (mp2_env%ri_grad%P_ij_beta)
1894      ENDIF
1895
1896      ! send around the sub_group the local data and check if we
1897      ! have to update our block with external elements
1898      ALLOCATE (mepos_2_grid(0:para_env_sub%num_pe - 1, 2))
1899      mepos_2_grid = 0
1900      mepos_2_grid(para_env_sub%mepos, 1) = myprow
1901      mepos_2_grid(para_env_sub%mepos, 2) = mypcol
1902      CALL mp_sum(mepos_2_grid, para_env_sub%group)
1903
1904      ALLOCATE (sizes(2, 0:para_env_sub%num_pe - 1))
1905      sizes = 0
1906      sizes(1, para_env_sub%mepos) = nrow_local
1907      sizes(2, para_env_sub%mepos) = ncol_local
1908      CALL mp_sum(sizes, para_env_sub%group)
1909
1910      ALLOCATE (ab_rec(nrow_local, ncol_local))
1911      DO proc_shift = 1, para_env_sub%num_pe - 1
1912         proc_send = sub_proc_map(para_env_sub%mepos + proc_shift)
1913         proc_receive = sub_proc_map(para_env_sub%mepos - proc_shift)
1914
1915         send_prow = mepos_2_grid(proc_send, 1)
1916         send_pcol = mepos_2_grid(proc_send, 2)
1917
1918         send_row_size = sizes(1, proc_send)
1919         send_col_size = sizes(2, proc_send)
1920
1921         ALLOCATE (ab_send(send_row_size, send_col_size))
1922         ab_send = 0.0_dp
1923
1924         ! first loop over row since in this way we can cycle
1925         DO iiB = 1, send_row_size
1926            i_global = cp_fm_indxl2g(iiB, nrow_block, send_prow, &
1927                                     mp2_env%ri_grad%P_mo%matrix_struct%first_p_pos(1), nprow)
1928            IF (i_global <= homo) CYCLE
1929            i_global = i_global - homo
1930            IF (.NOT. (my_B_virtual_start <= i_global .AND. i_global <= my_B_virtual_end)) CYCLE
1931            DO jjB = 1, send_col_size
1932               IF (.NOT. alpha_beta) THEN
1933                  j_global = cp_fm_indxl2g(jjB, ncol_block, send_pcol, &
1934                                           mp2_env%ri_grad%P_mo%matrix_struct%first_p_pos(2), npcol)
1935               ELSE
1936                  j_global = cp_fm_indxl2g(jjB, ncol_block, send_pcol, &
1937                                           mp2_env%ri_grad%P_mo_beta%matrix_struct%first_p_pos(2), npcol)
1938               ENDIF
1939               IF (j_global <= homo) CYCLE
1940               j_global = j_global - homo
1941               IF (.NOT. alpha_beta) THEN
1942                  ab_send(iiB, jjB) = mp2_env%ri_grad%P_ab(i_global - my_B_virtual_start + 1, j_global)
1943               ELSE
1944                  ab_send(iiB, jjB) = mp2_env%ri_grad%P_ab_beta(i_global - my_B_virtual_start + 1, j_global)
1945               ENDIF
1946            END DO
1947         END DO
1948
1949         ab_rec = 0.0_dp
1950         CALL mp_sendrecv(ab_send, proc_send, &
1951                          ab_rec, proc_receive, &
1952                          para_env_sub%group)
1953         IF (.NOT. alpha_beta) THEN
1954            mp2_env%ri_grad%P_mo%local_data(1:nrow_local, 1:ncol_local) = &
1955               mp2_env%ri_grad%P_mo%local_data(1:nrow_local, 1:ncol_local) + &
1956               ab_rec(1:nrow_local, 1:ncol_local)
1957         ELSE
1958            mp2_env%ri_grad%P_mo_beta%local_data(1:nrow_local, 1:ncol_local) = &
1959               mp2_env%ri_grad%P_mo_beta%local_data(1:nrow_local, 1:ncol_local) + &
1960               ab_rec(1:nrow_local, 1:ncol_local)
1961         ENDIF
1962
1963         DEALLOCATE (ab_send)
1964      END DO
1965      DEALLOCATE (ab_rec)
1966      DEALLOCATE (mepos_2_grid)
1967      DEALLOCATE (sizes)
1968
1969      ! deallocate the local P_ab'
1970      IF (.NOT. alpha_beta) THEN
1971         DEALLOCATE (mp2_env%ri_grad%P_ab)
1972      ELSE
1973         DEALLOCATE (mp2_env%ri_grad%P_ab_beta)
1974      ENDIF
1975      CALL timestop(handle2)
1976
1977      ! create also W_MP2_MO
1978      CALL timeset(routineN//"_WMO", handle2)
1979      IF (.NOT. alpha_beta) THEN
1980         NULLIFY (mp2_env%ri_grad%W_mo)
1981         CALL cp_fm_create(mp2_env%ri_grad%W_mo, fm_struct_tmp, name="W_MP2_MO")
1982         CALL cp_fm_set_all(mp2_env%ri_grad%W_mo, 0.0_dp)
1983         CALL cp_fm_struct_release(fm_struct_tmp)
1984
1985         ! all block
1986         CALL cp_gemm('T', 'N', dimen, dimen, dimen, 2.0_dp*factor, &
1987                      L_mu_q, mo_coeff, 0.0_dp, mp2_env%ri_grad%W_mo, &
1988                      a_first_col=1, &
1989                      a_first_row=1, &
1990                      b_first_col=1, &
1991                      b_first_row=1, &
1992                      c_first_col=1, &
1993                      c_first_row=1)
1994
1995         ! occ-occ block
1996         CALL cp_gemm('T', 'N', homo, homo, dimen, -2.0_dp*factor, &
1997                      L_mu_q, mo_coeff, 0.0_dp, mp2_env%ri_grad%W_mo, &
1998                      a_first_col=1, &
1999                      a_first_row=1, &
2000                      b_first_col=1, &
2001                      b_first_row=1, &
2002                      c_first_col=1, &
2003                      c_first_row=1)
2004
2005         ! occ-virt block
2006         CALL cp_gemm('T', 'N', homo, virtual, dimen, 2.0_dp*factor, &
2007                      mo_coeff, L_mu_q, 0.0_dp, mp2_env%ri_grad%W_mo, &
2008                      a_first_col=1, &
2009                      a_first_row=1, &
2010                      b_first_col=homo + 1, &
2011                      b_first_row=1, &
2012                      c_first_col=homo + 1, &
2013                      c_first_row=1)
2014      ELSE
2015         NULLIFY (mp2_env%ri_grad%W_mo_beta)
2016         CALL cp_fm_create(mp2_env%ri_grad%W_mo_beta, fm_struct_tmp, name="W_MP2_MO")
2017         CALL cp_fm_set_all(mp2_env%ri_grad%W_mo_beta, 0.0_dp)
2018         CALL cp_fm_struct_release(fm_struct_tmp)
2019
2020         ! all block
2021         CALL cp_gemm('T', 'N', dimen, dimen, dimen, 2.0_dp*factor, &
2022                      L_mu_q, mo_coeff, 0.0_dp, mp2_env%ri_grad%W_mo_beta, &
2023                      a_first_col=1, &
2024                      a_first_row=1, &
2025                      b_first_col=1, &
2026                      b_first_row=1, &
2027                      c_first_col=1, &
2028                      c_first_row=1)
2029
2030         ! occ-occ block
2031         CALL cp_gemm('T', 'N', homo, homo, dimen, -2.0_dp*factor, &
2032                      L_mu_q, mo_coeff, 0.0_dp, mp2_env%ri_grad%W_mo_beta, &
2033                      a_first_col=1, &
2034                      a_first_row=1, &
2035                      b_first_col=1, &
2036                      b_first_row=1, &
2037                      c_first_col=1, &
2038                      c_first_row=1)
2039
2040         ! occ-virt block
2041         CALL cp_gemm('T', 'N', homo, virtual, dimen, 2.0_dp*factor, &
2042                      mo_coeff, L_mu_q, 0.0_dp, mp2_env%ri_grad%W_mo_beta, &
2043                      a_first_col=1, &
2044                      a_first_row=1, &
2045                      b_first_col=homo + 1, &
2046                      b_first_row=1, &
2047                      c_first_col=homo + 1, &
2048                      c_first_row=1)
2049      ENDIF
2050      CALL timestop(handle2)
2051
2052      ! Calculate occ-virt block of the lagrangian in MO
2053      CALL timeset(routineN//"_Ljb", handle2)
2054      IF (.NOT. alpha_beta) THEN
2055         NULLIFY (mp2_env%ri_grad%L_jb, fm_struct_tmp)
2056         CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env, context=blacs_env, &
2057                                  nrow_global=homo, ncol_global=virtual)
2058         CALL cp_fm_create(mp2_env%ri_grad%L_jb, fm_struct_tmp, name="fm_L_jb")
2059         CALL cp_fm_struct_release(fm_struct_tmp)
2060         CALL cp_fm_set_all(mp2_env%ri_grad%L_jb, 0.0_dp)
2061
2062         ! first Virtual
2063         CALL cp_gemm('T', 'N', homo, virtual, dimen, 2.0_dp*factor, &
2064                      L_mu_q, mo_coeff, 0.0_dp, mp2_env%ri_grad%L_jb, &
2065                      a_first_col=1, &
2066                      a_first_row=1, &
2067                      b_first_col=homo + 1, &
2068                      b_first_row=1, &
2069                      c_first_col=1, &
2070                      c_first_row=1)
2071         ! then occupied
2072         CALL cp_gemm('T', 'N', homo, virtual, dimen, 2.0_dp*factor, &
2073                      mo_coeff, L_mu_q, 1.0_dp, mp2_env%ri_grad%L_jb, &
2074                      a_first_col=1, &
2075                      a_first_row=1, &
2076                      b_first_col=homo + 1, &
2077                      b_first_row=1, &
2078                      c_first_col=1, &
2079                      c_first_row=1)
2080      ELSE
2081         NULLIFY (mp2_env%ri_grad%L_jb_beta, fm_struct_tmp)
2082         CALL cp_fm_struct_create(fm_struct_tmp, para_env=para_env, context=blacs_env, &
2083                                  nrow_global=homo, ncol_global=virtual)
2084         CALL cp_fm_create(mp2_env%ri_grad%L_jb_beta, fm_struct_tmp, name="fm_L_jb")
2085         CALL cp_fm_struct_release(fm_struct_tmp)
2086         CALL cp_fm_set_all(mp2_env%ri_grad%L_jb_beta, 0.0_dp)
2087
2088         ! first Virtual
2089         CALL cp_gemm('T', 'N', homo, virtual, dimen, 2.0_dp*factor, &
2090                      L_mu_q, mo_coeff, 0.0_dp, mp2_env%ri_grad%L_jb_beta, &
2091                      a_first_col=1, &
2092                      a_first_row=1, &
2093                      b_first_col=homo + 1, &
2094                      b_first_row=1, &
2095                      c_first_col=1, &
2096                      c_first_row=1)
2097         ! then occupied
2098         CALL cp_gemm('T', 'N', homo, virtual, dimen, 2.0_dp*factor, &
2099                      mo_coeff, L_mu_q, 1.0_dp, mp2_env%ri_grad%L_jb_beta, &
2100                      a_first_col=1, &
2101                      a_first_row=1, &
2102                      b_first_col=homo + 1, &
2103                      b_first_row=1, &
2104                      c_first_col=1, &
2105                      c_first_row=1)
2106      ENDIF
2107      ! finally release L_mu_q
2108      CALL cp_fm_release(L_mu_q)
2109      CALL timestop(handle2)
2110
2111      ! here we should be done next CPHF
2112
2113      DEALLOCATE (sub_proc_map)
2114      DEALLOCATE (pos_info)
2115
2116      CALL timestop(handle)
2117
2118   END SUBROUTINE create_W_P
2119
2120END MODULE mp2_ri_grad
2121