1!--------------------------------------------------------------------------------------------------!
2!   CP2K: A general program to perform molecular dynamics simulations                              !
3!   Copyright (C) 2000 - 2019  CP2K developers group                                               !
4!--------------------------------------------------------------------------------------------------!
5! **************************************************************************************************
6!> \brief Routines useful for iterative matrix calculations
7!> \par History
8!>       2010.10 created [Joost VandeVondele]
9!> \author Joost VandeVondele
10! **************************************************************************************************
11MODULE iterate_matrix
12   USE arnoldi_api,                     ONLY: arnoldi_data_type,&
13                                              arnoldi_extremal
14   USE bibliography,                    ONLY: Richters2018,&
15                                              cite_reference
16   USE cp_log_handling,                 ONLY: cp_get_default_logger,&
17                                              cp_logger_get_default_unit_nr,&
18                                              cp_logger_type
19   USE dbcsr_api,                       ONLY: &
20        dbcsr_add, dbcsr_add_on_diag, dbcsr_copy, dbcsr_create, dbcsr_desymmetrize, dbcsr_filter, &
21        dbcsr_frobenius_norm, dbcsr_gershgorin_norm, dbcsr_get_diag, dbcsr_get_info, &
22        dbcsr_get_matrix_type, dbcsr_get_occupation, dbcsr_multiply, dbcsr_norm, &
23        dbcsr_norm_maxabsnorm, dbcsr_p_type, dbcsr_release, dbcsr_scale, dbcsr_set, &
24        dbcsr_set_diag, dbcsr_trace, dbcsr_transposed, dbcsr_type, dbcsr_type_no_symmetry
25   USE kinds,                           ONLY: dp,&
26                                              int_8
27   USE machine,                         ONLY: m_flush,&
28                                              m_walltime
29   USE mathconstants,                   ONLY: ifac
30   USE mathlib,                         ONLY: abnormal_value
31#include "./base/base_uses.f90"
32
33   IMPLICIT NONE
34
35   PRIVATE
36
37   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'iterate_matrix'
38
39   INTERFACE purify_mcweeny
40      MODULE PROCEDURE purify_mcweeny_orth, purify_mcweeny_nonorth
41   END INTERFACE
42
43   PUBLIC :: invert_Hotelling, matrix_sign_Newton_Schulz, matrix_sqrt_Newton_Schulz, &
44             matrix_sqrt_proot, matrix_sign_proot, &
45             purify_mcweeny, invert_Taylor, determinant
46
47CONTAINS
48
49! *****************************************************************************
50!> \brief Computes the determinant of a symmetric positive definite matrix
51!>        using the trace of the matrix logarithm via Mercator series:
52!>         det(A) = det(S)det(I+X)det(S), where S=diag(sqrt(Aii),..,sqrt(Ann))
53!>         det(I+X) = Exp(Trace(Ln(I+X)))
54!>         Ln(I+X) = X - X^2/2 + X^3/3 - X^4/4 + ..
55!>        The series converges only if the Frobenius norm of X is less than 1.
56!>        If it is more than one we compute (recursevily) the determinant of
57!>        the square root of (I+X).
58!> \param matrix ...
59!> \param det - determinant
60!> \param threshold ...
61!> \par History
62!>       2015.04 created [Rustam Z Khaliullin]
63!> \author Rustam Z. Khaliullin
64! **************************************************************************************************
65   RECURSIVE SUBROUTINE determinant(matrix, det, threshold)
66
67      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
68      REAL(KIND=dp), INTENT(INOUT)                       :: det
69      REAL(KIND=dp), INTENT(IN)                          :: threshold
70
71      CHARACTER(LEN=*), PARAMETER :: routineN = 'determinant', routineP = moduleN//':'//routineN
72
73      INTEGER                                            :: handle, i, max_iter_lanczos, nsize, &
74                                                            order_lanczos, sign_iter, unit_nr
75      INTEGER(KIND=int_8)                                :: flop1
76      INTEGER, SAVE                                      :: recursion_depth = 0
77      REAL(KIND=dp)                                      :: det0, eps_lanczos, frobnorm, maxnorm, &
78                                                            occ_matrix, t1, t2, trace
79      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: diagonal
80      TYPE(cp_logger_type), POINTER                      :: logger
81      TYPE(dbcsr_type)                                   :: tmp1, tmp2, tmp3
82
83      CALL timeset(routineN, handle)
84
85      ! get a useful output_unit
86      logger => cp_get_default_logger()
87      IF (logger%para_env%ionode) THEN
88         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
89      ELSE
90         unit_nr = -1
91      ENDIF
92
93      ! Note: tmp1 and tmp2 have the same matrix type as the
94      ! initial matrix (tmp3 does not have symmetry constraints)
95      ! this might lead to uninteded results with anti-symmetric
96      ! matrices
97      CALL dbcsr_create(tmp1, template=matrix, &
98                        matrix_type=dbcsr_type_no_symmetry)
99      CALL dbcsr_create(tmp2, template=matrix, &
100                        matrix_type=dbcsr_type_no_symmetry)
101      CALL dbcsr_create(tmp3, template=matrix, &
102                        matrix_type=dbcsr_type_no_symmetry)
103
104      ! compute the product of the diagonal elements
105      CALL dbcsr_get_info(matrix, nfullrows_total=nsize)
106      ALLOCATE (diagonal(nsize))
107      CALL dbcsr_get_diag(matrix, diagonal)
108      det = PRODUCT(diagonal)
109
110      ! create diagonal SQRTI matrix
111      diagonal(:) = 1.0_dp/(SQRT(diagonal(:)))
112      !ROLL CALL dbcsr_copy(tmp1,matrix)
113      CALL dbcsr_desymmetrize(matrix, tmp1)
114      CALL dbcsr_set(tmp1, 0.0_dp)
115      CALL dbcsr_set_diag(tmp1, diagonal)
116      CALL dbcsr_filter(tmp1, threshold)
117      DEALLOCATE (diagonal)
118
119      ! normalize the main diagonal, off-diagonal elements are scaled to
120      ! make the norm of the matrix less than 1
121      CALL dbcsr_multiply("N", "N", 1.0_dp, &
122                          matrix, &
123                          tmp1, &
124                          0.0_dp, tmp3, &
125                          filter_eps=threshold)
126      CALL dbcsr_multiply("N", "N", 1.0_dp, &
127                          tmp1, &
128                          tmp3, &
129                          0.0_dp, tmp2, &
130                          filter_eps=threshold)
131
132      ! subtract the main diagonal to create matrix X
133      CALL dbcsr_add_on_diag(tmp2, -1.0_dp)
134      frobnorm = dbcsr_frobenius_norm(tmp2)
135      IF (unit_nr > 0) THEN
136         IF (recursion_depth .EQ. 0) THEN
137            WRITE (unit_nr, '()')
138         ELSE
139            WRITE (unit_nr, '(T6,A28,1X,I15)') &
140               "Recursive iteration:", recursion_depth
141         ENDIF
142         WRITE (unit_nr, '(T6,A28,1X,F15.10)') &
143            "Frobenius norm:", frobnorm
144         CALL m_flush(unit_nr)
145      ENDIF
146
147      IF (frobnorm .GE. 1.0_dp) THEN
148
149         CALL dbcsr_add_on_diag(tmp2, 1.0_dp)
150         ! these controls should be provided as input
151         order_lanczos = 3
152         eps_lanczos = 1.0E-4_dp
153         max_iter_lanczos = 40
154         CALL matrix_sqrt_Newton_Schulz( &
155            tmp3, & ! output sqrt
156            tmp1, & ! output sqrti
157            tmp2, & ! input original
158            threshold=threshold, &
159            order=order_lanczos, &
160            eps_lanczos=eps_lanczos, &
161            max_iter_lanczos=max_iter_lanczos)
162         recursion_depth = recursion_depth + 1
163         CALL determinant(tmp3, det0, threshold)
164         recursion_depth = recursion_depth - 1
165         det = det*det0*det0
166
167      ELSE
168
169         ! create accumulator
170         CALL dbcsr_copy(tmp1, tmp2)
171         ! re-create to make use of symmetry
172         !ROLL CALL dbcsr_create(tmp3,template=matrix)
173
174         IF (unit_nr > 0) WRITE (unit_nr, *)
175
176         ! initialize the sign of the term
177         sign_iter = -1
178         DO i = 1, 100
179
180            t1 = m_walltime()
181
182            ! multiply X^i by X
183            ! note that the first iteration evaluates X^2
184            ! because the trace of X^1 is zero by construction
185            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, tmp2, &
186                                0.0_dp, tmp3, &
187                                filter_eps=threshold, &
188                                flop=flop1)
189            CALL dbcsr_copy(tmp1, tmp3)
190
191            ! get trace
192            CALL dbcsr_trace(tmp1, trace)
193            trace = trace*sign_iter/(1.0_dp*(i + 1))
194            sign_iter = -sign_iter
195
196            ! update the determinant
197            det = det*EXP(trace)
198
199            occ_matrix = dbcsr_get_occupation(tmp1)
200            CALL dbcsr_norm(tmp1, &
201                            dbcsr_norm_maxabsnorm, norm_scalar=maxnorm)
202
203            t2 = m_walltime()
204
205            IF (unit_nr > 0) THEN
206               WRITE (unit_nr, '(T6,A,1X,I3,1X,F7.5,F16.10,F10.3,F11.3)') &
207                  "Determinant iter", i, occ_matrix, &
208                  det, t2 - t1, &
209                  flop1/(1.0E6_dp*MAX(0.001_dp, t2 - t1))
210               CALL m_flush(unit_nr)
211            ENDIF
212
213            ! exit if the trace is close to zero
214            IF (maxnorm < threshold) EXIT
215
216         ENDDO ! end iterations
217
218         IF (unit_nr > 0) THEN
219            WRITE (unit_nr, '()')
220            CALL m_flush(unit_nr)
221         ENDIF
222
223      ENDIF ! decide to do sqrt or not
224
225      IF (unit_nr > 0) THEN
226         IF (recursion_depth .EQ. 0) THEN
227            WRITE (unit_nr, '(T6,A28,1X,F15.10)') &
228               "Final determinant:", det
229            WRITE (unit_nr, '()')
230         ELSE
231            WRITE (unit_nr, '(T6,A28,1X,F15.10)') &
232               "Recursive determinant:", det
233         ENDIF
234         CALL m_flush(unit_nr)
235      ENDIF
236
237      CALL dbcsr_release(tmp1)
238      CALL dbcsr_release(tmp2)
239      CALL dbcsr_release(tmp3)
240
241      CALL timestop(handle)
242
243   END SUBROUTINE determinant
244
245! **************************************************************************************************
246!> \brief invert a symmetric positive definite diagonally dominant matrix
247!> \param matrix_inverse ...
248!> \param matrix ...
249!> \param threshold convergence threshold nased on the max abs
250!> \param use_inv_as_guess logical whether input can be used as guess for inverse
251!> \param norm_convergence convergence threshold for the 2-norm, useful for approximate solutions
252!> \param filter_eps filter_eps for matrix multiplications, if not passed nothing is filteres
253!> \param accelerator_order ...
254!> \param max_iter_lanczos ...
255!> \param eps_lanczos ...
256!> \param silent ...
257!> \par History
258!>       2010.10 created [Joost VandeVondele]
259!>       2011.10 guess option added [Rustam Z Khaliullin]
260!> \author Joost VandeVondele
261! **************************************************************************************************
262   SUBROUTINE invert_Taylor(matrix_inverse, matrix, threshold, use_inv_as_guess, &
263                            norm_convergence, filter_eps, accelerator_order, &
264                            max_iter_lanczos, eps_lanczos, silent)
265
266      TYPE(dbcsr_type), INTENT(INOUT), TARGET            :: matrix_inverse, matrix
267      REAL(KIND=dp), INTENT(IN)                          :: threshold
268      LOGICAL, INTENT(IN), OPTIONAL                      :: use_inv_as_guess
269      REAL(KIND=dp), INTENT(IN), OPTIONAL                :: norm_convergence, filter_eps
270      INTEGER, INTENT(IN), OPTIONAL                      :: accelerator_order, max_iter_lanczos
271      REAL(KIND=dp), INTENT(IN), OPTIONAL                :: eps_lanczos
272      LOGICAL, INTENT(IN), OPTIONAL                      :: silent
273
274      CHARACTER(LEN=*), PARAMETER :: routineN = 'invert_Taylor', routineP = moduleN//':'//routineN
275
276      INTEGER                                            :: accelerator_type, handle, i, &
277                                                            my_max_iter_lanczos, nrows, unit_nr
278      INTEGER(KIND=int_8)                                :: flop2
279      LOGICAL                                            :: converged, use_inv_guess
280      REAL(KIND=dp)                                      :: coeff, convergence, maxnorm_matrix, &
281                                                            my_eps_lanczos, occ_matrix, t1, t2
282      REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: p_diagonal
283      TYPE(cp_logger_type), POINTER                      :: logger
284      TYPE(dbcsr_type), TARGET                           :: tmp1, tmp2, tmp3_sym
285
286      CALL timeset(routineN, handle)
287
288      logger => cp_get_default_logger()
289      IF (logger%para_env%ionode) THEN
290         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
291      ELSE
292         unit_nr = -1
293      ENDIF
294      IF (PRESENT(silent)) THEN
295         IF (silent) unit_nr = -1
296      END IF
297
298      convergence = threshold
299      IF (PRESENT(norm_convergence)) convergence = norm_convergence
300
301      accelerator_type = 0
302      IF (PRESENT(accelerator_order)) accelerator_type = accelerator_order
303      IF (accelerator_type .GT. 1) accelerator_type = 1
304
305      use_inv_guess = .FALSE.
306      IF (PRESENT(use_inv_as_guess)) use_inv_guess = use_inv_as_guess
307
308      my_max_iter_lanczos = 64
309      my_eps_lanczos = 1.0E-3_dp
310      IF (PRESENT(max_iter_lanczos)) my_max_iter_lanczos = max_iter_lanczos
311      IF (PRESENT(eps_lanczos)) my_eps_lanczos = eps_lanczos
312
313      CALL dbcsr_create(tmp1, template=matrix_inverse, matrix_type=dbcsr_type_no_symmetry)
314      CALL dbcsr_create(tmp2, template=matrix_inverse, matrix_type=dbcsr_type_no_symmetry)
315      CALL dbcsr_create(tmp3_sym, template=matrix_inverse)
316
317      CALL dbcsr_get_info(matrix, nfullrows_total=nrows)
318      ALLOCATE (p_diagonal(nrows))
319
320      ! generate the initial guess
321      IF (.NOT. use_inv_guess) THEN
322
323         SELECT CASE (accelerator_type)
324         CASE (0)
325            ! use tmp1 to hold off-diagonal elements
326            CALL dbcsr_desymmetrize(matrix, tmp1)
327            p_diagonal(:) = 0.0_dp
328            CALL dbcsr_set_diag(tmp1, p_diagonal)
329            !CALL dbcsr_print(tmp1)
330            ! invert the main diagonal
331            CALL dbcsr_get_diag(matrix, p_diagonal)
332            DO i = 1, nrows
333               IF (p_diagonal(i) .NE. 0.0_dp) THEN
334                  p_diagonal(i) = 1.0_dp/p_diagonal(i)
335               ENDIF
336            ENDDO
337            CALL dbcsr_set(matrix_inverse, 0.0_dp)
338            CALL dbcsr_add_on_diag(matrix_inverse, 1.0_dp)
339            CALL dbcsr_set_diag(matrix_inverse, p_diagonal)
340         CASE DEFAULT
341            CPABORT("Illegal accelerator order")
342         END SELECT
343
344      ELSE
345
346         CPABORT("Guess is NYI")
347
348      ENDIF
349
350      CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, matrix_inverse, &
351                          0.0_dp, tmp2, filter_eps=filter_eps)
352
353      IF (unit_nr > 0) WRITE (unit_nr, *)
354
355      ! scale the approximate inverse to be within the convergence radius
356      t1 = m_walltime()
357
358      ! done with the initial guess, start iterations
359      converged = .FALSE.
360      CALL dbcsr_desymmetrize(matrix_inverse, tmp1)
361      coeff = 1.0_dp
362      DO i = 1, 100
363
364         ! coeff = +/- 1
365         coeff = -1.0_dp*coeff
366         CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, tmp2, 0.0_dp, &
367                             tmp3_sym, &
368                             flop=flop2, filter_eps=filter_eps)
369         !flop=flop2)
370         CALL dbcsr_add(matrix_inverse, tmp3_sym, 1.0_dp, coeff)
371         CALL dbcsr_release(tmp1)
372         CALL dbcsr_create(tmp1, template=matrix_inverse, matrix_type=dbcsr_type_no_symmetry)
373         CALL dbcsr_desymmetrize(tmp3_sym, tmp1)
374
375         ! for the convergence check
376         CALL dbcsr_norm(tmp3_sym, &
377                         dbcsr_norm_maxabsnorm, norm_scalar=maxnorm_matrix)
378
379         t2 = m_walltime()
380         occ_matrix = dbcsr_get_occupation(matrix_inverse)
381
382         IF (unit_nr > 0) THEN
383            WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3,F12.3,F13.3)') "Taylor iter", i, occ_matrix, &
384               maxnorm_matrix, t2 - t1, &
385               flop2/(1.0E6_dp*MAX(0.001_dp, t2 - t1))
386            CALL m_flush(unit_nr)
387         ENDIF
388
389         IF (maxnorm_matrix < convergence) THEN
390            converged = .TRUE.
391            EXIT
392         ENDIF
393
394         t1 = m_walltime()
395
396      ENDDO
397
398      !last convergence check
399      CALL dbcsr_multiply("N", "N", 1.0_dp, matrix, matrix_inverse, 0.0_dp, tmp1, &
400                          filter_eps=filter_eps)
401      CALL dbcsr_add_on_diag(tmp1, -1.0_dp)
402      !frob_matrix =  dbcsr_frobenius_norm(tmp1)
403      CALL dbcsr_norm(tmp1, dbcsr_norm_maxabsnorm, norm_scalar=maxnorm_matrix)
404      IF (unit_nr > 0) THEN
405         WRITE (unit_nr, '(T6,A,E12.5)') "Final Taylor error", maxnorm_matrix
406         WRITE (unit_nr, '()')
407         CALL m_flush(unit_nr)
408      ENDIF
409      IF (maxnorm_matrix > convergence) THEN
410         converged = .FALSE.
411         IF (unit_nr > 0) THEN
412            WRITE (unit_nr, *) 'Final convergence check failed'
413         ENDIF
414      ENDIF
415
416      IF (.NOT. converged) THEN
417         CPABORT("Taylor inversion did not converge")
418      ENDIF
419
420      CALL dbcsr_release(tmp1)
421      CALL dbcsr_release(tmp2)
422      CALL dbcsr_release(tmp3_sym)
423
424      DEALLOCATE (p_diagonal)
425
426      CALL timestop(handle)
427
428   END SUBROUTINE invert_Taylor
429
430! **************************************************************************************************
431!> \brief invert a symmetric positive definite matrix by Hotelling's method
432!>        explicit symmetrization makes this code not suitable for other matrix types
433!>        Currently a bit messy with the options, to to be cleaned soon
434!> \param matrix_inverse ...
435!> \param matrix ...
436!> \param threshold convergence threshold nased on the max abs
437!> \param use_inv_as_guess logical whether input can be used as guess for inverse
438!> \param norm_convergence convergence threshold for the 2-norm, useful for approximate solutions
439!> \param filter_eps filter_eps for matrix multiplications, if not passed nothing is filteres
440!> \param accelerator_order ...
441!> \param max_iter_lanczos ...
442!> \param eps_lanczos ...
443!> \param silent ...
444!> \par History
445!>       2010.10 created [Joost VandeVondele]
446!>       2011.10 guess option added [Rustam Z Khaliullin]
447!> \author Joost VandeVondele
448! **************************************************************************************************
449   SUBROUTINE invert_Hotelling(matrix_inverse, matrix, threshold, use_inv_as_guess, &
450                               norm_convergence, filter_eps, accelerator_order, &
451                               max_iter_lanczos, eps_lanczos, silent)
452
453      TYPE(dbcsr_type), INTENT(INOUT), TARGET            :: matrix_inverse, matrix
454      REAL(KIND=dp), INTENT(IN)                          :: threshold
455      LOGICAL, INTENT(IN), OPTIONAL                      :: use_inv_as_guess
456      REAL(KIND=dp), INTENT(IN), OPTIONAL                :: norm_convergence, filter_eps
457      INTEGER, INTENT(IN), OPTIONAL                      :: accelerator_order, max_iter_lanczos
458      REAL(KIND=dp), INTENT(IN), OPTIONAL                :: eps_lanczos
459      LOGICAL, INTENT(IN), OPTIONAL                      :: silent
460
461      CHARACTER(LEN=*), PARAMETER :: routineN = 'invert_Hotelling', &
462         routineP = moduleN//':'//routineN
463
464      INTEGER                                            :: accelerator_type, handle, i, &
465                                                            my_max_iter_lanczos, unit_nr
466      INTEGER(KIND=int_8)                                :: flop1, flop2
467      LOGICAL                                            :: arnoldi_converged, converged, &
468                                                            use_inv_guess
469      REAL(KIND=dp) :: convergence, frob_matrix, gershgorin_norm, max_ev, maxnorm_matrix, min_ev, &
470         my_eps_lanczos, my_filter_eps, occ_matrix, scalingf, t1, t2
471      TYPE(cp_logger_type), POINTER                      :: logger
472      TYPE(dbcsr_type), TARGET                           :: tmp1, tmp2
473
474      !TYPE(arnoldi_data_type)                            :: my_arnoldi
475      !TYPE(dbcsr_p_type), DIMENSION(1)                   :: mymat
476
477      CALL timeset(routineN, handle)
478
479      logger => cp_get_default_logger()
480      IF (logger%para_env%ionode) THEN
481         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
482      ELSE
483         unit_nr = -1
484      ENDIF
485      IF (PRESENT(silent)) THEN
486         IF (silent) unit_nr = -1
487      END IF
488
489      convergence = threshold
490      IF (PRESENT(norm_convergence)) convergence = norm_convergence
491
492      accelerator_type = 1
493      IF (PRESENT(accelerator_order)) accelerator_type = accelerator_order
494      IF (accelerator_type .GT. 1) accelerator_type = 1
495
496      use_inv_guess = .FALSE.
497      IF (PRESENT(use_inv_as_guess)) use_inv_guess = use_inv_as_guess
498
499      my_max_iter_lanczos = 64
500      my_eps_lanczos = 1.0E-3_dp
501      IF (PRESENT(max_iter_lanczos)) my_max_iter_lanczos = max_iter_lanczos
502      IF (PRESENT(eps_lanczos)) my_eps_lanczos = eps_lanczos
503
504      my_filter_eps = threshold
505      IF (PRESENT(filter_eps)) my_filter_eps = filter_eps
506
507      ! generate the initial guess
508      IF (.NOT. use_inv_guess) THEN
509
510         SELECT CASE (accelerator_type)
511         CASE (0)
512            gershgorin_norm = dbcsr_gershgorin_norm(matrix)
513            frob_matrix = dbcsr_frobenius_norm(matrix)
514            CALL dbcsr_set(matrix_inverse, 0.0_dp)
515            CALL dbcsr_add_on_diag(matrix_inverse, 1/MIN(gershgorin_norm, frob_matrix))
516         CASE (1)
517            ! initialize matrix to unity and use arnoldi (below) to scale it into the convergence range
518            CALL dbcsr_set(matrix_inverse, 0.0_dp)
519            CALL dbcsr_add_on_diag(matrix_inverse, 1.0_dp)
520         CASE DEFAULT
521            CPABORT("Illegal accelerator order")
522         END SELECT
523
524         ! everything commutes, therefore our all products will be symmetric
525         CALL dbcsr_create(tmp1, template=matrix_inverse)
526
527      ELSE
528
529         ! It is unlikely that our guess will commute with the matrix, therefore the first product will
530         ! be non symmetric
531         CALL dbcsr_create(tmp1, template=matrix_inverse, matrix_type=dbcsr_type_no_symmetry)
532
533      ENDIF
534
535      CALL dbcsr_create(tmp2, template=matrix_inverse)
536
537      IF (unit_nr > 0) WRITE (unit_nr, *)
538
539      ! scale the approximate inverse to be within the convergence radius
540      t1 = m_walltime()
541
542      CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_inverse, matrix, &
543                          0.0_dp, tmp1, flop=flop1, filter_eps=my_filter_eps)
544
545      IF (accelerator_type == 1) THEN
546
547         ! scale the matrix to get into the convergence range
548         CALL arnoldi_extremal(tmp1, max_eV, min_eV, threshold=my_eps_lanczos, &
549                               max_iter=my_max_iter_lanczos, converged=arnoldi_converged)
550         !mymat(1)%matrix => tmp1
551         !CALL setup_arnoldi_data(my_arnoldi, mymat, max_iter=30, threshold=1.0E-3_dp, selection_crit=1, &
552         !                        nval_request=2, nrestarts=2, generalized_ev=.FALSE., iram=.TRUE.)
553         !CALL arnoldi_ev(mymat, my_arnoldi)
554         !max_eV = REAL(get_selected_ritz_val(my_arnoldi, 2), dp)
555         !min_eV = REAL(get_selected_ritz_val(my_arnoldi, 1), dp)
556         !CALL deallocate_arnoldi_data(my_arnoldi)
557
558         IF (unit_nr > 0) THEN
559            WRITE (unit_nr, *)
560            WRITE (unit_nr, '(T6,A,1X,L1,A,E12.3)') "Lanczos converged: ", arnoldi_converged, " threshold:", my_eps_lanczos
561            WRITE (unit_nr, '(T6,A,1X,E12.3,E12.3)') "Est. extremal eigenvalues:", max_eV, min_eV
562            WRITE (unit_nr, '(T6,A,1X,E12.3)') "Est. condition number :", max_eV/MAX(min_eV, EPSILON(min_eV))
563         ENDIF
564
565         ! 2.0 would be the correct scaling however, we should make sure here, that we are in the convergence radius
566         scalingf = 1.9_dp/(max_eV + min_eV)
567         CALL dbcsr_scale(tmp1, scalingf)
568         CALL dbcsr_scale(matrix_inverse, scalingf)
569         min_ev = min_ev*scalingf
570
571      ENDIF
572
573      ! done with the initial guess, start iterations
574      converged = .FALSE.
575      DO i = 1, 100
576
577         ! tmp1 = S^-1 S
578
579         ! for the convergence check
580         CALL dbcsr_add_on_diag(tmp1, -1.0_dp)
581         CALL dbcsr_norm(tmp1, &
582                         dbcsr_norm_maxabsnorm, norm_scalar=maxnorm_matrix)
583         CALL dbcsr_add_on_diag(tmp1, +1.0_dp)
584
585         ! tmp2 = S^-1 S S^-1
586         CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, matrix_inverse, 0.0_dp, tmp2, &
587                             flop=flop2, filter_eps=my_filter_eps)
588         ! S^-1_{n+1} = 2 S^-1 - S^-1 S S^-1
589         CALL dbcsr_add(matrix_inverse, tmp2, 2.0_dp, -1.0_dp)
590
591         CALL dbcsr_filter(matrix_inverse, my_filter_eps)
592         t2 = m_walltime()
593         occ_matrix = dbcsr_get_occupation(matrix_inverse)
594
595         ! use the scalar form of the algorithm to trace the EV
596         IF (accelerator_type == 1) THEN
597            min_ev = min_ev*(2.0_dp - min_ev)
598            IF (PRESENT(norm_convergence)) maxnorm_matrix = ABS(min_eV - 1.0_dp)
599         ENDIF
600
601         IF (unit_nr > 0) THEN
602            WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3,F12.3,F13.3)') "Hotelling iter", i, occ_matrix, &
603               maxnorm_matrix, t2 - t1, &
604               (flop1 + flop2)/(1.0E6_dp*MAX(0.001_dp, t2 - t1))
605            CALL m_flush(unit_nr)
606         ENDIF
607
608         IF (maxnorm_matrix < convergence) THEN
609            converged = .TRUE.
610            EXIT
611         ENDIF
612
613         ! scale the matrix for improved convergence
614         IF (accelerator_type == 1) THEN
615            min_ev = min_ev*2.0_dp/(min_ev + 1.0_dp)
616            CALL dbcsr_scale(matrix_inverse, 2.0_dp/(min_ev + 1.0_dp))
617         ENDIF
618
619         t1 = m_walltime()
620         CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_inverse, matrix, &
621                             0.0_dp, tmp1, flop=flop1, filter_eps=my_filter_eps)
622
623      ENDDO
624
625      IF (.NOT. converged) THEN
626         CPABORT("Hotelling inversion did not converge")
627      ENDIF
628
629      ! try to symmetrize the output matrix
630      IF (dbcsr_get_matrix_type(matrix_inverse) == dbcsr_type_no_symmetry) THEN
631         CALL dbcsr_transposed(tmp2, matrix_inverse)
632         CALL dbcsr_add(matrix_inverse, tmp2, 0.5_dp, 0.5_dp)
633      END IF
634
635      IF (unit_nr > 0) THEN
636!           WRITE(unit_nr,'(T6,A,1X,I3,1X,F10.8,E12.3)') "Final Hotelling ",i,occ_matrix,&
637!              !frob_matrix/frob_matrix_base
638!              maxnorm_matrix
639         WRITE (unit_nr, '()')
640         CALL m_flush(unit_nr)
641      ENDIF
642
643      CALL dbcsr_release(tmp1)
644      CALL dbcsr_release(tmp2)
645
646      CALL timestop(handle)
647
648   END SUBROUTINE invert_Hotelling
649
650! **************************************************************************************************
651!> \brief compute the sign a matrix using Newton-Schulz iterations
652!> \param matrix_sign ...
653!> \param matrix ...
654!> \param threshold ...
655!> \param sign_order ...
656!> \par History
657!>       2010.10 created [Joost VandeVondele]
658!>       2019.05 extended to order byxond 2 [Robert Schade]
659!> \author Joost VandeVondele, Robert Schade
660! **************************************************************************************************
661   SUBROUTINE matrix_sign_Newton_Schulz(matrix_sign, matrix, threshold, sign_order)
662
663      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_sign, matrix
664      REAL(KIND=dp), INTENT(IN)                          :: threshold
665      INTEGER, INTENT(IN), OPTIONAL                      :: sign_order
666
667      CHARACTER(LEN=*), PARAMETER :: routineN = 'matrix_sign_Newton_Schulz', &
668         routineP = moduleN//':'//routineN
669
670      INTEGER                                            :: count, handle, i, order, unit_nr
671      INTEGER(KIND=int_8)                                :: flops
672      REAL(KIND=dp)                                      :: a0, a1, a2, a3, a4, a5, floptot, &
673                                                            frob_matrix, frob_matrix_base, &
674                                                            gersh_matrix, occ_matrix, prefactor, &
675                                                            t1, t2
676      TYPE(cp_logger_type), POINTER                      :: logger
677      TYPE(dbcsr_type)                                   :: tmp1, tmp2, tmp3, tmp4
678
679      CALL timeset(routineN, handle)
680
681      logger => cp_get_default_logger()
682      IF (logger%para_env%ionode) THEN
683         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
684      ELSE
685         unit_nr = -1
686      ENDIF
687
688      IF (PRESENT(sign_order)) THEN
689         order = sign_order
690      ELSE
691         order = 2
692      ENDIF
693
694      CALL dbcsr_create(tmp1, template=matrix_sign)
695
696      CALL dbcsr_create(tmp2, template=matrix_sign)
697      IF (order .GE. 4) THEN
698         CALL dbcsr_create(tmp3, template=matrix_sign)
699      ENDIF
700      IF (order .GE. 7) THEN
701         CALL dbcsr_create(tmp4, template=matrix_sign)
702      ENDIF
703
704      CALL dbcsr_copy(matrix_sign, matrix)
705      CALL dbcsr_filter(matrix_sign, threshold)
706
707      ! scale the matrix to get into the convergence range
708      frob_matrix = dbcsr_frobenius_norm(matrix_sign)
709      gersh_matrix = dbcsr_gershgorin_norm(matrix_sign)
710      CALL dbcsr_scale(matrix_sign, 1/MIN(frob_matrix, gersh_matrix))
711
712      IF (unit_nr > 0) WRITE (unit_nr, *)
713
714      count = 0
715      DO i = 1, 100
716         floptot = 0_dp
717         t1 = m_walltime()
718         ! tmp1 = X * X
719         CALL dbcsr_multiply("N", "N", -1.0_dp, matrix_sign, matrix_sign, 0.0_dp, tmp1, &
720                             filter_eps=threshold, flop=flops)
721         floptot = floptot + flops
722
723         ! check convergence (frob norm of what should be the identity matrix minus identity matrix)
724         frob_matrix_base = dbcsr_frobenius_norm(tmp1)
725         CALL dbcsr_add_on_diag(tmp1, +1.0_dp)
726         frob_matrix = dbcsr_frobenius_norm(tmp1)
727
728         ! f(y) approx 1/sqrt(1-y)
729         ! f(y)=1+y/2+3/8*y^2+5/16*y^3+35/128*y^4+63/256*y^5+231/1024*y^6
730         ! f2(y)=1+y/2=1/2*(2+y)
731         ! f3(y)=1+y/2+3/8*y^2=3/8*(8/3+4/3*y+y^2)
732         ! f4(y)=1+y/2+3/8*y^2+5/16*y^3=5/16*(16/5+8/5*y+6/5*y^2+y^3)
733         ! f5(y)=1+y/2+3/8*y^2+5/16*y^3+35/128*y^4=35/128*(128/35+128/70*y+48/35*y^2+8/7*y^3+y^4)
734         !      z(y)=(y+a_0)*y+a_1
735         ! f5(y)=35/128*((z(y)+y+a_2)*z(y)+a_3)
736         !      =35/128*((a_1^2+a_1a_2+a_3)+(2*a_0a_1+a_1+a_0a_2)y+(a_0^2+a_0+2a_1+a_2)y^2+(2a_0+1)y^3+y^4)
737         !    a_0=1/14
738         !    a_1=23819/13720
739         !    a_2=1269/980-2a_1=-3734/1715
740         !    a_3=832591127/188238400
741         ! f6(y)=1+y/2+3/8*y^2+5/16*y^3+35/128*y^4+63/256*y^5
742         !      =63/256*(256/63 + (128 y)/63 + (32 y^2)/21 + (80 y^3)/63 + (10 y^4)/9 + y^5)
743         ! f7(y)=1+y/2+3/8*y^2+5/16*y^3+35/128*y^4+63/256*y^5+231/1024*y^6
744         !      =231/1024*(1024/231+512/231*y+128/77*y^2+320/231*y^3+40/33*y^4+12/11*y^5+y^6)
745         ! z(y)=(y+a_0)*y+a_1
746         ! w(y)=(y+a_2)*z(y)+a_3
747         ! f7(y)=(w(y)+z(y)+a_4)*w(y)+a_5
748         ! a_0= 1.3686502058092053653287666647611728507211996691324048468010382350359929055186612505791532871573242422
749         ! a_1= 1.7089671854477436685850554669524985556296280184497503489303331821456795715195510972774979091893741568
750         ! a_2=-1.3231956603546599107833121193066273961757451236778593922555836895814474509732067051246078326118696968
751         ! a_3= 3.9876642330847931291749479958277754186675336169578593000744380254770411483327581042259415937710270453
752         ! a_4=-3.7273299006476825027065704937541279833880400042556351139273912137942678919776364526511485025132991667
753         ! a_5= 4.9369932474103023792021351907971943220607580694533770325967170245194362399287150565595441897740173578
754         !
755         ! y=1-X*X
756
757         ! tmp1 = I-x*x
758         IF (order .EQ. 2) THEN
759            prefactor = 0.5_dp
760
761            ! update the above to 3*I-X*X
762            CALL dbcsr_add_on_diag(tmp1, +2.0_dp)
763            occ_matrix = dbcsr_get_occupation(matrix_sign)
764         ELSE IF (order .EQ. 3) THEN
765            ! with one multiplication
766            ! tmp1=y
767            CALL dbcsr_copy(tmp2, tmp1)
768            CALL dbcsr_scale(tmp1, 4.0_dp/3.0_dp)
769            CALL dbcsr_add_on_diag(tmp1, 8.0_dp/3.0_dp)
770
771            ! tmp2=y^2
772            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp2, 1.0_dp, tmp1, &
773                                filter_eps=threshold, flop=flops)
774            floptot = floptot + flops
775            prefactor = 3.0_dp/8.0_dp
776
777         ELSE IF (order .EQ. 4) THEN
778            ! with two multiplications
779            ! tmp1=y
780            CALL dbcsr_copy(tmp3, tmp1)
781            CALL dbcsr_scale(tmp1, 8.0_dp/5.0_dp)
782            CALL dbcsr_add_on_diag(tmp1, 16.0_dp/5.0_dp)
783
784            !
785            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp3, tmp3, 0.0_dp, tmp2, &
786                                filter_eps=threshold, flop=flops)
787            floptot = floptot + flops
788
789            CALL dbcsr_add(tmp1, tmp2, 1.0_dp, 6.0_dp/5.0_dp)
790
791            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp3, 1.0_dp, tmp1, &
792                                filter_eps=threshold, flop=flops)
793            floptot = floptot + flops
794
795            prefactor = 5.0_dp/16.0_dp
796         ELSE IF (order .EQ. -5) THEN
797            ! with three multiplications
798            ! tmp1=y
799            CALL dbcsr_copy(tmp3, tmp1)
800            CALL dbcsr_scale(tmp1, 128.0_dp/70.0_dp)
801            CALL dbcsr_add_on_diag(tmp1, 128.0_dp/35.0_dp)
802
803            !
804            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp3, tmp3, 0.0_dp, tmp2, &
805                                filter_eps=threshold, flop=flops)
806            floptot = floptot + flops
807
808            CALL dbcsr_add(tmp1, tmp2, 1.0_dp, 48.0_dp/35.0_dp)
809
810            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp3, 0.0_dp, tmp2, &
811                                filter_eps=threshold, flop=flops)
812            floptot = floptot + flops
813
814            CALL dbcsr_add(tmp1, tmp2, 1.0_dp, 8.0_dp/7.0_dp)
815
816            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp3, 1.0_dp, tmp1, &
817                                filter_eps=threshold, flop=flops)
818            floptot = floptot + flops
819
820            prefactor = 35.0_dp/128.0_dp
821         ELSE IF (order .EQ. 5) THEN
822            ! with two multiplications
823            !      z(y)=(y+a_0)*y+a_1
824            ! f5(y)=35/128*((z(y)+y+a_2)*z(y)+a_3)
825            !      =35/128*((a_1^2+a_1a_2+a_3)+(2*a_0a_1+a_1+a_0a_2)y+(a_0^2+a_0+2a_1+a_2)y^2+(2a_0+1)y^3+y^4)
826            !    a_0=1/14
827            !    a_1=23819/13720
828            !    a_2=1269/980-2a_1=-3734/1715
829            !    a_3=832591127/188238400
830            a0 = 1.0_dp/14.0_dp
831            a1 = 23819.0_dp/13720.0_dp
832            a2 = -3734_dp/1715.0_dp
833            a3 = 832591127_dp/188238400.0_dp
834
835            ! tmp1=y
836            ! tmp3=z
837            CALL dbcsr_copy(tmp3, tmp1)
838            CALL dbcsr_add_on_diag(tmp3, a0)
839            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp3, tmp1, 0.0_dp, tmp3, &
840                                filter_eps=threshold, flop=flops)
841            floptot = floptot + flops
842            CALL dbcsr_add_on_diag(tmp3, a1)
843
844            CALL dbcsr_add_on_diag(tmp1, a2)
845            CALL dbcsr_add(tmp1, tmp3, 1.0_dp, 1.0_dp)
846            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, tmp3, 0.0_dp, tmp1, &
847                                filter_eps=threshold, flop=flops)
848            floptot = floptot + flops
849            CALL dbcsr_add_on_diag(tmp1, a3)
850
851            prefactor = 35.0_dp/128.0_dp
852         ELSE IF (order .EQ. 6) THEN
853            ! with four multiplications
854            ! f6(y)=63/256*(256/63 + (128 y)/63 + (32 y^2)/21 + (80 y^3)/63 + (10 y^4)/9 + y^5)
855            ! tmp1=y
856            CALL dbcsr_copy(tmp3, tmp1)
857            CALL dbcsr_scale(tmp1, 128.0_dp/63.0_dp)
858            CALL dbcsr_add_on_diag(tmp1, 256.0_dp/63.0_dp)
859
860            !
861            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp3, tmp3, 0.0_dp, tmp2, &
862                                filter_eps=threshold, flop=flops)
863            floptot = floptot + flops
864
865            CALL dbcsr_add(tmp1, tmp2, 1.0_dp, 32.0_dp/21.0_dp)
866
867            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp3, 0.0_dp, tmp2, &
868                                filter_eps=threshold, flop=flops)
869            floptot = floptot + flops
870
871            CALL dbcsr_add(tmp1, tmp2, 1.0_dp, 80.0_dp/63.0_dp)
872
873            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp3, 0.0_dp, tmp2, &
874                                filter_eps=threshold, flop=flops)
875            floptot = floptot + flops
876
877            CALL dbcsr_add(tmp1, tmp2, 1.0_dp, 10.0_dp/9.0_dp)
878
879            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp3, 1.0_dp, tmp1, &
880                                filter_eps=threshold, flop=flops)
881            floptot = floptot + flops
882
883            prefactor = 63.0_dp/256.0_dp
884         ELSE IF (order .EQ. 7) THEN
885            ! with three multiplications
886
887            a0 = 1.3686502058092053653287666647611728507211996691324048468010382350359929055186612505791532871573242422_dp
888            a1 = 1.7089671854477436685850554669524985556296280184497503489303331821456795715195510972774979091893741568_dp
889            a2 = -1.3231956603546599107833121193066273961757451236778593922555836895814474509732067051246078326118696968_dp
890            a3 = 3.9876642330847931291749479958277754186675336169578593000744380254770411483327581042259415937710270453_dp
891            a4 = -3.7273299006476825027065704937541279833880400042556351139273912137942678919776364526511485025132991667_dp
892            a5 = 4.9369932474103023792021351907971943220607580694533770325967170245194362399287150565595441897740173578_dp
893            !      =231/1024*(1024/231+512/231*y+128/77*y^2+320/231*y^3+40/33*y^4+12/11*y^5+y^6)
894            ! z(y)=(y+a_0)*y+a_1
895            ! w(y)=(y+a_2)*z(y)+a_3
896            ! f7(y)=(w(y)+z(y)+a_4)*w(y)+a_5
897
898            ! tmp1=y
899            ! tmp3=z
900            CALL dbcsr_copy(tmp3, tmp1)
901            CALL dbcsr_add_on_diag(tmp3, a0)
902            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp3, tmp1, 0.0_dp, tmp3, &
903                                filter_eps=threshold, flop=flops)
904            floptot = floptot + flops
905            CALL dbcsr_add_on_diag(tmp3, a1)
906
907            ! tmp4=w
908            CALL dbcsr_copy(tmp4, tmp1)
909            CALL dbcsr_add_on_diag(tmp4, a2)
910            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp4, tmp3, 0.0_dp, tmp4, &
911                                filter_eps=threshold, flop=flops)
912            floptot = floptot + flops
913            CALL dbcsr_add_on_diag(tmp4, a3)
914
915            CALL dbcsr_add(tmp3, tmp4, 1.0_dp, 1.0_dp)
916            CALL dbcsr_add_on_diag(tmp3, a4)
917            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp3, tmp4, 0.0_dp, tmp1, &
918                                filter_eps=threshold, flop=flops)
919            floptot = floptot + flops
920            CALL dbcsr_add_on_diag(tmp1, a5)
921
922            prefactor = 231.0_dp/1024.0_dp
923         ELSE
924            CPABORT("requested order is not implemented.")
925         ENDIF
926
927         ! tmp2 = X * prefactor *
928         CALL dbcsr_multiply("N", "N", prefactor, matrix_sign, tmp1, 0.0_dp, tmp2, &
929                             filter_eps=threshold, flop=flops)
930         floptot = floptot + flops
931
932         ! done iterating
933         ! CALL dbcsr_filter(tmp2,threshold)
934         CALL dbcsr_copy(matrix_sign, tmp2)
935         t2 = m_walltime()
936
937         occ_matrix = dbcsr_get_occupation(matrix_sign)
938
939         IF (unit_nr > 0) THEN
940            WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3,F12.3,F13.3)') "NS sign iter ", i, occ_matrix, &
941               frob_matrix/frob_matrix_base, t2 - t1, &
942               floptot/(1.0E6_dp*MAX(0.001_dp, t2 - t1))
943            CALL m_flush(unit_nr)
944         ENDIF
945
946         ! frob_matrix/frob_matrix_base < SQRT(threshold)
947         IF (frob_matrix*frob_matrix < (threshold*frob_matrix_base*frob_matrix_base)) EXIT
948
949      ENDDO
950
951      ! this check is not really needed
952      CALL dbcsr_multiply("N", "N", +1.0_dp, matrix_sign, matrix_sign, 0.0_dp, tmp1, &
953                          filter_eps=threshold)
954      frob_matrix_base = dbcsr_frobenius_norm(tmp1)
955      CALL dbcsr_add_on_diag(tmp1, -1.0_dp)
956      frob_matrix = dbcsr_frobenius_norm(tmp1)
957      occ_matrix = dbcsr_get_occupation(matrix_sign)
958      IF (unit_nr > 0) THEN
959         WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3)') "Final NS sign iter", i, occ_matrix, &
960            frob_matrix/frob_matrix_base
961         WRITE (unit_nr, '()')
962         CALL m_flush(unit_nr)
963      ENDIF
964
965      CALL dbcsr_release(tmp1)
966      CALL dbcsr_release(tmp2)
967      IF (order .GE. 4) THEN
968         CALL dbcsr_release(tmp3)
969      ENDIF
970      IF (order .GE. 7) THEN
971         CALL dbcsr_release(tmp4)
972      ENDIF
973
974      CALL timestop(handle)
975
976   END SUBROUTINE matrix_sign_Newton_Schulz
977
978   ! **************************************************************************************************
979!> \brief compute the sign a matrix using the general algorithm for the p-th root of Richters et al.
980!>                   Commun. Comput. Phys., 25 (2019), pp. 564-585.
981!> \param matrix_sign ...
982!> \param matrix ...
983!> \param threshold ...
984!> \param sign_order ...
985!> \par History
986!>       2019.03 created [Robert Schade]
987!> \author Robert Schade
988! **************************************************************************************************
989   SUBROUTINE matrix_sign_proot(matrix_sign, matrix, threshold, sign_order)
990
991      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_sign, matrix
992      REAL(KIND=dp), INTENT(IN)                          :: threshold
993      INTEGER, INTENT(IN), OPTIONAL                      :: sign_order
994
995      CHARACTER(LEN=*), PARAMETER :: routineN = 'matrix_sign_proot', &
996         routineP = moduleN//':'//routineN
997
998      INTEGER                                            :: handle, order, unit_nr
999      INTEGER(KIND=int_8)                                :: flop0, flop1, flop2
1000      LOGICAL                                            :: converged, symmetrize
1001      REAL(KIND=dp)                                      :: frob_matrix, frob_matrix_base, occ_matrix
1002      TYPE(cp_logger_type), POINTER                      :: logger
1003      TYPE(dbcsr_type)                                   :: matrix2, matrix_sqrt, matrix_sqrt_inv, &
1004                                                            tmp1, tmp2
1005
1006      CALL timeset(routineN, handle)
1007
1008      logger => cp_get_default_logger()
1009      IF (logger%para_env%ionode) THEN
1010         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
1011      ELSE
1012         unit_nr = -1
1013      ENDIF
1014
1015      IF (PRESENT(sign_order)) THEN
1016         order = sign_order
1017      ELSE
1018         order = 2
1019      ENDIF
1020
1021      CALL dbcsr_create(tmp1, template=matrix_sign)
1022
1023      CALL dbcsr_create(tmp2, template=matrix_sign)
1024
1025      CALL dbcsr_create(matrix2, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1026      CALL dbcsr_multiply("N", "N", 1.0_dp, matrix, matrix, 0.0_dp, matrix2, &
1027                          filter_eps=threshold, flop=flop0)
1028      !CALL dbcsr_filter(matrix2, threshold)
1029
1030      !CALL dbcsr_copy(matrix_sign, matrix)
1031      !CALL dbcsr_filter(matrix_sign, threshold)
1032
1033      IF (unit_nr > 0) WRITE (unit_nr, *)
1034
1035      CALL dbcsr_create(matrix_sqrt, template=matrix2)
1036      CALL dbcsr_create(matrix_sqrt_inv, template=matrix2)
1037      IF (unit_nr > 0) WRITE (unit_nr, *) "Threshold=", threshold
1038
1039      symmetrize = .FALSE.
1040      CALL matrix_sqrt_proot(matrix_sqrt, matrix_sqrt_inv, matrix2, threshold, order, &
1041                             0.01_dp, 100, symmetrize, converged)
1042!      call matrix_sqrt_Newton_Schulz(matrix_sqrt, matrix_sqrt_inv, matrix2, threshold, order, &
1043!                                        0.01_dp,100, symmetrize,converged)
1044
1045      CALL dbcsr_multiply("N", "N", 1.0_dp, matrix, matrix_sqrt_inv, 0.0_dp, matrix_sign, &
1046                          filter_eps=threshold, flop=flop1)
1047
1048      ! this check is not really needed
1049      CALL dbcsr_multiply("N", "N", +1.0_dp, matrix_sign, matrix_sign, 0.0_dp, tmp1, &
1050                          filter_eps=threshold, flop=flop2)
1051      frob_matrix_base = dbcsr_frobenius_norm(tmp1)
1052      CALL dbcsr_add_on_diag(tmp1, -1.0_dp)
1053      frob_matrix = dbcsr_frobenius_norm(tmp1)
1054      occ_matrix = dbcsr_get_occupation(matrix_sign)
1055      IF (unit_nr > 0) THEN
1056         WRITE (unit_nr, '(T6,A,F10.8,E12.3)') "Final proot sign iter", occ_matrix, &
1057            frob_matrix/frob_matrix_base
1058         WRITE (unit_nr, '()')
1059         CALL m_flush(unit_nr)
1060      ENDIF
1061
1062      CALL dbcsr_release(tmp1)
1063      CALL dbcsr_release(tmp2)
1064      CALL dbcsr_release(matrix2)
1065      CALL dbcsr_release(matrix_sqrt)
1066      CALL dbcsr_release(matrix_sqrt_inv)
1067
1068      CALL timestop(handle)
1069
1070   END SUBROUTINE matrix_sign_proot
1071
1072! **************************************************************************************************
1073!> \brief compute the sqrt of a matrix via the sign function and the corresponding Newton-Schulz iterations
1074!>        the order of the algorithm should be 2..5, 3 or 5 is recommended
1075!> \param matrix_sqrt ...
1076!> \param matrix_sqrt_inv ...
1077!> \param matrix ...
1078!> \param threshold ...
1079!> \param order ...
1080!> \param eps_lanczos ...
1081!> \param max_iter_lanczos ...
1082!> \param symmetrize ...
1083!> \param converged ...
1084!> \par History
1085!>       2010.10 created [Joost VandeVondele]
1086!> \author Joost VandeVondele
1087! **************************************************************************************************
1088   SUBROUTINE matrix_sqrt_Newton_Schulz(matrix_sqrt, matrix_sqrt_inv, matrix, threshold, order, &
1089                                        eps_lanczos, max_iter_lanczos, symmetrize, converged)
1090      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_sqrt, matrix_sqrt_inv, matrix
1091      REAL(KIND=dp), INTENT(IN)                          :: threshold
1092      INTEGER, INTENT(IN)                                :: order
1093      REAL(KIND=dp), INTENT(IN)                          :: eps_lanczos
1094      INTEGER, INTENT(IN)                                :: max_iter_lanczos
1095      LOGICAL, OPTIONAL                                  :: symmetrize, converged
1096
1097      CHARACTER(LEN=*), PARAMETER :: routineN = 'matrix_sqrt_Newton_Schulz', &
1098         routineP = moduleN//':'//routineN
1099
1100      INTEGER                                            :: handle, i, unit_nr
1101      INTEGER(KIND=int_8)                                :: flop1, flop2, flop3, flop4, flop5
1102      LOGICAL                                            :: arnoldi_converged, tsym
1103      REAL(KIND=dp)                                      :: a, b, c, conv, d, frob_matrix, &
1104                                                            frob_matrix_base, gershgorin_norm, &
1105                                                            max_ev, min_ev, oa, ob, oc, &
1106                                                            occ_matrix, od, scaling, t1, t2
1107      TYPE(cp_logger_type), POINTER                      :: logger
1108      TYPE(dbcsr_type)                                   :: tmp1, tmp2, tmp3
1109
1110      CALL timeset(routineN, handle)
1111
1112      logger => cp_get_default_logger()
1113      IF (logger%para_env%ionode) THEN
1114         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
1115      ELSE
1116         unit_nr = -1
1117      ENDIF
1118
1119      IF (PRESENT(converged)) converged = .FALSE.
1120      IF (PRESENT(symmetrize)) THEN
1121         tsym = symmetrize
1122      ELSE
1123         tsym = .TRUE.
1124      ENDIF
1125
1126      ! for stability symmetry can not be assumed
1127      CALL dbcsr_create(tmp1, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1128      CALL dbcsr_create(tmp2, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1129      IF (order .GE. 4) THEN
1130         CALL dbcsr_create(tmp3, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1131      ENDIF
1132
1133      CALL dbcsr_set(matrix_sqrt_inv, 0.0_dp)
1134      CALL dbcsr_add_on_diag(matrix_sqrt_inv, 1.0_dp)
1135      CALL dbcsr_filter(matrix_sqrt_inv, threshold)
1136      CALL dbcsr_copy(matrix_sqrt, matrix)
1137
1138      ! scale the matrix to get into the convergence range
1139      IF (order == 0) THEN
1140
1141         gershgorin_norm = dbcsr_gershgorin_norm(matrix_sqrt)
1142         frob_matrix = dbcsr_frobenius_norm(matrix_sqrt)
1143         scaling = 1.0_dp/MIN(frob_matrix, gershgorin_norm)
1144
1145      ELSE
1146
1147         ! scale the matrix to get into the convergence range
1148         CALL arnoldi_extremal(matrix_sqrt, max_ev, min_ev, threshold=eps_lanczos, &
1149                               max_iter=max_iter_lanczos, converged=arnoldi_converged)
1150         IF (unit_nr > 0) THEN
1151            WRITE (unit_nr, *)
1152            WRITE (unit_nr, '(T6,A,1X,L1,A,E12.3)') "Lanczos converged: ", arnoldi_converged, " threshold:", eps_lanczos
1153            WRITE (unit_nr, '(T6,A,1X,E12.3,E12.3)') "Est. extremal eigenvalues:", max_ev, min_ev
1154            WRITE (unit_nr, '(T6,A,1X,E12.3)') "Est. condition number :", max_ev/MAX(min_ev, EPSILON(min_ev))
1155         ENDIF
1156         ! conservatively assume we get a relatively large error (100*threshold_lanczos) in the estimates
1157         ! and adjust the scaling to be on the safe side
1158         scaling = 2.0_dp/(max_ev + min_ev + 100*eps_lanczos)
1159
1160      ENDIF
1161
1162      CALL dbcsr_scale(matrix_sqrt, scaling)
1163      CALL dbcsr_filter(matrix_sqrt, threshold)
1164      IF (unit_nr > 0) THEN
1165         WRITE (unit_nr, *)
1166         WRITE (unit_nr, *) "Order=", order
1167      ENDIF
1168
1169      DO i = 1, 100
1170
1171         t1 = m_walltime()
1172
1173         ! tmp1 = Zk * Yk - I
1174         CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_sqrt_inv, matrix_sqrt, 0.0_dp, tmp1, &
1175                             filter_eps=threshold, flop=flop1)
1176         frob_matrix_base = dbcsr_frobenius_norm(tmp1)
1177         CALL dbcsr_add_on_diag(tmp1, -1.0_dp)
1178
1179         ! check convergence (frob norm of what should be the identity matrix minus identity matrix)
1180         frob_matrix = dbcsr_frobenius_norm(tmp1)
1181
1182         flop4 = 0; flop5 = 0
1183         SELECT CASE (order)
1184         CASE (0, 2)
1185            ! update the above to 0.5*(3*I-Zk*Yk)
1186            CALL dbcsr_add_on_diag(tmp1, -2.0_dp)
1187            CALL dbcsr_scale(tmp1, -0.5_dp)
1188         CASE (3)
1189            ! tmp2 = tmp1 ** 2
1190            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, tmp1, 0.0_dp, tmp2, &
1191                                filter_eps=threshold, flop=flop4)
1192            ! tmp1 = 1/16 * (16*I-8*tmp1+6*tmp1**2-5*tmp1**3)
1193            CALL dbcsr_add(tmp1, tmp2, -4.0_dp, 3.0_dp)
1194            CALL dbcsr_add_on_diag(tmp1, 8.0_dp)
1195            CALL dbcsr_scale(tmp1, 0.125_dp)
1196         CASE (4) ! as expensive as case(5), so little need to use it
1197            ! tmp2 = tmp1 ** 2
1198            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, tmp1, 0.0_dp, tmp2, &
1199                                filter_eps=threshold, flop=flop4)
1200            ! tmp3 = tmp2 * tmp1
1201            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp1, 0.0_dp, tmp3, &
1202                                filter_eps=threshold, flop=flop5)
1203            CALL dbcsr_scale(tmp1, -8.0_dp)
1204            CALL dbcsr_add_on_diag(tmp1, 16.0_dp)
1205            CALL dbcsr_add(tmp1, tmp2, 1.0_dp, 6.0_dp)
1206            CALL dbcsr_add(tmp1, tmp3, 1.0_dp, -5.0_dp)
1207            CALL dbcsr_scale(tmp1, 1/16.0_dp)
1208         CASE (5)
1209            ! Knuth's reformulation to evaluate the polynomial of 4th degree in 2 multiplications
1210            ! p = y4+A*y3+B*y2+C*y+D
1211            ! z := y * (y+a); P := (z+y+b) * (z+c) + d.
1212            ! a=(A-1)/2 ; b=B*(a+1)-C-a*(a+1)*(a+1)
1213            ! c=B-b-a*(a+1)
1214            ! d=D-bc
1215            oa = -40.0_dp/35.0_dp
1216            ob = 48.0_dp/35.0_dp
1217            oc = -64.0_dp/35.0_dp
1218            od = 128.0_dp/35.0_dp
1219            a = (oa - 1)/2
1220            b = ob*(a + 1) - oc - a*(a + 1)**2
1221            c = ob - b - a*(a + 1)
1222            d = od - b*c
1223            ! tmp2 = tmp1 ** 2 + a * tmp1
1224            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, tmp1, 0.0_dp, tmp2, &
1225                                filter_eps=threshold, flop=flop4)
1226            CALL dbcsr_add(tmp2, tmp1, 1.0_dp, a)
1227            ! tmp3 = tmp2 + tmp1 + b
1228            CALL dbcsr_copy(tmp3, tmp2)
1229            CALL dbcsr_add(tmp3, tmp1, 1.0_dp, 1.0_dp)
1230            CALL dbcsr_add_on_diag(tmp3, b)
1231            ! tmp2 = tmp2 + c
1232            CALL dbcsr_add_on_diag(tmp2, c)
1233            ! tmp1 = tmp2 * tmp3
1234            CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, tmp3, 0.0_dp, tmp1, &
1235                                filter_eps=threshold, flop=flop5)
1236            ! tmp1 = tmp1 + d
1237            CALL dbcsr_add_on_diag(tmp1, d)
1238            ! final scale
1239            CALL dbcsr_scale(tmp1, 35.0_dp/128.0_dp)
1240         CASE DEFAULT
1241            CPABORT("Illegal order value")
1242         END SELECT
1243
1244         ! tmp2 = Yk * tmp1 = Y(k+1)
1245         CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_sqrt, tmp1, 0.0_dp, tmp2, &
1246                             filter_eps=threshold, flop=flop2)
1247         ! CALL dbcsr_filter(tmp2,threshold)
1248         CALL dbcsr_copy(matrix_sqrt, tmp2)
1249
1250         ! tmp2 = tmp1 * Zk = Z(k+1)
1251         CALL dbcsr_multiply("N", "N", 1.0_dp, tmp1, matrix_sqrt_inv, 0.0_dp, tmp2, &
1252                             filter_eps=threshold, flop=flop3)
1253         ! CALL dbcsr_filter(tmp2,threshold)
1254         CALL dbcsr_copy(matrix_sqrt_inv, tmp2)
1255
1256         occ_matrix = dbcsr_get_occupation(matrix_sqrt_inv)
1257
1258         ! done iterating
1259         t2 = m_walltime()
1260
1261         conv = frob_matrix/frob_matrix_base
1262
1263         IF (unit_nr > 0) THEN
1264            WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3,F12.3,F13.3)') "NS sqrt iter ", i, occ_matrix, &
1265               conv, t2 - t1, &
1266               (flop1 + flop2 + flop3 + flop4 + flop5)/(1.0E6_dp*MAX(0.001_dp, t2 - t1))
1267            CALL m_flush(unit_nr)
1268         ENDIF
1269
1270         IF (abnormal_value(conv)) &
1271            CPABORT("conv is an abnormal value (NaN/Inf).")
1272
1273         ! conv < SQRT(threshold)
1274         IF ((conv*conv) < threshold) THEN
1275            IF (PRESENT(converged)) converged = .TRUE.
1276            EXIT
1277         ENDIF
1278
1279      ENDDO
1280
1281      ! symmetrize the matrices as this is not guaranteed by the algorithm
1282      IF (tsym) THEN
1283         IF (unit_nr > 0) THEN
1284            WRITE (unit_nr, '(A20)') "SYMMETRIZING RESULTS"
1285         ENDIF
1286         CALL dbcsr_transposed(tmp1, matrix_sqrt_inv)
1287         CALL dbcsr_add(matrix_sqrt_inv, tmp1, 0.5_dp, 0.5_dp)
1288         CALL dbcsr_transposed(tmp1, matrix_sqrt)
1289         CALL dbcsr_add(matrix_sqrt, tmp1, 0.5_dp, 0.5_dp)
1290      ENDIF
1291
1292      ! this check is not really needed
1293      CALL dbcsr_multiply("N", "N", +1.0_dp, matrix_sqrt_inv, matrix_sqrt, 0.0_dp, tmp1, &
1294                          filter_eps=threshold)
1295      frob_matrix_base = dbcsr_frobenius_norm(tmp1)
1296      CALL dbcsr_add_on_diag(tmp1, -1.0_dp)
1297      frob_matrix = dbcsr_frobenius_norm(tmp1)
1298      occ_matrix = dbcsr_get_occupation(matrix_sqrt_inv)
1299      IF (unit_nr > 0) THEN
1300         WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3)') "Final NS sqrt iter ", i, occ_matrix, &
1301            frob_matrix/frob_matrix_base
1302         WRITE (unit_nr, '()')
1303         CALL m_flush(unit_nr)
1304      ENDIF
1305
1306      ! scale to proper end results
1307      CALL dbcsr_scale(matrix_sqrt, 1/SQRT(scaling))
1308      CALL dbcsr_scale(matrix_sqrt_inv, SQRT(scaling))
1309
1310      CALL dbcsr_release(tmp1)
1311      CALL dbcsr_release(tmp2)
1312      IF (order .GE. 4) THEN
1313         CALL dbcsr_release(tmp3)
1314      ENDIF
1315
1316      CALL timestop(handle)
1317
1318   END SUBROUTINE matrix_sqrt_Newton_Schulz
1319
1320! **************************************************************************************************
1321!> \brief compute the sqrt of a matrix via the general algorithm for the p-th root of Richters et al.
1322!>                   Commun. Comput. Phys., 25 (2019), pp. 564-585.
1323!> \param matrix_sqrt ...
1324!> \param matrix_sqrt_inv ...
1325!> \param matrix ...
1326!> \param threshold ...
1327!> \param order ...
1328!> \param eps_lanczos ...
1329!> \param max_iter_lanczos ...
1330!> \param symmetrize ...
1331!> \param converged ...
1332!> \par History
1333!>       2019.04 created [Robert Schade]
1334!> \author Robert Schade
1335! **************************************************************************************************
1336   SUBROUTINE matrix_sqrt_proot(matrix_sqrt, matrix_sqrt_inv, matrix, threshold, order, &
1337                                eps_lanczos, max_iter_lanczos, symmetrize, converged)
1338      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_sqrt, matrix_sqrt_inv, matrix
1339      REAL(KIND=dp), INTENT(IN)                          :: threshold
1340      INTEGER, INTENT(IN)                                :: order
1341      REAL(KIND=dp), INTENT(IN)                          :: eps_lanczos
1342      INTEGER, INTENT(IN)                                :: max_iter_lanczos
1343      LOGICAL, OPTIONAL                                  :: symmetrize, converged
1344
1345      CHARACTER(LEN=*), PARAMETER :: routineN = 'matrix_sqrt_proot', &
1346         routineP = moduleN//':'//routineN
1347
1348      INTEGER                                            :: choose, handle, i, ii, j, unit_nr
1349      INTEGER(KIND=int_8)                                :: f, flop1, flop2, flop3, flop4, flop5
1350      LOGICAL                                            :: arnoldi_converged, test, tsym
1351      REAL(KIND=dp)                                      :: conv, frob_matrix, frob_matrix_base, &
1352                                                            max_ev, min_ev, occ_matrix, scaling, &
1353                                                            t1, t2
1354      TYPE(cp_logger_type), POINTER                      :: logger
1355      TYPE(dbcsr_type)                                   :: BK2A, matrixS, Rmat, tmp1, tmp2
1356
1357      CALL cite_reference(Richters2018)
1358
1359      test = .FALSE.
1360
1361      CALL timeset(routineN, handle)
1362
1363      logger => cp_get_default_logger()
1364      IF (logger%para_env%ionode) THEN
1365         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
1366      ELSE
1367         unit_nr = -1
1368      ENDIF
1369
1370      IF (PRESENT(converged)) converged = .FALSE.
1371      IF (PRESENT(symmetrize)) THEN
1372         tsym = symmetrize
1373      ELSE
1374         tsym = .TRUE.
1375      ENDIF
1376
1377      ! for stability symmetry can not be assumed
1378      CALL dbcsr_create(tmp1, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1379      CALL dbcsr_create(tmp2, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1380      CALL dbcsr_create(Rmat, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1381      CALL dbcsr_create(matrixS, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1382
1383      CALL dbcsr_copy(matrixS, matrix)
1384      IF (1 .EQ. 1) THEN
1385         ! scale the matrix to get into the convergence range
1386         CALL arnoldi_extremal(matrixS, max_ev, min_ev, threshold=eps_lanczos, &
1387                               max_iter=max_iter_lanczos, converged=arnoldi_converged)
1388         IF (unit_nr > 0) THEN
1389            WRITE (unit_nr, *)
1390            WRITE (unit_nr, '(T6,A,1X,L1,A,E12.3)') "Lanczos converged: ", arnoldi_converged, " threshold:", eps_lanczos
1391            WRITE (unit_nr, '(T6,A,1X,E12.3,E12.3)') "Est. extremal eigenvalues:", max_ev, min_ev
1392            WRITE (unit_nr, '(T6,A,1X,E12.3)') "Est. condition number :", max_ev/MAX(min_ev, EPSILON(min_ev))
1393         ENDIF
1394         ! conservatively assume we get a relatively large error (100*threshold_lanczos) in the estimates
1395         ! and adjust the scaling to be on the safe side
1396         scaling = 2.0_dp/(max_ev + min_ev + 100*eps_lanczos)
1397         CALL dbcsr_scale(matrixS, scaling)
1398         CALL dbcsr_filter(matrixS, threshold)
1399      ELSE
1400         scaling = 1.0_dp
1401      ENDIF
1402
1403      CALL dbcsr_set(matrix_sqrt_inv, 0.0_dp)
1404      CALL dbcsr_add_on_diag(matrix_sqrt_inv, 1.0_dp)
1405      !CALL dbcsr_filter(matrix_sqrt_inv, threshold)
1406
1407      IF (unit_nr > 0) THEN
1408         WRITE (unit_nr, *)
1409         WRITE (unit_nr, *) "Order=", order
1410      ENDIF
1411
1412      DO i = 1, 100
1413
1414         t1 = m_walltime()
1415         IF (1 .EQ. 1) THEN
1416            !build R=1-A B_K^2
1417            CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_sqrt_inv, matrix_sqrt_inv, 0.0_dp, tmp1, &
1418                                filter_eps=threshold, flop=flop1)
1419            CALL dbcsr_multiply("N", "N", 1.0_dp, matrixS, tmp1, 0.0_dp, Rmat, &
1420                                filter_eps=threshold, flop=flop2)
1421            CALL dbcsr_scale(Rmat, -1.0_dp)
1422            CALL dbcsr_add_on_diag(Rmat, 1.0_dp)
1423
1424            flop4 = 0; flop5 = 0
1425            CALL dbcsr_set(tmp1, 0.0_dp)
1426            CALL dbcsr_add_on_diag(tmp1, 2.0_dp)
1427
1428            flop3 = 0
1429
1430            DO j = 2, order
1431               IF (j .EQ. 2) THEN
1432                  CALL dbcsr_copy(tmp2, Rmat)
1433               ELSE
1434                  f = 0
1435                  CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, Rmat, 0.0_dp, tmp2, &
1436                                      filter_eps=threshold, flop=f)
1437                  flop3 = flop3 + f
1438               ENDIF
1439               CALL dbcsr_add(tmp1, tmp2, 1.0_dp, 1.0_dp)
1440            ENDDO
1441         ELSE
1442            CALL dbcsr_create(BK2A, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1443            CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_sqrt_inv, matrixS, 0.0_dp, BK2A, &
1444                                filter_eps=threshold, flop=flop1)
1445            CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_sqrt_inv, BK2A, 0.0_dp, BK2A, &
1446                                filter_eps=threshold, flop=flop2)
1447            CALL dbcsr_copy(Rmat, BK2A)
1448            CALL dbcsr_add_on_diag(Rmat, -1.0_dp)
1449
1450            CALL dbcsr_set(tmp1, 0.0_dp)
1451            CALL dbcsr_add_on_diag(tmp1, 1.0_dp)
1452
1453            CALL dbcsr_set(tmp2, 0.0_dp)
1454            CALL dbcsr_add_on_diag(tmp2, 1.0_dp)
1455
1456            flop3 = 0
1457            DO j = 1, order
1458               !choose=factorial(order)/(factorial(j)*factorial(order-j))
1459               choose = PRODUCT((/(ii, ii=1, order)/))/(PRODUCT((/(ii, ii=1, j)/))*PRODUCT((/(ii, ii=1, order - j)/)))
1460               CALL dbcsr_add(tmp1, tmp2, 1.0_dp, -1.0_dp*(-1)**j*choose)
1461               IF (j .LT. order) THEN
1462                  f = 0
1463                  CALL dbcsr_multiply("N", "N", 1.0_dp, tmp2, BK2A, 0.0_dp, tmp2, &
1464                                      filter_eps=threshold, flop=f)
1465                  flop3 = flop3 + f
1466               ENDIF
1467            ENDDO
1468            CALL dbcsr_release(BK2A)
1469         ENDIF
1470
1471         CALL dbcsr_multiply("N", "N", 0.5_dp, matrix_sqrt_inv, tmp1, 0.0_dp, matrix_sqrt_inv, &
1472                             filter_eps=threshold, flop=flop4)
1473
1474         occ_matrix = dbcsr_get_occupation(matrix_sqrt_inv)
1475
1476         ! done iterating
1477         t2 = m_walltime()
1478
1479         conv = dbcsr_frobenius_norm(Rmat)
1480
1481         IF (unit_nr > 0) THEN
1482            WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3,F12.3,F13.3)') "PROOT sqrt iter ", i, occ_matrix, &
1483               conv, t2 - t1, &
1484               (flop1 + flop2 + flop3 + flop4 + flop5)/(1.0E6_dp*MAX(0.001_dp, t2 - t1))
1485            CALL m_flush(unit_nr)
1486         ENDIF
1487
1488         IF (abnormal_value(conv)) &
1489            CPABORT("conv is an abnormal value (NaN/Inf).")
1490
1491         ! conv < SQRT(threshold)
1492         IF ((conv*conv) < threshold) THEN
1493            IF (PRESENT(converged)) converged = .TRUE.
1494            EXIT
1495         ENDIF
1496
1497      ENDDO
1498
1499      ! scale to proper end results
1500      CALL dbcsr_scale(matrix_sqrt_inv, SQRT(scaling))
1501      CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_sqrt_inv, matrix, 0.0_dp, matrix_sqrt, &
1502                          filter_eps=threshold, flop=flop5)
1503
1504      ! symmetrize the matrices as this is not guaranteed by the algorithm
1505      IF (tsym) THEN
1506         IF (unit_nr > 0) THEN
1507            WRITE (unit_nr, '(A20)') "SYMMETRIZING RESULTS"
1508         ENDIF
1509         CALL dbcsr_transposed(tmp1, matrix_sqrt_inv)
1510         CALL dbcsr_add(matrix_sqrt_inv, tmp1, 0.5_dp, 0.5_dp)
1511         CALL dbcsr_transposed(tmp1, matrix_sqrt)
1512         CALL dbcsr_add(matrix_sqrt, tmp1, 0.5_dp, 0.5_dp)
1513      ENDIF
1514
1515      ! this check is not really needed
1516      IF (test) THEN
1517         CALL dbcsr_multiply("N", "N", +1.0_dp, matrix_sqrt_inv, matrix_sqrt, 0.0_dp, tmp1, &
1518                             filter_eps=threshold)
1519         frob_matrix_base = dbcsr_frobenius_norm(tmp1)
1520         CALL dbcsr_add_on_diag(tmp1, -1.0_dp)
1521         frob_matrix = dbcsr_frobenius_norm(tmp1)
1522         occ_matrix = dbcsr_get_occupation(matrix_sqrt_inv)
1523         IF (unit_nr > 0) THEN
1524            WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3)') "Final PROOT S^{-1/2} S^{1/2}-Eins error ", i, occ_matrix, &
1525               frob_matrix/frob_matrix_base
1526            WRITE (unit_nr, '()')
1527            CALL m_flush(unit_nr)
1528         ENDIF
1529
1530         ! this check is not really needed
1531         CALL dbcsr_multiply("N", "N", +1.0_dp, matrix_sqrt_inv, matrix_sqrt_inv, 0.0_dp, tmp2, &
1532                             filter_eps=threshold)
1533         CALL dbcsr_multiply("N", "N", +1.0_dp, tmp2, matrix, 0.0_dp, tmp1, &
1534                             filter_eps=threshold)
1535         frob_matrix_base = dbcsr_frobenius_norm(tmp1)
1536         CALL dbcsr_add_on_diag(tmp1, -1.0_dp)
1537         frob_matrix = dbcsr_frobenius_norm(tmp1)
1538         occ_matrix = dbcsr_get_occupation(matrix_sqrt_inv)
1539         IF (unit_nr > 0) THEN
1540            WRITE (unit_nr, '(T6,A,1X,I3,1X,F10.8,E12.3)') "Final PROOT S^{-1/2} S^{-1/2} S-Eins error ", i, occ_matrix, &
1541               frob_matrix/frob_matrix_base
1542            WRITE (unit_nr, '()')
1543            CALL m_flush(unit_nr)
1544         ENDIF
1545      ENDIF
1546
1547      CALL dbcsr_release(tmp1)
1548      CALL dbcsr_release(tmp2)
1549      CALL dbcsr_release(Rmat)
1550      CALL dbcsr_release(matrixS)
1551
1552      CALL timestop(handle)
1553   END SUBROUTINE matrix_sqrt_proot
1554
1555! **************************************************************************************************
1556!> \brief ...
1557!> \param matrix_exp ...
1558!> \param matrix ...
1559!> \param omega ...
1560!> \param alpha ...
1561!> \param threshold ...
1562! **************************************************************************************************
1563   SUBROUTINE matrix_exponential(matrix_exp, matrix, omega, alpha, threshold)
1564      ! compute matrix_exp=omega*exp(alpha*matrix)
1565      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix_exp, matrix
1566      REAL(KIND=dp), INTENT(IN)                          :: omega, alpha, threshold
1567
1568      CHARACTER(LEN=*), PARAMETER :: routineN = 'matrix_exponential', &
1569         routineP = moduleN//':'//routineN
1570      REAL(dp), PARAMETER                                :: one = 1.0_dp, toll = 1.E-17_dp, &
1571                                                            zero = 0.0_dp
1572
1573      INTEGER                                            :: handle, i, k, unit_nr
1574      REAL(dp)                                           :: factorial, norm_C, norm_D, norm_scalar
1575      TYPE(cp_logger_type), POINTER                      :: logger
1576      TYPE(dbcsr_type)                                   :: B, B_square, C, D, D_product
1577
1578      CALL timeset(routineN, handle)
1579
1580      logger => cp_get_default_logger()
1581      IF (logger%para_env%ionode) THEN
1582         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
1583      ELSE
1584         unit_nr = -1
1585      ENDIF
1586
1587      ! Calculate the norm of the matrix alpha*matrix, and scale it until it is less than 1.0
1588      norm_scalar = ABS(alpha)*dbcsr_frobenius_norm(matrix)
1589
1590      ! k=scaling parameter
1591      k = 1
1592      DO
1593         IF ((norm_scalar/2.0_dp**k) <= one) EXIT
1594         k = k + 1
1595      END DO
1596
1597      ! copy and scale the input matrix in matrix C and in matrix D
1598      CALL dbcsr_create(C, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1599      CALL dbcsr_copy(C, matrix)
1600      CALL dbcsr_scale(C, alpha_scalar=alpha/2.0_dp**k)
1601
1602      CALL dbcsr_create(D, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1603      CALL dbcsr_copy(D, C)
1604
1605      !   write(*,*)
1606      !   write(*,*)
1607      !   CALL dbcsr_print(D, nodata=.FALSE., matlab_format=.TRUE., variable_name="D", unit_nr=6)
1608
1609      ! set the B matrix as B=Identity+D
1610      CALL dbcsr_create(B, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1611      CALL dbcsr_copy(B, D)
1612      CALL dbcsr_add_on_diag(B, alpha_scalar=one)
1613
1614      !   CALL dbcsr_print(B, nodata=.FALSE., matlab_format=.TRUE., variable_name="B", unit_nr=6)
1615
1616      ! Calculate the norm of C and moltiply by toll to be used as a threshold
1617      norm_C = toll*dbcsr_frobenius_norm(matrix)
1618
1619      ! iteration for the trucated taylor series expansion
1620      CALL dbcsr_create(D_product, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1621      i = 1
1622      DO
1623         i = i + 1
1624         ! compute D_product=D*C
1625         CALL dbcsr_multiply("N", "N", one, D, C, &
1626                             zero, D_product, filter_eps=threshold)
1627
1628         ! copy D_product in D
1629         CALL dbcsr_copy(D, D_product)
1630
1631         ! calculate B=B+D_product/fat(i)
1632         factorial = ifac(i)
1633         CALL dbcsr_add(B, D_product, one, factorial)
1634
1635         ! check for convergence using the norm of D (copy of the matrix D_product) and C
1636         norm_D = factorial*dbcsr_frobenius_norm(D)
1637         IF (norm_D < norm_C) EXIT
1638      END DO
1639
1640      ! start the k iteration for the squaring of the matrix
1641      CALL dbcsr_create(B_square, template=matrix, matrix_type=dbcsr_type_no_symmetry)
1642      DO i = 1, k
1643         !compute B_square=B*B
1644         CALL dbcsr_multiply("N", "N", one, B, B, &
1645                             zero, B_square, filter_eps=threshold)
1646         ! copy Bsquare in B to iterate
1647         CALL dbcsr_copy(B, B_square)
1648      END DO
1649
1650      ! copy B_square in matrix_exp and
1651      CALL dbcsr_copy(matrix_exp, B_square)
1652
1653      ! scale matrix_exp by omega, matrix_exp=omega*B_square
1654      CALL dbcsr_scale(matrix_exp, alpha_scalar=omega)
1655      ! write(*,*) alpha,omega
1656
1657      CALL timestop(handle)
1658
1659   END SUBROUTINE matrix_exponential
1660
1661! **************************************************************************************************
1662!> \brief McWeeny purification of a matrix in the orthonormal basis
1663!> \param matrix_p Matrix to purify (needs to be almost idempotent already)
1664!> \param threshold Threshold used as filter_eps and convergence criteria
1665!> \param max_steps Max number of iterations
1666!> \par History
1667!>       2013.01 created [Florian Schiffmann]
1668!>       2014.07 slightly refactored [Ole Schuett]
1669!> \author Florian Schiffmann
1670! **************************************************************************************************
1671   SUBROUTINE purify_mcweeny_orth(matrix_p, threshold, max_steps)
1672      TYPE(dbcsr_type), DIMENSION(:)                     :: matrix_p
1673      REAL(KIND=dp)                                      :: threshold
1674      INTEGER                                            :: max_steps
1675
1676      CHARACTER(LEN=*), PARAMETER :: routineN = 'purify_mcweeny_orth', &
1677         routineP = moduleN//':'//routineN
1678
1679      INTEGER                                            :: handle, i, ispin, unit_nr
1680      REAL(KIND=dp)                                      :: frob_norm, trace
1681      TYPE(cp_logger_type), POINTER                      :: logger
1682      TYPE(dbcsr_type)                                   :: matrix_pp, matrix_tmp
1683
1684      CALL timeset(routineN, handle)
1685      logger => cp_get_default_logger()
1686      IF (logger%para_env%ionode) THEN
1687         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
1688      ELSE
1689         unit_nr = -1
1690      ENDIF
1691
1692      CALL dbcsr_create(matrix_pp, template=matrix_p(1), matrix_type=dbcsr_type_no_symmetry)
1693      CALL dbcsr_create(matrix_tmp, template=matrix_p(1), matrix_type=dbcsr_type_no_symmetry)
1694      CALL dbcsr_trace(matrix_p(1), trace)
1695
1696      DO ispin = 1, SIZE(matrix_p)
1697         DO i = 1, max_steps
1698            CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_p(ispin), matrix_p(ispin), &
1699                                0.0_dp, matrix_pp, filter_eps=threshold)
1700
1701            ! test convergence
1702            CALL dbcsr_copy(matrix_tmp, matrix_pp)
1703            CALL dbcsr_add(matrix_tmp, matrix_p(ispin), 1.0_dp, -1.0_dp)
1704            frob_norm = dbcsr_frobenius_norm(matrix_tmp) ! tmp = PP - P
1705            IF (unit_nr > 0) WRITE (unit_nr, '(t3,a,f16.8)') "McWeeny: Deviation of idempotency", frob_norm
1706            IF (unit_nr > 0) CALL m_flush(unit_nr)
1707
1708            ! construct new P
1709            CALL dbcsr_copy(matrix_tmp, matrix_pp)
1710            CALL dbcsr_multiply("N", "N", -2.0_dp, matrix_pp, matrix_p(ispin), &
1711                                3.0_dp, matrix_tmp, filter_eps=threshold)
1712            CALL dbcsr_copy(matrix_p(ispin), matrix_tmp) ! tmp = 3PP - 2PPP
1713
1714            ! frob_norm < SQRT(trace*threshold)
1715            IF (frob_norm*frob_norm < trace*threshold) EXIT
1716         END DO
1717      END DO
1718
1719      CALL dbcsr_release(matrix_pp)
1720      CALL dbcsr_release(matrix_tmp)
1721      CALL timestop(handle)
1722   END SUBROUTINE purify_mcweeny_orth
1723
1724! **************************************************************************************************
1725!> \brief McWeeny purification of a matrix in the non-orthonormal basis
1726!> \param matrix_p Matrix to purify (needs to be almost idempotent already)
1727!> \param matrix_s Overlap-Matrix
1728!> \param threshold Threshold used as filter_eps and convergence criteria
1729!> \param max_steps Max number of iterations
1730!> \par History
1731!>       2013.01 created [Florian Schiffmann]
1732!>       2014.07 slightly refactored [Ole Schuett]
1733!> \author Florian Schiffmann
1734! **************************************************************************************************
1735   SUBROUTINE purify_mcweeny_nonorth(matrix_p, matrix_s, threshold, max_steps)
1736      TYPE(dbcsr_type), DIMENSION(:)                     :: matrix_p
1737      TYPE(dbcsr_type)                                   :: matrix_s
1738      REAL(KIND=dp)                                      :: threshold
1739      INTEGER                                            :: max_steps
1740
1741      CHARACTER(LEN=*), PARAMETER :: routineN = 'purify_mcweeny_nonorth', &
1742         routineP = moduleN//':'//routineN
1743
1744      INTEGER                                            :: handle, i, ispin, unit_nr
1745      REAL(KIND=dp)                                      :: frob_norm, trace
1746      TYPE(cp_logger_type), POINTER                      :: logger
1747      TYPE(dbcsr_type)                                   :: matrix_ps, matrix_psp, matrix_test
1748
1749      CALL timeset(routineN, handle)
1750
1751      logger => cp_get_default_logger()
1752      IF (logger%para_env%ionode) THEN
1753         unit_nr = cp_logger_get_default_unit_nr(logger, local=.TRUE.)
1754      ELSE
1755         unit_nr = -1
1756      ENDIF
1757
1758      CALL dbcsr_create(matrix_ps, template=matrix_p(1), matrix_type=dbcsr_type_no_symmetry)
1759      CALL dbcsr_create(matrix_psp, template=matrix_p(1), matrix_type=dbcsr_type_no_symmetry)
1760      CALL dbcsr_create(matrix_test, template=matrix_p(1), matrix_type=dbcsr_type_no_symmetry)
1761
1762      DO ispin = 1, SIZE(matrix_p)
1763         DO i = 1, max_steps
1764            CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_p(ispin), matrix_s, &
1765                                0.0_dp, matrix_ps, filter_eps=threshold)
1766            CALL dbcsr_multiply("N", "N", 1.0_dp, matrix_ps, matrix_p(ispin), &
1767                                0.0_dp, matrix_psp, filter_eps=threshold)
1768            IF (i == 1) CALL dbcsr_trace(matrix_ps, trace)
1769
1770            ! test convergence
1771            CALL dbcsr_copy(matrix_test, matrix_psp)
1772            CALL dbcsr_add(matrix_test, matrix_p(ispin), 1.0_dp, -1.0_dp)
1773            frob_norm = dbcsr_frobenius_norm(matrix_test) ! test = PSP - P
1774            IF (unit_nr > 0) WRITE (unit_nr, '(t3,a,2f16.8)') "McWeeny: Deviation of idempotency", frob_norm
1775            IF (unit_nr > 0) CALL m_flush(unit_nr)
1776
1777            ! construct new P
1778            CALL dbcsr_copy(matrix_p(ispin), matrix_psp)
1779            CALL dbcsr_multiply("N", "N", -2.0_dp, matrix_ps, matrix_psp, &
1780                                3.0_dp, matrix_p(ispin), filter_eps=threshold)
1781
1782            ! frob_norm < SQRT(trace*threshold)
1783            IF (frob_norm*frob_norm < trace*threshold) EXIT
1784         END DO
1785      END DO
1786
1787      CALL dbcsr_release(matrix_ps)
1788      CALL dbcsr_release(matrix_psp)
1789      CALL dbcsr_release(matrix_test)
1790      CALL timestop(handle)
1791   END SUBROUTINE purify_mcweeny_nonorth
1792
1793END MODULE iterate_matrix
1794