1!--------------------------------------------------------------------------------------------------!
2!   CP2K: A general program to perform molecular dynamics simulations                              !
3!   Copyright (C) 2000 - 2019  CP2K developers group                                               !
4!--------------------------------------------------------------------------------------------------!
5
6! **************************************************************************************************
7!>   \brief
8!>     Routines to efficently collocate and integrate gaussians on a grid
9!>     These use most of Joost's tricks and a couple more...
10!>     result is *speed* and genericity
11!>   \author Fawzi Mohamed, 2007
12!>   \notes original available with BSD style license
13! **************************************************************************************************
14MODULE gauss_colloc
15
16#:include "colloc_int_kloop.fypp"
17
18   USE d3_poly,                         ONLY: &
19        grad_size3, poly_affine_t3, poly_affine_t3t, poly_p_eval2b, poly_p_eval3b, &
20        poly_padd_uneval2b, poly_padd_uneval3b, poly_size1, poly_size2, poly_size3
21   USE kinds,                           ONLY: dp,&
22                                              int_8
23   USE lgrid_types,                     ONLY: lgrid_type
24
25!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
26#include "./base/base_uses.f90"
27
28   IMPLICIT NONE
29   PRIVATE
30
31   PUBLIC :: collocGauss, &
32             integrateGaussFull
33
34#ifdef FD_DEBUG
35#define IF_CHECK(x,y) x
36#else
37#define IF_CHECK(x,y) y
38#endif
39
40   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'gauss_colloc'
41
42   REAL(dp), PARAMETER :: small = TINY(1.0_dp)
43
44   ! keep prettify happy (does not see the include)
45   INTEGER(KIND=int_8), PARAMETER           :: unused_import_of_int_8 = 1
46
47CONTAINS
48
49! **************************************************************************************************
50!> \brief collocate a periodically repeated gaussian on a non orthormbic grid
51!>
52!> this routine has been tested and works well with cells with
53!> det(h)/sqrt(tr(dot(h^T,h)))>0.2 (2 angles bigger than 24 deg or one angle
54!> bigger than 11 deg).
55!> Because of its numerics it might fail badly (infinity or NaN) with
56!> with more deformed cells. Avoiding this would be bossible only using
57!> IEEE numerics controls, which would also make everything slower and
58!> less supported.
59!> Still the actual numeric has been carefully tuned, and in normal cases
60!> and most anormal it should work.
61!> With det(h)/sqrt(tr(dot(h^T,h)))>0.2 I could not find any failure.
62!>
63!> \param h cell matrix
64!> \param h_inv inverse of the cell matrix
65!> \param grid the grid
66!> \param poly polynomial (d3_poly format)
67!> \param alphai exponential coeff
68!> \param posi position of the gaussian
69!> \param max_r2 maximum radius of collocation squared
70!> \param periodic array of 0 or 1 that says which dimensions have pbc (1=pbc)
71!> \param gdim dimension of the grid (grid might be a subset)
72!> \param local_bounds local bounds of the grid piece that is kept locally
73!>   (i.e. of grid) the global grid is assumed to atart at 0,0,0
74!> \param local_shift start indexes of the local slice (i.e. of grid)
75!> \param poly_shift position of posi in the polynomial reference system.
76!>  Set it to posi to use the global reference system.
77!> \param scale a global scale factor
78!> \param lgrid ...
79! **************************************************************************************************
80   SUBROUTINE collocGauss(h, h_inv, grid, poly, alphai, posi, max_r2, &
81                          periodic, gdim, local_bounds, local_shift, poly_shift, scale, lgrid)
82      REAL(dp), DIMENSION(0:2, 0:2), INTENT(in)          :: h, h_inv
83      REAL(dp), DIMENSION(0:, 0:, 0:), INTENT(inout)     :: grid
84      REAL(dp), DIMENSION(:), INTENT(inout)              :: poly
85      REAL(dp), INTENT(in)                               :: alphai
86      REAL(dp), DIMENSION(0:2), INTENT(in)               :: posi
87      REAL(dp), INTENT(in)                               :: max_r2
88      INTEGER, DIMENSION(0:2), INTENT(in)                :: periodic
89      INTEGER, DIMENSION(0:2), INTENT(in), OPTIONAL      :: gdim
90      INTEGER, DIMENSION(2, 0:2), INTENT(in), OPTIONAL   :: local_bounds
91      INTEGER, DIMENSION(0:2), INTENT(in), OPTIONAL      :: local_shift
92      REAL(dp), DIMENSION(0:2), INTENT(in), OPTIONAL     :: poly_shift
93      REAL(dp), INTENT(in), OPTIONAL                     :: scale
94      TYPE(lgrid_type), INTENT(inout), OPTIONAL          :: lgrid
95
96      CHARACTER(len=*), PARAMETER :: routineN = 'collocGauss', routineP = moduleN//':'//routineN
97
98      CALL colloc_int_body(h, h_inv, grid, poly, alphai, posi, max_r2, &
99                           periodic, gdim, local_bounds, local_shift, &
100                           poly_shift, scale, lgrid, integrate=.FALSE.)
101
102   END SUBROUTINE
103
104! **************************************************************************************************
105!> \brief integrates a gaussian times any polynomial up to a give order.
106!>
107!> Most things are the same as for collocGauss (see its comments).
108!> Returns the integrals of all the monomials in d3 format into poly
109!> \param h ...
110!> \param h_inv ...
111!> \param grid ...
112!> \param poly ...
113!> \param alphai ...
114!> \param posi ...
115!> \param max_r2 ...
116!> \param periodic ...
117!> \param gdim ...
118!> \param local_bounds ...
119!> \param local_shift ...
120!> \param poly_shift ...
121!> \param scale ...
122! **************************************************************************************************
123   SUBROUTINE integrateGaussFull(h, h_inv, grid, poly, alphai, posi, max_r2, &
124                                 periodic, gdim, local_bounds, local_shift, poly_shift, scale)
125      REAL(dp), DIMENSION(0:2, 0:2), INTENT(in)          :: h, h_inv
126      REAL(dp), DIMENSION(0:, 0:, 0:), INTENT(inout)     :: grid
127      REAL(dp), DIMENSION(:), INTENT(inout)              :: poly
128      REAL(dp), INTENT(in)                               :: alphai
129      REAL(dp), DIMENSION(0:2), INTENT(in)               :: posi
130      REAL(dp), INTENT(in)                               :: max_r2
131      INTEGER, DIMENSION(0:2), INTENT(in)                :: periodic
132      INTEGER, DIMENSION(0:2), INTENT(in), OPTIONAL      :: gdim
133      INTEGER, DIMENSION(2, 0:2), INTENT(in), OPTIONAL   :: local_bounds
134      INTEGER, DIMENSION(0:2), INTENT(in), OPTIONAL      :: local_shift
135      REAL(dp), DIMENSION(0:2), INTENT(in), OPTIONAL     :: poly_shift
136      REAL(dp), INTENT(in), OPTIONAL                     :: scale
137
138      CHARACTER(len=*), PARAMETER :: routineN = 'integrateGaussFull', &
139         routineP = moduleN//':'//routineN
140
141      CALL colloc_int_body(h, h_inv, grid, poly, alphai, posi, max_r2, &
142                           periodic, gdim, local_bounds, local_shift, &
143                           poly_shift, scale, integrate=.TRUE.)
144
145   END SUBROUTINE
146
147! **************************************************************************************************
148!> \brief Common code for collocGauss() and integrateGaussFull()
149!> \param h ...
150!> \param h_inv ...
151!> \param grid ...
152!> \param poly ...
153!> \param alphai ...
154!> \param posi ...
155!> \param max_r2 ...
156!> \param periodic ...
157!> \param gdim ...
158!> \param local_bounds ...
159!> \param local_shift ...
160!> \param poly_shift ...
161!> \param scale ...
162!> \param lgrid ...
163!> \param integrate ...
164! **************************************************************************************************
165   SUBROUTINE colloc_int_body(h, h_inv, grid, poly, alphai, posi, max_r2, &
166                              periodic, gdim, local_bounds, local_shift, poly_shift, scale, lgrid, integrate)
167      REAL(dp), DIMENSION(0:2, 0:2), &
168         INTENT(in)                             :: h, h_inv
169      REAL(dp), DIMENSION(0:, 0:, 0:), &
170         INTENT(inout)                          :: grid
171      REAL(dp), DIMENSION(:), INTENT(inout)    :: poly
172      REAL(dp), INTENT(in)                     :: alphai
173      REAL(dp), DIMENSION(0:2), INTENT(in)     :: posi
174      REAL(dp), INTENT(in)                     :: max_r2
175      INTEGER, DIMENSION(0:2), INTENT(in)      :: periodic
176      INTEGER, DIMENSION(0:2), INTENT(in), &
177         OPTIONAL                               :: gdim
178      INTEGER, DIMENSION(2, 0:2), INTENT(in), &
179         OPTIONAL                               :: local_bounds
180      INTEGER, DIMENSION(0:2), INTENT(in), &
181         OPTIONAL                               :: local_shift
182      REAL(dp), DIMENSION(0:2), INTENT(in), &
183         OPTIONAL                               :: poly_shift
184      REAL(dp), INTENT(in), OPTIONAL           :: scale
185      TYPE(lgrid_type), INTENT(inout), &
186         OPTIONAL                               :: lgrid
187      LOGICAL                                  :: integrate
188
189      CHARACTER(len=*), PARAMETER :: routineN = 'colloc_int_body', &
190                                     routineP = moduleN//':'//routineN
191
192      INTEGER, DIMENSION(0:2), PARAMETER       :: permut = (/2, 1, 0/)
193      INTEGER :: grad, i, i0, ii, iiShift, iiShift2, iistart, iistart2, ij, &
194                 ijShift, iJump, ik, ikShift, ikShift2, ikstart, ikstart2, iend, iend2, &
195                 imax, imax1, imin, imin1, istart, istart2, j, jend, jJump, jmax, jmax1, jmin, &
196                 jmin1, jstart, k, kend, kend2, kgrad, kJump, kmax, kmax1, kmin, kmin1, &
197                 kstart, kstart2, max_j, size_jk, size_k, size_ijk, ig, ithread, nthread
198      INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: k_bounds
199      INTEGER, DIMENSION(0:2)                  :: cellShift, l_shift, l_ub, &
200                                                  ndim, period, shiftPos, ldim2
201      INTEGER, DIMENSION(2, 0:2)               :: l_bounds
202      LOGICAL                                  :: has_overlap
203      REAL(dp) :: cci0, cci1, cci2, ccj0, ccj0_i0, ccj0_i1, ccj0_i2, ccj1, &
204                  ccj1_i0, ccj1_i1, ccj2, cck0, cck0_0, cck0_0_p, cck0_i, cck0_i2, &
205                  cck0_ij, cck0_j, cck0_j2, cck0_j_p, cck1, cck1_0, cck1_0_p, cck1_i, &
206                  cck1_j, cck2, delta_i, delta_j, delta_k, g_scale, i_coeffn_i, icoeff0, &
207                  ii_coeff0, ii_coeff2, ii_coeff2_jump, ii_coeffn, ii_coeffn_jump, &
208                  ij_coeff0, ij_coeff0_jump, ij_coeff1, ik_coeff0, ik_coeff0_jump, &
209                  ik_coeff1, j_coeffn_i, j_coeffn_j, jcoeff0, jj_coeff0, jj_coeff2, &
210                  jj_coeffn, jk_coeff0, jk_coeff1, k_coeffn_i, k_coeffn_j, k_coeffn_k, &
211                  kcoeff0, kk_coeff0, kk_coeff2, kk_coeffn, m(0:2, 0:2), maxr2, p_kk, &
212                  r_0
213      REAL(dp) :: res_0, res_i, res_j, res_k, scaled_h(0:2, 0:2), sqDi, sqDj, &
214                  sqDk
215      REAL(dp), ALLOCATABLE, DIMENSION(:)      :: poly_ijk, poly_jk, xi
216      REAL(dp), DIMENSION(0:2)                 :: l, normD, p_shift, resPos, &
217                                                  resPosReal, riPos, rpos, wrPos
218
219      REAL(dp) :: det, gval
220      REAL(dp), ALLOCATABLE, DIMENSION(:)      :: k_vals
221      INTEGER, PARAMETER :: npoly = 1
222      REAL(dp), ALLOCATABLE, DIMENSION(:)      :: poly_k
223      REAL(dp) :: p_v
224
225#define IF_FLAT(x,y) y
226
227      ithread = 0
228!$    ithread = omp_get_thread_num()
229
230      nthread = 1
231!$    nthread = omp_get_num_threads()
232
233      IF (integrate) THEN
234         poly = 0.0_dp
235      ELSE
236         IF (ALL(poly == 0.0_dp)) RETURN
237      ENDIF
238
239      IF (PRESENT(poly_shift)) THEN
240         p_shift = poly_shift
241      ELSE
242         p_shift = 0.0_dp
243      END IF
244
245      ldim2(permut(0)) = SIZE(grid, 1)
246      ldim2(permut(1)) = SIZE(grid, 2)
247      ldim2(permut(2)) = SIZE(grid, 3)
248
249      IF (PRESENT(gdim)) THEN
250         DO i = 0, 2
251            ndim(permut(i)) = gdim(i)
252         END DO
253      ELSE
254         ndim = ldim2
255      END IF
256      g_scale = 1.0_dp
257      IF (PRESENT(scale)) g_scale = scale
258
259      IF (integrate) THEN
260         det = (h(0, 0)*(h(1, 1)*h(2, 2)-h(1, 2)*h(2, 1)) &
261                -h(1, 0)*(h(0, 1)*h(2, 2)-h(0, 2)*h(2, 1)) &
262                +h(2, 0)*(h(0, 1)*h(1, 2)-h(0, 2)*h(1, 1)))
263         g_scale = g_scale*ABS(det)/REAL(INT(ndim(0), KIND=int_8)*INT(ndim(1), KIND=int_8)*INT(ndim(2), KIND=int_8), dp)
264      ENDIF
265
266      IF (PRESENT(local_bounds)) THEN
267         DO i = 0, 2
268            l_bounds(:, permut(i)) = local_bounds(:, i)
269         END DO
270      ELSE
271         l_bounds(1, :) = 0
272         l_bounds(2, :) = ndim-1
273      END IF
274      IF (PRESENT(local_shift)) THEN
275         DO i = 0, 2
276            l_shift(permut(i)) = local_shift(i)
277         END DO
278      ELSE
279         l_shift = 0 ! l_bounds(1,:)
280      END IF
281      l_ub = l_bounds(2, :)-l_bounds(1, :)+l_shift
282      DO i = 0, 2
283         CPASSERT(l_ub(i) < ldim2(i))
284      END DO
285
286      DO i = 0, 2
287         period(permut(i)) = periodic(i)
288      END DO
289      CPASSERT(ALL(l_bounds(2, :) < ndim .OR. period(:) == 1))
290      CPASSERT(ALL(l_bounds(1, :) >= 0 .OR. period(:) == 1))
291      CPASSERT(ALL(l_bounds(2, :)-l_bounds(1, :) < ndim))
292      rPos = 0.0_dp
293      DO j = 0, 2
294         DO i = 0, 2
295            rPos(permut(i)) = rPos(permut(i))+h_inv(i, j)*posi(j)
296         END DO
297      END DO
298      cellShift = FLOOR(rPos)*period
299      wrPos = rPos-REAL(cellShift, dp)
300      riPos = wrPos*ndim
301      shiftPos = FLOOR(riPos+0.5_dp)
302      resPos = riPos-shiftPos
303      normD = 1.0_dp/REAL(ndim, dp)
304      scaled_h = 0.0_dp
305      DO j = 0, 2
306         DO i = 0, 2
307            scaled_h(i, permut(j)) = h(i, j)*normD(permut(j))
308         END DO
309      END DO
310      resPosReal = 0.0_dp
311      DO j = 0, 2
312         DO i = 0, 2
313            resPosReal(i) = resPosReal(i)+h(i, j)*(wrPos(permut(j))-normD(permut(j))*REAL(shiftPos(permut(j)), dp))
314         END DO
315      END DO
316
317      !maxr2=0.0_dp
318      !DO j=0,2
319      !    DO i=0,2
320      !        ! guarantee at least the nearest points (this increases the sphere, increase just the box?)
321      !        maxr2=maxr2+(scaled_h(i,j))**2
322      !    END DO
323      !END DO
324      maxr2 = max_r2 !MAX(max_r2,maxr2)
325
326      ! build up quadratic form (ellipsoid)
327      m = 0.0_dp
328      DO j = 0, 2
329         DO i = 0, 2
330            DO k = 0, 2
331               m(i, j) = m(i, j)+scaled_h(k, i)*scaled_h(k, j)
332            END DO
333         END DO
334      END DO
335
336      l = 0.0_dp
337      DO j = 0, 2
338         DO i = 0, 2
339            l(j) = l(j)-2.0*resPos(i)*m(i, j)
340         END DO
341      END DO
342
343      r_0 = 0.0_dp
344      DO i = 0, 2
345         r_0 = r_0-0.5*resPos(i)*l(i)
346      END DO
347
348      ! calc i boundaries
349      cci2 = (m(2, 2)*m(0, 0)*m(1, 1)-m(2, 2)*m(0, 1)**2-m(1, 1)*m(0, 2)**2 &
350              +2.0_dp*m(0, 2)*m(0, 1)*m(1, 2)-m(0, 0)*m(1, 2)**2)/(m(2, 2)*m(1, 1)-m(1, 2)**2)
351      cci1 = -(-m(2, 2)*l(0)*m(1, 1)+m(2, 2)*m(0, 1)*l(1)+l(2)*m(0, 2)*m(1, 1) &
352               +l(0)*m(1, 2)**2-l(2)*m(0, 1)*m(1, 2)-m(1, 2)*l(1)*m(0, 2))/(m(2, 2)*m(1, 1)-m(1, 2)**2)
353      cci0 = -((-4.0_dp*m(2, 2)*r_0*m(1, 1)+m(2, 2)*l(1)**2+l(2)**2*m(1, 1) &
354                -2.0_dp*l(1)*m(1, 2)*l(2)+4.0_dp*r_0*m(1, 2)**2) &
355               /(m(2, 2)*m(1, 1)-m(1, 2)**2))/4.0_dp-maxr2
356      delta_i = cci1*cci1-4.0_dp*cci2*cci0
357      IF (delta_i <= 0) RETURN
358      sqDi = SQRT(delta_i)
359      imin = CEILING((-cci1-sqDi)/(2.0_dp*cci2))
360      imax = FLOOR((-cci1+sqDi)/(2.0_dp*cci2))
361      !! box early return
362
363      IF (period(0) == 1) THEN
364         has_overlap = imax-imin+1 > ndim(0) .OR. (l_bounds(1, 0) == 0 .AND. l_bounds(2, 0) == ndim(0)-1)
365         IF (.NOT. has_overlap) THEN
366            imin1 = MODULO(imin+shiftPos(0), ndim(0))
367            imax1 = imin1+imax-imin+1
368            IF (imin1 < l_bounds(1, 0)) THEN
369               has_overlap = imax1 >= l_bounds(1, 0)
370            ELSE
371               has_overlap = imin1 <= l_bounds(2, 0) .OR. (imax1 >= ndim(0) .AND. l_bounds(1, 0) <= imax1+ndim(0))
372            END IF
373            IF (.NOT. has_overlap) RETURN
374         END IF
375      ELSE
376         IF (imax+shiftPos(0) < l_bounds(1, 0) .OR. imin+shiftPos(0) > l_bounds(2, 0)) RETURN
377      END IF
378
379      ! j box bounds
380      has_overlap = l_bounds(1, 1) == 0 .AND. l_bounds(2, 1) == ndim(1)-1
381      IF (.NOT. has_overlap) THEN
382         ccj2 = (m(0, 0)*m(2, 2)*m(1, 1)-m(0, 0)*m(1, 2)**2-m(0, 1)**2*m(2, 2) &
383                 +2.0_dp*m(0, 1)*m(0, 2)*m(1, 2)-m(1, 1)*m(0, 2)**2) &
384                /(m(0, 0)*m(2, 2)-m(0, 2)**2)
385         ccj1 = -(-m(0, 0)*l(1)*m(2, 2)+m(0, 0)*l(2)*m(1, 2)+l(0)*m(0, 1)*m(2, 2) &
386                  -m(0, 2)*l(2)*m(0, 1)-l(0)*m(0, 2)*m(1, 2)+l(1)*m(0, 2)**2) &
387                /(m(0, 0)*m(2, 2)-m(0, 2)**2)
388         ccj0 = (4.0_dp*m(0, 0)*m(2, 2)*r_0-m(0, 0)*l(2)**2-m(2, 2)*l(0)**2 &
389                 +2.0_dp*m(0, 2)*l(2)*l(0)-4.0_dp*r_0*m(0, 2)**2) &
390                /(m(0, 0)*m(2, 2)-m(0, 2)**2)/4.0_dp-maxr2
391         delta_j = ccj1*ccj1-4.0_dp*ccj2*ccj0
392         IF (delta_j <= 0) RETURN
393         sqDj = SQRT(delta_j)
394         jmin = CEILING((-ccj1-sqDj)/(2.0_dp*ccj2))
395         jmax = FLOOR((-ccj1+sqDj)/(2.0_dp*ccj2))
396         IF (period(1) == 1) THEN
397            IF (jmax-jmin+1 < ndim(1)) THEN
398               jmin1 = MODULO(jmin+shiftPos(1), ndim(1))
399               jmax1 = jmin1+jmax-jmin+1
400               IF (jmin1 < l_bounds(1, 1)) THEN
401                  has_overlap = jmax1 >= l_bounds(1, 1)
402               ELSE
403                  has_overlap = jmin1 <= l_bounds(2, 1) .OR. (jmax1 >= ndim(1) .AND. (l_bounds(1, 1) <= jmax1-ndim(1)))
404               END IF
405               IF (.NOT. has_overlap) RETURN
406            END IF
407         ELSE
408            IF (jmax+shiftPos(1) < l_bounds(1, 1) .OR. jmin+shiftPos(1) > l_bounds(2, 1)) RETURN
409         END IF
410      END IF
411
412      ! k box bounds
413      has_overlap = l_bounds(1, 2) == 0 .AND. l_bounds(2, 2) == ndim(2)-1
414      IF (.NOT. has_overlap) THEN
415         cck2 = (m(0, 0)*m(2, 2)*m(1, 1)-m(0, 0)*m(1, 2)**2-m(0, 1)**2*m(2, 2) &
416                 +2.0_dp*m(0, 1)*m(0, 2)*m(1, 2)-m(1, 1)*m(0, 2)**2)/(m(0, 0)*m(1, 1)-m(0, 1)**2)
417         cck1 = (m(0, 0)*l(2)*m(1, 1)-m(0, 0)*m(1, 2)*l(1)-m(0, 2)*l(0)*m(1, 1) &
418                 +l(0)*m(0, 1)*m(1, 2)-l(2)*m(0, 1)**2+m(0, 1)*l(1)*m(0, 2))/(m(0, 0)*m(1, 1)-m(0, 1)**2)
419         cck0 = (4.0_dp*m(0, 0)*m(1, 1)*r_0-m(0, 0)*l(1)**2-m(1, 1)*l(0)**2 &
420                 +2.0_dp*m(0, 1)*l(1)*l(0)-4.0_dp*r_0*m(0, 1)**2) &
421                /(m(0, 0)*m(1, 1)-m(0, 1)**2)/4.0_dp-maxr2
422         delta_k = cck1*cck1-4.0_dp*cck2*cck0
423         IF (delta_k <= 0) RETURN
424         sqDk = SQRT(delta_k)
425         kmin = CEILING((-cck1-sqDk)/(2.0_dp*cck2))
426         kmax = FLOOR((-cck1+sqDk)/(2.0_dp*cck2))
427
428         IF (period(2) == 1) THEN
429            IF (kmax-kmin+1 < ndim(2)) THEN
430               kmin1 = MODULO(kmin+shiftPos(2), ndim(2))
431               kmax1 = kmin1+kmax-kmin+1
432               IF (kmin1 < l_bounds(1, 2)) THEN
433                  has_overlap = kmax1 >= l_bounds(1, 2)
434               ELSE
435                  has_overlap = kmin1 <= l_bounds(2, 2) .OR. &
436                                (kmax1 >= ndim(2) .AND. (l_bounds(1, 2) <= MODULO(kmax1, ndim(2))))
437               END IF
438               IF (.NOT. has_overlap) RETURN
439            END IF
440         ELSE
441            IF (kmax+shiftPos(2) < l_bounds(1, 2) .OR. kmin+shiftPos(2) > l_bounds(2, 2)) RETURN
442         END IF
443      END IF
444
445      ! k bounds (cache a la cube_info, or inversely integrate in the collocate loop?)
446      ccj2 = (m(2, 2)*m(1, 1)-m(1, 2)**2)/m(2, 2)
447      ccj1_i1 = (2*m(2, 2)*m(0, 1)-2*m(0, 2)*m(1, 2))/m(2, 2)
448      ccj1_i0 = (-l(2)*m(1, 2)+m(2, 2)*l(1))/m(2, 2)
449      ccj0_i2 = (m(2, 2)*m(0, 0)-m(0, 2)**2)/m(2, 2)
450      ccj0_i1 = (m(2, 2)*l(0)-m(0, 2)*l(2))/m(2, 2)
451      ccj0_i0 = (m(2, 2)*r_0-0.25*l(2)**2)/m(2, 2)-maxr2
452      cck2 = m(2, 2)
453      cck1_i = 2*m(0, 2)
454      cck1_j = 2*m(1, 2)
455      cck1_0 = l(2)
456      cck0_i2 = m(0, 0)
457      cck0_ij = 2*m(0, 1)
458      cck0_i = l(0)
459      cck0_j2 = m(1, 1)
460      cck0_j = l(1)
461      cck0_0 = r_0-maxr2
462
463      ! find maximum number of j
464      max_j = 0
465      DO i0 = 0, 1
466         i = (imin+imax)/2+i0
467         ccj1 = ccj1_i1*i+ccj1_i0
468         ccj0 = (ccj0_i2*i+ccj0_i1)*i+ccj0_i0
469         delta_j = ccj1*ccj1-4*ccj2*ccj0
470         IF (delta_j >= 0) THEN
471            sqDj = SQRT(delta_j)
472            max_j = MAX(max_j, FLOOR((-ccj1+sqDj)/(2.0_dp*ccj2)) &
473                        -CEILING((-ccj1-sqDj)/(2.0_dp*ccj2))+1)
474         END IF
475      END DO
476      max_j = max_j+1 ! just to be sure...
477      IF (period(1) == 0) max_j = MIN(max_j, l_bounds(2, 1)-l_bounds(1, 1)+1)
478
479      IF (period(0) == 0) THEN
480         imin = MAX(l_bounds(1, 0)-shiftPos(0), imin)
481         imax = MIN(l_bounds(2, 0)-shiftPos(0), imax)
482      END IF
483
484      ! k bounds (cache a la cube_info?)
485      has_overlap = .FALSE.
486      ALLOCATE (k_bounds(0:1, 0:max_j-1, 0:MAX(0, imax-imin+1)))
487      ! k_bounds=0
488      istart = imin
489      iiShift = shiftPos(0)-l_bounds(2, 0)+istart
490      IF (iiShift > 0) iiShift = iiShift+ndim(0)-1
491      iiShift = (iiShift/ndim(0))*ndim(0)-shiftPos(0)
492      !iiShift=CEILING(REAL(shiftPos(0)+istart-l_bounds(2,0))/REAL(ndim(0)))*ndim(0)-shiftPos(0))
493      istart = MAX(iiShift+l_bounds(1, 0), istart)
494      iend = MIN(iiShift+l_bounds(2, 0), imax)
495      iJump = ndim(0)-l_bounds(2, 0)+l_bounds(1, 0)-1
496      jJump = ndim(1)-l_bounds(2, 1)+l_bounds(1, 1)-1
497      DO
498         DO i = istart, iend
499            ccj1 = ccj1_i1*i+ccj1_i0
500            ccj0 = (ccj0_i2*i+ccj0_i1)*i+ccj0_i0
501            delta_j = ccj1*ccj1-4*ccj2*ccj0
502            IF (delta_j < 0) CONTINUE
503            sqDj = SQRT(delta_j)
504            jmin = CEILING((-ccj1-sqDj)/(2.0_dp*ccj2))
505            jmax = FLOOR((-ccj1+sqDj)/(2.0_dp*ccj2))
506            cck0_0_p = cck0_0+(cck0_i2*i+cck0_i)*i
507            cck0_j_p = cck0_j+cck0_ij*i
508            cck1_0_p = cck1_0+cck1_i*i
509            IF (period(1) == 0) THEN
510               jmin = MAX(l_bounds(1, 1)-shiftPos(1), jmin)
511               jmax = MIN(l_bounds(2, 1)-shiftPos(1), jmax)
512            END IF
513            jstart = jmin
514            ijShift = shiftPos(1)+jstart-l_bounds(2, 1)
515            IF (ijShift > 0) ijShift = ijShift+ndim(1)-1
516            ijShift = (ijShift/ndim(1))*ndim(1)-shiftPos(1)
517            ! ijShift=CEILING(REAL(shiftPos(1)+jstart-l_bounds(2,1))/REAL(ndim(1)))*ndim(1)-shiftPos(1)
518            jstart = MAX(ijShift+l_bounds(1, 1), jstart)
519            jend = MIN(ijShift+l_bounds(2, 1), jmax)
520            DO
521               DO j = jstart, jend
522                  cck1 = cck1_0_p+cck1_j*j
523                  cck0 = cck0_0_p+(cck0_j_p+cck0_j2*j)*j
524
525                  delta_k = cck1*cck1-4*cck2*cck0
526                  IF (delta_k < 0) THEN
527                     k_bounds(0, j-jmin, i-imin) = 0 ! CEILING((-cck1)/(2.0_dp*cck2))
528                     k_bounds(1, j-jmin, i-imin) = -1 ! k_bounds(0,j-jmin,i-imin)-1
529                  ELSE
530                     sqDk = SQRT(delta_k)
531                     kmin = CEILING((-cck1-sqDk)/(2.0_dp*cck2))
532                     kmax = FLOOR((-cck1+sqDk)/(2.0_dp*cck2))
533
534                     ! ! reduce kmax,kmin
535                     ! ! this should be done later if k_bounds are shared by threads with different slices
536                     ! ikShift=FLOOR(REAL(shiftPos(2)+kmax-l_bounds(1,2))/REAL(ndim(2)))*ndim(2)-shiftPos(2)
537                     ! kmax=MIN(kmax,ikShift+l_bounds(2,2))
538                     ! ikShift2=CEILING(REAL(shiftPos(2)-l_bounds(2,2)+kmin)/REAL(ndim(2)))*ndim(2)-shiftPos(2)
539                     ! kmin=MAX(kmin,ikShift2+l_bounds(1,2))
540
541                     k_bounds(0, j-jmin, i-imin) = kmin
542                     k_bounds(1, j-jmin, i-imin) = kmax
543                     IF (kmax >= kmin) has_overlap = .TRUE.
544                  END IF
545               END DO
546               jstart = jend+jJump+1
547               IF (jstart > jmax) EXIT
548               jend = MIN(jend+ndim(1), jmax)
549            END DO
550         END DO
551         istart = iend+iJump+1
552         IF (istart > imax) EXIT
553         iend = MIN(iend+ndim(0), imax)
554      END DO
555      IF (period(2) == 0) THEN
556         k_bounds(0, :, :) = MAX(l_bounds(1, 2)-shiftPos(2), k_bounds(0, :, :))
557         k_bounds(1, :, :) = MIN(l_bounds(2, 2)-shiftPos(2), k_bounds(1, :, :))
558      END IF
559
560      IF (.NOT. has_overlap) RETURN
561
562      ! poly x,y,z -> i,j,k
563      grad = grad_size3(SIZE(poly)/npoly)
564      size_jk = poly_size2(grad)*npoly
565      size_k = poly_size1(grad)*npoly
566      size_ijk = poly_size3(grad)*npoly
567      ALLOCATE (poly_ijk(size_ijk), &
568                poly_jk(size_jk), &
569                xi(grad+1))
570
571      IF (integrate) THEN
572         ALLOCATE (k_vals(0:grad))
573      ELSE
574         ALLOCATE (poly_k(0:size_k-1))
575      ENDIF
576
577      IF (integrate) THEN
578         CPASSERT(SIZE(poly) == poly_size3(grad))
579         poly_ijk = 0.0_dp
580      ELSE
581         CALL poly_affine_t3(poly, scaled_h, -resPosReal+p_shift, poly_ijk, &
582                             npoly=npoly)
583      ENDIF
584
585      ij_coeff0 = EXP(-2.0_dp*alphai*m(0, 1))
586      ik_coeff0 = EXP(-2.0_dp*alphai*m(0, 2))
587      ii_coeff0 = EXP(-alphai*m(0, 0))
588      jk_coeff0 = EXP(-2.0_dp*alphai*m(1, 2))
589      jj_coeff0 = EXP(-alphai*m(1, 1))
590      kk_coeff0 = EXP(-alphai*m(2, 2))
591      jk_coeff1 = 1.0_dp/jk_coeff0
592      ij_coeff1 = 1.0_dp/ij_coeff0
593      ik_coeff1 = 1.0_dp/ik_coeff0
594      ii_coeff2 = ii_coeff0*ii_coeff0
595      jj_coeff2 = jj_coeff0*jj_coeff0
596      kk_coeff2 = kk_coeff0*kk_coeff0
597      icoeff0 = EXP(-alphai*l(0))
598      jcoeff0 = EXP(-alphai*l(1))
599      kcoeff0 = EXP(-alphai*l(2))
600      res_0 = EXP(-alphai*r_0)*g_scale
601
602      i_coeffn_i = icoeff0
603      j_coeffn_i = jcoeff0
604      k_coeffn_i = kcoeff0
605      ii_coeffn = i_coeffn_i*ii_coeff0
606      res_i = res_0
607
608      iJump = ndim(0)-l_bounds(2, 0)+l_bounds(1, 0)-1
609      istart = MAX(0, imin)
610      iiShift = shiftPos(0)-l_bounds(2, 0)+istart
611      IF (iiShift > 0) iiShift = iiShift+ndim(0)-1
612      iiShift = (iiShift/ndim(0))*ndim(0)-shiftPos(0)
613      !iiShift=CEILING(REAL(shiftPos(0)+istart-l_bounds(2,0))/REAL(ndim(0)))*ndim(0)-shiftPos(0)
614      istart = MAX(iiShift+l_bounds(1, 0), istart)
615      iistart = istart-iiShift-l_bounds(1, 0)+l_shift(0)
616      istart2 = MIN(-1, imax)
617      iiShift2 = shiftPos(0)+istart2-l_bounds(1, 0)
618      IF (iiShift2 < 0) iiShift2 = iiShift2-ndim(0)+1
619      iiShift2 = (iiShift2/ndim(0))*ndim(0)-shiftPos(0)
620      !iiShift2=FLOOR(REAL(shiftPos(0)+istart2-l_bounds(1,0))/REAL(ndim(0)))*ndim(0)-shiftPos(0)
621      istart2 = MIN(iiShift2+l_bounds(2, 0), istart2)
622      iistart2 = istart2-iiShift2-l_bounds(1, 0)+l_shift(0)
623
624      IF (iJump /= 0 .AND. (iistart+imax-istart >= ndim(0)+l_shift(0) .OR. &
625                            iistart2+imin-istart2 <= l_ub(0)-ndim(0))) THEN
626         ! will wrap
627         ij_coeff0_jump = ij_coeff0**(iJump)
628         ik_coeff0_jump = ik_coeff0**(iJump)
629         ii_coeff2_jump = ii_coeff2**(iJump)
630         ii_coeffn_jump = ii_coeff0**((iJump)*(iJump-1))
631      ELSE
632         ij_coeff0_jump = 1.0_dp
633         ik_coeff0_jump = 1.0_dp
634         ii_coeff2_jump = 1.0_dp
635         ii_coeffn_jump = 1.0_dp
636      END IF
637
638      iend = MIN(iiShift+l_bounds(2, 0), imax)
639      ii = iistart IF_FLAT(*iidim,)
640      IF (i > 0) THEN
641         ii_coeffn = i_coeffn_i*ii_coeff0**(2*istart+1)
642         j_coeffn_i = jcoeff0*ij_coeff0**istart
643         k_coeffn_i = kcoeff0*ik_coeff0**istart
644         res_i = res_0*(ii_coeff0**istart*i_coeffn_i)**istart
645      END IF
646      DO
647         DO i = istart, iend
648            ! perform j loop
649            IF (ABS(res_i) > small) THEN
650               CALL j_loop
651            END IF
652            j_coeffn_i = j_coeffn_i*ij_coeff0
653            k_coeffn_i = k_coeffn_i*ik_coeff0
654            res_i = res_i*ii_coeffn
655            ii_coeffn = ii_coeffn*ii_coeff2
656            ii = ii+IF_FLAT(iidim, 1)
657         END DO
658         istart = iend+iJump+1
659         IF (istart > imax) EXIT
660         iend = MIN(iend+ndim(0), imax)
661         ii = l_shift(0) IF_FLAT(*iidim,)
662         j_coeffn_i = j_coeffn_i*ij_coeff0_jump
663         k_coeffn_i = k_coeffn_i*ik_coeff0_jump
664         res_i = res_i*ii_coeffn**(iJump)*ii_coeffn_jump
665         ii_coeffn = ii_coeffn*ii_coeff2_jump
666      END DO
667
668      ! neg i side
669      i_coeffn_i = 1.0_dp/icoeff0
670      j_coeffn_i = jcoeff0
671      k_coeffn_i = kcoeff0
672      res_i = res_0
673      ii_coeffn = i_coeffn_i*ii_coeff0
674
675      iend2 = MAX(iiShift2+l_bounds(1, 0), imin)
676      ii = iistart2 IF_FLAT(*iidim,)
677      IF (istart2 < -1) THEN
678         ii_coeffn = i_coeffn_i*ii_coeff0**(-(2*istart2+1))
679         j_coeffn_i = jcoeff0*ij_coeff0**(istart2+1)
680         k_coeffn_i = kcoeff0*ik_coeff0**(istart2+1)
681         res_i = res_0*(ii_coeff0**(-istart2-1)*i_coeffn_i)**(-istart2-1)
682      END IF
683      DO
684         DO i = istart2, iend2, -1
685            j_coeffn_i = j_coeffn_i*ij_coeff1
686            k_coeffn_i = k_coeffn_i*ik_coeff1
687            res_i = res_i*ii_coeffn
688            ii_coeffn = ii_coeffn*ii_coeff2
689
690            ! perform j loop
691            IF (ABS(res_i) > small) THEN
692               CALL j_loop
693            END IF
694            ii = ii-IF_FLAT(iidim, 1)
695         END DO
696         istart2 = iend2-iJump-1
697         IF (istart2 < imin) EXIT
698         iend2 = MAX(iend2-ndim(0), imin)
699         ii = l_ub(0) IF_FLAT(*iidim,)
700         j_coeffn_i = j_coeffn_i/ij_coeff0_jump
701         k_coeffn_i = k_coeffn_i/ik_coeff0_jump
702         res_i = res_i*ii_coeffn**iJump*ii_coeffn_jump
703         ii_coeffn = ii_coeffn*ii_coeff2_jump
704      END DO
705
706      ! the final cleanup
707      IF (integrate) THEN
708         CALL poly_affine_t3t(poly_ijk, scaled_h, -resPosReal+p_shift, poly, &
709                              npoly=npoly)
710      END IF
711
712      ! rely on compiler to deallocate ALLOCATABLEs
713
714   CONTAINS
715
716! **************************************************************************************************
717!> \brief ...
718! **************************************************************************************************
719      SUBROUTINE j_loop()
720         ! calculate j bounds
721         ccj1 = ccj1_i1*i+ccj1_i0
722         ccj0 = (ccj0_i2*i+ccj0_i1)*i+ccj0_i0
723         delta_j = ccj1*ccj1-4*ccj2*ccj0
724         IF (delta_j < 0) THEN
725            RETURN
726         END IF
727         sqDj = SQRT(delta_j)
728         jmin = CEILING((-ccj1-sqDj)/(2.0_dp*ccj2))
729         jmax = FLOOR((-ccj1+sqDj)/(2.0_dp*ccj2))
730
731         IF (integrate) THEN
732            poly_jk = 0.0_dp
733         ELSE
734            CALL poly_p_eval3b(IF_CHECK(poly_ijk, poly_ijk(1)), size_ijk, REAL(i, dp), &
735                               IF_CHECK(poly_jk, poly_jk(1)), size_jk, &
736                               npoly=npoly, grad=grad, xi=IF_CHECK(xi, xi(1)))
737         ENDIF
738
739         IF (period(1) == 0) THEN
740            jmin = MAX(l_bounds(1, 1)-shiftPos(1), jmin)
741            jmax = MIN(l_bounds(2, 1)-shiftPos(1), jmax)
742         END IF
743
744         ! pos j side
745         j_coeffn_j = j_coeffn_i
746         k_coeffn_j = k_coeffn_i
747         jj_coeffn = j_coeffn_j*jj_coeff0
748         res_j = res_i
749
750         jJump = ndim(1)-l_bounds(2, 1)+l_bounds(1, 1)
751         jstart = MAX(0, jmin)
752         ijShift = shiftPos(1)+jstart-l_bounds(2, 1)
753         IF (ijShift > 0) ijShift = ijShift+ndim(1)-1
754         ijShift = (ijShift/ndim(1))*ndim(1)-shiftPos(1)
755         !ijShift=CEILING(REAL(shiftPos(1)+jstart-l_bounds(2,1))/REAL(ndim(1)))*ndim(1)-shiftPos(1)
756         jstart = MAX(ijShift+l_bounds(1, 1), jstart)
757         jend = MIN(ijShift+l_bounds(2, 1), jmax)
758         ij = (jstart-ijShift-l_bounds(1, 1)+l_shift(1)) IF_FLAT(*ijdim,)
759
760         IF (jstart > 0) THEN
761            k_coeffn_j = k_coeffn_i*jk_coeff0**jstart
762            jj_coeffn = j_coeffn_j*jj_coeff0**(2*jstart+1)
763            res_j = res_i*(jj_coeff0**jstart*j_coeffn_j)**jstart
764         END IF
765         DO
766            DO j = jstart, jend
767               kmin = k_bounds(0, j-jmin, i-imin)
768               kmax = k_bounds(1, j-jmin, i-imin)
769               ! do k loop
770               IF (res_j /= 0.0_dp .AND. k_coeffn_j /= 0.0_dp .AND. kmin <= kmax &
771                   .AND. ABS(res_j) > small) THEN
772                  CALL k_loop
773               END IF
774               k_coeffn_j = k_coeffn_j*jk_coeff0
775               res_j = res_j*jj_coeffn
776               jj_coeffn = jj_coeffn*jj_coeff2
777               ij = ij+IF_FLAT(ijdim, 1)
778            END DO
779            jstart = jend+jJump
780            IF (jstart > jmax) EXIT
781            ij = l_shift(1) IF_FLAT(*ijdim,)
782            jend = MIN(jend+ndim(1), jmax)
783            IF (jJump /= 1) THEN ! remove if?
784               k_coeffn_j = k_coeffn_i*jk_coeff0**jstart
785               jj_coeffn = j_coeffn_j*jj_coeff0**(2*jstart+1)
786               res_j = res_i*(jj_coeff0**jstart*j_coeffn_j)**jstart
787            END IF
788         END DO
789
790         ! neg j side
791         j_coeffn_j = 1.0_dp/j_coeffn_i
792         k_coeffn_j = k_coeffn_i
793         jj_coeffn = j_coeffn_j*jj_coeff0
794         res_j = res_i
795
796         jstart = MIN(-1, jmax)
797         ijShift = shiftPos(1)+jstart-l_bounds(1, 1)
798         IF (ijShift < 0) ijShift = ijShift-ndim(1)+1
799         ijShift = (ijShift/ndim(1))*ndim(1)-shiftPos(1)
800         !ijShift=FLOOR(REAL(shiftPos(1)+jstart-l_bounds(1,1))/REAL(ndim(1)))*ndim(1)-shiftPos(1))
801         jstart = MIN(ijShift+l_bounds(2, 1), jstart)
802         jend = MAX(ijShift+l_bounds(1, 1), jmin)
803         ij = (jstart-ijShift-l_bounds(1, 1)+l_shift(1)) IF_FLAT(*ijdim,)
804         IF (jstart < -1) THEN
805            k_coeffn_j = k_coeffn_i*jk_coeff0**(jstart+1)
806            jj_coeffn = j_coeffn_j*jj_coeff0**(-(2*jstart+1))
807            res_j = res_i*(jj_coeff0**(-jstart-1)*j_coeffn_j)**(-jstart-1)
808         END IF
809         DO
810            DO j = jstart, jend, -1
811               k_coeffn_j = k_coeffn_j*jk_coeff1
812               res_j = res_j*jj_coeffn
813               jj_coeffn = jj_coeffn*jj_coeff2
814
815               kmin = k_bounds(0, j-jmin, i-imin)
816               kmax = k_bounds(1, j-jmin, i-imin)
817               ! perform k loop
818               IF (res_j /= 0.0_dp .AND. k_coeffn_j /= 0.0_dp .AND. kmin <= kmax &
819                   .AND. ABS(res_j) > small) THEN
820                  CALL k_loop
821               END IF
822               ij = ij-IF_FLAT(ijdim, 1)
823            END DO
824            jstart = jend-jJump
825            IF (jstart < jmin) EXIT
826            jend = MAX(jend-ndim(1), jmin)
827            ij = l_ub(1) IF_FLAT(*ijdim,)
828            IF (jJump /= 1) THEN ! remove if?
829               k_coeffn_j = k_coeffn_i*jk_coeff0**(jstart+1)
830               jj_coeffn = j_coeffn_j*jj_coeff0**(-(2*jstart+1))
831               res_j = res_i*(jj_coeff0**(-jstart-1)*j_coeffn_j)**(-jstart-1)
832            END IF
833         END DO
834
835         IF (integrate) THEN
836            CALL poly_padd_uneval3b(IF_CHECK(poly_ijk, poly_ijk(1)), size_ijk, REAL(i, dp), &
837                                    IF_CHECK(poly_jk, poly_jk(1)), size_jk, &
838                                    npoly=npoly, grad=grad, xi=IF_CHECK(xi, xi(1)))
839            !CALL poly_padd_uneval3(poly_ijk,REAL(i,dp),poly_jk,npoly=npoly)
840         ENDIF
841      END SUBROUTINE
842
843! **************************************************************************************************
844!> \brief ...
845! **************************************************************************************************
846      SUBROUTINE k_loop()
847         IF (integrate) THEN
848            SELECT CASE (grad)
849            CASE (1)
850               CALL kloop1_int()
851            CASE (2)
852               CALL kloop2_int()
853            CASE (3)
854               CALL kloop3_int()
855            CASE (4)
856               CALL kloop4_int()
857            CASE (5)
858               CALL kloop5_int()
859            CASE (6)
860               CALL kloop6_int()
861            CASE (7)
862               CALL kloop7_int()
863            CASE (8)
864               CALL kloop8_int()
865            CASE default
866               CALL kloopdefault_int(grad)
867            END SELECT
868         ELSE
869            SELECT CASE (grad)
870            CASE (1)
871               CALL kloop1_col()
872            CASE (2)
873               CALL kloop2_col()
874            CASE (3)
875               CALL kloop3_col()
876            CASE (4)
877               CALL kloop4_col()
878            CASE (5)
879               CALL kloop5_col()
880            CASE (6)
881               CALL kloop6_col()
882            CASE (7)
883               CALL kloop7_col()
884            CASE (8)
885               CALL kloop8_col()
886            CASE default
887               CALL kloopdefault_col(grad)
888            END SELECT
889         ENDIF
890      END SUBROUTINE
891
892$:colloc_int_kloop()
893$:colloc_int_kloop(FMG_INTEGRATE_FULL=True)
894
895#:for i in range(1, 9)
896$:colloc_int_kloop(grad_val=i)
897$:colloc_int_kloop(grad_val=i, FMG_INTEGRATE_FULL=True)
898#:endfor
899
900#undef IF_FLAT
901   END SUBROUTINE colloc_int_body
902
903END MODULE gauss_colloc
904