1c     Modifications (c) 2021 Advanced Micro Devices, Inc. All Rights Reserved.
2
3C #ifndef OFFLOAD_DEBUG_DATA
4C #define OFFLOAD_DEBUG_DATA
5C #endif
6
7#ifdef USE_OMP_TEAMS_DISTRIBUTE
8#define TEAMS_DISTRIBUTE teams distribute
9#else
10#define TEAMS_DISTRIBUTE
11#endif
12
13#ifdef USE_OMP_SCHEDULE_STATIC_1
14#define OMP_SCHEDULE schedule(static,1)
15#else
16#define OMP_SCHEDULE
17#endif
18
19      SUBROUTINE ccsd_t(d_t1,k_t1_offset,d_t2,k_t2_offset,
20     1            d_v2,k_v2_offset,energy1,energy2,size_t1)
21C
22C     $Id$
23C
24      IMPLICIT NONE
25#include "global.fh"
26#include "mafdecls.fh"
27#include "util.fh"
28#include "errquit.fh"
29#include "tce.fh"
30#include "tce_main.fh"
31#include "offl.fh"
32#include "ccsd_t_ps.fh"
33      integer d_t1
34      integer k_t1_offset
35      integer d_t2
36      integer k_t2_offset
37      integer d_v2
38      integer k_v2_offset
39      integer t_h1b, t_h1
40      integer t_h2b, t_h2
41      integer t_h3b, t_h3
42      integer t_p4b, t_p4
43      integer t_p5b, t_p5
44      integer t_p6b, t_p6
45#ifdef USE_F90_ALLOCATABLE
46      double precision, allocatable :: f_singles(:),f_doubles(:)
47#ifdef USE_FASTMEM
48      !dec$ attributes fastmem :: f_singles, f_doubles
49#endif
50      integer alloc_error
51#else
52      integer k_singles,l_singles
53      integer k_doubles,l_doubles
54#endif
55      integer size,i
56      integer nxtask0
57      integer next
58      integer nprocs
59      integer count
60      integer offset_p4,offset_p5,offset_p6
61      integer offset_h1,offset_h2,offset_h3
62      integer range_p4,range_p5,range_p6
63      integer range_h1,range_h2,range_h3
64c - T1/X1 LOCALIZATION -------------------
65      integer l_t1_local,k_t1_local
66      integer size_t1
67c ---------------------------------------
68      double precision energy(2)
69      double precision energy1,energy2
70      integer k_aux,k_list,l_aux,l_list
71      integer ccsd_t_6tasks,tot_task,w_thresh,task_thresh
72      integer k,iptr,k_grain
73      double precision wall
74      integer tasks_skip
75      integer n_progr,pct_progr
76      parameter(n_progr=20)
77      logical i_progr(n_progr)
78      external nxtask0,ccsd_t_6tasks
79      logical offload_enabled
80      external offload_enabled
81C
82c
83c - T1/X1 LOCALIZATION ----------
84c    opening l_t1_local and l_x1_local
85        if (.not.MA_PUSH_GET(mt_dbl,size_t1,'t1_local',
86     1      l_t1_local,k_t1_local))
87     1      call errquit('ccsd_t: t1_local size=',size_t1,MA_ERR)
88        call ma_zero(dbl_mb(k_t1_local),size_t1)
89c    copy d_t1 ==> l_t1_local
90ccx        call ga_get(d_t1,1,size_t1,1,1,dbl_mb(k_t1_local),1)
91      call get_block(d_t1,dbl_mb(k_t1_local),size_t1,0)
92c -------------------------------
93C
94      nprocs = GA_NNODES()
95      energy(1)=0.0d0
96      energy(2)=0.0d0
97      energy1 = 0.0d0
98      energy2 = 0.0d0
99c     estimate triplesx size
100      range_p4=0
101      do t_p4b = noab+1,noab+nvab
102         range_p4 = max(range_p4,int_mb(k_range+t_p4b-1))
103      enddo
104      range_h1=0
105      do t_h1b = 1,noab
106         range_h1 = max(range_h1,int_mb(k_range+t_h1b-1))
107      enddo
108      size=(range_p4**3)*(range_h1**3)
109
110      call util_align64(size)
111      triplesx_mxlgth=size
112#ifdef USE_F90_ALLOCATABLE
113      allocate( f_singles(1:size), stat=alloc_error)
114      if (alloc_error.ne.0) then
115        call errquit('ccsd_t: MA error sgl',0,MA_ERR)
116      endif
117      allocate( f_doubles(1:size), stat=alloc_error)
118      if (alloc_error.ne.0) then
119        call errquit('ccsd_t: MA error dbl',0,MA_ERR)
120      endif
121#if USE_OPENMP
122#if USE_OFFLOAD
123      if (offload_enabled()) then
124!$omp target enter data
125!$omp&map(alloc:f_singles)
126!$omp&map(alloc:f_doubles)
127#ifdef OFFLOAD_DEBUG_DATA
128        write (*,*) ga_nodeid(), 'alloc f_singles', size, 'elements'
129        write (*,*) ga_nodeid(), 'alloc f_doubles', size, 'elements'
130#endif
131      endif
132#endif
133#endif
134#else
135#if USE_OFFLOAD
136      call errquit('ccsd_t: offload not supported with MA',0,0)
137#endif
138      if (.not.MA_PUSH_GET(mt_dbl,size,'(T) singles',l_singles,
139     1     k_singles)) call errquit('ccsd_t: MA error sgl',
140     2     size,MA_ERR)
141      if (.not.MA_PUSH_GET(mt_dbl,size,'(T) doubles',l_doubles,
142     1     k_doubles)) call errquit('ccsd_t: MA error dbl',
143     2     size,MA_ERR)
144#endif
145      wall=-util_wallsec()
146      tot_task= ccsd_t_6tasks(restricted,noab,nvab,
147     1                        int_mb(k_spin),int_mb(k_sym))
148      if (.not.ma_push_get(mt_int,7*tot_task,"list.task",
149     1  l_list,k_list)) call errquit("k_list",1,MA_ERR)
150      if (.not.ma_push_get(mt_int,7*tot_task,"auxtask",
151     1  l_aux,k_aux)) call errquit("k_aux",2,MA_ERR)
152c
153c     get first task with weight lt ? 8
154c
155c      w_thresh=10
156      w_thresh=0
157      w_thresh=w_thresh**6
158      call ccsd_t_neword(tot_task, w_thresh,task_thresh,
159     R     restricted,noab,nvab,
160     K     int_mb(k_spin),int_mb(k_sym),
161     K     int_mb(k_range),
162     A     int_mb(k_aux),int_mb(k_list))
163
164
165      if (.not.MA_POP_STACK(l_aux))
166     1     call errquit('ordering',3,MA_ERR)
167      count = 0
168      k_grain=1
169      tasks_skip=0
170      next = nxtask0(nprocs,k_grain,tasks_skip)
171      do k=1,n_progr
172         i_progr(k)=.true.
173      enddo
174c     stagger start of loop
175      call util_mpinap(100)
176      if(task_thresh.gt.1) then
177      do k=1,task_thresh-1
178c
179         if (next.eq.count) then
180#ifdef USE_F90_ALLOCATABLE
181            call ccsd_t_loop(k,energy1,energy2,
182     &              int_mb(k_list),int_mb(k_range),int_mb(k_offset),
183     &              f_singles,f_doubles,
184     &              k_t1_local,k_t1_offset,
185     &              d_t2,d_v2,k_t2_offset,k_v2_offset,
186     &              restricted,k_evl_sorted,size)
187#else
188            call ccsd_t_loop(k,energy1,energy2,
189     &              int_mb(k_list),int_mb(k_range),int_mb(k_offset),
190     &              dbl_mb(k_singles),dbl_mb(k_doubles),
191     &              k_t1_local,k_t1_offset,
192     &              d_t2,d_v2,k_t2_offset,k_v2_offset,
193     &              restricted,k_evl_sorted,size)
194#endif
195c
196            if(ga_nodeid().eq.2) then
197               pct_progr=(k*n_progr)/tot_task
198               if(i_progr(pct_progr)) then
199                  i_progr(pct_progr)=.false.
200                  write(6,'(a,i5,a,i4,a,f15.1,a,f9.1)')
201     &                  '0task ',k,'  done ',
202     &                  int((k*100d0)/tot_task),'%  at',
203     &                  wall+util_wallsec(),' sec, (size)^1/6= ',
204     &                  (size)**(1d0/6d0)
205                  call util_flush(6)
206               endif
207            endif
208            next = nxtask0(nprocs,k_grain,tasks_skip)
209         endif
210         count = count + 1
211      enddo
212      endif
213      if(task_thresh.le.tot_task) then
214      next = nxtask0(-nprocs,k_grain,tasks_skip)
215      if(.true.) then
216         next = nxtask0(nprocs,k_grain,tasks_skip)
217         count=0
218         do k=task_thresh,tot_task
219            if (next.eq.count) then
220#ifdef USE_F90_ALLOCATABLE
221               call ccsd_t_loop(k,energy1,energy2,
222     &                 int_mb(k_list),int_mb(k_range),int_mb(k_offset),
223     &                 f_singles,f_doubles,
224     &                 k_t1_local,k_t1_offset,
225     &                 d_t2,d_v2,k_t2_offset,k_v2_offset,
226     &                 restricted,k_evl_sorted,size)
227#else
228               call ccsd_t_loop(k,energy1,energy2,
229     &                 int_mb(k_list),int_mb(k_range),int_mb(k_offset),
230     &                 dbl_mb(k_singles),dbl_mb(k_doubles),
231     &                 k_t1_local,k_t1_offset,
232     &                 d_t2,d_v2,k_t2_offset,k_v2_offset,
233     &                 restricted,k_evl_sorted,size)
234#endif
235c
236               next = nxtask0(nprocs,k_grain,tasks_skip)
237               if(ga_nodeid().eq.2) then
238                  pct_progr=(k*n_progr)/tot_task
239                  if(i_progr(pct_progr)) then
240                     i_progr(pct_progr)=.false.
241                     write(6,'(a,i8,a,i4,a,f15.1,a,f9.1)')
242     &                     ' task',k,'  done ',
243     &                     int((k*100d0)/tot_task),'%  at',
244     &                     wall+util_wallsec(),' sec, (size)^1/6= ',
245     &                     (size)**(1d0/6d0)
246                     call util_flush(6)
247                  endif
248               endif
249            endif
250            count = count + 1
251         enddo
252      endif
253      endif
254
255      if (.not.MA_POP_STACK(l_list))
256     1     call errquit('ordering',3,MA_ERR)
257#ifdef USE_F90_ALLOCATABLE
258#if USE_OPENMP
259#if USE_OFFLOAD
260      if (offload_enabled()) then
261!$omp target exit data
262!$omp&map(release:f_singles)
263!$omp&map(release:f_doubles)
264#ifdef OFFLOAD_DEBUG_DATA
265        write (*,*) ga_nodeid(), 'release f_singles', size, 'elements'
266        write (*,*) ga_nodeid(), 'release f_doubles', size, 'elements'
267#endif
268      endif
269#endif
270#endif
271      deallocate( f_doubles, stat=alloc_error)
272      if (alloc_error.ne.0) then
273        call errquit('ccsd_t doubles',3,MA_ERR)
274      endif
275      deallocate( f_singles, stat=alloc_error)
276      if (alloc_error.ne.0) then
277        call errquit('ccsd_t singles',4,MA_ERR)
278      endif
279#else
280      if (.not.MA_POP_STACK(l_doubles))
281     1     call errquit('ccsd_t doubles',3,MA_ERR)
282      if (.not.MA_POP_STACK(l_singles))
283     1     call errquit('ccsd_t singles',4,MA_ERR)
284#endif
285      next = nxtask0(-nprocs,k_grain,tasks_skip)
286c      call ga_sync()
287      energy(1) = energy1
288      energy(2) = energy2
289      call ga_mask_sync(.false.,.true.)
290      call ga_dgop(1975,energy,2,'+')
291      energy1 = energy(1)
292      energy2 = energy(2)
293c - T1/X1 LOCALIZATION ------
294         if(.not.MA_POP_STACK(l_t1_local))
295     &      call errquit('ccsd_t: l_t1_local',4,MA_ERR)
296c ---------------------------
297      return
298      end
299
300
301
302
303
304!     wrapper to ccsd_t_dot because of offload ugliness
305      subroutine ccsd_t_esum(a_singles, a_doubles, restricted,
306     &                       h1b,h2b,h3b,p4b,p5b,p6b,
307     &                       o_h1,o_h2,o_h3,
308     &                       o_p4,o_p5,o_p6,
309     &                       r_h1,r_h2,r_h3,
310     &                       r_p4,r_p5,r_p6,
311     &                       energy1,energy2)
312      implicit none
313      integer h1b, h2b, h3b, p4b, p5b, p6b
314      double precision o_h1(*),o_h2(*),o_h3(*)
315      double precision o_p4(*),o_p5(*),o_p6(*)
316      integer r_h1,r_h2,r_h3
317      integer r_p4,r_p5,r_p6
318      double precision a_singles(*)
319      double precision a_doubles(*)
320      logical restricted
321      double precision energy1,energy2
322      external offload_enabled
323      logical offload_enabled
324#ifdef USE_OPENMP
325#ifdef USE_OFFLOAD
326      if(offload_enabled()) then
327         call offl_ccsd_t_dot(a_singles,a_doubles,restricted,
328     &        h1b,h2b,h3b,p4b,p5b,p6b,
329     &        o_h1,o_h2,o_h3,o_p4,o_p5,o_p6,
330     &        r_h1,r_h2,r_h3,r_p4,r_p5,r_p6,
331     &        energy1,energy2)
332         else
333#endif
334#endif
335            call ccsd_t_dot(a_singles,a_doubles, restricted,
336     &        h1b,h2b,h3b,p4b,p5b,p6b,
337     &        o_h1,o_h2,o_h3,o_p4,o_p5,o_p6,
338     &        r_h1,r_h2,r_h3,r_p4,r_p5,r_p6,
339     &        energy1,energy2)
340#ifdef USE_OPENMP
341#ifdef USE_OFFLOAD
342         endif
343#endif
344#endif
345
346      return
347      end
348
349
350
351      subroutine ccsd_t_loop(k,energy1,energy2,
352     &     k_list,k_range,k_offset,a_singles,a_doubles,
353     &     k_t1_local,k_t1_offset,d_t2,d_v2,k_t2_offset,k_v2_offset,
354     &     restricted,k_evl_sorted,size)
355      implicit none
356#include "mafdecls.fh"
357#include "ccsd_t_ps.fh"
358      integer k
359      double precision energy1,energy2
360      integer k_list(7,*)
361      integer k_range(*),k_offset(*)
362      double precision a_singles(*),a_doubles(*)
363      logical restricted
364      integer k_evl_sorted
365      integer k_t1_local,k_t1_offset
366      integer d_t2,d_v2
367      integer k_t2_offset,k_v2_offset
368      integer size
369#if USE_OFFLOAD
370      external offload_enabled
371      logical offload_enabled
372#endif
373c
374      integer t_p4b,t_p5b,t_p6b,t_h1b,t_h2b,t_h3b
375      integer range_p4,range_p5,range_p6
376      integer range_h1,range_h2,range_h3
377      integer offset_p4,offset_p5,offset_p6
378      integer offset_h1,offset_h2,offset_h3
379c
380      t_p4b=k_list(1,k)
381      t_p5b=k_list(2,k)
382      t_p6b=k_list(3,k)
383      t_h1b=k_list(4,k)
384      t_h2b=k_list(5,k)
385      t_h3b=k_list(6,k)
386      range_p4 = k_range(t_p4b)
387      range_p5 = k_range(t_p5b)
388      range_p6 = k_range(t_p6b)
389      range_h1 = k_range(t_h1b)
390      range_h2 = k_range(t_h2b)
391      range_h3 = k_range(t_h3b)
392      offset_p4 = k_evl_sorted+k_offset(t_p4b)-1
393      offset_p5 = k_evl_sorted+k_offset(t_p5b)-1
394      offset_p6 = k_evl_sorted+k_offset(t_p6b)-1
395      offset_h1 = k_evl_sorted+k_offset(t_h1b)-1
396      offset_h2 = k_evl_sorted+k_offset(t_h2b)-1
397      offset_h3 = k_evl_sorted+k_offset(t_h3b)-1
398c
399      size = range_p4 * range_p5 * range_p6
400     &     * range_h1 * range_h2 * range_h3
401c zeroing ---
402#if USE_OPENMP
403#if USE_OFFLOAD
404      if (offload_enabled()) then
405        call offl_zero(a_singles, size)
406        call offl_zero(a_doubles, size)
407      else
408#endif
409#endif
410        call dcopy(size, 0.0d0, 0, a_singles, 1)
411        call dcopy(size, 0.0d0, 0, a_doubles, 1)
412#if USE_OPENMP
413#if USE_OFFLOAD
414      endif
415#endif
416#endif
417c -----------
418      if (otceps) call pstat_on(ps_cctsng)
419#ifdef USE_OFFLOAD
420      call offl_ccsd_t_singles_l(a_singles,
421     1                 k_t1_local,d_v2,k_t1_offset,k_v2_offset,
422     1                 t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2)
423#else
424      call ccsd_t_singles_l(a_singles,
425     1                 k_t1_local,d_v2,k_t1_offset,k_v2_offset,
426     1                 t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2)
427#endif
428      if (otceps) call pstat_off(ps_cctsng)
429
430
431      if (otceps) call pstat_on(ps_cctdbl)
432      call ccsd_t_doubles_l(a_doubles,
433     1                 d_t2,d_v2,k_t2_offset,k_v2_offset,
434     1                 t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2)
435      if (otceps) call pstat_off(ps_cctdbl)
436      call ccsd_t_esum(a_singles,a_doubles,restricted,
437     &                 t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,
438     &                 dbl_mb(offset_h1+1),dbl_mb(offset_h2+1),
439     &                 dbl_mb(offset_h3+1),dbl_mb(offset_p4+1),
440     &                 dbl_mb(offset_p5+1),dbl_mb(offset_p6+1),
441     &                 range_h1,range_h2,range_h3,
442     &                 range_p4,range_p5,range_p6,
443     &                 energy1,energy2)
444      return
445      end
446
447#ifdef USE_OPENMP
448#ifdef USE_OFFLOAD
449      subroutine offl_zero(array, size)
450      implicit none
451      double precision array(size)
452      integer size
453      integer ii
454!$omp target
455!$omp TEAMS_DISTRIBUTE parallel do OMP_SCHEDULE
456      do ii=1,size
457         array(ii)=0d0
458      enddo
459!$omp end TEAMS_DISTRIBUTE parallel do
460!$omp end target
461      return
462      end
463#endif
464#endif