1#define ALLOC alloc_if(.true.)  free_if(.false.)
2#define FREE  alloc_if(.false.) free_if(.true.)
3#define REUSE alloc_if(.false.) free_if(.false.)
4
5      subroutine ccsd_trpdrv_offload(t1,
6     &     f1n,f1t,f2n,f2t,f3n,f3t,f4n,f4t,eorb,
7     &     g_objo,g_objv,g_coul,g_exch,
8     &     ncor,nocc,nvir,emp4,emp5,
9     &     oseg_lo,oseg_hi,
10     &     kchunk, Tij, Tkj, Tia, Tka, Xia, Xka, Jia, Jka, Kia, Kka,
11     &     Jij, Jkj, Kij, Kkj, Dja, Djka, Djia)
12C     $Id: ccsd_trpdrv_offload.F 26674 2015-01-08 14:36:59Z jhammond $
13      implicit none
14c
15cdir$ ATTRIBUTES OFFLOAD : mic :: dgemm
16cdir$ ATTRIBUTES OFFLOAD : mic :: lnov
17cdir$ ATTRIBUTES OFFLOAD : mic :: lnvv
18cdir$ ATTRIBUTES OFFLOAD : mic :: omp_set_num_threads
19cdir$ ATTRIBUTES OFFLOAD : mic :: omp_set_nested
20cdir$ ATTRIBUTES OFFLOAD : mic :: mkl_set_num_threads
21cdir$ ATTRIBUTES OFFLOAD : mic :: mkl_set_dynamic
22cdir$ ATTRIBUTES OFFLOAD : mic :: kmp_set_defaults
23
24#include "global.fh"
25#include "ccsd_len.fh"
26#include "ccsdps.fh"
27c
28      integer ncor,nocc,nvir
29      double precision t1(*)
30      double precision f1n(nvir,nvir),f1t(nvir,nvir)
31      double precision f2n(nvir,nvir),f2t(nvir,nvir)
32      double precision f3n(nvir,nvir),f3t(nvir,nvir)
33      double precision f4n(nvir,nvir),f4t(nvir,nvir)
34
35      double precision Tij(nvir*nvir), Tia(nvir*nocc)
36      double precision Xia(nvir*nocc)
37      double precision Jia(nvir*nvir), Jij(nvir*nocc)
38      double precision Kia(nvir*nvir), Kij(nvir*nocc)
39      double precision Djia(nvir)
40
41      double precision dintc1(nvir), dintx1(nvir)
42      double precision t1v2(nvir)
43      double precision t1v1(nvir*kchunk)
44      double precision dintc2(nvir*kchunk)
45      double precision dintx2(nvir*kchunk)
46      double precision eorb(*)
47      double precision Tkj(*), Tka(*)
48      double precision Xka(*)
49      double precision Jka(*), Jkj(*)
50      double precision Kka(*), Kkj(*)
51      double precision Dja(*), Djka(*)
52! used to make inline threaded tengy correct - for now
53c
54
55      double precision emp4,emp5
56      double precision emp4z,emp5z,denomz
57      double precision emp4iz,emp5iz,emp4kz,emp5kz
58      double precision eaijk,ea
59      integer g_objo,g_objv,g_coul,g_exch
60      integer inode,next,nodes,iam
61      integer oseg_lo,oseg_hi
62      integer a,b,c,i,j,k,akold,av
63      integer klo, khi, kchunk,nocc2
64      integer nxtask
65      external nxtask
66c
67c  Dependencies (global array, local array, handle):
68c
69c  These are waited on first
70c
71c      g_objv, Dja,  nbh_objv1
72c      g_objv, Djka(1+(k-klo)*nvir), nbh_objv4(k)
73c      g_objv, Djia, nbh_objv5
74c
75c  These are waited on later
76c
77c      g_objv, Tka,  nbh_objv2
78c      g_objv, Xka,  nbh_objv3
79c      g_objv, Tia,  nbh_objv6
80c      g_objv, Xia,  nbh_objv7
81c      g_objo, Tkj,  nbh_objo1
82c      g_objo, Jkj,  nbh_objo2
83c      g_objo, Kkj,  nbh_objo3
84c      g_objo, Tij,  nbh_objo4
85c      g_objo, Jij,  nbh_objo5
86c      g_objo, Kij,  nbh_objo6
87c      g_exch, Kka,  nbh_exch1
88c      g_exch, Kia,  nbh_exch2
89c      g_coul, Jka,  nbh_coul1
90c      g_coul, Jia,  nbh_coul2
91c
92c  non-blocking handles
93c
94       integer nbh_objv1,nbh_objv2,nbh_objv3
95       integer nbh_objv5,nbh_objv6,nbh_objv7
96       integer nbh_objv4(nocc)
97c
98       integer nbh_objo1,nbh_objo2,nbh_objo3
99       integer nbh_objo4,nbh_objo5,nbh_objo6
100c
101       integer nbh_exch1,nbh_exch2,nbh_coul1,nbh_coul2
102       integer n_nvir,nc_no1,n_nn,nv_no,klnn,klno,nv_nk
103       integer kklnn,kklno
104c
105#ifdef _OPENMP
106      integer  omp_get_thread_num
107      external omp_get_thread_num
108      integer  omp_get_num_threads
109      external omp_get_num_threads
110      integer  omp_get_max_threads
111      external omp_get_max_threads
112      if (ga_nodeid().eq.0) write(6,99) omp_get_max_threads()
113   99 format(2x,'Using ',i2,' OpenMP threads in CCSD(T)')
114#endif
115c
116CDIR$ ASSUME_ALIGNED f1t: 64
117CDIR$ ASSUME_ALIGNED f2n: 64
118CDIR$ ASSUME_ALIGNED f2t: 64
119CDIR$ ASSUME_ALIGNED f3n: 64
120CDIR$ ASSUME_ALIGNED f3t: 64
121CDIR$ ASSUME_ALIGNED f4n: 64
122CDIR$ ASSUME_ALIGNED f4t: 64
123CDIR$ ASSUME_ALIGNED dintc1: 64
124CDIR$ ASSUME_ALIGNED dintx1: 64
125CDIR$ ASSUME_ALIGNED t1v1: 64
126CDIR$ ASSUME_ALIGNED dintc2: 64
127CDIR$ ASSUME_ALIGNED dintx2: 64
128CDIR$ ASSUME_ALIGNED t1v2: 64
129CDIR$ ASSUME_ALIGNED Tij: 64
130CDIR$ ASSUME_ALIGNED Tia: 64
131CDIR$ ASSUME_ALIGNED Xia: 64
132CDIR$ ASSUME_ALIGNED Jia: 64
133CDIR$ ASSUME_ALIGNED Jij: 64
134CDIR$ ASSUME_ALIGNED Kia: 64
135CDIR$ ASSUME_ALIGNED Kij: 64
136
137c
138         call omp_set_nested(.true.)
139!$omp parallel
140!$omp& shared(eorb,kchunk,ea)
141!$omp& shared(Tkj,Tka,Xka,Jka,Kka,Jkj,Kkj)
142!$omp& shared(ncor,nocc,nvir)
143!$omp& shared(emp4z,emp5z)
144!$omp& private(n_nvir,n_nn,nv_no,klnn,klno,kklnn,kklno)
145!$omp& private(f1n,f1t,f2n,f2t,f3n,f3t,f4n,f4t)
146!$omp& private(Tij,Tia,Xia,Jia,Jij,Kia,Kij)
147!$omp& private(Djia,dintc1,dintx1,t1v2,t1v1,dintc2,dintx2)
148!$omp& private(i,j,a,klo,k,eaijk,khi)
149!$omp single
150      nodes = ga_nnodes()
151      iam = ga_nodeid()
152c
153c      call ga_sync() ! ga_sync called just before trpdrv in aoccsd2
154c
155      emp4z=0.0d0
156      emp5z=0.0d0
157      if (occsdps) then
158         call pstat_on(ps_trpdrv)
159      else
160         call qenter('trpdrv',0)
161      endif
162      inode=-1
163      nocc2=nocc/2
164      next=nxtask(nodes, 1)
165      n_nvir = nvir*nvir
166      nc_no1 = ncor+nocc+1
167      nv_no = nvir*nocc
168      nv_nk	= nvir*kchunk
169      n_nn = ncor+nocc+nvir
170      klnn = ((kchunk-1)*lnvv)+n_nvir
171      klno = ((kchunk-1)*lnov)+nv_no
172!dir$ offload begin target(mic)
173         call omp_set_num_threads(64)
174         call omp_set_nested(.true.)
175         call mkl_set_num_threads(4)
176         call mkl_set_dynamic(.false.)
177!dir$ end offload
178
179!dir$ offload_transfer target(mic)
180     I   nocopy(f1n:length(n_nvir) ALLOC)
181     I   nocopy(f1t:length(n_nvir) ALLOC)
182     I   nocopy(f2n:length(n_nvir) ALLOC)
183     I   nocopy(f2t:length(n_nvir) ALLOC)
184     I   nocopy(f3n:length(n_nvir) ALLOC)
185     I   nocopy(f3t:length(n_nvir) ALLOC)
186     I   nocopy(f4n:length(n_nvir) ALLOC)
187     I   nocopy(f4t:length(n_nvir) ALLOC)
188     N   in(eorb(ncor+1:n_nn) : ALLOC)
189     I   nocopy(Jia:length(n_nvir) ALLOC)
190     I   nocopy(Kia:length(n_nvir) ALLOC)
191     I   nocopy(Tia:length(nv_no) ALLOC)
192     I   nocopy(Xia:length(nv_no) ALLOC)
193     I   nocopy(Tij:length(n_nvir) ALLOC)
194     I   nocopy(Kij:length(nv_no) ALLOC)
195     I   nocopy(Jij:length(nv_no) ALLOC)
196     N   nocopy(t1v2:length(nvir) ALLOC)
197     N   nocopy(dintc1:length(nvir) ALLOC)
198     N   nocopy(dintx1:length(nvir) ALLOC)
199     I   nocopy(t1v1:length(nv_nk) ALLOC)
200     I   nocopy(dintc2:length(nv_nk) ALLOC)
201     I   nocopy(dintx2:length(nv_nk) ALLOC)
202     I   nocopy(Tkj:length(klnn)  ALLOC)
203     I   nocopy(Kkj:length(klno)  ALLOC)
204     I   nocopy(Jkj:length(klno)  ALLOC)
205     I   nocopy(Jka:length(klnn)  ALLOC)
206     I   nocopy(Tka:length(klno)  ALLOC)
207     I   nocopy(Kka:length(klnn)  ALLOC)
208     I   nocopy(Xka:length(klno)  ALLOC)
209      do klo = 1, nocc, kchunk
210         akold=0
211         khi = min(nocc, klo+kchunk-1)
212         do a=oseg_lo,oseg_hi
213            av=a-ncor-nocc
214            do j=1,nocc
215               inode=inode+1
216               if (inode.eq.next)then
217
218                  call ga_nbget(g_objv,1+(j-1)*lnov,j*lnov,av,av,Dja,
219     &                          lnov,nbh_objv1)
220                  do k = klo, khi
221                     call ga_nbget(g_objv,1+(j-1)*nvir+(k-1)*lnov,
222     &                    j*nvir+(k-1)*lnov,av,av,
223     &                    Djka(1+(k-klo)*nvir),nvir,nbh_objv4(k))
224                  enddo
225                  call ga_nbget(g_objo,(klo-1)*lnvv+1,khi*lnvv,j,j,Tkj,
226     &                          (khi-klo+1)*lnvv,nbh_objo1)
227                  call ga_nbget(g_objo,lnovv+(klo-1)*lnov+1,
228     &                          lnovv+khi*lnov,j,j,Jkj,
229     &                          (khi-klo+1)*lnov,nbh_objo2)
230                  call ga_nbget(g_objo,lnovv+lnoov+(klo-1)*lnov+1,
231     &                          lnovv+lnoov+khi*lnov,j,j,Kkj,
232     &                          (khi-klo+1)*lnov,nbh_objo3)
233                  if (akold .ne. a) then
234                     akold = a
235                     call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+klo,
236     &                    (a-oseg_lo)*nocc+khi,Jka,lnvv,nbh_coul1)
237                     call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+klo,
238     &                    (a-oseg_lo)*nocc+khi,Kka,lnvv,nbh_exch1)
239                     call ga_nbget(g_objv,1+lnoov+(klo-1)*lnov,
240     &                    lnoov+khi*lnov,av,av,Tka,(khi-klo+1)*lnov,
241     &                    nbh_objv2)
242                     call ga_nbget(g_objv,1+2*lnoov+(klo-1)*lnov,
243     &                    2*lnoov+khi*lnov,av,av,Xka,(khi-klo+1)*lnov,
244     &                    nbh_objv3)
245                  endif
246
247      kklnn=((khi-klo)*lnvv)+n_nvir
248      kklno=((khi-klo)*lnov)+nv_no
249      ea=eorb(a)
250!$omp task  shared( t1,eorb,kchunk,av,
251     &  Tkj,Tka,Xka,Jka,Kka,Jkj,Kkj,Dja,Djka,
252     &  ncor,nocc,nvir,emp4,emp5,oseg_lo,
253     &  g_objo,g_objv,g_coul,g_exch,
254     &  nbh_objv1,nbh_objv2,nbh_objv3,nbh_objo1,nbh_objo2,
255     &  nbh_objo3,nbh_exch1,nbh_coul1,nbh_objv4)
256     &   firstprivate(klo,khi,eaijk)
257       call   ccsd_iloop_host(t1,eorb,
258     &     g_objo,g_objv,g_coul,g_exch,
259     &     ncor,nocc,nvir,emp4,emp5,oseg_lo,
260     &     kchunk,Tkj,Tka,Xka,Jka,Kka,Jkj,Kkj, Dja, Djka,
261     &     j,a,klo,khi,av,eaijk,
262     &     nbh_objv1,nbh_objv2,nbh_objv3,nbh_objo1,nbh_objo2,
263     &	  nbh_objo3,nbh_exch1,nbh_coul1,nbh_objv4)
264!$omp end task
265 !dir$ offload_transfer target(mic)
266     I   in(Tkj:length(kklnn)  REUSE)
267     I   in(Kkj:length(kklno)  REUSE)
268     I   in(Jkj:length(kklno)  REUSE)
269     I   in(Jka:length(kklnn)  REUSE)
270     I   in(Tka:length(kklno)  REUSE)
271     I   in(Kka:length(kklnn)  REUSE)
272     I   in(Xka:length(kklno)  REUSE)
273                do i=1,nocc2
274!$omp  critical
275                     call ga_nbget(g_objv,1+(j-1)*nvir+(i-1)*lnov,
276     &                    j*nvir+(i-1)*lnov,av,av,Djia,nvir,nbh_objv5)
277                     call ga_nbget(g_objo,(i-1)*lnvv+1,i*lnvv,j,j,Tij,
278     &                    lnvv,nbh_objo4)
279                     call ga_nbget(g_objo,lnovv+(i-1)*lnov+1,
280     &                    lnovv+i*lnov,j,j,Jij,lnov,nbh_objo5)
281                     call ga_nbget(g_objo,lnovv+lnoov+(i-1)*lnov+1,
282     &                    lnovv+lnoov+i*lnov,j,j,Kij,lnov,nbh_objo6)
283                     call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+i,
284     &                    (a-oseg_lo)*nocc+i,Jia,lnvv,nbh_coul2)
285                     call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+i,
286     &                    (a-oseg_lo)*nocc+i,Kia,lnvv,nbh_exch2)
287                     call ga_nbget(g_objv,1+lnoov+(i-1)*lnov,
288     &                    lnoov+i*lnov,av,av,Tia,lnov,nbh_objv6)
289                     call ga_nbget(g_objv,1+2*lnoov+(i-1)*lnov,
290     &                    2*lnoov+i*lnov,av,av,Xia,lnov,nbh_objv7)
291!$omp end critical
292                     call dcopy(nvir,t1((i-1)*nvir+1),1,t1v2,1)
293                     call ga_nbwait(nbh_objv1) ! Dja
294                     call dcopy(nvir,Dja(1+(i-1)*nvir),1,dintc1,1)
295                     call ga_nbwait(nbh_objv5) ! Djia
296                     call dcopy(nvir,Djia,1,dintx1,1)
297
298                            call ga_nbwait(nbh_objv2)
299                            call ga_nbwait(nbh_objv3)
300                            call ga_nbwait(nbh_objv6)
301                            call ga_nbwait(nbh_objv7)
302                            call ga_nbwait(nbh_objo1)
303                            call ga_nbwait(nbh_objo2)
304                            call ga_nbwait(nbh_objo3)
305                            call ga_nbwait(nbh_objo4)
306                            call ga_nbwait(nbh_objo5)
307                            call ga_nbwait(nbh_objo6)
308                            call ga_nbwait(nbh_exch1)
309                            call ga_nbwait(nbh_exch2)
310                            call ga_nbwait(nbh_coul1)
311                            call ga_nbwait(nbh_coul2)
312                     do k=klo,min(khi,i)
313            call dcopy(nvir,t1((k-1)*nvir+1),1,t1v1((k-klo)*nvir+1),1)
314         call dcopy(nvir,Dja(1+(k-1)*nvir),1,dintc2((k-klo)*nvir+1),1)
315                            call ga_nbwait(nbh_objv4(k)) ! Djka
316      call dcopy(nvir,Djka(1+(k-klo)*nvir),1,dintx2((k-klo)*nvir+1),1)
317                     end do
318
319!dir$ offload begin target(mic)  inout(emp4z,emp5z)
320     I   in(emp4iz,emp4kz,emp5iz,emp5kz)
321     I   in(nvir,nocc,klo,lnvv,lnov,ncor)
322     I   in(eaijk,b,c,denomz,k,ea,i,j,khi)
323     I   nocopy(Tkj:length(kklnn)  REUSE)
324     I   nocopy(Kkj:length(kklno) REUSE)
325     I   nocopy(Jkj:length(kklno)  REUSE)
326     I   nocopy(Jka:length(kklnn)  REUSE)
327     I   nocopy(Tka:length(kklno)  REUSE)
328     I   nocopy(Kka:length(kklnn)  REUSE)
329     I   nocopy(Xka:length(kklno) REUSE)
330     I   nocopy(f1n:length(n_nvir) REUSE)
331     I   nocopy(f1t:length(n_nvir) REUSE)
332     I   nocopy(f2n:length(n_nvir) REUSE)
333     I   nocopy(f2t:length(n_nvir) REUSE)
334     I   nocopy(f3n:length(n_nvir) REUSE)
335     I   nocopy(f3t:length(n_nvir) REUSE)
336     I   nocopy(f4n:length(n_nvir) REUSE)
337     I   nocopy(f4t:length(n_nvir) REUSE)
338     N   nocopy(eorb(ncor+1:n_nn) : REUSE)
339     I   in(t1v1:length(nv_nk) REUSE)
340     I   in(dintc2:length(nv_nk) REUSE)
341     I   in(dintx2:length(nv_nk) REUSE)
342     I   in(Jia:length(n_nvir) REUSE)
343     I   in(Kia:length(n_nvir) REUSE)
344     I   in(Tia:length(nv_no) REUSE)
345     I   in(Xia:length(nv_no) REUSE)
346     I   in(Tij:length(n_nvir) REUSE)
347     I   in(Kij:length(nv_no) REUSE)
348     I   in(Jij:length(nv_no) REUSE)
349     N   in(t1v2:length(nvir) REUSE)
350     N   in(dintc1:length(nvir) REUSE)
351     N   in(dintx1:length(nvir) REUSE)
352
353                     do k=klo,min(khi,i)
354                        emp4iz = 0.0d0
355                        emp5iz = 0.0d0
356                        emp4kz = 0.0d0
357                        emp5kz = 0.0d0
358
359                        eaijk=ea - (eorb(ncor+i)
360     &                           +eorb(ncor+j)
361     &                           +eorb(ncor+k) )
362!$omp parallel
363!$omp& shared(eorb,eaijk)
364!$omp& shared(f1n,f2n,f3n,f4n,f1t,f2t,f3t,f4t)
365!$omp& shared(t1v1,dintc1,dintx1)
366!$omp& shared(t1v2,dintc2,dintx2)
367!$omp& private(b,c,denomz)
368!$omp& firstprivate(ncor,nocc,nvir,lnov,lnvv,i,j,k,klo)
369
370!$omp sections
371!$omp section
372                        call dgemm('n','t',nvir,nvir,nvir,1.0d0,
373     1                       Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0,
374     2                       f1n,nvir)
375                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
376     1                       Tia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0,
377     2                       f1n,nvir)
378!$omp section
379                        call dgemm('n','t',nvir,nvir,nvir,1.0d0,
380     1                       Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0,
381     2                       f2n,nvir)
382                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
383     1                       Xia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0,
384     2                       f2n,nvir)
385!$omp section
386                        call dgemm('n','n',nvir,nvir,nvir,1.0d0,
387     1                       Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0,
388     2                       f3n,nvir)
389                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
390     1                       Tia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0,
391     2                       f3n,nvir)
392!$omp section
393                        call dgemm('n','n',nvir,nvir,nvir,1.0d0,
394     1                       Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0,
395     2                       f4n,nvir)
396                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
397     1                       Xia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0,
398     2                       f4n,nvir)
399!$omp section
400                        call dgemm('n','t',nvir,nvir,nvir,1.0d0,
401     1                       Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0,
402     2                       f1t,nvir)
403                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
404     1                       Tka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0,
405     2                       f1t,nvir)
406!$omp section
407                        call dgemm('n','t',nvir,nvir,nvir,1.0d0,
408     1                       Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0,
409     2                       f2t,nvir)
410                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
411     1                       Xka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0,
412     2                       f2t,nvir)
413!$omp section
414                        call dgemm('n','n',nvir,nvir,nvir,1.0d0,
415     1                       Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0,
416     2                       f3t,nvir)
417                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
418     1                       Tka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0,
419     2                       f3t,nvir)
420!$omp section
421                        call dgemm('n','n',nvir,nvir,nvir,1.0d0,
422     1                       Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0,
423     2                       f4t,nvir)
424                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
425     1                       Xka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0,
426     2                       f4t,nvir)
427!$omp end sections
428
429
430
431
432!$omp do collapse(2)
433!$omp& schedule(static)
434!$omp& reduction(+:emp5iz,emp4iz)
435!$omp& reduction(+:emp5kz,emp4kz)
436               do b=1,nvir
437                 do c=1,nvir
438                   denomz=-1.0d0/( eorb(ncor+nocc+b)
439     &                           +eorb(ncor+nocc+c)+eaijk )
440                   emp4iz=emp4iz+denomz*
441     &                  (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3n(b,c)+f4n(c,b))*
442     &                  (f1t(b,c)-2*f2t(b,c)-2*f3t(b,c)+f4t(b,c))
443                   emp4iz=emp4iz-denomz*
444     &                  (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3n(c,b))*
445     &                  (2*f1t(b,c)-f2t(b,c)-f3t(b,c)+2*f4t(b,c))
446                   emp4iz=emp4iz+3*denomz*(
447     &                  f1n(b,c)*(f1n(b,c)+f3n(c,b)+2*f4t(c,b))+
448     &                  f2n(b,c)*f2t(c,b)+f3n(b,c)*f4t(b,c))
449                   emp4kz=emp4kz+denomz*
450     &                  (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3t(b,c)+f4t(c,b))*
451     &                  (f1n(b,c)-2*f2n(b,c)-2*f3n(b,c)+f4n(b,c))
452                   emp4kz=emp4kz-denomz*
453     &                  (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3t(c,b))*
454     &                  (2*f1n(b,c)-f2n(b,c)-f3n(b,c)+2*f4n(b,c))
455                   emp4kz=emp4kz+3*denomz*(
456     &                  f1t(b,c)*(f1t(b,c)+f3t(c,b)+2*f4n(c,b))+
457     &                  f2t(b,c)*f2n(c,b)+f3t(b,c)*f4n(b,c))
458                   emp5iz=emp5iz+denomz*t1v1((k-klo)*nvir+b)*dintx1(c)*
459     &                 (    f1t(b,c)+f2n(b,c)+f4n(c,b)
460     &                  -2*(f3t(b,c)+f4n(b,c)+f2n(c,b)+
461     &                      f1n(b,c)+f2t(b,c)+f3n(c,b))
462     &                  +4*(f3n(b,c)+f4t(b,c)+f1n(c,b)))
463                   emp5iz=emp5iz+denomz*t1v1((k-klo)*nvir+b)*dintc1(c)*
464     &                 (     f1n(b,c)+f4n(b,c)+f1t(c,b)
465     &                   -2*(f2n(b,c)+f3n(b,c)+f2t(c,b)))
466                   emp5kz=emp5kz+denomz*t1v2(b)*dintx2((k-klo)*nvir+c)*
467     &                 (    f1n(b,c)+f2t(b,c)+f4t(c,b)
468     &                  -2*(f3n(b,c)+f4t(b,c)+f2t(c,b)+
469     &                      f1t(b,c)+f2n(b,c)+f3t(c,b))
470     &                  +4*(f3t(b,c)+f4n(b,c)+f1t(c,b)))
471                   emp5kz=emp5kz+denomz*t1v2(b)*dintc2((k-klo)*nvir+c)*
472     &                 (     f1t(b,c)+f4t(b,c)+f1n(c,b)
473     &                   -2*(f2t(b,c)+f3t(b,c)+f2n(c,b)))
474                 enddo
475               enddo
476!$omp end do
477!$omp end parallel
478
479                         emp4z = emp4z + emp4iz
480                         emp5z = emp5z + emp5iz
481                         if (i.ne.k) then
482                             emp4z = emp4z + emp4kz
483                             emp5z = emp5z + emp5kz
484                         end if ! (i.ne.k)
485                     end do    ! k
486!dir$ end offload
487                  end do       ! i
488!$omp   taskwait
489c                  if (iprt.gt.50)then
490c                     write(6,1234)iam,a,j,emp4,emp5
491c 1234                format(' iam aijk',3i5,2e15.5)
492c                  end if
493                  next=nxtask(nodes, 1)
494               end if
495            end do
496            if(ga_nodeid().eq.0) then
497               write(6,4321) ' ccsd(t): done ',
498     A              a-(ncor+nocc)+((klo-1)/kchunk)*nvir,
499     O              ' out of ',(nocc/kchunk)*nvir,
500     O              ' progress: ',
501     O              ((a-(ncor+nocc)+((klo-1)/kchunk)*nvir)*100d0)/
502     D              ((nocc/kchunk)*nvir),
503     P              '%'
504 4321          format(a,i8,a,i8,a,f6.1,a1)
505            endif
506         end do
507      end do
508                         emp4 =emp4 + emp4z
509                         emp5 = emp5 + emp5z
510      call ga_sync()
511      next=nxtask(-nodes, 1)
512      call ga_sync()
513      if (occsdps) then
514         call pstat_off(ps_trpdrv)
515      else
516         call qexit('trpdrv',0)
517      endif
518c
519!dir$ offload_transfer target(mic)
520     I   nocopy(f1n:length(n_nvir) FREE)
521     I   nocopy(f1t:length(n_nvir) FREE)
522     I   nocopy(f2n:length(n_nvir) FREE)
523     I   nocopy(f2t:length(n_nvir) FREE)
524     I   nocopy(f3n:length(n_nvir) FREE)
525     I   nocopy(f3t:length(n_nvir) FREE)
526     I   nocopy(f4n:length(n_nvir) FREE)
527     I   nocopy(f4t:length(n_nvir) FREE)
528     N   nocopy(eorb(ncor+1:n_nn) : FREE)
529     I   nocopy(Jia:length(n_nvir) FREE)
530     I   nocopy(Kia:length(n_nvir) FREE)
531     I   nocopy(Tia:length(nv_no) FREE)
532     I   nocopy(Xia:length(nv_no) FREE)
533     I   nocopy(Tij:length(n_nvir) FREE)
534     I   nocopy(Kij:length(nv_no) FREE)
535     I   nocopy(Jij:length(nv_no) FREE)
536     N   nocopy(t1v2:length(nvir) FREE)
537     N   nocopy(dintc1:length(nvir) FREE)
538     N   nocopy(dintx1:length(nvir) FREE)
539     I   nocopy(t1v1:length(nv_nk) FREE)
540     I   nocopy(dintc2:length(nv_nk) FREE)
541     I   nocopy(dintx2:length(nv_nk) FREE)
542     I   nocopy(Tkj:length(klnn) FREE)
543     I   nocopy(Kkj:length(klno) FREE)
544     I   nocopy(Jkj:length(klno) FREE)
545     I   nocopy(Jka:length(klnn) FREE)
546     I   nocopy(Tka:length(klno) FREE)
547     I   nocopy(Kka:length(klnn) FREE)
548     I   nocopy(Xka:length(klno) FREE)
549
550!$omp end single
551!$omp end parallel
552      end
553
554
555
556      subroutine ccsd_iloop_host(t1,eorb,
557     &     g_objo,g_objv,g_coul,g_exch,
558     &     ncor,nocc,nvir,emp4,emp5,oseg_lo,
559     &     kchunk,Tkj,Tka,Xka,Jka,Kka,Jkj,Kkj,Dja,Djka,
560     &     j,a,klo,khi,av,eaijk,
561     &     nbh_objv1,nbh_objv2,nbh_objv3,nbh_objo1,nbh_objo2,
562     &	  nbh_objo3,nbh_exch1,nbh_coul1,nbh_objv4)
563C     $Id: ccsd_trpdrv_omp.F 26674 2015-01-08 14:36:59Z jhammond $
564      implicit none
565c
566#include "global.fh"
567#include "ccsd_len.fh"
568#include "ccsdps.fh"
569      integer ncor,nocc,nvir
570      double precision t1(*)
571      double precision f1n(nvir,nvir),f1t(nvir,nvir)
572      double precision f2n(nvir,nvir),f2t(nvir,nvir)
573      double precision f3n(nvir,nvir),f3t(nvir,nvir)
574      double precision f4n(nvir,nvir),f4t(nvir,nvir)
575      double precision eorb(*)
576
577      double precision Tij(nvir*nvir), Tia(nvir*nocc)
578      double precision Xia(nvir*nocc)
579      double precision Jia(nvir*nvir), Jij(nvir*nocc)
580      double precision Kia(nvir*nvir), Kij(nvir*nocc)
581      double precision Djia(nvir)
582
583      double precision Tkj(*), Tka(*)
584      double precision Xka(*)
585      double precision Jka(*), Jkj(*)
586      double precision Kka(*), Kkj(*)
587      double precision Dja(*), Djka(*)
588
589      double precision dintc1(nvir),dintx1(nvir),t1v1(nvir)
590      double precision dintc2(nvir),dintx2(nvir),t1v2(nvir)
591c
592      double precision emp4,emp5,denom
593      double precision emp4i,emp5i,emp4k,emp5k
594      double precision eaijk
595      integer g_objo,g_objv,g_coul,g_exch
596      integer oseg_lo,oseg_hi
597      integer a,b,c,i,j,k,av
598      integer klo, khi,nocc2,kchunk
599
600
601
602       integer nbh_objv1,nbh_objv2,nbh_objv3
603       integer nbh_objv5,nbh_objv6,nbh_objv7
604       integer nbh_objv4(nocc)
605c
606       integer nbh_objo1,nbh_objo2,nbh_objo3
607       integer nbh_objo4,nbh_objo5,nbh_objo6
608c
609       integer nbh_exch1,nbh_exch2,nbh_coul1,nbh_coul2
610
611            nocc2=nocc/2
612                  do i=nocc2+1,nocc
613!$omp critical
614                     call ga_nbget(g_objv,1+(j-1)*nvir+(i-1)*lnov,
615     &                    j*nvir+(i-1)*lnov,av,av,Djia,nvir,nbh_objv5)
616                     call ga_nbget(g_objo,(i-1)*lnvv+1,i*lnvv,j,j,Tij,
617     &                    lnvv,nbh_objo4)
618                     call ga_nbget(g_objo,lnovv+(i-1)*lnov+1,
619     &                    lnovv+i*lnov,j,j,Jij,lnov,nbh_objo5)
620                     call ga_nbget(g_objo,lnovv+lnoov+(i-1)*lnov+1,
621     &                    lnovv+lnoov+i*lnov,j,j,Kij,lnov,nbh_objo6)
622                     call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+i,
623     &                    (a-oseg_lo)*nocc+i,Jia,lnvv,nbh_coul2)
624                     call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+i,
625     &                    (a-oseg_lo)*nocc+i,Kia,lnvv,nbh_exch2)
626                     call ga_nbget(g_objv,1+lnoov+(i-1)*lnov,
627     &                    lnoov+i*lnov,av,av,Tia,lnov,nbh_objv6)
628                     call ga_nbget(g_objv,1+2*lnoov+(i-1)*lnov,
629     &                    2*lnoov+i*lnov,av,av,Xia,lnov,nbh_objv7)
630!$omp end critical
631                     call dcopy(nvir,t1((i-1)*nvir+1),1,t1v2,1)
632                     call ga_nbwait(nbh_objv1) ! Dja
633                     call dcopy(nvir,Dja(1+(i-1)*nvir),1,dintc1,1)
634                     call ga_nbwait(nbh_objv5) ! Djia
635                     call dcopy(nvir,Djia,1,dintx1,1)
636
637
638                            call ga_nbwait(nbh_objv2)
639                            call ga_nbwait(nbh_objv3)
640                            call ga_nbwait(nbh_objv6)
641                            call ga_nbwait(nbh_objv7)
642                            call ga_nbwait(nbh_objo1)
643                            call ga_nbwait(nbh_objo2)
644                            call ga_nbwait(nbh_objo3)
645                            call ga_nbwait(nbh_objo4)
646                            call ga_nbwait(nbh_objo5)
647                            call ga_nbwait(nbh_objo6)
648                            call ga_nbwait(nbh_exch1)
649                            call ga_nbwait(nbh_exch2)
650                            call ga_nbwait(nbh_coul1)
651                            call ga_nbwait(nbh_coul2)
652
653                     do k=klo,min(khi,i)
654
655                        call dcopy(nvir,t1((k-1)*nvir+1),1,t1v1,1)
656                        call dcopy(nvir,Dja(1+(k-1)*nvir),1,dintc2,1)
657                        call ga_nbwait(nbh_objv4(k)) ! Djka
658                      call dcopy(nvir,Djka(1+(k-klo)*nvir),1,dintx2,1)
659
660                        emp4i = 0.0d0
661                        emp5i = 0.0d0
662                        emp4k = 0.0d0
663                        emp5k = 0.0d0
664
665
666!$omp parallel
667!$omp& shared(eorb)
668!$omp& shared(f1n,f2n,f3n,f4n,f1t,f2t,f3t,f4t)
669!$omp& shared(t1v1,dintc1,dintx1)
670!$omp& shared(t1v2,dintc2,dintx2)
671!$omp& private(b,c,eaijk,denom)
672!$omp& firstprivate(ncor,nocc,nvir,lnov,lnvv,i,j,k,klo)
673
674!$omp sections
675!$omp section
676                        call dgemm('n','t',nvir,nvir,nvir,1.0d0,
677     1                       Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0,
678     2                       f1n,nvir)
679                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
680     1                       Tia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0,
681     2                       f1n,nvir)
682!$omp section
683                        call dgemm('n','t',nvir,nvir,nvir,1.0d0,
684     1                       Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0,
685     2                       f2n,nvir)
686                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
687     1                       Xia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0,
688     2                       f2n,nvir)
689!$omp section
690                        call dgemm('n','n',nvir,nvir,nvir,1.0d0,
691     1                       Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0,
692     2                       f3n,nvir)
693                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
694     1                       Tia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0,
695     2                       f3n,nvir)
696!$omp section
697                        call dgemm('n','n',nvir,nvir,nvir,1.0d0,
698     1                       Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0,
699     2                       f4n,nvir)
700                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
701     1                       Xia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0,
702     2                       f4n,nvir)
703!$omp section
704                        call dgemm('n','t',nvir,nvir,nvir,1.0d0,
705     1                       Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0,
706     2                       f1t,nvir)
707                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
708     1                       Tka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0,
709     2                       f1t,nvir)
710!$omp section
711                        call dgemm('n','t',nvir,nvir,nvir,1.0d0,
712     1                       Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0,
713     2                       f2t,nvir)
714                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
715     1                       Xka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0,
716     2                       f2t,nvir)
717!$omp section
718                        call dgemm('n','n',nvir,nvir,nvir,1.0d0,
719     1                       Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0,
720     2                       f3t,nvir)
721                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
722     1                       Tka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0,
723     2                       f3t,nvir)
724!$omp section
725                        call dgemm('n','n',nvir,nvir,nvir,1.0d0,
726     1                       Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0,
727     2                       f4t,nvir)
728                        call dgemm('n','n',nvir,nvir,nocc,-1.0d0,
729     1                       Xka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0,
730     2                       f4t,nvir)
731!$omp end sections
732
733
734
735                        eaijk=eorb(a) - (  eorb(ncor+i)
736     &                                    +eorb(ncor+j)
737     &                                    +eorb(ncor+k) )
738!$omp do collapse(2)
739!$omp& schedule(static)
740!$omp& reduction(+:emp5i,emp4i)
741!$omp& reduction(+:emp5k,emp4k)
742               do b=1,nvir
743                 do c=1,nvir
744                   denom=-1.0d0/( eorb(ncor+nocc+b)
745     &                           +eorb(ncor+nocc+c)+eaijk )
746                   emp4i=emp4i+denom*
747     &                  (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3n(b,c)+f4n(c,b))*
748     &                  (f1t(b,c)-2*f2t(b,c)-2*f3t(b,c)+f4t(b,c))
749                   emp4i=emp4i-denom*
750     &                  (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3n(c,b))*
751     &                  (2*f1t(b,c)-f2t(b,c)-f3t(b,c)+2*f4t(b,c))
752                   emp4i=emp4i+3*denom*(
753     &                  f1n(b,c)*(f1n(b,c)+f3n(c,b)+2*f4t(c,b))+
754     &                  f2n(b,c)*f2t(c,b)+f3n(b,c)*f4t(b,c))
755                   emp4k=emp4k+denom*
756     &                  (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3t(b,c)+f4t(c,b))*
757     &                  (f1n(b,c)-2*f2n(b,c)-2*f3n(b,c)+f4n(b,c))
758                   emp4k=emp4k-denom*
759     &                  (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3t(c,b))*
760     &                  (2*f1n(b,c)-f2n(b,c)-f3n(b,c)+2*f4n(b,c))
761                   emp4k=emp4k+3*denom*(
762     &                  f1t(b,c)*(f1t(b,c)+f3t(c,b)+2*f4n(c,b))+
763     &                  f2t(b,c)*f2n(c,b)+f3t(b,c)*f4n(b,c))
764                   emp5i=emp5i+denom*t1v1(b)*dintx1(c)*
765     &                 (    f1t(b,c)+f2n(b,c)+f4n(c,b)
766     &                  -2*(f3t(b,c)+f4n(b,c)+f2n(c,b)+
767     &                      f1n(b,c)+f2t(b,c)+f3n(c,b))
768     &                  +4*(f3n(b,c)+f4t(b,c)+f1n(c,b)))
769                   emp5i=emp5i+denom*t1v1(b)*dintc1(c)*
770     &                 (     f1n(b,c)+f4n(b,c)+f1t(c,b)
771     &                   -2*(f2n(b,c)+f3n(b,c)+f2t(c,b)))
772                   emp5k=emp5k+denom*t1v2(b)*dintx2(c)*
773     &                 (    f1n(b,c)+f2t(b,c)+f4t(c,b)
774     &                  -2*(f3n(b,c)+f4t(b,c)+f2t(c,b)+
775     &                      f1t(b,c)+f2n(b,c)+f3t(c,b))
776     &                  +4*(f3t(b,c)+f4n(b,c)+f1t(c,b)))
777                   emp5k=emp5k+denom*t1v2(b)*dintc2(c)*
778     &                 (     f1t(b,c)+f4t(b,c)+f1n(c,b)
779     &                   -2*(f2t(b,c)+f3t(b,c)+f2n(c,b)))
780                 enddo
781               enddo
782!$omp end do
783!$omp end parallel
784
785                         emp4 = emp4 + emp4i
786                         emp5 = emp5 + emp5i
787                         if (i.ne.k) then
788                             emp4 = emp4 + emp4k
789                             emp5 = emp5 + emp5k
790                         end if ! (i.ne.k)
791                     end do    ! k
792                  end do       ! i
793        end
794