1#define ALLOC alloc_if(.true.) free_if(.false.) 2#define FREE alloc_if(.false.) free_if(.true.) 3#define REUSE alloc_if(.false.) free_if(.false.) 4 5 subroutine ccsd_trpdrv_offload(t1, 6 & f1n,f1t,f2n,f2t,f3n,f3t,f4n,f4t,eorb, 7 & g_objo,g_objv,g_coul,g_exch, 8 & ncor,nocc,nvir,emp4,emp5, 9 & oseg_lo,oseg_hi, 10 & kchunk, Tij, Tkj, Tia, Tka, Xia, Xka, Jia, Jka, Kia, Kka, 11 & Jij, Jkj, Kij, Kkj, Dja, Djka, Djia) 12C $Id: ccsd_trpdrv_offload.F 26674 2015-01-08 14:36:59Z jhammond $ 13 implicit none 14c 15cdir$ ATTRIBUTES OFFLOAD : mic :: dgemm 16cdir$ ATTRIBUTES OFFLOAD : mic :: lnov 17cdir$ ATTRIBUTES OFFLOAD : mic :: lnvv 18cdir$ ATTRIBUTES OFFLOAD : mic :: omp_set_num_threads 19cdir$ ATTRIBUTES OFFLOAD : mic :: omp_set_nested 20cdir$ ATTRIBUTES OFFLOAD : mic :: mkl_set_num_threads 21cdir$ ATTRIBUTES OFFLOAD : mic :: mkl_set_dynamic 22cdir$ ATTRIBUTES OFFLOAD : mic :: kmp_set_defaults 23 24#include "global.fh" 25#include "ccsd_len.fh" 26#include "ccsdps.fh" 27c 28 integer ncor,nocc,nvir 29 double precision t1(*) 30 double precision f1n(nvir,nvir),f1t(nvir,nvir) 31 double precision f2n(nvir,nvir),f2t(nvir,nvir) 32 double precision f3n(nvir,nvir),f3t(nvir,nvir) 33 double precision f4n(nvir,nvir),f4t(nvir,nvir) 34 35 double precision Tij(nvir*nvir), Tia(nvir*nocc) 36 double precision Xia(nvir*nocc) 37 double precision Jia(nvir*nvir), Jij(nvir*nocc) 38 double precision Kia(nvir*nvir), Kij(nvir*nocc) 39 double precision Djia(nvir) 40 41 double precision dintc1(nvir), dintx1(nvir) 42 double precision t1v2(nvir) 43 double precision t1v1(nvir*kchunk) 44 double precision dintc2(nvir*kchunk) 45 double precision dintx2(nvir*kchunk) 46 double precision eorb(*) 47 double precision Tkj(*), Tka(*) 48 double precision Xka(*) 49 double precision Jka(*), Jkj(*) 50 double precision Kka(*), Kkj(*) 51 double precision Dja(*), Djka(*) 52! used to make inline threaded tengy correct - for now 53c 54 55 double precision emp4,emp5 56 double precision emp4z,emp5z,denomz 57 double precision emp4iz,emp5iz,emp4kz,emp5kz 58 double precision eaijk,ea 59 integer g_objo,g_objv,g_coul,g_exch 60 integer inode,next,nodes,iam 61 integer oseg_lo,oseg_hi 62 integer a,b,c,i,j,k,akold,av 63 integer klo, khi, kchunk,nocc2 64 integer nxtask 65 external nxtask 66c 67c Dependencies (global array, local array, handle): 68c 69c These are waited on first 70c 71c g_objv, Dja, nbh_objv1 72c g_objv, Djka(1+(k-klo)*nvir), nbh_objv4(k) 73c g_objv, Djia, nbh_objv5 74c 75c These are waited on later 76c 77c g_objv, Tka, nbh_objv2 78c g_objv, Xka, nbh_objv3 79c g_objv, Tia, nbh_objv6 80c g_objv, Xia, nbh_objv7 81c g_objo, Tkj, nbh_objo1 82c g_objo, Jkj, nbh_objo2 83c g_objo, Kkj, nbh_objo3 84c g_objo, Tij, nbh_objo4 85c g_objo, Jij, nbh_objo5 86c g_objo, Kij, nbh_objo6 87c g_exch, Kka, nbh_exch1 88c g_exch, Kia, nbh_exch2 89c g_coul, Jka, nbh_coul1 90c g_coul, Jia, nbh_coul2 91c 92c non-blocking handles 93c 94 integer nbh_objv1,nbh_objv2,nbh_objv3 95 integer nbh_objv5,nbh_objv6,nbh_objv7 96 integer nbh_objv4(nocc) 97c 98 integer nbh_objo1,nbh_objo2,nbh_objo3 99 integer nbh_objo4,nbh_objo5,nbh_objo6 100c 101 integer nbh_exch1,nbh_exch2,nbh_coul1,nbh_coul2 102 integer n_nvir,nc_no1,n_nn,nv_no,klnn,klno,nv_nk 103 integer kklnn,kklno 104c 105#ifdef _OPENMP 106 integer omp_get_thread_num 107 external omp_get_thread_num 108 integer omp_get_num_threads 109 external omp_get_num_threads 110 integer omp_get_max_threads 111 external omp_get_max_threads 112 if (ga_nodeid().eq.0) write(6,99) omp_get_max_threads() 113 99 format(2x,'Using ',i2,' OpenMP threads in CCSD(T)') 114#endif 115c 116CDIR$ ASSUME_ALIGNED f1t: 64 117CDIR$ ASSUME_ALIGNED f2n: 64 118CDIR$ ASSUME_ALIGNED f2t: 64 119CDIR$ ASSUME_ALIGNED f3n: 64 120CDIR$ ASSUME_ALIGNED f3t: 64 121CDIR$ ASSUME_ALIGNED f4n: 64 122CDIR$ ASSUME_ALIGNED f4t: 64 123CDIR$ ASSUME_ALIGNED dintc1: 64 124CDIR$ ASSUME_ALIGNED dintx1: 64 125CDIR$ ASSUME_ALIGNED t1v1: 64 126CDIR$ ASSUME_ALIGNED dintc2: 64 127CDIR$ ASSUME_ALIGNED dintx2: 64 128CDIR$ ASSUME_ALIGNED t1v2: 64 129CDIR$ ASSUME_ALIGNED Tij: 64 130CDIR$ ASSUME_ALIGNED Tia: 64 131CDIR$ ASSUME_ALIGNED Xia: 64 132CDIR$ ASSUME_ALIGNED Jia: 64 133CDIR$ ASSUME_ALIGNED Jij: 64 134CDIR$ ASSUME_ALIGNED Kia: 64 135CDIR$ ASSUME_ALIGNED Kij: 64 136 137c 138 call omp_set_nested(.true.) 139!$omp parallel 140!$omp& shared(eorb,kchunk,ea) 141!$omp& shared(Tkj,Tka,Xka,Jka,Kka,Jkj,Kkj) 142!$omp& shared(ncor,nocc,nvir) 143!$omp& shared(emp4z,emp5z) 144!$omp& private(n_nvir,n_nn,nv_no,klnn,klno,kklnn,kklno) 145!$omp& private(f1n,f1t,f2n,f2t,f3n,f3t,f4n,f4t) 146!$omp& private(Tij,Tia,Xia,Jia,Jij,Kia,Kij) 147!$omp& private(Djia,dintc1,dintx1,t1v2,t1v1,dintc2,dintx2) 148!$omp& private(i,j,a,klo,k,eaijk,khi) 149!$omp single 150 nodes = ga_nnodes() 151 iam = ga_nodeid() 152c 153c call ga_sync() ! ga_sync called just before trpdrv in aoccsd2 154c 155 emp4z=0.0d0 156 emp5z=0.0d0 157 if (occsdps) then 158 call pstat_on(ps_trpdrv) 159 else 160 call qenter('trpdrv',0) 161 endif 162 inode=-1 163 nocc2=nocc/2 164 next=nxtask(nodes, 1) 165 n_nvir = nvir*nvir 166 nc_no1 = ncor+nocc+1 167 nv_no = nvir*nocc 168 nv_nk = nvir*kchunk 169 n_nn = ncor+nocc+nvir 170 klnn = ((kchunk-1)*lnvv)+n_nvir 171 klno = ((kchunk-1)*lnov)+nv_no 172!dir$ offload begin target(mic) 173 call omp_set_num_threads(64) 174 call omp_set_nested(.true.) 175 call mkl_set_num_threads(4) 176 call mkl_set_dynamic(.false.) 177!dir$ end offload 178 179!dir$ offload_transfer target(mic) 180 I nocopy(f1n:length(n_nvir) ALLOC) 181 I nocopy(f1t:length(n_nvir) ALLOC) 182 I nocopy(f2n:length(n_nvir) ALLOC) 183 I nocopy(f2t:length(n_nvir) ALLOC) 184 I nocopy(f3n:length(n_nvir) ALLOC) 185 I nocopy(f3t:length(n_nvir) ALLOC) 186 I nocopy(f4n:length(n_nvir) ALLOC) 187 I nocopy(f4t:length(n_nvir) ALLOC) 188 N in(eorb(ncor+1:n_nn) : ALLOC) 189 I nocopy(Jia:length(n_nvir) ALLOC) 190 I nocopy(Kia:length(n_nvir) ALLOC) 191 I nocopy(Tia:length(nv_no) ALLOC) 192 I nocopy(Xia:length(nv_no) ALLOC) 193 I nocopy(Tij:length(n_nvir) ALLOC) 194 I nocopy(Kij:length(nv_no) ALLOC) 195 I nocopy(Jij:length(nv_no) ALLOC) 196 N nocopy(t1v2:length(nvir) ALLOC) 197 N nocopy(dintc1:length(nvir) ALLOC) 198 N nocopy(dintx1:length(nvir) ALLOC) 199 I nocopy(t1v1:length(nv_nk) ALLOC) 200 I nocopy(dintc2:length(nv_nk) ALLOC) 201 I nocopy(dintx2:length(nv_nk) ALLOC) 202 I nocopy(Tkj:length(klnn) ALLOC) 203 I nocopy(Kkj:length(klno) ALLOC) 204 I nocopy(Jkj:length(klno) ALLOC) 205 I nocopy(Jka:length(klnn) ALLOC) 206 I nocopy(Tka:length(klno) ALLOC) 207 I nocopy(Kka:length(klnn) ALLOC) 208 I nocopy(Xka:length(klno) ALLOC) 209 do klo = 1, nocc, kchunk 210 akold=0 211 khi = min(nocc, klo+kchunk-1) 212 do a=oseg_lo,oseg_hi 213 av=a-ncor-nocc 214 do j=1,nocc 215 inode=inode+1 216 if (inode.eq.next)then 217 218 call ga_nbget(g_objv,1+(j-1)*lnov,j*lnov,av,av,Dja, 219 & lnov,nbh_objv1) 220 do k = klo, khi 221 call ga_nbget(g_objv,1+(j-1)*nvir+(k-1)*lnov, 222 & j*nvir+(k-1)*lnov,av,av, 223 & Djka(1+(k-klo)*nvir),nvir,nbh_objv4(k)) 224 enddo 225 call ga_nbget(g_objo,(klo-1)*lnvv+1,khi*lnvv,j,j,Tkj, 226 & (khi-klo+1)*lnvv,nbh_objo1) 227 call ga_nbget(g_objo,lnovv+(klo-1)*lnov+1, 228 & lnovv+khi*lnov,j,j,Jkj, 229 & (khi-klo+1)*lnov,nbh_objo2) 230 call ga_nbget(g_objo,lnovv+lnoov+(klo-1)*lnov+1, 231 & lnovv+lnoov+khi*lnov,j,j,Kkj, 232 & (khi-klo+1)*lnov,nbh_objo3) 233 if (akold .ne. a) then 234 akold = a 235 call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+klo, 236 & (a-oseg_lo)*nocc+khi,Jka,lnvv,nbh_coul1) 237 call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+klo, 238 & (a-oseg_lo)*nocc+khi,Kka,lnvv,nbh_exch1) 239 call ga_nbget(g_objv,1+lnoov+(klo-1)*lnov, 240 & lnoov+khi*lnov,av,av,Tka,(khi-klo+1)*lnov, 241 & nbh_objv2) 242 call ga_nbget(g_objv,1+2*lnoov+(klo-1)*lnov, 243 & 2*lnoov+khi*lnov,av,av,Xka,(khi-klo+1)*lnov, 244 & nbh_objv3) 245 endif 246 247 kklnn=((khi-klo)*lnvv)+n_nvir 248 kklno=((khi-klo)*lnov)+nv_no 249 ea=eorb(a) 250!$omp task shared( t1,eorb,kchunk,av, 251 & Tkj,Tka,Xka,Jka,Kka,Jkj,Kkj,Dja,Djka, 252 & ncor,nocc,nvir,emp4,emp5,oseg_lo, 253 & g_objo,g_objv,g_coul,g_exch, 254 & nbh_objv1,nbh_objv2,nbh_objv3,nbh_objo1,nbh_objo2, 255 & nbh_objo3,nbh_exch1,nbh_coul1,nbh_objv4) 256 & firstprivate(klo,khi,eaijk) 257 call ccsd_iloop_host(t1,eorb, 258 & g_objo,g_objv,g_coul,g_exch, 259 & ncor,nocc,nvir,emp4,emp5,oseg_lo, 260 & kchunk,Tkj,Tka,Xka,Jka,Kka,Jkj,Kkj, Dja, Djka, 261 & j,a,klo,khi,av,eaijk, 262 & nbh_objv1,nbh_objv2,nbh_objv3,nbh_objo1,nbh_objo2, 263 & nbh_objo3,nbh_exch1,nbh_coul1,nbh_objv4) 264!$omp end task 265 !dir$ offload_transfer target(mic) 266 I in(Tkj:length(kklnn) REUSE) 267 I in(Kkj:length(kklno) REUSE) 268 I in(Jkj:length(kklno) REUSE) 269 I in(Jka:length(kklnn) REUSE) 270 I in(Tka:length(kklno) REUSE) 271 I in(Kka:length(kklnn) REUSE) 272 I in(Xka:length(kklno) REUSE) 273 do i=1,nocc2 274!$omp critical 275 call ga_nbget(g_objv,1+(j-1)*nvir+(i-1)*lnov, 276 & j*nvir+(i-1)*lnov,av,av,Djia,nvir,nbh_objv5) 277 call ga_nbget(g_objo,(i-1)*lnvv+1,i*lnvv,j,j,Tij, 278 & lnvv,nbh_objo4) 279 call ga_nbget(g_objo,lnovv+(i-1)*lnov+1, 280 & lnovv+i*lnov,j,j,Jij,lnov,nbh_objo5) 281 call ga_nbget(g_objo,lnovv+lnoov+(i-1)*lnov+1, 282 & lnovv+lnoov+i*lnov,j,j,Kij,lnov,nbh_objo6) 283 call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+i, 284 & (a-oseg_lo)*nocc+i,Jia,lnvv,nbh_coul2) 285 call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+i, 286 & (a-oseg_lo)*nocc+i,Kia,lnvv,nbh_exch2) 287 call ga_nbget(g_objv,1+lnoov+(i-1)*lnov, 288 & lnoov+i*lnov,av,av,Tia,lnov,nbh_objv6) 289 call ga_nbget(g_objv,1+2*lnoov+(i-1)*lnov, 290 & 2*lnoov+i*lnov,av,av,Xia,lnov,nbh_objv7) 291!$omp end critical 292 call dcopy(nvir,t1((i-1)*nvir+1),1,t1v2,1) 293 call ga_nbwait(nbh_objv1) ! Dja 294 call dcopy(nvir,Dja(1+(i-1)*nvir),1,dintc1,1) 295 call ga_nbwait(nbh_objv5) ! Djia 296 call dcopy(nvir,Djia,1,dintx1,1) 297 298 call ga_nbwait(nbh_objv2) 299 call ga_nbwait(nbh_objv3) 300 call ga_nbwait(nbh_objv6) 301 call ga_nbwait(nbh_objv7) 302 call ga_nbwait(nbh_objo1) 303 call ga_nbwait(nbh_objo2) 304 call ga_nbwait(nbh_objo3) 305 call ga_nbwait(nbh_objo4) 306 call ga_nbwait(nbh_objo5) 307 call ga_nbwait(nbh_objo6) 308 call ga_nbwait(nbh_exch1) 309 call ga_nbwait(nbh_exch2) 310 call ga_nbwait(nbh_coul1) 311 call ga_nbwait(nbh_coul2) 312 do k=klo,min(khi,i) 313 call dcopy(nvir,t1((k-1)*nvir+1),1,t1v1((k-klo)*nvir+1),1) 314 call dcopy(nvir,Dja(1+(k-1)*nvir),1,dintc2((k-klo)*nvir+1),1) 315 call ga_nbwait(nbh_objv4(k)) ! Djka 316 call dcopy(nvir,Djka(1+(k-klo)*nvir),1,dintx2((k-klo)*nvir+1),1) 317 end do 318 319!dir$ offload begin target(mic) inout(emp4z,emp5z) 320 I in(emp4iz,emp4kz,emp5iz,emp5kz) 321 I in(nvir,nocc,klo,lnvv,lnov,ncor) 322 I in(eaijk,b,c,denomz,k,ea,i,j,khi) 323 I nocopy(Tkj:length(kklnn) REUSE) 324 I nocopy(Kkj:length(kklno) REUSE) 325 I nocopy(Jkj:length(kklno) REUSE) 326 I nocopy(Jka:length(kklnn) REUSE) 327 I nocopy(Tka:length(kklno) REUSE) 328 I nocopy(Kka:length(kklnn) REUSE) 329 I nocopy(Xka:length(kklno) REUSE) 330 I nocopy(f1n:length(n_nvir) REUSE) 331 I nocopy(f1t:length(n_nvir) REUSE) 332 I nocopy(f2n:length(n_nvir) REUSE) 333 I nocopy(f2t:length(n_nvir) REUSE) 334 I nocopy(f3n:length(n_nvir) REUSE) 335 I nocopy(f3t:length(n_nvir) REUSE) 336 I nocopy(f4n:length(n_nvir) REUSE) 337 I nocopy(f4t:length(n_nvir) REUSE) 338 N nocopy(eorb(ncor+1:n_nn) : REUSE) 339 I in(t1v1:length(nv_nk) REUSE) 340 I in(dintc2:length(nv_nk) REUSE) 341 I in(dintx2:length(nv_nk) REUSE) 342 I in(Jia:length(n_nvir) REUSE) 343 I in(Kia:length(n_nvir) REUSE) 344 I in(Tia:length(nv_no) REUSE) 345 I in(Xia:length(nv_no) REUSE) 346 I in(Tij:length(n_nvir) REUSE) 347 I in(Kij:length(nv_no) REUSE) 348 I in(Jij:length(nv_no) REUSE) 349 N in(t1v2:length(nvir) REUSE) 350 N in(dintc1:length(nvir) REUSE) 351 N in(dintx1:length(nvir) REUSE) 352 353 do k=klo,min(khi,i) 354 emp4iz = 0.0d0 355 emp5iz = 0.0d0 356 emp4kz = 0.0d0 357 emp5kz = 0.0d0 358 359 eaijk=ea - (eorb(ncor+i) 360 & +eorb(ncor+j) 361 & +eorb(ncor+k) ) 362!$omp parallel 363!$omp& shared(eorb,eaijk) 364!$omp& shared(f1n,f2n,f3n,f4n,f1t,f2t,f3t,f4t) 365!$omp& shared(t1v1,dintc1,dintx1) 366!$omp& shared(t1v2,dintc2,dintx2) 367!$omp& private(b,c,denomz) 368!$omp& firstprivate(ncor,nocc,nvir,lnov,lnvv,i,j,k,klo) 369 370!$omp sections 371!$omp section 372 call dgemm('n','t',nvir,nvir,nvir,1.0d0, 373 1 Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, 374 2 f1n,nvir) 375 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 376 1 Tia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0, 377 2 f1n,nvir) 378!$omp section 379 call dgemm('n','t',nvir,nvir,nvir,1.0d0, 380 1 Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, 381 2 f2n,nvir) 382 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 383 1 Xia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0, 384 2 f2n,nvir) 385!$omp section 386 call dgemm('n','n',nvir,nvir,nvir,1.0d0, 387 1 Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, 388 2 f3n,nvir) 389 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 390 1 Tia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0, 391 2 f3n,nvir) 392!$omp section 393 call dgemm('n','n',nvir,nvir,nvir,1.0d0, 394 1 Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, 395 2 f4n,nvir) 396 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 397 1 Xia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0, 398 2 f4n,nvir) 399!$omp section 400 call dgemm('n','t',nvir,nvir,nvir,1.0d0, 401 1 Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, 402 2 f1t,nvir) 403 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 404 1 Tka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0, 405 2 f1t,nvir) 406!$omp section 407 call dgemm('n','t',nvir,nvir,nvir,1.0d0, 408 1 Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, 409 2 f2t,nvir) 410 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 411 1 Xka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0, 412 2 f2t,nvir) 413!$omp section 414 call dgemm('n','n',nvir,nvir,nvir,1.0d0, 415 1 Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, 416 2 f3t,nvir) 417 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 418 1 Tka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0, 419 2 f3t,nvir) 420!$omp section 421 call dgemm('n','n',nvir,nvir,nvir,1.0d0, 422 1 Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, 423 2 f4t,nvir) 424 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 425 1 Xka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0, 426 2 f4t,nvir) 427!$omp end sections 428 429 430 431 432!$omp do collapse(2) 433!$omp& schedule(static) 434!$omp& reduction(+:emp5iz,emp4iz) 435!$omp& reduction(+:emp5kz,emp4kz) 436 do b=1,nvir 437 do c=1,nvir 438 denomz=-1.0d0/( eorb(ncor+nocc+b) 439 & +eorb(ncor+nocc+c)+eaijk ) 440 emp4iz=emp4iz+denomz* 441 & (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3n(b,c)+f4n(c,b))* 442 & (f1t(b,c)-2*f2t(b,c)-2*f3t(b,c)+f4t(b,c)) 443 emp4iz=emp4iz-denomz* 444 & (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3n(c,b))* 445 & (2*f1t(b,c)-f2t(b,c)-f3t(b,c)+2*f4t(b,c)) 446 emp4iz=emp4iz+3*denomz*( 447 & f1n(b,c)*(f1n(b,c)+f3n(c,b)+2*f4t(c,b))+ 448 & f2n(b,c)*f2t(c,b)+f3n(b,c)*f4t(b,c)) 449 emp4kz=emp4kz+denomz* 450 & (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3t(b,c)+f4t(c,b))* 451 & (f1n(b,c)-2*f2n(b,c)-2*f3n(b,c)+f4n(b,c)) 452 emp4kz=emp4kz-denomz* 453 & (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3t(c,b))* 454 & (2*f1n(b,c)-f2n(b,c)-f3n(b,c)+2*f4n(b,c)) 455 emp4kz=emp4kz+3*denomz*( 456 & f1t(b,c)*(f1t(b,c)+f3t(c,b)+2*f4n(c,b))+ 457 & f2t(b,c)*f2n(c,b)+f3t(b,c)*f4n(b,c)) 458 emp5iz=emp5iz+denomz*t1v1((k-klo)*nvir+b)*dintx1(c)* 459 & ( f1t(b,c)+f2n(b,c)+f4n(c,b) 460 & -2*(f3t(b,c)+f4n(b,c)+f2n(c,b)+ 461 & f1n(b,c)+f2t(b,c)+f3n(c,b)) 462 & +4*(f3n(b,c)+f4t(b,c)+f1n(c,b))) 463 emp5iz=emp5iz+denomz*t1v1((k-klo)*nvir+b)*dintc1(c)* 464 & ( f1n(b,c)+f4n(b,c)+f1t(c,b) 465 & -2*(f2n(b,c)+f3n(b,c)+f2t(c,b))) 466 emp5kz=emp5kz+denomz*t1v2(b)*dintx2((k-klo)*nvir+c)* 467 & ( f1n(b,c)+f2t(b,c)+f4t(c,b) 468 & -2*(f3n(b,c)+f4t(b,c)+f2t(c,b)+ 469 & f1t(b,c)+f2n(b,c)+f3t(c,b)) 470 & +4*(f3t(b,c)+f4n(b,c)+f1t(c,b))) 471 emp5kz=emp5kz+denomz*t1v2(b)*dintc2((k-klo)*nvir+c)* 472 & ( f1t(b,c)+f4t(b,c)+f1n(c,b) 473 & -2*(f2t(b,c)+f3t(b,c)+f2n(c,b))) 474 enddo 475 enddo 476!$omp end do 477!$omp end parallel 478 479 emp4z = emp4z + emp4iz 480 emp5z = emp5z + emp5iz 481 if (i.ne.k) then 482 emp4z = emp4z + emp4kz 483 emp5z = emp5z + emp5kz 484 end if ! (i.ne.k) 485 end do ! k 486!dir$ end offload 487 end do ! i 488!$omp taskwait 489c if (iprt.gt.50)then 490c write(6,1234)iam,a,j,emp4,emp5 491c 1234 format(' iam aijk',3i5,2e15.5) 492c end if 493 next=nxtask(nodes, 1) 494 end if 495 end do 496 if(ga_nodeid().eq.0) then 497 write(6,4321) ' ccsd(t): done ', 498 A a-(ncor+nocc)+((klo-1)/kchunk)*nvir, 499 O ' out of ',(nocc/kchunk)*nvir, 500 O ' progress: ', 501 O ((a-(ncor+nocc)+((klo-1)/kchunk)*nvir)*100d0)/ 502 D ((nocc/kchunk)*nvir), 503 P '%' 504 4321 format(a,i8,a,i8,a,f6.1,a1) 505 endif 506 end do 507 end do 508 emp4 =emp4 + emp4z 509 emp5 = emp5 + emp5z 510 call ga_sync() 511 next=nxtask(-nodes, 1) 512 call ga_sync() 513 if (occsdps) then 514 call pstat_off(ps_trpdrv) 515 else 516 call qexit('trpdrv',0) 517 endif 518c 519!dir$ offload_transfer target(mic) 520 I nocopy(f1n:length(n_nvir) FREE) 521 I nocopy(f1t:length(n_nvir) FREE) 522 I nocopy(f2n:length(n_nvir) FREE) 523 I nocopy(f2t:length(n_nvir) FREE) 524 I nocopy(f3n:length(n_nvir) FREE) 525 I nocopy(f3t:length(n_nvir) FREE) 526 I nocopy(f4n:length(n_nvir) FREE) 527 I nocopy(f4t:length(n_nvir) FREE) 528 N nocopy(eorb(ncor+1:n_nn) : FREE) 529 I nocopy(Jia:length(n_nvir) FREE) 530 I nocopy(Kia:length(n_nvir) FREE) 531 I nocopy(Tia:length(nv_no) FREE) 532 I nocopy(Xia:length(nv_no) FREE) 533 I nocopy(Tij:length(n_nvir) FREE) 534 I nocopy(Kij:length(nv_no) FREE) 535 I nocopy(Jij:length(nv_no) FREE) 536 N nocopy(t1v2:length(nvir) FREE) 537 N nocopy(dintc1:length(nvir) FREE) 538 N nocopy(dintx1:length(nvir) FREE) 539 I nocopy(t1v1:length(nv_nk) FREE) 540 I nocopy(dintc2:length(nv_nk) FREE) 541 I nocopy(dintx2:length(nv_nk) FREE) 542 I nocopy(Tkj:length(klnn) FREE) 543 I nocopy(Kkj:length(klno) FREE) 544 I nocopy(Jkj:length(klno) FREE) 545 I nocopy(Jka:length(klnn) FREE) 546 I nocopy(Tka:length(klno) FREE) 547 I nocopy(Kka:length(klnn) FREE) 548 I nocopy(Xka:length(klno) FREE) 549 550!$omp end single 551!$omp end parallel 552 end 553 554 555 556 subroutine ccsd_iloop_host(t1,eorb, 557 & g_objo,g_objv,g_coul,g_exch, 558 & ncor,nocc,nvir,emp4,emp5,oseg_lo, 559 & kchunk,Tkj,Tka,Xka,Jka,Kka,Jkj,Kkj,Dja,Djka, 560 & j,a,klo,khi,av,eaijk, 561 & nbh_objv1,nbh_objv2,nbh_objv3,nbh_objo1,nbh_objo2, 562 & nbh_objo3,nbh_exch1,nbh_coul1,nbh_objv4) 563C $Id: ccsd_trpdrv_omp.F 26674 2015-01-08 14:36:59Z jhammond $ 564 implicit none 565c 566#include "global.fh" 567#include "ccsd_len.fh" 568#include "ccsdps.fh" 569 integer ncor,nocc,nvir 570 double precision t1(*) 571 double precision f1n(nvir,nvir),f1t(nvir,nvir) 572 double precision f2n(nvir,nvir),f2t(nvir,nvir) 573 double precision f3n(nvir,nvir),f3t(nvir,nvir) 574 double precision f4n(nvir,nvir),f4t(nvir,nvir) 575 double precision eorb(*) 576 577 double precision Tij(nvir*nvir), Tia(nvir*nocc) 578 double precision Xia(nvir*nocc) 579 double precision Jia(nvir*nvir), Jij(nvir*nocc) 580 double precision Kia(nvir*nvir), Kij(nvir*nocc) 581 double precision Djia(nvir) 582 583 double precision Tkj(*), Tka(*) 584 double precision Xka(*) 585 double precision Jka(*), Jkj(*) 586 double precision Kka(*), Kkj(*) 587 double precision Dja(*), Djka(*) 588 589 double precision dintc1(nvir),dintx1(nvir),t1v1(nvir) 590 double precision dintc2(nvir),dintx2(nvir),t1v2(nvir) 591c 592 double precision emp4,emp5,denom 593 double precision emp4i,emp5i,emp4k,emp5k 594 double precision eaijk 595 integer g_objo,g_objv,g_coul,g_exch 596 integer oseg_lo,oseg_hi 597 integer a,b,c,i,j,k,av 598 integer klo, khi,nocc2,kchunk 599 600 601 602 integer nbh_objv1,nbh_objv2,nbh_objv3 603 integer nbh_objv5,nbh_objv6,nbh_objv7 604 integer nbh_objv4(nocc) 605c 606 integer nbh_objo1,nbh_objo2,nbh_objo3 607 integer nbh_objo4,nbh_objo5,nbh_objo6 608c 609 integer nbh_exch1,nbh_exch2,nbh_coul1,nbh_coul2 610 611 nocc2=nocc/2 612 do i=nocc2+1,nocc 613!$omp critical 614 call ga_nbget(g_objv,1+(j-1)*nvir+(i-1)*lnov, 615 & j*nvir+(i-1)*lnov,av,av,Djia,nvir,nbh_objv5) 616 call ga_nbget(g_objo,(i-1)*lnvv+1,i*lnvv,j,j,Tij, 617 & lnvv,nbh_objo4) 618 call ga_nbget(g_objo,lnovv+(i-1)*lnov+1, 619 & lnovv+i*lnov,j,j,Jij,lnov,nbh_objo5) 620 call ga_nbget(g_objo,lnovv+lnoov+(i-1)*lnov+1, 621 & lnovv+lnoov+i*lnov,j,j,Kij,lnov,nbh_objo6) 622 call ga_nbget(g_coul,1,lnvv,(a-oseg_lo)*nocc+i, 623 & (a-oseg_lo)*nocc+i,Jia,lnvv,nbh_coul2) 624 call ga_nbget(g_exch,1,lnvv,(a-oseg_lo)*nocc+i, 625 & (a-oseg_lo)*nocc+i,Kia,lnvv,nbh_exch2) 626 call ga_nbget(g_objv,1+lnoov+(i-1)*lnov, 627 & lnoov+i*lnov,av,av,Tia,lnov,nbh_objv6) 628 call ga_nbget(g_objv,1+2*lnoov+(i-1)*lnov, 629 & 2*lnoov+i*lnov,av,av,Xia,lnov,nbh_objv7) 630!$omp end critical 631 call dcopy(nvir,t1((i-1)*nvir+1),1,t1v2,1) 632 call ga_nbwait(nbh_objv1) ! Dja 633 call dcopy(nvir,Dja(1+(i-1)*nvir),1,dintc1,1) 634 call ga_nbwait(nbh_objv5) ! Djia 635 call dcopy(nvir,Djia,1,dintx1,1) 636 637 638 call ga_nbwait(nbh_objv2) 639 call ga_nbwait(nbh_objv3) 640 call ga_nbwait(nbh_objv6) 641 call ga_nbwait(nbh_objv7) 642 call ga_nbwait(nbh_objo1) 643 call ga_nbwait(nbh_objo2) 644 call ga_nbwait(nbh_objo3) 645 call ga_nbwait(nbh_objo4) 646 call ga_nbwait(nbh_objo5) 647 call ga_nbwait(nbh_objo6) 648 call ga_nbwait(nbh_exch1) 649 call ga_nbwait(nbh_exch2) 650 call ga_nbwait(nbh_coul1) 651 call ga_nbwait(nbh_coul2) 652 653 do k=klo,min(khi,i) 654 655 call dcopy(nvir,t1((k-1)*nvir+1),1,t1v1,1) 656 call dcopy(nvir,Dja(1+(k-1)*nvir),1,dintc2,1) 657 call ga_nbwait(nbh_objv4(k)) ! Djka 658 call dcopy(nvir,Djka(1+(k-klo)*nvir),1,dintx2,1) 659 660 emp4i = 0.0d0 661 emp5i = 0.0d0 662 emp4k = 0.0d0 663 emp5k = 0.0d0 664 665 666!$omp parallel 667!$omp& shared(eorb) 668!$omp& shared(f1n,f2n,f3n,f4n,f1t,f2t,f3t,f4t) 669!$omp& shared(t1v1,dintc1,dintx1) 670!$omp& shared(t1v2,dintc2,dintx2) 671!$omp& private(b,c,eaijk,denom) 672!$omp& firstprivate(ncor,nocc,nvir,lnov,lnvv,i,j,k,klo) 673 674!$omp sections 675!$omp section 676 call dgemm('n','t',nvir,nvir,nvir,1.0d0, 677 1 Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, 678 2 f1n,nvir) 679 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 680 1 Tia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0, 681 2 f1n,nvir) 682!$omp section 683 call dgemm('n','t',nvir,nvir,nvir,1.0d0, 684 1 Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, 685 2 f2n,nvir) 686 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 687 1 Xia,nvir,Kkj(1+(k-klo)*lnov),nocc,1.0d0, 688 2 f2n,nvir) 689!$omp section 690 call dgemm('n','n',nvir,nvir,nvir,1.0d0, 691 1 Jia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, 692 2 f3n,nvir) 693 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 694 1 Tia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0, 695 2 f3n,nvir) 696!$omp section 697 call dgemm('n','n',nvir,nvir,nvir,1.0d0, 698 1 Kia,nvir,Tkj(1+(k-klo)*lnvv),nvir,0.0d0, 699 2 f4n,nvir) 700 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 701 1 Xia,nvir,Jkj(1+(k-klo)*lnov),nocc,1.0d0, 702 2 f4n,nvir) 703!$omp section 704 call dgemm('n','t',nvir,nvir,nvir,1.0d0, 705 1 Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, 706 2 f1t,nvir) 707 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 708 1 Tka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0, 709 2 f1t,nvir) 710!$omp section 711 call dgemm('n','t',nvir,nvir,nvir,1.0d0, 712 1 Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, 713 2 f2t,nvir) 714 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 715 1 Xka(1+(k-klo)*lnov),nvir,Kij,nocc,1.0d0, 716 2 f2t,nvir) 717!$omp section 718 call dgemm('n','n',nvir,nvir,nvir,1.0d0, 719 1 Jka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, 720 2 f3t,nvir) 721 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 722 1 Tka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0, 723 2 f3t,nvir) 724!$omp section 725 call dgemm('n','n',nvir,nvir,nvir,1.0d0, 726 1 Kka(1+(k-klo)*lnvv),nvir,Tij,nvir,0.0d0, 727 2 f4t,nvir) 728 call dgemm('n','n',nvir,nvir,nocc,-1.0d0, 729 1 Xka(1+(k-klo)*lnov),nvir,Jij,nocc,1.0d0, 730 2 f4t,nvir) 731!$omp end sections 732 733 734 735 eaijk=eorb(a) - ( eorb(ncor+i) 736 & +eorb(ncor+j) 737 & +eorb(ncor+k) ) 738!$omp do collapse(2) 739!$omp& schedule(static) 740!$omp& reduction(+:emp5i,emp4i) 741!$omp& reduction(+:emp5k,emp4k) 742 do b=1,nvir 743 do c=1,nvir 744 denom=-1.0d0/( eorb(ncor+nocc+b) 745 & +eorb(ncor+nocc+c)+eaijk ) 746 emp4i=emp4i+denom* 747 & (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3n(b,c)+f4n(c,b))* 748 & (f1t(b,c)-2*f2t(b,c)-2*f3t(b,c)+f4t(b,c)) 749 emp4i=emp4i-denom* 750 & (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3n(c,b))* 751 & (2*f1t(b,c)-f2t(b,c)-f3t(b,c)+2*f4t(b,c)) 752 emp4i=emp4i+3*denom*( 753 & f1n(b,c)*(f1n(b,c)+f3n(c,b)+2*f4t(c,b))+ 754 & f2n(b,c)*f2t(c,b)+f3n(b,c)*f4t(b,c)) 755 emp4k=emp4k+denom* 756 & (f1n(b,c)+f1t(c,b)+f2n(c,b)+f3t(b,c)+f4t(c,b))* 757 & (f1n(b,c)-2*f2n(b,c)-2*f3n(b,c)+f4n(b,c)) 758 emp4k=emp4k-denom* 759 & (f1t(b,c)+f1n(c,b)+f2t(c,b)+f3t(c,b))* 760 & (2*f1n(b,c)-f2n(b,c)-f3n(b,c)+2*f4n(b,c)) 761 emp4k=emp4k+3*denom*( 762 & f1t(b,c)*(f1t(b,c)+f3t(c,b)+2*f4n(c,b))+ 763 & f2t(b,c)*f2n(c,b)+f3t(b,c)*f4n(b,c)) 764 emp5i=emp5i+denom*t1v1(b)*dintx1(c)* 765 & ( f1t(b,c)+f2n(b,c)+f4n(c,b) 766 & -2*(f3t(b,c)+f4n(b,c)+f2n(c,b)+ 767 & f1n(b,c)+f2t(b,c)+f3n(c,b)) 768 & +4*(f3n(b,c)+f4t(b,c)+f1n(c,b))) 769 emp5i=emp5i+denom*t1v1(b)*dintc1(c)* 770 & ( f1n(b,c)+f4n(b,c)+f1t(c,b) 771 & -2*(f2n(b,c)+f3n(b,c)+f2t(c,b))) 772 emp5k=emp5k+denom*t1v2(b)*dintx2(c)* 773 & ( f1n(b,c)+f2t(b,c)+f4t(c,b) 774 & -2*(f3n(b,c)+f4t(b,c)+f2t(c,b)+ 775 & f1t(b,c)+f2n(b,c)+f3t(c,b)) 776 & +4*(f3t(b,c)+f4n(b,c)+f1t(c,b))) 777 emp5k=emp5k+denom*t1v2(b)*dintc2(c)* 778 & ( f1t(b,c)+f4t(b,c)+f1n(c,b) 779 & -2*(f2t(b,c)+f3t(b,c)+f2n(c,b))) 780 enddo 781 enddo 782!$omp end do 783!$omp end parallel 784 785 emp4 = emp4 + emp4i 786 emp5 = emp5 + emp5i 787 if (i.ne.k) then 788 emp4 = emp4 + emp4k 789 emp5 = emp5 + emp5k 790 end if ! (i.ne.k) 791 end do ! k 792 end do ! i 793 end 794