1c Modifications (c) 2021 Advanced Micro Devices, Inc. All Rights Reserved. 2 3C #ifndef OFFLOAD_DEBUG_DATA 4C #define OFFLOAD_DEBUG_DATA 5C #endif 6 7#ifdef USE_OMP_TEAMS_DISTRIBUTE 8#define TEAMS_DISTRIBUTE teams distribute 9#else 10#define TEAMS_DISTRIBUTE 11#endif 12 13#ifdef USE_OMP_SCHEDULE_STATIC_1 14#define OMP_SCHEDULE schedule(static,1) 15#else 16#define OMP_SCHEDULE 17#endif 18 19 SUBROUTINE ccsd_t(d_t1,k_t1_offset,d_t2,k_t2_offset, 20 1 d_v2,k_v2_offset,energy1,energy2,size_t1) 21C 22C $Id$ 23C 24 IMPLICIT NONE 25#include "global.fh" 26#include "mafdecls.fh" 27#include "util.fh" 28#include "errquit.fh" 29#include "tce.fh" 30#include "tce_main.fh" 31#include "offl.fh" 32#include "ccsd_t_ps.fh" 33 integer d_t1 34 integer k_t1_offset 35 integer d_t2 36 integer k_t2_offset 37 integer d_v2 38 integer k_v2_offset 39 integer t_h1b, t_h1 40 integer t_h2b, t_h2 41 integer t_h3b, t_h3 42 integer t_p4b, t_p4 43 integer t_p5b, t_p5 44 integer t_p6b, t_p6 45#ifdef USE_F90_ALLOCATABLE 46 double precision, allocatable :: f_singles(:),f_doubles(:) 47#ifdef USE_FASTMEM 48 !dec$ attributes fastmem :: f_singles, f_doubles 49#endif 50 integer alloc_error 51#else 52 integer k_singles,l_singles 53 integer k_doubles,l_doubles 54#endif 55 integer size,i 56 integer nxtask0 57 integer next 58 integer nprocs 59 integer count 60 integer offset_p4,offset_p5,offset_p6 61 integer offset_h1,offset_h2,offset_h3 62 integer range_p4,range_p5,range_p6 63 integer range_h1,range_h2,range_h3 64c - T1/X1 LOCALIZATION ------------------- 65 integer l_t1_local,k_t1_local 66 integer size_t1 67c --------------------------------------- 68 double precision energy(2) 69 double precision energy1,energy2 70 integer k_aux,k_list,l_aux,l_list 71 integer ccsd_t_6tasks,tot_task,w_thresh,task_thresh 72 integer k,iptr,k_grain 73 double precision wall 74 integer tasks_skip 75 integer n_progr,pct_progr 76 parameter(n_progr=20) 77 logical i_progr(n_progr) 78 external nxtask0,ccsd_t_6tasks 79 logical offload_enabled 80 external offload_enabled 81C 82c 83c - T1/X1 LOCALIZATION ---------- 84c opening l_t1_local and l_x1_local 85 if (.not.MA_PUSH_GET(mt_dbl,size_t1,'t1_local', 86 1 l_t1_local,k_t1_local)) 87 1 call errquit('ccsd_t: t1_local size=',size_t1,MA_ERR) 88 call ma_zero(dbl_mb(k_t1_local),size_t1) 89c copy d_t1 ==> l_t1_local 90ccx call ga_get(d_t1,1,size_t1,1,1,dbl_mb(k_t1_local),1) 91 call get_block(d_t1,dbl_mb(k_t1_local),size_t1,0) 92c ------------------------------- 93C 94 nprocs = GA_NNODES() 95 energy(1)=0.0d0 96 energy(2)=0.0d0 97 energy1 = 0.0d0 98 energy2 = 0.0d0 99c estimate triplesx size 100 range_p4=0 101 do t_p4b = noab+1,noab+nvab 102 range_p4 = max(range_p4,int_mb(k_range+t_p4b-1)) 103 enddo 104 range_h1=0 105 do t_h1b = 1,noab 106 range_h1 = max(range_h1,int_mb(k_range+t_h1b-1)) 107 enddo 108 size=(range_p4**3)*(range_h1**3) 109 110 call util_align64(size) 111 triplesx_mxlgth=size 112#ifdef USE_F90_ALLOCATABLE 113 allocate( f_singles(1:size), stat=alloc_error) 114 if (alloc_error.ne.0) then 115 call errquit('ccsd_t: MA error sgl',0,MA_ERR) 116 endif 117 allocate( f_doubles(1:size), stat=alloc_error) 118 if (alloc_error.ne.0) then 119 call errquit('ccsd_t: MA error dbl',0,MA_ERR) 120 endif 121#if USE_OPENMP 122#if USE_OFFLOAD 123 if (offload_enabled()) then 124!$omp target enter data 125!$omp&map(alloc:f_singles) 126!$omp&map(alloc:f_doubles) 127#ifdef OFFLOAD_DEBUG_DATA 128 write (*,*) ga_nodeid(), 'alloc f_singles', size, 'elements' 129 write (*,*) ga_nodeid(), 'alloc f_doubles', size, 'elements' 130#endif 131 endif 132#endif 133#endif 134#else 135#if USE_OFFLOAD 136 call errquit('ccsd_t: offload not supported with MA',0,0) 137#endif 138 if (.not.MA_PUSH_GET(mt_dbl,size,'(T) singles',l_singles, 139 1 k_singles)) call errquit('ccsd_t: MA error sgl', 140 2 size,MA_ERR) 141 if (.not.MA_PUSH_GET(mt_dbl,size,'(T) doubles',l_doubles, 142 1 k_doubles)) call errquit('ccsd_t: MA error dbl', 143 2 size,MA_ERR) 144#endif 145 wall=-util_wallsec() 146 tot_task= ccsd_t_6tasks(restricted,noab,nvab, 147 1 int_mb(k_spin),int_mb(k_sym)) 148 if (.not.ma_push_get(mt_int,7*tot_task,"list.task", 149 1 l_list,k_list)) call errquit("k_list",1,MA_ERR) 150 if (.not.ma_push_get(mt_int,7*tot_task,"auxtask", 151 1 l_aux,k_aux)) call errquit("k_aux",2,MA_ERR) 152c 153c get first task with weight lt ? 8 154c 155c w_thresh=10 156 w_thresh=0 157 w_thresh=w_thresh**6 158 call ccsd_t_neword(tot_task, w_thresh,task_thresh, 159 R restricted,noab,nvab, 160 K int_mb(k_spin),int_mb(k_sym), 161 K int_mb(k_range), 162 A int_mb(k_aux),int_mb(k_list)) 163 164 165 if (.not.MA_POP_STACK(l_aux)) 166 1 call errquit('ordering',3,MA_ERR) 167 count = 0 168 k_grain=1 169 tasks_skip=0 170 next = nxtask0(nprocs,k_grain,tasks_skip) 171 do k=1,n_progr 172 i_progr(k)=.true. 173 enddo 174c stagger start of loop 175 call util_mpinap(100) 176 if(task_thresh.gt.1) then 177 do k=1,task_thresh-1 178c 179 if (next.eq.count) then 180#ifdef USE_F90_ALLOCATABLE 181 call ccsd_t_loop(k,energy1,energy2, 182 & int_mb(k_list),int_mb(k_range),int_mb(k_offset), 183 & f_singles,f_doubles, 184 & k_t1_local,k_t1_offset, 185 & d_t2,d_v2,k_t2_offset,k_v2_offset, 186 & restricted,k_evl_sorted,size) 187#else 188 call ccsd_t_loop(k,energy1,energy2, 189 & int_mb(k_list),int_mb(k_range),int_mb(k_offset), 190 & dbl_mb(k_singles),dbl_mb(k_doubles), 191 & k_t1_local,k_t1_offset, 192 & d_t2,d_v2,k_t2_offset,k_v2_offset, 193 & restricted,k_evl_sorted,size) 194#endif 195c 196 if(ga_nodeid().eq.2) then 197 pct_progr=(k*n_progr)/tot_task 198 if(i_progr(pct_progr)) then 199 i_progr(pct_progr)=.false. 200 write(6,'(a,i5,a,i4,a,f15.1,a,f9.1)') 201 & '0task ',k,' done ', 202 & int((k*100d0)/tot_task),'% at', 203 & wall+util_wallsec(),' sec, (size)^1/6= ', 204 & (size)**(1d0/6d0) 205 call util_flush(6) 206 endif 207 endif 208 next = nxtask0(nprocs,k_grain,tasks_skip) 209 endif 210 count = count + 1 211 enddo 212 endif 213 if(task_thresh.le.tot_task) then 214 next = nxtask0(-nprocs,k_grain,tasks_skip) 215 if(.true.) then 216 next = nxtask0(nprocs,k_grain,tasks_skip) 217 count=0 218 do k=task_thresh,tot_task 219 if (next.eq.count) then 220#ifdef USE_F90_ALLOCATABLE 221 call ccsd_t_loop(k,energy1,energy2, 222 & int_mb(k_list),int_mb(k_range),int_mb(k_offset), 223 & f_singles,f_doubles, 224 & k_t1_local,k_t1_offset, 225 & d_t2,d_v2,k_t2_offset,k_v2_offset, 226 & restricted,k_evl_sorted,size) 227#else 228 call ccsd_t_loop(k,energy1,energy2, 229 & int_mb(k_list),int_mb(k_range),int_mb(k_offset), 230 & dbl_mb(k_singles),dbl_mb(k_doubles), 231 & k_t1_local,k_t1_offset, 232 & d_t2,d_v2,k_t2_offset,k_v2_offset, 233 & restricted,k_evl_sorted,size) 234#endif 235c 236 next = nxtask0(nprocs,k_grain,tasks_skip) 237 if(ga_nodeid().eq.2) then 238 pct_progr=(k*n_progr)/tot_task 239 if(i_progr(pct_progr)) then 240 i_progr(pct_progr)=.false. 241 write(6,'(a,i8,a,i4,a,f15.1,a,f9.1)') 242 & ' task',k,' done ', 243 & int((k*100d0)/tot_task),'% at', 244 & wall+util_wallsec(),' sec, (size)^1/6= ', 245 & (size)**(1d0/6d0) 246 call util_flush(6) 247 endif 248 endif 249 endif 250 count = count + 1 251 enddo 252 endif 253 endif 254 255 if (.not.MA_POP_STACK(l_list)) 256 1 call errquit('ordering',3,MA_ERR) 257#ifdef USE_F90_ALLOCATABLE 258#if USE_OPENMP 259#if USE_OFFLOAD 260 if (offload_enabled()) then 261!$omp target exit data 262!$omp&map(release:f_singles) 263!$omp&map(release:f_doubles) 264#ifdef OFFLOAD_DEBUG_DATA 265 write (*,*) ga_nodeid(), 'release f_singles', size, 'elements' 266 write (*,*) ga_nodeid(), 'release f_doubles', size, 'elements' 267#endif 268 endif 269#endif 270#endif 271 deallocate( f_doubles, stat=alloc_error) 272 if (alloc_error.ne.0) then 273 call errquit('ccsd_t doubles',3,MA_ERR) 274 endif 275 deallocate( f_singles, stat=alloc_error) 276 if (alloc_error.ne.0) then 277 call errquit('ccsd_t singles',4,MA_ERR) 278 endif 279#else 280 if (.not.MA_POP_STACK(l_doubles)) 281 1 call errquit('ccsd_t doubles',3,MA_ERR) 282 if (.not.MA_POP_STACK(l_singles)) 283 1 call errquit('ccsd_t singles',4,MA_ERR) 284#endif 285 next = nxtask0(-nprocs,k_grain,tasks_skip) 286c call ga_sync() 287 energy(1) = energy1 288 energy(2) = energy2 289 call ga_mask_sync(.false.,.true.) 290 call ga_dgop(1975,energy,2,'+') 291 energy1 = energy(1) 292 energy2 = energy(2) 293c - T1/X1 LOCALIZATION ------ 294 if(.not.MA_POP_STACK(l_t1_local)) 295 & call errquit('ccsd_t: l_t1_local',4,MA_ERR) 296c --------------------------- 297 return 298 end 299 300 301 302 303 304! wrapper to ccsd_t_dot because of offload ugliness 305 subroutine ccsd_t_esum(a_singles, a_doubles, restricted, 306 & h1b,h2b,h3b,p4b,p5b,p6b, 307 & o_h1,o_h2,o_h3, 308 & o_p4,o_p5,o_p6, 309 & r_h1,r_h2,r_h3, 310 & r_p4,r_p5,r_p6, 311 & energy1,energy2) 312 implicit none 313 integer h1b, h2b, h3b, p4b, p5b, p6b 314 double precision o_h1(*),o_h2(*),o_h3(*) 315 double precision o_p4(*),o_p5(*),o_p6(*) 316 integer r_h1,r_h2,r_h3 317 integer r_p4,r_p5,r_p6 318 double precision a_singles(*) 319 double precision a_doubles(*) 320 logical restricted 321 double precision energy1,energy2 322 external offload_enabled 323 logical offload_enabled 324#ifdef USE_OPENMP 325#ifdef USE_OFFLOAD 326 if(offload_enabled()) then 327 call offl_ccsd_t_dot(a_singles,a_doubles,restricted, 328 & h1b,h2b,h3b,p4b,p5b,p6b, 329 & o_h1,o_h2,o_h3,o_p4,o_p5,o_p6, 330 & r_h1,r_h2,r_h3,r_p4,r_p5,r_p6, 331 & energy1,energy2) 332 else 333#endif 334#endif 335 call ccsd_t_dot(a_singles,a_doubles, restricted, 336 & h1b,h2b,h3b,p4b,p5b,p6b, 337 & o_h1,o_h2,o_h3,o_p4,o_p5,o_p6, 338 & r_h1,r_h2,r_h3,r_p4,r_p5,r_p6, 339 & energy1,energy2) 340#ifdef USE_OPENMP 341#ifdef USE_OFFLOAD 342 endif 343#endif 344#endif 345 346 return 347 end 348 349 350 351 subroutine ccsd_t_loop(k,energy1,energy2, 352 & k_list,k_range,k_offset,a_singles,a_doubles, 353 & k_t1_local,k_t1_offset,d_t2,d_v2,k_t2_offset,k_v2_offset, 354 & restricted,k_evl_sorted,size) 355 implicit none 356#include "mafdecls.fh" 357#include "ccsd_t_ps.fh" 358 integer k 359 double precision energy1,energy2 360 integer k_list(7,*) 361 integer k_range(*),k_offset(*) 362 double precision a_singles(*),a_doubles(*) 363 logical restricted 364 integer k_evl_sorted 365 integer k_t1_local,k_t1_offset 366 integer d_t2,d_v2 367 integer k_t2_offset,k_v2_offset 368 integer size 369#if USE_OFFLOAD 370 external offload_enabled 371 logical offload_enabled 372#endif 373c 374 integer t_p4b,t_p5b,t_p6b,t_h1b,t_h2b,t_h3b 375 integer range_p4,range_p5,range_p6 376 integer range_h1,range_h2,range_h3 377 integer offset_p4,offset_p5,offset_p6 378 integer offset_h1,offset_h2,offset_h3 379c 380 t_p4b=k_list(1,k) 381 t_p5b=k_list(2,k) 382 t_p6b=k_list(3,k) 383 t_h1b=k_list(4,k) 384 t_h2b=k_list(5,k) 385 t_h3b=k_list(6,k) 386 range_p4 = k_range(t_p4b) 387 range_p5 = k_range(t_p5b) 388 range_p6 = k_range(t_p6b) 389 range_h1 = k_range(t_h1b) 390 range_h2 = k_range(t_h2b) 391 range_h3 = k_range(t_h3b) 392 offset_p4 = k_evl_sorted+k_offset(t_p4b)-1 393 offset_p5 = k_evl_sorted+k_offset(t_p5b)-1 394 offset_p6 = k_evl_sorted+k_offset(t_p6b)-1 395 offset_h1 = k_evl_sorted+k_offset(t_h1b)-1 396 offset_h2 = k_evl_sorted+k_offset(t_h2b)-1 397 offset_h3 = k_evl_sorted+k_offset(t_h3b)-1 398c 399 size = range_p4 * range_p5 * range_p6 400 & * range_h1 * range_h2 * range_h3 401c zeroing --- 402#if USE_OPENMP 403#if USE_OFFLOAD 404 if (offload_enabled()) then 405 call offl_zero(a_singles, size) 406 call offl_zero(a_doubles, size) 407 else 408#endif 409#endif 410 call dcopy(size, 0.0d0, 0, a_singles, 1) 411 call dcopy(size, 0.0d0, 0, a_doubles, 1) 412#if USE_OPENMP 413#if USE_OFFLOAD 414 endif 415#endif 416#endif 417c ----------- 418 if (otceps) call pstat_on(ps_cctsng) 419#ifdef USE_OFFLOAD 420 call offl_ccsd_t_singles_l(a_singles, 421 1 k_t1_local,d_v2,k_t1_offset,k_v2_offset, 422 1 t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2) 423#else 424 call ccsd_t_singles_l(a_singles, 425 1 k_t1_local,d_v2,k_t1_offset,k_v2_offset, 426 1 t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2) 427#endif 428 if (otceps) call pstat_off(ps_cctsng) 429 430 431 if (otceps) call pstat_on(ps_cctdbl) 432 call ccsd_t_doubles_l(a_doubles, 433 1 d_t2,d_v2,k_t2_offset,k_v2_offset, 434 1 t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b,2) 435 if (otceps) call pstat_off(ps_cctdbl) 436 call ccsd_t_esum(a_singles,a_doubles,restricted, 437 & t_h1b,t_h2b,t_h3b,t_p4b,t_p5b,t_p6b, 438 & dbl_mb(offset_h1+1),dbl_mb(offset_h2+1), 439 & dbl_mb(offset_h3+1),dbl_mb(offset_p4+1), 440 & dbl_mb(offset_p5+1),dbl_mb(offset_p6+1), 441 & range_h1,range_h2,range_h3, 442 & range_p4,range_p5,range_p6, 443 & energy1,energy2) 444 return 445 end 446 447#ifdef USE_OPENMP 448#ifdef USE_OFFLOAD 449 subroutine offl_zero(array, size) 450 implicit none 451 double precision array(size) 452 integer size 453 integer ii 454!$omp target 455!$omp TEAMS_DISTRIBUTE parallel do OMP_SCHEDULE 456 do ii=1,size 457 array(ii)=0d0 458 enddo 459!$omp end TEAMS_DISTRIBUTE parallel do 460!$omp end target 461 return 462 end 463#endif 464#endif