1 2#define NBLOCKS 2 3 4* 5* $Id$ 6* 7 8* *********************************************************** 9* * * 10* * C3dB_pfft library * 11* * (MPI implemenation) * 12* * * 13* * Author - Eric Bylaska * 14* * date - 3/23/96 * 15* * * 16* *********************************************************** 17 18* *********************************** 19* * * 20* * C3dB_pfft_init * 21* * * 22* *********************************** 23 24 subroutine C3dB_pfft_init() 25 implicit none 26 27#include "bafdecls.fh" 28#include "errquit.fh" 29#include "C3dB.fh" 30#include "C3dB_pfft.fh" 31 32* **** local variables **** 33 integer taskid,MASTER 34 parameter (MASTER=0) 35 36 double precision eps 37 parameter (eps=1d-12) 38 39 logical value,yrow,zrow,yzslab 40 integer nxh,nyh,nzh,q,p 41 integer k1,k2,k3,fb,nbb 42 integer i,j,k 43 integer index2 44 double precision ggcut,g1,g2,g3,gg 45 integer zero_arow2(2),zero_arow3(2) 46 47* **** external functions *** 48 integer control_pfft3_qsize,brillioun_nbrillq 49 real*8 lattice_ggcut,lattice_wggcut,lattice_unitg 50 real*8 brillioun_k 51 external control_pfft3_qsize,brillioun_nbrillq 52 external lattice_ggcut,lattice_wggcut,lattice_unitg 53 external brillioun_k 54 55 call Parallel3d_taskid_i(taskid) 56 nxh = nx(1)/2 57 nyh = ny(1)/2 58 nzh = nz(1)/2 59 60 if (mapping.eq.1) then 61 62 value = BA_alloc_get(mt_log,nx(1)*nq(1), 63 > 'zero_row3',zero_row3(2,0),zero_row3(1,0)) 64 value = value.and.BA_alloc_get(mt_log,nx(1)*nq(1), 65 > 'zero_row3',zero_row3(2,1),zero_row3(1,1)) 66 67 value = value.and.BA_alloc_get(mt_log,nx(1)*nq(1), 68 > 'zero_row2',zero_row2(2,0),zero_row2(1,0)) 69 value = value.and.BA_alloc_get(mt_log,nx(1)*nq(1), 70 > 'zero_row2',zero_row2(2,1),zero_row2(1,1)) 71 72 value = value.and.BA_alloc_get(mt_log,nx(1), 73 > 'zero_slab23',zero_slab23(2,0),zero_slab23(1,0)) 74 value = value.and.BA_alloc_get(mt_log,nx(1), 75 > 'zero_slab23',zero_slab23(2,1),zero_slab23(1,1)) 76 if (.not. value) call errquit('out of heap memory',0, MA_ERR) 77 78 value = value.and.BA_push_get(mt_log,(nx(1)*ny(1)), 79 > 'zero_arow3',zero_arow3(2),zero_arow3(1)) 80 if (.not. value) call errquit('out of stack memory',0,MA_ERR) 81 82 83 !*** fb==0 - density, fb=1 - all wavefunctions/brillioun zones *** 84 do fb=0,1 85 86 if (fb.eq.0) then 87 ggcut = lattice_ggcut() 88 else 89 ggcut = lattice_wggcut() 90 end if 91 92* **** find zero_row3 - (i,*,k) rows that are zero **** 93* **** note the crazy notation used below **** 94 do q=1,nx(1)*nq(1) 95 log_mb(zero_row3(1,fb)+q-1) =.true. 96 end do 97 do q=1,nx(1)*ny(1) 98 log_mb(zero_arow3(1)+q-1) =.true. 99 end do 100 101 102 do k2 = -nyh+1, nyh-1 103 do k1 = -nxh+1, nxh-1 104 i=k1 105 j=k2 106 if (i .lt. 0) i = i + nx(1) 107 if (j .lt. 0) j = j + ny(1) 108 i=i+1 109 j=j+1 110 zrow = .true. 111 do k3 = -nzh+1, nzh-1 112 if (fb.eq.0) then 113 g1 = k1*lattice_unitg(1,1) 114 > + k3*lattice_unitg(1,2) 115 > + k2*lattice_unitg(1,3) 116 g2 = k1*lattice_unitg(2,1) 117 > + k3*lattice_unitg(2,2) 118 > + k2*lattice_unitg(2,3) 119 g3 = k1*lattice_unitg(3,1) 120 > + k3*lattice_unitg(3,2) 121 > + k2*lattice_unitg(3,3) 122 gg = g1*g1 + g2*g2 + g3*g3 123 gg= gg-ggcut 124 if (gg.lt.-eps) zrow = .false. 125 else 126 do nbb=1,brillioun_nbrillq() 127 g1 = k1*lattice_unitg(1,1) 128 > + k3*lattice_unitg(1,2) 129 > + k2*lattice_unitg(1,3) 130 > + brillioun_k(1,nbb) 131 g2 = k1*lattice_unitg(2,1) 132 > + k3*lattice_unitg(2,2) 133 > + k2*lattice_unitg(2,3) 134 > + brillioun_k(2,nbb) 135 g3 = k1*lattice_unitg(3,1) 136 > + k3*lattice_unitg(3,2) 137 > + k2*lattice_unitg(3,3) 138 > + brillioun_k(3,nbb) 139 gg = g1*g1 + g2*g2 + g3*g3 140 gg= gg-ggcut 141 if (gg.lt.-eps) zrow = .false. 142 end do 143 endif 144 145 end do 146 if (.not.zrow) then 147 log_mb(zero_arow3(1)+(i-1)+nx(1)*(j-1)) =.false. 148 call C3dB_ktoqp(1,j,q,p) 149 if (p.eq.taskid) then 150 index2 = i + nx(1)*(q-1) 151 log_mb(zero_row3(1,fb)+index2-1) =.false. 152 end if 153 end if 154 155 end do 156 end do 157 158 call C3dB_c_ptranspose_jk_init(fb,log_mb(zero_arow3(1))) 159 160 161* **** find zero_slab23 - (i,*,*) slabs that are zero **** 162 do k1 = 1,nx(1) 163 log_mb(zero_slab23(1,fb)+k1-1) =.true. 164 end do 165 166 do k1 = -nxh+1, nxh-1 167 i=k1 168 if (i .lt. 0) i = i + nx(1) 169 i=i+1 170 yzslab = .true. 171 do k3 = -nzh+1, nzh-1 172 do k2 = -nyh+1, nyh-1 173 if (fb.eq.0) then 174 g1 = k1*lattice_unitg(1,1) 175 > + k2*lattice_unitg(1,2) 176 > + k3*lattice_unitg(1,3) 177 g2 = k1*lattice_unitg(2,1) 178 > + k2*lattice_unitg(2,2) 179 > + k3*lattice_unitg(2,3) 180 g3 = k1*lattice_unitg(3,1) 181 > + k2*lattice_unitg(3,2) 182 > + k3*lattice_unitg(3,3) 183 gg = g1*g1 + g2*g2 + g3*g3 184 gg= gg-ggcut 185 if (gg.lt.-eps) yzslab = .false. 186 else 187 do nbb=1,brillioun_nbrillq() 188 g1 = k1*lattice_unitg(1,1) 189 > + k2*lattice_unitg(1,2) 190 > + k3*lattice_unitg(1,3) 191 > + brillioun_k(1,nbb) 192 g2 = k1*lattice_unitg(2,1) 193 > + k2*lattice_unitg(2,2) 194 > + k3*lattice_unitg(2,3) 195 > + brillioun_k(2,nbb) 196 g3 = k1*lattice_unitg(3,1) 197 > + k2*lattice_unitg(3,2) 198 > + k3*lattice_unitg(3,3) 199 > + brillioun_k(3,nbb) 200 gg = g1*g1 + g2*g2 + g3*g3 201 gg= gg-ggcut 202 if (gg.lt.-eps) yzslab = .false. 203 end do 204 end if 205 end do 206 end do 207 if (.not.yzslab) then 208 log_mb(zero_slab23(1,fb)+i-1) =.false. 209 end if 210 211 end do 212 213* **** find zero_row2 - (i,*,k) rows that are zero after fft of (i,j,*) **** 214 do k3 = 1,nz(1) 215 do k1 = 1,nx(1) 216 call C3dB_ktoqp(1,k3,q,p) 217 if (p.eq.taskid) then 218 index2 = k1 + nx(1)*(q-1) 219 log_mb(zero_row2(1,fb)+index2-1) 220 > = log_mb(zero_slab23(1,fb)+k1-1) 221 end if 222 end do 223 end do 224 225 226 end do !*fb* 227 value = BA_pop_stack(zero_arow3(2)) 228 if (.not. value) call errquit('error freeing stack',0,MA_ERR) 229 230 !*** mapping == 2 *** 231 else 232 value = BA_alloc_get(mt_log,nq3(1), 233 > 'zero_row3',zero_row3(2,0),zero_row3(1,0)) 234 value = value.and.BA_alloc_get(mt_log,nq3(1), 235 > 'zero_row3',zero_row3(2,1),zero_row3(1,1)) 236 237 value = value.and.BA_alloc_get(mt_log,nq2(1), 238 > 'zero_row2',zero_row2(2,0),zero_row2(1,0)) 239 value = value.and.BA_alloc_get(mt_log,nq2(1), 240 > 'zero_row2',zero_row2(2,1),zero_row2(1,1)) 241 242 value = value.and.BA_alloc_get(mt_log,nx(1), 243 > 'zero_slab23',zero_slab23(2,0),zero_slab23(1,0)) 244 value = value.and.BA_alloc_get(mt_log,nx(1), 245 > 'zero_slab23',zero_slab23(2,1),zero_slab23(1,1)) 246 if (.not. value) call errquit('out of heap memory',0, MA_ERR) 247 248 value = value.and.BA_push_get(mt_log,nx(1)*nz(1), 249 > 'zero_arow2',zero_arow2(2),zero_arow2(1)) 250 value = value.and.BA_push_get(mt_log,nx(1)*ny(1), 251 > 'zero_arow3',zero_arow3(2),zero_arow3(1)) 252 if (.not. value) call errquit('out of stack memory',0,MA_ERR) 253 254 !*** fb==0 - density, fb=1 - all wavefunctions/brillioun zones *** 255 do fb=0,1 256 257 if (fb.eq.0) then 258 ggcut = lattice_ggcut() 259 else 260 ggcut = lattice_wggcut() 261 end if 262 263* **** find zero_row3 - (i,j,*) rows that are zero **** 264 do q = 1,nq3(1) 265 log_mb(zero_row3(1,fb)+q-1) =.true. 266 end do 267 do q = 1,nx(1)*ny(1) 268 log_mb(zero_arow3(1)+q-1) =.true. 269 end do 270 271 do k2 = -nyh+1, nyh-1 272 do k1 = -nxh+1, nxh-1 273 i=k1 274 j=k2 275 if (i .lt. 0) i = i + nx(1) 276 if (j .lt. 0) j = j + ny(1) 277 i=i+1 278 j=j+1 279 zrow = .true. 280 do k3 = -nzh+1, nzh-1 281 if (fb.eq.0) then 282 g1 = k1*lattice_unitg(1,1) 283 > + k2*lattice_unitg(1,2) 284 > + k3*lattice_unitg(1,3) 285 g2 = k1*lattice_unitg(2,1) 286 > + k2*lattice_unitg(2,2) 287 > + k3*lattice_unitg(2,3) 288 g3 = k1*lattice_unitg(3,1) 289 > + k2*lattice_unitg(3,2) 290 > + k3*lattice_unitg(3,3) 291 gg = g1*g1 + g2*g2 + g3*g3 292 gg= gg-ggcut 293 if (gg.lt.-eps) zrow = .false. 294 else 295 do nbb=1,brillioun_nbrillq() 296 g1 = k1*lattice_unitg(1,1) 297 > + k2*lattice_unitg(1,2) 298 > + k3*lattice_unitg(1,3) 299 > + brillioun_k(1,nbb) 300 g2 = k1*lattice_unitg(2,1) 301 > + k2*lattice_unitg(2,2) 302 > + k3*lattice_unitg(2,3) 303 > + brillioun_k(2,nbb) 304 g3 = k1*lattice_unitg(3,1) 305 > + k2*lattice_unitg(3,2) 306 > + k3*lattice_unitg(3,3) 307 > + brillioun_k(3,nbb) 308 gg = g1*g1 + g2*g2 + g3*g3 309 gg= gg-ggcut 310 if (gg.lt.-eps) zrow = .false. 311 end do 312 end if 313 end do 314 if (.not.zrow) then 315 log_mb(zero_arow3(1)+(i-1)+nx(1)*(j-1)) =.false. 316 q = int_mb(q_map3(1,1)+(i-1)+(j-1)*(nx(1))) 317 p = int_mb(p_map3(1,1)+(i-1)+(j-1)*(nx(1))) 318 if (p.eq.taskid) then 319 log_mb(zero_row3(1,fb)+q-1) =.false. 320 end if 321 end if 322 323 end do 324 end do 325 326* **** find zero_slab23 - (i,*,*) slabs that are zero **** 327 do k1 = 1,nx(1) 328 log_mb(zero_slab23(1,fb)+k1-1) =.true. 329 end do 330 331 do k1 = -nxh+1, nxh-1 332 i=k1 333 if (i .lt. 0) i = i + nx(1) 334 i=i+1 335 yzslab = .true. 336 do k3 = -nzh+1, nzh-1 337 do k2 = -nyh+1, nyh-1 338 if (fb.eq.0) then 339 g1 = k1*lattice_unitg(1,1) 340 > + k2*lattice_unitg(1,2) 341 > + k3*lattice_unitg(1,3) 342 g2 = k1*lattice_unitg(2,1) 343 > + k2*lattice_unitg(2,2) 344 > + k3*lattice_unitg(2,3) 345 g3 = k1*lattice_unitg(3,1) 346 > + k2*lattice_unitg(3,2) 347 > + k3*lattice_unitg(3,3) 348 gg = g1*g1 + g2*g2 + g3*g3 349 gg= gg-ggcut 350 if (gg.lt.-eps) yzslab = .false. 351 else 352 do nbb=1,brillioun_nbrillq() 353 g1 = k1*lattice_unitg(1,1) 354 > + k2*lattice_unitg(1,2) 355 > + k3*lattice_unitg(1,3) 356 > + brillioun_k(1,nbb) 357 g2 = k1*lattice_unitg(2,1) 358 > + k2*lattice_unitg(2,2) 359 > + k3*lattice_unitg(2,3) 360 > + brillioun_k(2,nbb) 361 g3 = k1*lattice_unitg(3,1) 362 > + k2*lattice_unitg(3,2) 363 > + k3*lattice_unitg(3,3) 364 > + brillioun_k(3,nbb) 365 gg = g1*g1 + g2*g2 + g3*g3 366 gg= gg-ggcut 367 if (gg.lt.-eps) yzslab = .false. 368 end do 369 end if 370 end do 371 end do 372 if (.not.yzslab) then 373 log_mb(zero_slab23(1,fb)+i-1) =.false. 374 end if 375 376 end do 377 378 379* **** find zero_row2 - (i,*,k) rows that are zero after fft of (i,j,*) **** 380 do k = 1,nz(1) 381 do i = 1,nx(1) 382 q = int_mb(q_map2(1,1)+(k-1)+(i-1)*(nz(1))) 383 p = int_mb(p_map2(1,1)+(k-1)+(i-1)*(nz(1))) 384 385 log_mb(zero_arow2(1)+i-1+nx(1)*(k-1)) 386 > = log_mb(zero_slab23(1,fb)+i-1) 387 if (p.eq.taskid) then 388 log_mb(zero_row2(1,fb)+q-1) 389 > = log_mb(zero_slab23(1,fb)+i-1) 390 end if 391 end do 392 end do 393 394 call C3dB_c_ptranspose_ijk_init(fb, 395 > log_mb(zero_arow2(1)), 396 > log_mb(zero_arow3(1))) 397 398 399 end do 400 401 value = BA_pop_stack(zero_arow3(2)) 402 value = value.and.BA_pop_stack(zero_arow2(2)) 403 if (.not. value) call errquit('error pop stack',0,MA_ERR) 404 405 406 end if 407 408 call C3dB_pfft3_queue_init(control_pfft3_qsize()) 409 410 return 411 end 412 413 414 415 416* *********************************** 417* * * 418* * C3dB_pfft_end * 419* * * 420* *********************************** 421 422 subroutine C3dB_pfft_end() 423 implicit none 424 425#include "bafdecls.fh" 426#include "errquit.fh" 427#include "C3dB.fh" 428#include "C3dB_pfft.fh" 429 430 431 432* **** indexing variables **** 433 integer iq_to_i1(2,0:1) 434 integer iq_to_i2(2,0:1) 435 integer iz_to_i2(2,0:1) 436 integer i1_start(2,0:1) 437 integer i2_start(2,0:1) 438 common / c_ptrans_blk1 / iq_to_i1,iq_to_i2,iz_to_i2, 439 > i1_start,i2_start 440 441* **** indexing variables **** 442 integer jq_to_i1(2,0:1) 443 integer jq_to_i2(2,0:1) 444 integer jz_to_i2(2,0:1) 445 integer j1_start(2,0:1) 446 integer j2_start(2,0:1) 447 common / c_ptrans_blk2 / jq_to_i1,jq_to_i2,jz_to_i2, 448 > j1_start,j2_start 449 450 451* *** hilbert tranpose data structure **** 452 integer h_iq_to_i1(2,6,0:1) 453 integer h_iq_to_i2(2,6,0:1) 454 integer h_iz_to_i2(2,6,0:1) 455 integer h_iz_to_i2_count(6,0:1) 456 integer h_i1_start(2,6,0:1) 457 integer h_i2_start(2,6,0:1) 458 common / c_ptrans_blk_ijk / h_iq_to_i1, 459 > h_iq_to_i2, 460 > h_iz_to_i2, 461 > h_iz_to_i2_count, 462 > h_i1_start, 463 > h_i2_start 464 465 466 logical value 467 integer i 468 469 value = BA_free_heap(zero_row2(2,0)) 470 value = value.and.BA_free_heap(zero_row2(2,1)) 471 value = value.and.BA_free_heap(zero_row3(2,0)) 472 value = value.and.BA_free_heap(zero_row3(2,1)) 473 value = value.and.BA_free_heap(zero_slab23(2,0)) 474 value = value.and.BA_free_heap(zero_slab23(2,1)) 475 476 if (mapping.eq.1) then 477 value = value.and.BA_free_heap(iq_to_i1(2,0)) 478 value = value.and.BA_free_heap(iq_to_i1(2,1)) 479 value = value.and.BA_free_heap(iq_to_i2(2,0)) 480 value = value.and.BA_free_heap(iq_to_i2(2,1)) 481 value = value.and.BA_free_heap(iz_to_i2(2,0)) 482 value = value.and.BA_free_heap(iz_to_i2(2,1)) 483 value = value.and.BA_free_heap(i1_start(2,0)) 484 value = value.and.BA_free_heap(i1_start(2,1)) 485 value = value.and.BA_free_heap(i2_start(2,0)) 486 value = value.and.BA_free_heap(i2_start(2,1)) 487 488 value = value.and.BA_free_heap(jq_to_i1(2,0)) 489 value = value.and.BA_free_heap(jq_to_i1(2,1)) 490 value = value.and.BA_free_heap(jq_to_i2(2,0)) 491 value = value.and.BA_free_heap(jq_to_i2(2,1)) 492 value = value.and.BA_free_heap(jz_to_i2(2,0)) 493 value = value.and.BA_free_heap(jz_to_i2(2,1)) 494 value = value.and.BA_free_heap(j1_start(2,0)) 495 value = value.and.BA_free_heap(j1_start(2,1)) 496 value = value.and.BA_free_heap(j2_start(2,0)) 497 value = value.and.BA_free_heap(j2_start(2,1)) 498 end if 499 500 if (mapping.eq.2) then 501 do i=1,6 502 value = value.and.BA_free_heap(h_i1_start(2,i,0)) 503 value = value.and.BA_free_heap(h_i2_start(2,i,0)) 504 value = value.and.BA_free_heap(h_iq_to_i1(2,i,0)) 505 value = value.and.BA_free_heap(h_iq_to_i2(2,i,0)) 506 value = value.and.BA_free_heap(h_iz_to_i2(2,i,0)) 507 value = value.and.BA_free_heap(h_i1_start(2,i,1)) 508 value = value.and.BA_free_heap(h_i2_start(2,i,1)) 509 value = value.and.BA_free_heap(h_iq_to_i1(2,i,1)) 510 value = value.and.BA_free_heap(h_iq_to_i2(2,i,1)) 511 value = value.and.BA_free_heap(h_iz_to_i2(2,i,1)) 512 end do 513 end if 514 515 if (.not. value) call errquit('error freeing heap',0,MA_ERR) 516 517 call C3dB_pfft3_queue_end() 518 return 519 end 520 521 522 523 524* *********************************** 525* * * 526* * C3dB_cr_pfft3b * 527* * * 528* *********************************** 529 530 subroutine C3dB_cr_pfft3b(nb,nbb,A) 531 532***************************************************** 533* * 534* This routine performs the operation of * 535* a three dimensional complex to complex * 536* inverse fft * 537* A(nx,ny(nb),nz(nb)) <- FFT3^(-1)[A(kx,ky,kz)] * 538* * 539* Entry - * 540* A: a column distribuded 3d block * 541* tmp: tempory work space must be at * 542* least the size of (complex) * 543* (nfft*nfft + 1) + 10*nfft * 544* * 545* Exit - A is transformed and the imaginary * 546* part of A is set to zero * 547* uses - C3dB_c_ptranspose_jk, dcopy * 548* * 549***************************************************** 550 551 implicit none 552 integer nb,nbb 553 complex*16 A(*) 554 555#include "bafdecls.fh" 556#include "errquit.fh" 557 558#include "C3dB.fh" 559#include "C3dB_pfft.fh" 560 561 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 562 common / C3dB_fft / tmpx,tmpy,tmpz 563 564 565* *** local variables *** 566 integer i,j,k,q,indx,fb 567 integer indx2 568#ifdef MLIB 569 integer ierr 570#endif 571 572 573 !integer tmp1(2),tmp2(2),tmp3(2) 574 integer tmp2(2),tmp3(2) 575 logical value 576 577 578 call nwpw_timing_start(1) 579 580 !*** set fb *** 581 if (nbb.eq.0) then 582 fb = 0 583 else 584 fb = 1 585 end if 586 587 588* ***** allocate temporary space **** 589 !call C3dB_nfft3d(nb,nfft3d) 590 value = BA_push_get(mt_dcpl,(nfft3d(nb)),'ffttmp2', 591 > tmp2(2),tmp2(1)) 592 value = value.and. 593 > BA_push_get(mt_dcpl,(nfft3d(nb)),'ffttmp3',tmp3(2),tmp3(1)) 594 if (.not. value) call errquit('out of stack memory',0, MA_ERR) 595 596 597 !********************** 598 !**** slab mapping **** 599 !********************** 600 if (mapping.eq.1) then 601 602* **************************************************** 603* *** do fft along ky dimension *** 604* *** A(kx,ny(nb),kz) <- fft1d^(-1)[A(kx,ky,kz)] *** 605* **************************************************** 606#ifdef MLIB 607 !call z1dfft(dbl_mb(tmp3(1)),nz(nb),dcpl_mb(tmpz(1)),-3,ierr) 608 indx2 = 0 609 do q=1,nq(nb) 610 do i=1,nx(nb) 611 indx2 = indx2 + 1 612 if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then 613 614 indx = i + (q-1)*nx(nb)*nz(nb) 615 call zcopy(nz(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1) 616 call z1dfft(dcpl_mb(tmp2(1)),nz(nb), 617 > dcpl_mb(tmpz(1,nb)),-2,ierr) 618 call zcopy(nz(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb)) 619 620 end if 621 end do 622 end do 623 624#else 625 !call dcffti(nz(nb),dcpl_mb(tmp1(1))) 626 indx2 = 0 627 do q=1,nq(nb) 628 do i=1,nx(nb) 629 indx2 = indx2 + 1 630 if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then 631 indx = i + (q-1)*nx(nb)*nz(nb) 632 call zcopy(nz(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1) 633 call dcfftb(nz(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpz(1,nb))) 634 call zcopy(nz(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb)) 635 end if 636 end do 637 end do 638#endif 639 640* ******************************************** 641* *** Do a ptranspose of A *** 642* *** A(kx,kz,ny(nb)) <- A(kx,ny(nb),kz) *** 643* ******************************************** 644 call C3dB_c_ptranspose1_jk(fb,A,dcpl_mb(tmp2(1)),dcpl_mb(tmp3(1))) 645 646* ************************************************* 647* *** do fft along kz dimension *** 648* *** A(kx,nz(nb),ny(nb)) <- fft1d^(-1)[A(kx,kz,ny(nb))] *** 649* ************************************************* 650#ifdef MLIB 651 652 indx2 = 0 653 do q=1,nq(nb) 654 do i=1,nx(nb) 655 indx2 = indx2 + 1 656 if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then 657 indx = i + (q-1)*nx(nb)*ny(nb) 658 call zcopy(ny(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1) 659 call z1dfft(dcpl_mb(tmp2(1)),ny(nb), 660 > dcpl_mb(tmpy(1,nb)),-2,ierr) 661 call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb)) 662 end if 663 end do 664 end do 665 666#else 667 indx2 = 0 668 do q=1,nq(nb) 669 do i=1,nx(nb) 670 indx2 = indx2 + 1 671 if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then 672 indx = i + (q-1)*nx(nb)*ny(nb) 673 call zcopy(ny(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1) 674 call dcfftb(ny(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpy(1,nb))) 675 call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb)) 676 end if 677 end do 678 end do 679#endif 680 681* ********************************************************************* 682* *** do fft along kx dimension *** 683* *** A(nx(nb),nz(nb),ny(nb)) <- fft1d^(-1)[A(kx,nz(nb),ny(nb))] *** 684* ********************************************************************* 685#ifdef MLIB 686 !call drc1ft (dbl_mb(tmp3(1)),nx(nb),dcpl_mb(tmp1(1)),-3,ierr) 687 indx = 1 688 do q=1,nq(nb) 689 do j=1,ny(nb) 690 call z1dfft(A(indx),nx(nb), 691 > dcpl_mb(tmpx(1,nb)),-2,ierr) 692 indx = indx + nx(nb) 693 end do 694 end do 695#else 696 indx = 1 697 do q=1,nq(nb) 698 do j=1,ny(nb) 699 call dcfftb(nx(nb),A(indx),dcpl_mb(tmpx(1,nb))) 700 indx = indx + nx(nb) 701 end do 702 end do 703#endif 704 705 706 !************************* 707 !**** hilbert mapping **** 708 !************************* 709 else 710 711* ************************************************* 712* *** do fft along kz dimension *** 713* *** A(nz(nb),kx,ky) <- fft1d^(-1)[A(kz,kx,ky)] *** 714* ************************************************* 715#ifdef MLIB 716 indx = 1 717 do q=1,nq3(nb) 718 if (.not.log_mb(zero_row3(1,fb)+q-1)) then 719 call z1dfft(A(indx),nz(nb),dcpl_mb(tmpz(1,nb)),-2,ierr) 720 end if 721 indx = indx + nz(nb) 722 end do 723#else 724 indx = 1 725 do q=1,nq3(nb) 726 if (.not.log_mb(zero_row3(1,fb)+q-1)) then 727 call dcfftb(nz(nb),A(indx),dcpl_mb(tmpz(1,nb))) 728 end if 729 indx = indx + nz(nb) 730 end do 731#endif 732 733 call C3dB_c_ptranspose_ijk(fb,3,A, 734 > dcpl_mb(tmp2(1)), 735 > dcpl_mb(tmp3(1))) 736 737* ************************************************* 738* *** do fft along ky dimension *** 739* *** A(ny(nb),nz(nb),kx) <- fft1d^(-1)[A(ky,nz(nb),kx)] *** 740* ************************************************* 741#ifdef MLIB 742 indx = 1 743 do q=1,nq2(nb) 744 if (.not.log_mb(zero_row2(1,fb)+q-1)) then 745 call z1dfft(A(indx),ny(nb),dcpl_mb(tmpy(1,nb)),-2,ierr) 746 end if 747 indx = indx + ny(nb) 748 end do 749#else 750 indx = 1 751 do q=1,nq2(nb) 752 if (.not.log_mb(zero_row2(1,fb)+q-1)) then 753 call dcfftb(ny(nb),A(indx),dcpl_mb(tmpy(1,nb))) 754 end if 755 indx = indx + ny(nb) 756 end do 757#endif 758 759 call C3dB_c_ptranspose_ijk(fb,4,A, 760 > dcpl_mb(tmp2(1)), 761 > dcpl_mb(tmp3(1))) 762 763* ************************************************* 764* *** do fft along kx dimension *** 765* *** A(nx(nb),ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ny(nb),nz(nb))] *** 766* ************************************************* 767#ifdef MLIB 768 indx = 1 769 do q=1,nq1(nb) 770 call z1dfft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),-2,ierr) 771 indx = indx + nx(nb) 772 end do 773#else 774 indx = 1 775 do q=1,nq1(nb) 776 call dcfftb(nx(nb),A(indx),dcpl_mb(tmpx(1,nb))) 777 indx = indx + nx(nb) 778 end do 779#endif 780 781 end if 782 783* **** deallocate temporary space **** 784 value = BA_pop_stack(tmp3(2)) 785 value = BA_pop_stack(tmp2(2)) 786 !value = BA_pop_stack(tmp1(2)) 787 788 call nwpw_timing_end(1) 789 return 790 end 791 792 793 794 795* *********************************** 796* * * 797* * C3dB_rc_pfft3f * 798* * * 799* *********************************** 800 801 subroutine C3dB_rc_pfft3f(nb,nbb,A) 802 803***************************************************** 804* * 805* This routine performs the operation of * 806* a three dimensional complex to complex fft * 807* A(kx,ky,kz) <- FFT3[A(nx(nb),ny(nb),nz(nb))] * 808* * 809* Entry - * 810* A: a column distribuded 3d block * 811* tmp: tempory work space must be at * 812* least the size of (complex) * 813* (nfft*nfft + 1) + 10*nfft * 814* * 815* Exit - A is transformed * 816* * 817* uses - ptranspose1 subroutine * 818* * 819***************************************************** 820 821 implicit none 822 integer nb,nbb 823 complex*16 A(*) 824 825#include "bafdecls.fh" 826#include "errquit.fh" 827 828#include "C3dB.fh" 829#include "C3dB_pfft.fh" 830 831 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 832 common / C3dB_fft / tmpx,tmpy,tmpz 833 834* *** local variables *** 835 integer i,j,k,q,indx,indx1,indx2,fb 836#ifdef MLIB 837 integer ierr 838#endif 839 840 !integer tmp1(2),tmp2(2),tmp3(2) 841 integer tmp2(2),tmp3(2) 842 logical value 843 844 845 call nwpw_timing_start(1) 846 847 !*** set fb *** 848 if (nbb.eq.0) then 849 fb = 0 850 else 851 fb = 1 852 end if 853 854* ***** allocate temporary space **** 855 !call C3dB_nfft3d(nb,nfft3d) 856 value = BA_push_get(mt_dcpl,(nfft3d(nb)),'tmp2',tmp2(2),tmp2(1)) 857 value = value.and. 858 > BA_push_get(mt_dcpl,(nfft3d(nb)),'tmp3',tmp3(2),tmp3(1)) 859 if (.not. value) call errquit('out of stack memory',0, MA_ERR) 860 861 !********************** 862 !**** slab mapping **** 863 !********************** 864 if (mapping.eq.1) then 865* ******************************************** 866* *** do fft along nx(nb) dimension *** 867* *** A(kx,nz(nb),ny(nb)) <- fft1d[A(nx(nb),nz(nb),ny(nb))] *** 868* ******************************************** 869#ifdef MLIB 870 indx = 1 871 do q=1,nq(nb) 872 do j=1,ny(nb) 873 call z1dfft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),1,ierr) 874 indx = indx + nx(nb) 875 end do 876 end do 877#else 878 indx = 1 879 do q=1,nq(nb) 880 do j=1,ny(nb) 881 call dcfftf(nx(nb),A(indx),dcpl_mb(tmpx(1,nb))) 882 indx = indx + nx(nb) 883 end do 884 end do 885#endif 886 887 888* ******************************************** 889* *** do fft along nz(nb) dimension *** 890* *** A(kx,kz,nz(nb)) <- fft1d[A(kx,nz(nb),ny(nb))] *** 891* ******************************************** 892 893#ifdef MLIB 894 895 indx2 = 0 896 do q=1,nq(nb) 897 do i=1,nx(nb) 898 indx2 = indx2 + 1 899 if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then 900 901 indx = i + (q-1)*nx(nb)*ny(nb) 902 call zcopy(ny(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1) 903 call z1dfft(dcpl_mb(tmp2(1)),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr) 904 call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb)) 905 906 end if 907 end do 908 end do 909#else 910 911 indx2 = 0 912 do q=1,nq(nb) 913 do i=1,nx(nb) 914 indx2 = indx2 + 1 915 if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then 916 917 indx = i + (q-1)*nx(nb)*ny(nb) 918 call zcopy(ny(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1) 919 call dcfftf(ny(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpy(1,nb))) 920 call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb)) 921 end if 922 end do 923 end do 924 925#endif 926 927* ******************************************** 928* *** Do a ptranspose of A *** 929* *** A(ky,ny(nb),kz) <- A(kx,kz,ny(nb)) *** 930* ******************************************** 931 call C3dB_c_ptranspose2_jk(fb,A,dcpl_mb(tmp2(1)),dcpl_mb(tmp3(1))) 932 933 934* ******************************************** 935* *** do fft along ny(nb) dimension *** 936* *** A(kx,ky,kz) <- fft1d[A(kx,ny(nb),kz)] *** 937* ******************************************** 938#ifdef MLIB 939 940 indx2 = 0 941 do q=1,nq(nb) 942 do i=1,nx(nb) 943 indx2 = indx2 + 1 944 if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then 945 indx = i + (q-1)*nx(nb)*ny(nb) 946 call zcopy(nz(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1) 947 call z1dfft(dcpl_mb(tmp2(1)),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr) 948 call zcopy(nz(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb)) 949 950 end if 951 end do 952 end do 953#else 954 955 indx2 = 0 956 do q=1,nq(nb) 957 do i=1,nx(nb) 958 indx2 = indx2 + 1 959 if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then 960 indx = i + (q-1)*nx(nb)*ny(nb) 961 call zcopy(nz(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1) 962 call dcfftf(nz(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpz(1,nb))) 963 call zcopy(nz(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb)) 964 end if 965 end do 966 end do 967 968#endif 969 970 971 !************************* 972 !**** hilbert mapping **** 973 !************************* 974 else 975 976* ******************************************** 977* *** do fft along nx(nb) dimension *** 978* *** A(kx,ny(nb),nz(nb)) <- fft1d[A(nx(nb),ny(nb),nz(nb))] *** 979* ******************************************** 980#ifdef MLIB 981 indx = 1 982 do q=1,nq1(nb) 983 call z1dfft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),1,ierr) 984 indx = indx + nx(nb) 985 end do 986#else 987 indx = 1 988 do q=1,nq1(nb) 989 call dcfftf(nx(nb),A(indx),dcpl_mb(tmpx(1,nb))) 990 indx = indx + nx(nb) 991 end do 992#endif 993 994 call C3dB_c_ptranspose_ijk(fb,1,A, 995 > dcpl_mb(tmp2(1)), 996 > dcpl_mb(tmp3(1))) 997 998* ******************************************** 999* *** do fft along ny(nb) dimension *** 1000* *** A(ky,nz(nb),kx) <- fft1d[A(ny(nb),nz(nb),kx)] *** 1001* ******************************************** 1002#ifdef MLIB 1003 indx = 1 1004 do q=1,nq2(nb) 1005 if (.not.log_mb(zero_row2(1,fb)+q-1)) then 1006 call z1dfft(A(indx),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr) 1007 end if 1008 indx = indx + ny(nb) 1009 end do 1010#else 1011 indx = 1 1012 do q=1,nq2(nb) 1013 if (.not.log_mb(zero_row2(1,fb)+q-1)) then 1014 call dcfftf(ny(nb),A(indx),dcpl_mb(tmpy(1,nb))) 1015 end if 1016 indx = indx + ny(nb) 1017 end do 1018#endif 1019 1020 call C3dB_c_ptranspose_ijk(fb,2,A, 1021 > dcpl_mb(tmp2(1)), 1022 > dcpl_mb(tmp3(1))) 1023 1024* ******************************************** 1025* *** do fft along nz(nb) dimension *** 1026* *** A(kz,kx,ky) <- fft1d[A(nz(nb),kx,ky)] *** 1027* ******************************************** 1028#ifdef MLIB 1029 indx = 1 1030 do q=1,nq3(nb) 1031 if (.not.log_mb(zero_row3(1,fb)+q-1)) then 1032 call z1dfft(A(indx),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr) 1033 end if 1034 indx = indx + nz(nb) 1035 end do 1036#else 1037 indx = 1 1038 do q=1,nq3(nb) 1039 if (.not.log_mb(zero_row3(1,fb)+q-1)) then 1040 call dcfftf(nz(nb),A(indx),dcpl_mb(tmpz(1,nb))) 1041 end if 1042 indx = indx + nz(nb) 1043 end do 1044#endif 1045 1046 end if 1047 1048 1049* **** deallocate temporary space **** 1050 value = BA_pop_stack(tmp3(2)) 1051 value = BA_pop_stack(tmp2(2)) 1052 !value = BA_pop_stack(tmp1(2)) 1053 1054 call nwpw_timing_end(1) 1055 return 1056 end 1057 1058 1059 1060 1061* *********************************** 1062* * * 1063* * C3dB_c_ptranspose_jk_init * 1064* * * 1065* *********************************** 1066 1067 subroutine C3dB_c_ptranspose_jk_init(fb,zero_arow3) 1068 implicit none 1069 integer fb 1070 logical zero_arow3(*) 1071 1072#include "bafdecls.fh" 1073#include "errquit.fh" 1074#include "C3dB.fh" 1075 1076 1077* **** indexing variables **** 1078 integer iq_to_i1(2,0:1) 1079 integer iq_to_i2(2,0:1) 1080 integer iz_to_i2(2,0:1) 1081 integer i1_start(2,0:1) 1082 integer i2_start(2,0:1) 1083 common / c_ptrans_blk1 / iq_to_i1,iq_to_i2,iz_to_i2, 1084 > i1_start,i2_start 1085 1086* **** indexing variables **** 1087 integer jq_to_i1(2,0:1) 1088 integer jq_to_i2(2,0:1) 1089 integer jz_to_i2(2,0:1) 1090 integer j1_start(2,0:1) 1091 integer j2_start(2,0:1) 1092 common / c_ptrans_blk2 / jq_to_i1,jq_to_i2,jz_to_i2, 1093 > j1_start,j2_start 1094 1095 1096 1097* **** local variables **** 1098 integer proc_to,proc_from 1099 integer pto,qto,np,taskid 1100 integer pfrom,qfrom 1101 integer phere,qhere 1102 integer index1,index2,index3 1103 integer jndex1,jndex2,jndex3 1104 integer itmp,ii,jj 1105 integer i,j,k,it 1106 logical value 1107 1108 1109 1110* **** allocate ptrans_blk1 and ptrans_blk2 common block **** 1111 value = BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)), 1112 > 'piq_to_i1',iq_to_i1(2,fb),iq_to_i1(1,fb)) 1113 value=value.and.BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)), 1114 > 'piq_to_i2',iq_to_i2(2,fb),iq_to_i2(1,fb)) 1115 value=value.and.BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)), 1116 > 'piz_to_i2',iz_to_i2(2,fb),iz_to_i2(1,fb)) 1117 1118 value = value.and.BA_alloc_get(mt_int,(nz(1)+1), 1119 > 'pi1_start',i1_start(2,fb),i1_start(1,fb)) 1120 value = value.and.BA_alloc_get(mt_int,(nz(1)+1), 1121 > 'pi2_start',i2_start(2,fb),i2_start(1,fb)) 1122 1123 1124 value = BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)), 1125 > 'riq_to_i1',jq_to_i1(2,fb),jq_to_i1(1,fb)) 1126 value=value.and.BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)), 1127 > 'riq_to_i2',jq_to_i2(2,fb),jq_to_i2(1,fb)) 1128 value=value.and.BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)), 1129 > 'riz_to_i2',jz_to_i2(2,fb),jz_to_i2(1,fb)) 1130 1131 value = value.and.BA_alloc_get(mt_int,(nz(1)+1), 1132 > 'ri1_start',j1_start(2,fb),j1_start(1,fb)) 1133 value = value.and.BA_alloc_get(mt_int,(nz(1)+1), 1134 > 'ri2_start',j2_start(2,fb),j2_start(1,fb)) 1135 1136 if (.not. value) 1137 > call errquit('C3dB_ptranspose_jk_init:out of heap',0,MA_ERR) 1138 1139 call Parallel3d_taskid_i(taskid) 1140 call Parallel3d_np_i(np) 1141 1142 1143 index1 = 1 1144 index2 = 1 1145 index3 = 1 1146 jndex1 = 1 1147 jndex2 = 1 1148 jndex3 = 1 1149 do it=0,np-1 1150 proc_to = mod(taskid+it,np) 1151 proc_from = mod(taskid-it+np,np) 1152 int_mb(i1_start(1,fb)+it) = index1 1153 int_mb(i2_start(1,fb)+it) = index2 1154 int_mb(j1_start(1,fb)+it) = jndex1 1155 int_mb(j2_start(1,fb)+it) = jndex2 1156 1157 do k=1,nz(1) 1158 do j=1,ny(1) 1159 1160* **** packing scheme **** 1161 call C3dB_ktoqp(1,k,qhere,phere) 1162 call C3dB_ktoqp(1,j,qto,pto) 1163 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 1164 do i=1,nx(1) 1165 ii = i + nx(1)*(k-1) 1166 jj = i + nx(1)*(j-1) 1167 itmp = i + (j-1)*nx(1) 1168 > + (qhere-1)*nx(1)*ny(1) 1169 1170 if (.not.zero_arow3(ii)) then 1171 int_mb(iq_to_i1(1,fb)+index1-1) = itmp 1172 index1 = index1 + 1 1173 end if 1174 1175 if (.not.zero_arow3(jj)) then 1176 int_mb(jq_to_i1(1,fb)+jndex1-1) = itmp 1177 jndex1 = jndex1 + 1 1178 end if 1179 1180 end do 1181 end if 1182 1183* **** unpacking scheme **** 1184 call C3dB_ktoqp(1,j,qhere,phere) 1185 call C3dB_ktoqp(1,k,qfrom,pfrom) 1186 if ((phere.eq.taskid).and.(pfrom.eq.proc_from)) then 1187 do i=1,nx(1) 1188 ii = i + nx(1)*(k-1) 1189 jj = i + nx(1)*(j-1) 1190 itmp = i + (k-1)*nx(1) 1191 > + (qhere-1)*nx(1)*ny(1) 1192 if (zero_arow3(ii)) then 1193 int_mb(iz_to_i2(1,fb)+index3-1) = itmp 1194 index3 = index3 + 1 1195 else 1196 int_mb(iq_to_i2(1,fb)+index2-1) = itmp 1197 index2 = index2 + 1 1198 end if 1199 1200 if (zero_arow3(jj)) then 1201 int_mb(jz_to_i2(1,fb)+jndex3-1) = itmp 1202 jndex3 = jndex3 + 1 1203 else 1204 int_mb(jq_to_i2(1,fb)+jndex2-1) = itmp 1205 jndex2 = jndex2 + 1 1206 end if 1207 end do 1208 end if 1209 end do 1210 end do 1211 end do 1212 int_mb(i1_start(1,fb)+np) = index1 1213 int_mb(i2_start(1,fb)+np) = index2 1214 int_mb(j1_start(1,fb)+np) = jndex1 1215 int_mb(j2_start(1,fb)+np) = jndex2 1216 1217 1218 return 1219 end 1220 1221 1222 1223 1224 1225* *********************************** 1226* * * 1227* * C3dB_c_ptranspose_ijk_init * 1228* * * 1229* *********************************** 1230 1231 subroutine C3dB_c_ptranspose_ijk_init(fb,zero_arow2,zero_arow3) 1232 implicit none 1233 integer fb 1234 logical zero_arow2(*) 1235 logical zero_arow3(*) 1236 1237#include "bafdecls.fh" 1238#include "errquit.fh" 1239#include "C3dB.fh" 1240 1241 1242* *** hilbert tranpose data structure **** 1243 integer h_iq_to_i1(2,6,0:1) 1244 integer h_iq_to_i2(2,6,0:1) 1245 integer h_iz_to_i2(2,6,0:1) 1246 integer h_iz_to_i2_count(6,0:1) 1247 integer h_i1_start(2,6,0:1) 1248 integer h_i2_start(2,6,0:1) 1249 common / c_ptrans_blk_ijk / h_iq_to_i1, 1250 > h_iq_to_i2, 1251 > h_iz_to_i2, 1252 > h_iz_to_i2_count, 1253 > h_i1_start, 1254 > h_i2_start 1255 1256* **** local variables **** 1257 logical value,iszero 1258 integer proc_to,proc_from 1259 integer pto,qto,np,taskid 1260 integer phere,qhere 1261 integer index1,index2,index3,itmp 1262 integer i,j,k,it 1263 1264 1265 call Parallel3d_taskid_i(taskid) 1266 call Parallel3d_np_i(np) 1267 1268 1269* ******************************************************** 1270* **** map1to2 mapping - done - tranpose operation #1 *** 1271* **** (ny,nz,nx) <-- (nx,ny,nz) *** 1272* **** use zero_arow2 *** 1273* ******************************************************** 1274 1275* **** allocate trans_blk_ijk common block **** 1276 value = BA_alloc_get(mt_int,(nx(1)*nq1(1)), 1277 > 'h_iq_to_i1_1', 1278 > h_iq_to_i1(2,1,fb), 1279 > h_iq_to_i1(1,1,fb)) 1280 value = value.and. 1281 > BA_alloc_get(mt_int,(ny(1)*nq2(1)), 1282 > 'h_iq_to_i2_1', 1283 > h_iq_to_i2(2,1,fb), 1284 > h_iq_to_i2(1,1,fb)) 1285 value = value.and. 1286 > BA_alloc_get(mt_int,(ny(1)*nq2(1)), 1287 > 'h_iz_to_i2_1', 1288 > h_iz_to_i2(2,1,fb), 1289 > h_iz_to_i2(1,1,fb)) 1290 value = value.and. 1291 > BA_alloc_get(mt_int,(np+1), 1292 > 'h_i1_start_1', 1293 > h_i1_start(2,1,fb), 1294 > h_i1_start(1,1,fb)) 1295 value = value.and. 1296 > BA_alloc_get(mt_int,(np+1), 1297 > 'h_i2_start_1', 1298 > h_i2_start(2,1,fb), 1299 > h_i2_start(1,1,fb)) 1300 if (.not.value) 1301 > call errquit('C3dB_transpose_ijk_initt:out of heap',0,MA_ERR) 1302 1303 index1 = 1 1304 index2 = 1 1305 index3 = 1 1306 do it=0,np-1 1307 proc_to = mod(taskid+it,np) 1308 proc_from = mod(taskid-it+np,np) 1309 int_mb(h_i1_start(1,1,fb)+it) = index1 1310 int_mb(h_i2_start(1,1,fb)+it) = index2 1311 1312 do k=1,nz(1) 1313 do j=1,ny(1) 1314 do i=1,nx(1) 1315 iszero = zero_arow2(i+(k-1)*nx(1)) 1316 1317* **** packing scheme **** 1318 phere = int_mb(p_map1(1,1)+(j-1)+(k-1)*ny(1)) 1319 qhere = int_mb(q_map1(1,1)+(j-1)+(k-1)*ny(1)) 1320 1321 pto = int_mb(p_map2(1,1)+(k-1)+(i-1)*nz(1)) 1322 qto = int_mb(q_map2(1,1)+(k-1)+(i-1)*nz(1)) 1323 1324 1325 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 1326 itmp = i + (qhere-1)*nx(1) 1327 if (.not.iszero) then 1328 int_mb(h_iq_to_i1(1,1,fb)+index1-1) = itmp 1329 index1 = index1 + 1 1330 end if 1331 end if 1332 1333* **** unpacking scheme **** 1334 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 1335 itmp = j + (qto-1)*ny(1) 1336 if (.not.iszero) then 1337 int_mb(h_iq_to_i2(1,1,fb)+index2-1) = itmp 1338 index2 = index2 + 1 1339 else 1340 int_mb(h_iz_to_i2(1,1,fb)+index3-1) = itmp 1341 index3 = index3 + 1 1342 end if 1343 end if 1344 1345 end do 1346 end do 1347 end do 1348 1349 end do 1350 int_mb(h_i1_start(1,1,fb)+np) = index1 1351 int_mb(h_i2_start(1,1,fb)+np) = index2 1352 h_iz_to_i2_count(1,fb) = index3 - 1 1353 1354 1355 1356 1357 1358* ********************************************************* 1359* **** map2to3 mapping - done - transpose operation #2 **** 1360* **** (nz,nx,ny) <-- (ny,nz,nx) **** 1361* **** use zero_arow3 **** 1362* ********************************************************* 1363 1364* **** allocate trans_blk_ijk common block **** 1365 value = BA_alloc_get(mt_int,(ny(1)*nq2(1)), 1366 > 'h_iq_to_i1_2', 1367 > h_iq_to_i1(2,2,fb), 1368 > h_iq_to_i1(1,2,fb)) 1369 value = value.and. 1370 > BA_alloc_get(mt_int,(nz(1)*nq3(1)), 1371 > 'h_iq_to_i2_2', 1372 > h_iq_to_i2(2,2,fb), 1373 > h_iq_to_i2(1,2,fb)) 1374 value = value.and. 1375 > BA_alloc_get(mt_int,(nz(1)*nq3(1)), 1376 > 'h_iz_to_i2_2', 1377 > h_iz_to_i2(2,2,fb), 1378 > h_iz_to_i2(1,2,fb)) 1379 value = value.and. 1380 > BA_alloc_get(mt_int,(np+1), 1381 > 'h_i1_start_2', 1382 > h_i1_start(2,2,fb), 1383 > h_i1_start(1,2,fb)) 1384 value = value.and. 1385 > BA_alloc_get(mt_int,(np+1), 1386 > 'h_i2_start_2', 1387 > h_i2_start(2,2,fb), 1388 > h_i2_start(1,2,fb)) 1389 if (.not.value) 1390 > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR) 1391 1392 index1 = 1 1393 index2 = 1 1394 index3 = 1 1395 do it=0,np-1 1396 proc_to = mod(taskid+it,np) 1397 proc_from = mod(taskid-it+np,np) 1398 int_mb(h_i1_start(1,2,fb)+it) = index1 1399 int_mb(h_i2_start(1,2,fb)+it) = index2 1400 1401 do k=1,nz(1) 1402 do j=1,ny(1) 1403 do i=1,nx(1) 1404 1405 iszero = zero_arow3(i+(j-1)*nx(1)) 1406 1407* **** packing scheme **** 1408 phere = int_mb(p_map2(1,1)+(k-1)+(i-1)*nz(1)) 1409 qhere = int_mb(q_map2(1,1)+(k-1)+(i-1)*nz(1)) 1410 1411 pto = int_mb(p_map3(1,1)+(i-1)+(j-1)*nx(1)) 1412 qto = int_mb(q_map3(1,1)+(i-1)+(j-1)*nx(1)) 1413 1414 1415 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 1416 itmp = j + (qhere-1)*ny(1) 1417 if (.not.iszero) then 1418 int_mb(h_iq_to_i1(1,2,fb)+index1-1) = itmp 1419 index1 = index1 + 1 1420 end if 1421 end if 1422 1423* **** unpacking scheme **** 1424 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 1425 itmp = k + (qto-1)*nz(1) 1426 if (.not.iszero) then 1427 int_mb(h_iq_to_i2(1,2,fb)+index2-1) = itmp 1428 index2 = index2 + 1 1429 else 1430 int_mb(h_iz_to_i2(1,2,fb)+index3-1) = itmp 1431 index3 = index3 + 1 1432 end if 1433 end if 1434 1435 1436 end do 1437 end do 1438 end do 1439 1440 end do 1441 int_mb(h_i1_start(1,2,fb)+np) = index1 1442 int_mb(h_i2_start(1,2,fb)+np) = index2 1443 h_iz_to_i2_count(2,fb) = index3 - 1 1444 1445 1446 1447 1448 1449 1450* ******************************************************** 1451* **** map3to2 mapping - done - tranpose operation #3 **** 1452* **** (ny,nz,nx) <-- (nz,nx,ny) **** 1453* **** use zero_arow3 **** 1454* ******************************************************** 1455 1456* **** allocate trans_blk_ijk common block **** 1457 value = BA_alloc_get(mt_int,(nz(1)*nq3(1)), 1458 > 'h_iq_to_i1_3', 1459 > h_iq_to_i1(2,3,fb), 1460 > h_iq_to_i1(1,3,fb)) 1461 value = value.and. 1462 > BA_alloc_get(mt_int,(ny(1)*nq2(1)), 1463 > 'h_iq_to_i2_3', 1464 > h_iq_to_i2(2,3,fb), 1465 > h_iq_to_i2(1,3,fb)) 1466 value = value.and. 1467 > BA_alloc_get(mt_int,(ny(1)*nq2(1)), 1468 > 'h_iz_to_i2_3', 1469 > h_iz_to_i2(2,3,fb), 1470 > h_iz_to_i2(1,3,fb)) 1471 value = value.and. 1472 > BA_alloc_get(mt_int,(np+1), 1473 > 'h_i1_start_3', 1474 > h_i1_start(2,3,fb), 1475 > h_i1_start(1,3,fb)) 1476 value = value.and. 1477 > BA_alloc_get(mt_int,(np+1), 1478 > 'h_i2_start_3', 1479 > h_i2_start(2,3,fb), 1480 > h_i2_start(1,3,fb)) 1481 if (.not.value) 1482 > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR) 1483 1484 index1 = 1 1485 index2 = 1 1486 index3 = 1 1487 do it=0,np-1 1488 proc_to = mod(taskid+it,np) 1489 proc_from = mod(taskid-it+np,np) 1490 int_mb(h_i1_start(1,3,fb)+it) = index1 1491 int_mb(h_i2_start(1,3,fb)+it) = index2 1492 1493 do k=1,nz(1) 1494 do j=1,ny(1) 1495 do i=1,nx(1) 1496 1497 iszero = zero_arow3(i+(j-1)*nx(1)) 1498 1499* **** packing scheme **** 1500 phere = int_mb(p_map3(1,1)+(i-1)+(j-1)*nx(1)) 1501 qhere = int_mb(q_map3(1,1)+(i-1)+(j-1)*nx(1)) 1502 1503 pto = int_mb(p_map2(1,1)+(k-1)+(i-1)*nz(1)) 1504 qto = int_mb(q_map2(1,1)+(k-1)+(i-1)*nz(1)) 1505 1506 1507 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 1508 itmp = k + (qhere-1)*nz(1) 1509 if (.not.iszero) then 1510 int_mb(h_iq_to_i1(1,3,fb)+index1-1) = itmp 1511 index1 = index1 + 1 1512 end if 1513 end if 1514 1515* **** unpacking scheme **** 1516 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 1517 itmp = j + (qto-1)*ny(1) 1518 if (.not.iszero) then 1519 int_mb(h_iq_to_i2(1,3,fb)+index2-1) = itmp 1520 index2 = index2 + 1 1521 else 1522 int_mb(h_iz_to_i2(1,3,fb)+index3-1) = itmp 1523 index3 = index3 + 1 1524 end if 1525 end if 1526 1527 1528 end do 1529 end do 1530 end do 1531 1532 end do 1533 int_mb(h_i1_start(1,3,fb)+np) = index1 1534 int_mb(h_i2_start(1,3,fb)+np) = index2 1535 h_iz_to_i2_count(3,fb) = index3 - 1 1536 1537 1538 1539 1540* ******************************************************** 1541* **** map2to1 mapping - done - tranpose operation #4 **** 1542* **** (nx,ny,nz) <-- (ny,nz,nx) **** 1543* **** use zero_arow2 **** 1544* ******************************************************** 1545 1546* **** allocate trans_blk_ijk common block **** 1547 value = BA_alloc_get(mt_int,(ny(1)*nq2(1)), 1548 > 'h_iq_to_i1_4', 1549 > h_iq_to_i1(2,4,fb), 1550 > h_iq_to_i1(1,4,fb)) 1551 value = value.and. 1552 > BA_alloc_get(mt_int,(nx(1)*nq1(1)), 1553 > 'h_iq_to_i2_4', 1554 > h_iq_to_i2(2,4,fb), 1555 > h_iq_to_i2(1,4,fb)) 1556 value = value.and. 1557 > BA_alloc_get(mt_int,(nx(1)*nq1(1)), 1558 > 'h_iz_to_i2_4', 1559 > h_iz_to_i2(2,4,fb), 1560 > h_iz_to_i2(1,4,fb)) 1561 value = value.and. 1562 > BA_alloc_get(mt_int,(np+1), 1563 > 'h_i1_start_4', 1564 > h_i1_start(2,4,fb), 1565 > h_i1_start(1,4,fb)) 1566 value = value.and. 1567 > BA_alloc_get(mt_int,(np+1), 1568 > 'h_i2_start_4', 1569 > h_i2_start(2,4,fb), 1570 > h_i2_start(1,4,fb)) 1571 if (.not.value) 1572 > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR) 1573 1574 index1 = 1 1575 index2 = 1 1576 index3 = 1 1577 do it=0,np-1 1578 proc_to = mod(taskid+it,np) 1579 proc_from = mod(taskid-it+np,np) 1580 int_mb(h_i1_start(1,4,fb)+it) = index1 1581 int_mb(h_i2_start(1,4,fb)+it) = index2 1582 1583 do k=1,nz(1) 1584 do j=1,ny(1) 1585 do i=1,nx(1) 1586 1587 iszero = zero_arow2(i+(k-1)*nx(1)) 1588 1589* **** packing scheme **** 1590 phere = int_mb(p_map2(1,1)+(k-1)+(i-1)*nz(1)) 1591 qhere = int_mb(q_map2(1,1)+(k-1)+(i-1)*nz(1)) 1592 1593 pto = int_mb(p_map1(1,1)+(j-1)+(k-1)*ny(1)) 1594 qto = int_mb(q_map1(1,1)+(j-1)+(k-1)*ny(1)) 1595 1596 1597 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 1598 itmp = j + (qhere-1)*ny(1) 1599 if (.not.iszero) then 1600 int_mb(h_iq_to_i1(1,4,fb)+index1-1) = itmp 1601 index1 = index1 + 1 1602 end if 1603 end if 1604 1605* **** unpacking scheme **** 1606 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 1607 itmp = i + (qto-1)*nx(1) 1608 if (.not.iszero) then 1609 int_mb(h_iq_to_i2(1,4,fb)+index2-1) = itmp 1610 index2 = index2 + 1 1611 else 1612 int_mb(h_iz_to_i2(1,4,fb)+index3-1) = itmp 1613 index3 = index3 + 1 1614 end if 1615 end if 1616 1617 1618 end do 1619 end do 1620 end do 1621 1622 end do 1623 int_mb(h_i1_start(1,4,fb)+np) = index1 1624 int_mb(h_i2_start(1,4,fb)+np) = index2 1625 h_iz_to_i2_count(4,fb) = index3 - 1 1626 1627 1628 1629 1630 1631* ********************************************************** 1632* **** map1to3 mapping - done - tranpose operation # 5 **** 1633* ********************************************************** 1634 1635* **** allocate trans_blk_ijk common block **** 1636 value = BA_alloc_get(mt_int,(nx(1)*nq1(1)), 1637 > 'h_iq_to_i1_5', 1638 > h_iq_to_i1(2,5,fb), 1639 > h_iq_to_i1(1,5,fb)) 1640 value = value.and. 1641 > BA_alloc_get(mt_int,(nz(1)*nq3(1)), 1642 > 'h_iq_to_i2_5', 1643 > h_iq_to_i2(2,5,fb), 1644 > h_iq_to_i2(1,5,fb)) 1645 value = value.and. 1646 > BA_alloc_get(mt_int,(nz(1)*nq3(1)), 1647 > 'h_iz_to_i2_5', 1648 > h_iz_to_i2(2,5,fb), 1649 > h_iz_to_i2(1,5,fb)) 1650 value = value.and. 1651 > BA_alloc_get(mt_int,(np+1), 1652 > 'h_i1_start_5', 1653 > h_i1_start(2,5,fb), 1654 > h_i1_start(1,5,fb)) 1655 value = value.and. 1656 > BA_alloc_get(mt_int,(np+1), 1657 > 'h_i2_start_5', 1658 > h_i2_start(2,5,fb), 1659 > h_i2_start(1,5,fb)) 1660 if (.not.value) 1661 > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR) 1662 1663 index1 = 1 1664 index2 = 1 1665 index3 = 1 1666 do it=0,np-1 1667 proc_to = mod(taskid+it,np) 1668 proc_from = mod(taskid-it+np,np) 1669 int_mb(h_i1_start(1,5,fb)+it) = index1 1670 int_mb(h_i2_start(1,5,fb)+it) = index2 1671 1672 do k=1,nz(1) 1673 do j=1,ny(1) 1674 do i=1,nx(1) 1675 1676* **** packing scheme **** 1677 phere = int_mb(p_map1(1,1)+(j-1)+(k-1)*ny(1)) 1678 qhere = int_mb(q_map1(1,1)+(j-1)+(k-1)*ny(1)) 1679 1680 pto = int_mb(p_map3(1,1)+(i-1)+(j-1)*nx(1)) 1681 qto = int_mb(q_map3(1,1)+(i-1)+(j-1)*nx(1)) 1682 1683 1684 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 1685 itmp = i + (qhere-1)*nx(1) 1686 int_mb(h_iq_to_i1(1,5,fb)+index1-1) = itmp 1687 index1 = index1 + 1 1688 end if 1689 1690* **** unpacking scheme **** 1691 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 1692 itmp = k + (qto-1)*nz(1) 1693 int_mb(h_iq_to_i2(1,5,fb)+index2-1) = itmp 1694 index2 = index2 + 1 1695 end if 1696 1697 end do 1698 end do 1699 end do 1700 1701 end do 1702 int_mb(h_i1_start(1,5,fb)+np) = index1 1703 int_mb(h_i2_start(1,5,fb)+np) = index2 1704 h_iz_to_i2_count(5,fb) = index3 - 1 1705 1706 1707 1708 1709 1710 1711* ************************* 1712* **** map3to1 mapping **** 1713* ************************* 1714 1715* **** allocate trans_blk_ijk common block **** 1716 value = BA_alloc_get(mt_int,(nz(1)*nq3(1)), 1717 > 'h_iq_to_i1_6', 1718 > h_iq_to_i1(2,6,fb), 1719 > h_iq_to_i1(1,6,fb)) 1720 value = value.and. 1721 > BA_alloc_get(mt_int,(nx(1)*nq1(1)), 1722 > 'h_iq_to_i2_6', 1723 > h_iq_to_i2(2,6,fb), 1724 > h_iq_to_i2(1,6,fb)) 1725 value = value.and. 1726 > BA_alloc_get(mt_int,(nx(1)*nq1(1)), 1727 > 'h_iz_to_i2_6', 1728 > h_iz_to_i2(2,6,fb), 1729 > h_iz_to_i2(1,6,fb)) 1730 value = value.and. 1731 > BA_alloc_get(mt_int,(np+1), 1732 > 'h_i1_start_6', 1733 > h_i1_start(2,6,fb), 1734 > h_i1_start(1,6,fb)) 1735 value = value.and. 1736 > BA_alloc_get(mt_int,(np+1), 1737 > 'h_i2_start_6', 1738 > h_i2_start(2,6,fb), 1739 > h_i2_start(1,6,fb)) 1740 if (.not.value) 1741 > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR) 1742 1743 index1 = 1 1744 index2 = 1 1745 index3 = 1 1746 do it=0,np-1 1747 proc_to = mod(taskid+it,np) 1748 proc_from = mod(taskid-it+np,np) 1749 int_mb(h_i1_start(1,6,fb)+it) = index1 1750 int_mb(h_i2_start(1,6,fb)+it) = index2 1751 1752 do k=1,nz(1) 1753 do j=1,ny(1) 1754 do i=1,nx(1) 1755 1756* **** packing scheme **** 1757 phere = int_mb(p_map3(1,1)+(i-1)+(j-1)*nx(1)) 1758 qhere = int_mb(q_map3(1,1)+(i-1)+(j-1)*nx(1)) 1759 1760 pto = int_mb(p_map1(1,1)+(j-1)+(k-1)*ny(1)) 1761 qto = int_mb(q_map1(1,1)+(j-1)+(k-1)*ny(1)) 1762 1763 1764 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 1765 itmp = k + (qhere-1)*nz(1) 1766 int_mb(h_iq_to_i1(1,6,fb)+index1-1) = itmp 1767 index1 = index1 + 1 1768 end if 1769 1770* **** unpacking scheme **** 1771 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 1772 itmp = i + (qto-1)*nx(1) 1773 int_mb(h_iq_to_i2(1,6,fb)+index2-1) = itmp 1774 index2 = index2 + 1 1775 end if 1776 1777 end do 1778 end do 1779 end do 1780 1781 end do 1782 int_mb(h_i1_start(1,6,fb)+np) = index1 1783 int_mb(h_i2_start(1,6,fb)+np) = index2 1784 h_iz_to_i2_count(6,fb) = index3 - 1 1785 1786 1787 return 1788 end 1789 1790 1791 1792* *********************************** 1793* * * 1794* * C3dB_pfftfx * 1795* * * 1796* *********************************** 1797* 1798* do fft along nx(1) dimension 1799* 1800* A(kx,ny(1),nz(1)) <- fft1d[A(nx(1),ny(1),nz(1))] 1801* 1802 1803 subroutine C3dB_pfftfx(nbb,A,tmp1,tmp2,request,reqcnt) 1804 implicit none 1805 integer nbb 1806 complex*16 A(*) 1807 complex*16 tmp1(*) 1808 complex*16 tmp2(*) 1809 integer request(*),reqcnt 1810 1811 1812#include "bafdecls.fh" 1813#include "errquit.fh" 1814 1815#include "C3dB.fh" 1816#include "C3dB_pfft.fh" 1817 1818 1819 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 1820 common / C3dB_fft / tmpx,tmpy,tmpz 1821 1822 !*** local variables *** 1823 integer j,q,indx,fb 1824 1825 if (nbb.eq.0) then 1826 fb = 0 1827 else 1828 fb = 1 1829 end if 1830 1831 !********************** 1832 !**** slab mapping **** 1833 !********************** 1834 if (mapping.eq.1) then 1835 call nwpw_timing_start(31) 1836#ifdef MLIB 1837 indx = 1 1838 do q=1,nq(1) 1839 do j=1,ny(1) 1840 call z1dfft(A(indx),nx(1),dcpl_mb(tmpx(1,1)),1,ierr) 1841 indx = indx + nx(1) 1842 end do 1843 end do 1844 1845#else 1846 indx = 1 1847 do q=1,nq(1) 1848 do j=1,ny(1) 1849 call dcfftf(nx(1),A(indx),dcpl_mb(tmpx(1,1))) 1850 indx = indx + nx(1) 1851 end do 1852 end do 1853#endif 1854 call dcopy(2*nx(1)*ny(1)*nq(1),A,1,tmp1,1) 1855 call nwpw_timing_end(31) 1856 1857 1858 !************************* 1859 !**** hilbert mapping **** 1860 !************************* 1861 else 1862 call nwpw_timing_start(31) 1863 1864#ifdef MLIB 1865 indx = 1 1866 do q=1,nq1(1) 1867 call z1dfft(A(indx),nx(1),dcpl_mb(tmpx(1,1)),1,ierr) 1868 indx = indx + nx(1) 1869 end do 1870#else 1871 indx = 1 1872 do q=1,nq1(1) 1873 call dcfftf(nx(1),A(indx),dcpl_mb(tmpx(1,1))) 1874 indx = indx + nx(1) 1875 end do 1876#endif 1877 1878 call nwpw_timing_end(31) 1879 call nwpw_timing_start(32) 1880 call C3dB_c_ptranspose_ijk_start(fb,1,A,tmp1,tmp2, 1881 > request,reqcnt,40) 1882 call nwpw_timing_end(32) 1883 1884 end if 1885 1886 return 1887 end 1888 1889 1890* *********************************** 1891* * * 1892* * C3dB_pfftfy * 1893* * * 1894* *********************************** 1895* 1896* do fft along ny(1) dimension 1897* 1898* A(kx,ny(1),nz(1)) <- fft1d[A(nx(1),ny(1),nz(1))] 1899* 1900 1901 subroutine C3dB_pfftfy(nbb,tmp1,tmp2,request,reqcnt) 1902 implicit none 1903 integer nbb 1904 complex*16 tmp1(*) 1905 complex*16 tmp2(*) 1906 integer request(*),reqcnt 1907 1908 1909#include "bafdecls.fh" 1910#include "errquit.fh" 1911 1912#include "C3dB.fh" 1913#include "C3dB_pfft.fh" 1914 1915 1916 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 1917 common / C3dB_fft / tmpx,tmpy,tmpz 1918 1919 1920 !*** local variables *** 1921 integer i,j,k,indx,indx2,q,fb 1922 1923 if (nbb.eq.0) then 1924 fb = 0 1925 else 1926 fb = 1 1927 end if 1928 1929 !********************** 1930 !**** slab mapping **** 1931 !********************** 1932 if (mapping.eq.1) then 1933 call nwpw_timing_start(31) 1934 1935* ******************************************** 1936* *** do fft along ny(1) dimension *** 1937* *** A(kx,ky,nz(1)) <- fft1d[A(kx,ny(1),nz(1))] *** 1938* ******************************************** 1939 1940#ifdef MLIB 1941 1942 indx2 = 0 1943 do q=1,nq(1) 1944 do i=1,nx(1) 1945 indx2 = indx2 + 1 1946 if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then 1947 indx = i + (q-1)*nx(1)*ny(1) 1948 call zcopy(ny(1),tmp1(indx),nx(1),tmp2,1) 1949 call z1dfft(tmp2,ny(1),dcpl_mb(tmpy(1,1)),1,ierr) 1950 call zcopy(ny(1),tmp2,1,tmp1(indx),nx(1)) 1951 end if 1952 end do 1953 end do 1954#else 1955 indx2 = 0 1956 do q=1,nq(1) 1957 do i=1,nx(1) 1958 indx2 = indx2 + 1 1959 if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then 1960 indx = i + (q-1)*nx(1)*ny(1) 1961 call zcopy(ny(1),tmp1(indx),nx(1),tmp2,1) 1962 call dcfftf(ny(1),tmp2,dcpl_mb(tmpy(1,1))) 1963 call zcopy(ny(1),tmp2,1,tmp1(indx),nx(1)) 1964 end if 1965 end do 1966 end do 1967#endif 1968 1969 1970* ******************************************** 1971* *** Do a ptranspose of A *** 1972* *** A(ky,nz(1),ky) <- A(kx,ky,nz(1)) *** 1973* ******************************************** 1974 call nwpw_timing_end(31) 1975 call nwpw_timing_start(32) 1976 call C3dB_c_ptranspose2_jk_start(fb,tmp1,tmp2,tmp1, 1977 > request,reqcnt,41) 1978 call nwpw_timing_end(32) 1979 1980 1981 1982 1983 !************************* 1984 !**** hilbert mapping **** 1985 !************************* 1986 else 1987 call nwpw_timing_start(32) 1988 call C3dB_c_ptranspose_ijk_end(fb,1,tmp1,tmp2,request,reqcnt) 1989 call nwpw_timing_end(32) 1990 call nwpw_timing_start(31) 1991 1992* ******************************************** 1993* *** do fft along ny(1) dimension *** 1994* *** A(ky,nz(1),kx) <- fft1d[A(ny(1),nz(1),kx)] *** 1995* ******************************************** 1996#ifdef MLIB 1997 indx = 1 1998 do q=1,nq2(1) 1999 if (.not.log_mb(zero_row2(1,fb)+q-1)) then 2000 call z1dfft(tmp1(indx),ny(1),dcpl_mb(tmpy(1,1)),1,ierr) 2001 end if 2002 indx = indx + ny(1) 2003 end do 2004#else 2005 indx = 1 2006 do q=1,nq2(1) 2007 if (.not.log_mb(zero_row2(1,fb)+q-1)) then 2008 call dcfftf(ny(1),tmp1(indx),dcpl_mb(tmpy(1,1))) 2009 end if 2010 indx = indx + ny(1) 2011 end do 2012#endif 2013 2014 call nwpw_timing_end(31) 2015 call nwpw_timing_start(32) 2016 call C3dB_c_ptranspose_ijk_start(fb,2,tmp1,tmp2,tmp1, 2017 > request,reqcnt,42) 2018 call nwpw_timing_end(32) 2019 2020 2021 end if 2022 2023 return 2024 end 2025 2026 2027 2028 2029 2030* *********************************** 2031* * * 2032* * C3dB_pfftfz * 2033* * * 2034* *********************************** 2035* 2036 2037 subroutine C3dB_pfftfz(nbb,tmp1,tmp2,request,reqcnt) 2038 implicit none 2039 integer nbb 2040 complex*16 tmp1(*) 2041 complex*16 tmp2(*) 2042 integer request(*),reqcnt 2043 2044 2045#include "bafdecls.fh" 2046#include "errquit.fh" 2047 2048#include "C3dB.fh" 2049#include "C3dB_pfft.fh" 2050 2051 2052 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 2053 common / C3dB_fft / tmpx,tmpy,tmpz 2054 2055 2056 !*** local variables *** 2057 integer i,k,q,fb 2058 integer indx,indx2 2059 2060#ifdef MLIB 2061 integer ierr 2062#endif 2063 2064 if (nbb.eq.0) then 2065 fb = 0 2066 else 2067 fb = 1 2068 end if 2069 2070 !********************** 2071 !**** slab mapping **** 2072 !********************** 2073 if (mapping.eq.1) then 2074 2075 call nwpw_timing_start(32) 2076 call C3dB_c_ptranspose2_jk_end(fb,tmp2,tmp1,request,reqcnt) 2077 call nwpw_timing_end(32) 2078 call nwpw_timing_start(31) 2079 2080* ******************************************** 2081* *** do fft along nz(1) dimension *** 2082* *** A(kx,kz,ky) <- fft1d[A(kx,nz(1),ky)] *** 2083* ******************************************** 2084 2085#ifdef MLIB 2086 indx2 = 0 2087 do q=1,nq(1) 2088 do i=1,nx(1) 2089 indx2 = indx2 + 1 2090 if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then 2091 indx = i + (q-1)*nx(1)*nz(1) 2092 call zcopy(nz(1),tmp2(indx),nx(1),tmp1,1) 2093 call z1dfft(tmp1,nz(1),dcpl_mb(tmpz(1,1)),1,ierr) 2094 call zcopy(nz(1),tmp1,1,tmp2(indx),nx(1)) 2095 end if 2096 end do 2097 end do 2098#else 2099 indx2 = 0 2100 do q=1,nq(1) 2101 do i=1,nx(1) 2102 indx2 = indx2 + 1 2103 if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then 2104 indx = i + (q-1)*nx(1)*nz(1) 2105 call zcopy(nz(1),tmp2(indx),nx(1),tmp1,1) 2106 call dcfftf(nz(1),tmp1,dcpl_mb(tmpz(1,1))) 2107 call zcopy(nz(1),tmp1,1,tmp2(indx),nx(1)) 2108 end if 2109 end do 2110 end do 2111 call nwpw_timing_end(31) 2112 2113#endif 2114 2115 2116 !************************* 2117 !**** hilbert mapping **** 2118 !************************* 2119 else 2120 call nwpw_timing_start(32) 2121 call C3dB_c_ptranspose_ijk_end(fb,2,tmp2,tmp1,request,reqcnt) 2122 call nwpw_timing_end(32) 2123 call nwpw_timing_start(31) 2124 2125* ******************************************** 2126* *** do fft along nz(1) dimension *** 2127* *** A(kz,kx,ky) <- fft1d[A(nz(1),kx,ky)] *** 2128* ******************************************** 2129#ifdef MLIB 2130 indx = 1 2131 do q=1,nq3(1) 2132 if (.not.log_mb(zero_row3(1,fb)+q-1)) then 2133 call z1dfft(tmp2(indx),nz(1),dcpl_mb(tmpz(1,1)),1,ierr) 2134 end if 2135 indx = indx + nz(1) 2136 end do 2137#else 2138 indx = 1 2139 do q=1,nq3(1) 2140 if (.not.log_mb(zero_row3(1,fb)+q-1)) then 2141 call dcfftf(nz(1),tmp2(indx),dcpl_mb(tmpz(1,1))) 2142 end if 2143 indx = indx + nz(1) 2144 end do 2145#endif 2146 call nwpw_timing_end(31) 2147 2148 end if 2149 2150 2151 2152 call nwpw_timing_start(32) 2153 call Cram_c_pack_start(nbb,tmp2,tmp1,request,reqcnt,43) 2154 call nwpw_timing_end(32) 2155 2156 return 2157 end 2158 2159 2160 2161* *********************************** 2162* * * 2163* * C3dB_pfftf_final * 2164* * * 2165* *********************************** 2166* 2167 2168 subroutine C3dB_pfftf_final(nbb,tmp1,tmp2,request,reqcnt) 2169 implicit none 2170 integer nbb 2171 complex*16 tmp1(*) 2172 complex*16 tmp2(*) 2173 integer request(*),reqcnt 2174 2175#include "bafdecls.fh" 2176#include "errquit.fh" 2177 2178#include "C3dB.fh" 2179#include "C3dB_pfft.fh" 2180 2181 call nwpw_timing_start(32) 2182 call Cram_c_pack_end(nbb,tmp2,request,reqcnt) 2183 call nwpw_timing_end(32) 2184 2185 return 2186 end 2187 2188 2189 2190 2191* *********************************** 2192* * * 2193* * C3dB_pfftf_step * 2194* * * 2195* *********************************** 2196* 2197 2198 subroutine C3dB_pfftf_step(step,nbb,A,tmp1,tmp2,request,reqcnt) 2199 implicit none 2200 integer step 2201 integer nbb 2202 complex*16 A(*) 2203 complex*16 tmp1(*) 2204 complex*16 tmp2(*) 2205 integer request(*),reqcnt 2206 2207 2208 if (step.eq.1) then 2209 call C3dB_pfftfx(nbb,A,tmp1,tmp2,request,reqcnt) 2210 else if (step.eq.2) then 2211 call C3dB_pfftfy(nbb,tmp1,tmp2,request,reqcnt) 2212 else if (step.eq.3) then 2213 call C3dB_pfftfz(nbb,tmp1,tmp2,request,reqcnt) 2214 else if (step.eq.4) then 2215 call C3dB_pfftf_final(nbb,tmp1,tmp2,request,reqcnt) 2216 end if 2217 2218 return 2219 end 2220 2221 2222 2223 2224* *********************************** 2225* * * 2226* * C3dB_pfftbz * 2227* * * 2228* *********************************** 2229 2230* 2231* This routine performs the operation of 2232* a three dimensional complex to complex 2233* inverse fft 2234* A(nx,ny(1),nz(1)) <- FFT3^(-1)[A(kx,ky,kz)] 2235* 2236* Entry - 2237* A: a column distribuded 3d block 2238* tmp: tempory work space must be at 2239* least the size of (complex) 2240* (nfft*nfft + 1) + 10*nfft 2241* 2242* Exit - A is transformed and the imaginary 2243* part of A is set to zero 2244* uses - C3dB_c_ptranspose_jk, dcopy 2245* 2246 2247 2248 subroutine C3dB_pfftbz(nbb,tmp1,tmp2,request,reqcnt) 2249 implicit none 2250 integer nbb 2251 complex*16 tmp1(*) 2252 complex*16 tmp2(*) 2253 integer request(*),reqcnt 2254 2255 2256#include "bafdecls.fh" 2257#include "errquit.fh" 2258#include "C3dB.fh" 2259#include "C3dB_pfft.fh" 2260 2261 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 2262 common / C3dB_fft / tmpx,tmpy,tmpz 2263 2264 2265 2266* *** local variables *** 2267 integer i,k,q,indx,ierr 2268 integer indx2,fb 2269 2270 if (nbb.eq.0) then 2271 fb = 0 2272 else 2273 fb = 1 2274 end if 2275 2276 !********************** 2277 !**** slab mapping **** 2278 !********************** 2279 if (mapping.eq.1) then 2280 2281 call nwpw_timing_start(31) 2282* ************************************************* 2283* *** do fft along kz dimension *** 2284* *** A(kx,nz(1),ky) <- fft1d^(-1)[A(kx,kz,ky)] *** 2285* ************************************************* 2286#ifdef MLIB 2287 indx2 = 0 2288 do q=1,nq(1) 2289 do i=1,nx(1) 2290 indx2 = indx2 + 1 2291 if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then 2292 indx = i + (q-1)*nx(1)*nz(1) 2293 call zcopy(nz(1),tmp1(indx),nx(1),tmp2,1) 2294 call z1dfft(tmp2,nz(1),dcpl_mb(tmpz(1,1)),-2,ierr) 2295 call zcopy(nz(1),tmp2,1,tmp1(indx),nx(1)) 2296 end if 2297 end do 2298 end do 2299#else 2300 indx2 = 0 2301 do q=1,nq(1) 2302 do i=1,nx(1) 2303 indx2 = indx2 + 1 2304 if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then 2305 indx = i + (q-1)*nx(1)*nz(1) 2306 call zcopy(nz(1),tmp1(indx),nx(1),tmp2,1) 2307 call dcfftb(nz(1),tmp2,dcpl_mb(tmpz(1,1))) 2308 call zcopy(nz(1),tmp2,1,tmp1(indx),nx(1)) 2309 end if 2310 end do 2311 end do 2312#endif 2313 2314* ******************************************** 2315* *** Do a ptranspose of A *** 2316* *** A(kx,ky,nz(1)) <- A(kx,nz(1),ky) *** 2317* ******************************************** 2318 call nwpw_timing_end(31) 2319 call nwpw_timing_start(32) 2320 call C3dB_c_ptranspose1_jk_start(fb,tmp1,tmp2,tmp1, 2321 > request,reqcnt,44) 2322 call nwpw_timing_end(32) 2323 2324 2325 !************************* 2326 !**** hilbert mapping **** 2327 !************************* 2328 else 2329 2330 2331 call nwpw_timing_start(31) 2332* ************************************************* 2333* *** do fft along kz dimension *** 2334* *** A(nz(1),kx,ky) <- fft1d^(-1)[A(kz,kx,ky)] *** 2335* ************************************************* 2336#ifdef MLIB 2337 indx = 1 2338 do q=1,nq3(1) 2339 if (.not.log_mb(zero_row3(1,fb)+q-1)) then 2340 call z1dfft(tmp1(indx),nz(1),dcpl_mb(tmpz(1,1)),-2,ierr) 2341 end if 2342 indx = indx + nz(1) 2343 end do 2344#else 2345 indx = 1 2346 do q=1,nq3(1) 2347 if (.not.log_mb(zero_row3(1,fb)+q-1)) then 2348 call dcfftb(nz(1),tmp1(indx),dcpl_mb(tmpz(1,1))) 2349 end if 2350 indx = indx + nz(1) 2351 end do 2352#endif 2353 2354 call nwpw_timing_end(31) 2355 call nwpw_timing_start(32) 2356 call C3dB_c_ptranspose_ijk_start(fb,3,tmp1,tmp2,tmp1, 2357 > request,reqcnt,45) 2358 call nwpw_timing_end(32) 2359 2360 2361 end if 2362 2363 return 2364 end 2365 2366 2367 2368 2369 2370* *********************************** 2371* * * 2372* * C3dB_pfftby * 2373* * * 2374* *********************************** 2375 2376 subroutine C3dB_pfftby(nbb,tmp1,tmp2,request,reqcnt) 2377 implicit none 2378 integer nbb 2379 complex*16 tmp1(*) 2380 complex*16 tmp2(*) 2381 integer request(*),reqcnt 2382 2383#include "bafdecls.fh" 2384#include "errquit.fh" 2385#include "C3dB.fh" 2386#include "C3dB_pfft.fh" 2387 2388 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 2389 common / C3dB_fft / tmpx,tmpy,tmpz 2390 2391 2392* *** local variables *** 2393 integer i,j,q,indx,ierr 2394 integer indx2,fb 2395 2396 if (nbb.eq.0) then 2397 fb = 0 2398 else 2399 fb = 1 2400 end if 2401 2402 !********************** 2403 !**** slab mapping **** 2404 !********************** 2405 if (mapping.eq.1) then 2406 2407 2408* ******************************************** 2409* *** Do a ptranspose of A *** 2410* *** A(kx,ky,nz(1)) <- A(kx,nz(1),ky) *** 2411* ******************************************** 2412 call nwpw_timing_start(32) 2413 call C3dB_c_ptranspose1_jk_end(fb,tmp2,tmp1,request,reqcnt) 2414 call nwpw_timing_end(32) 2415 call nwpw_timing_start(31) 2416 2417* ************************************************* 2418* *** do fft along ky dimension *** 2419* *** A(kx,ny(1),nz(1)) <- fft1d^(-1)[A(kx,ky,nz(1))] *** 2420* ************************************************* 2421#ifdef MLIB 2422 2423 indx2 = 0 2424 do q=1,nq(1) 2425 do i=1,nx(1) 2426 indx2 = indx2 + 1 2427 if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then 2428 indx = i + (q-1)*nx(1)*nz(1) 2429 call zcopy(ny(1),tmp2(indx),nx(1),tmp1,1) 2430 call z1dfft(tmp1,ny(1),dcpl_mb(tmpy(1,1)),-2,ierr) 2431 call zcopy(ny(1),tmp1,1,tmp2(indx),nx(1)) 2432 end if 2433 end do 2434 end do 2435#else 2436 indx2 = 0 2437 do q=1,nq(1) 2438 do i=1,nx(1) 2439 indx2 = indx2 + 1 2440 if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then 2441 indx = i + (q-1)*nx(1)*nz(1) 2442 call zcopy(ny(1),tmp2(indx),nx(1),tmp1,1) 2443 call dcfftb(ny(1),tmp1,dcpl_mb(tmpy(1,1))) 2444 call zcopy(ny(1),tmp1,1,tmp2(indx),nx(1)) 2445 end if 2446 end do 2447 end do 2448#endif 2449 2450 call nwpw_timing_end(31) 2451 2452 2453 !************************* 2454 !**** hilbert mapping **** 2455 !************************* 2456 else 2457 2458 call nwpw_timing_start(32) 2459 call C3dB_c_ptranspose_ijk_end(fb,3,tmp2,tmp1,request,reqcnt) 2460 2461 call nwpw_timing_end(32) 2462 call nwpw_timing_start(31) 2463* ************************************************* 2464* *** do fft along ky dimension *** 2465* *** A(ny(1),nz(1),kx) <- fft1d^(-1)[A(ky,nz(1),kx)] *** 2466* ************************************************* 2467#ifdef MLIB 2468 indx = 1 2469 do q=1,nq2(1) 2470 if (.not.log_mb(zero_row2(1,fb)+q-1)) then 2471 call z1dfft(tmp2(indx),ny(1),dcpl_mb(tmpy(1,1)),-2,ierr) 2472 end if 2473 indx = indx + ny(1) 2474 end do 2475#else 2476 indx = 1 2477 do q=1,nq2(1) 2478 if (.not.log_mb(zero_row2(1,fb)+q-1)) then 2479 call dcfftb(ny(1),tmp2(indx),dcpl_mb(tmpy(1,1))) 2480 end if 2481 indx = indx + ny(1) 2482 end do 2483#endif 2484 2485 call nwpw_timing_end(31) 2486 call nwpw_timing_start(32) 2487 call C3dB_c_ptranspose_ijk_start(fb,4,tmp2,tmp1,tmp2, 2488 > request,reqcnt,46) 2489 call nwpw_timing_end(32) 2490 2491 2492 end if 2493 2494 return 2495 end 2496 2497 2498 2499 2500* *********************************** 2501* * * 2502* * C3dB_pfftbx * 2503* * * 2504* *********************************** 2505 2506 subroutine C3dB_pfftbx(nbb,tmp1,tmp2,request,reqcnt) 2507 implicit none 2508 integer nbb 2509 complex*16 tmp1(*) 2510 complex*16 tmp2(*) 2511 integer request(*),reqcnt 2512 2513#include "bafdecls.fh" 2514#include "errquit.fh" 2515#include "C3dB.fh" 2516#include "C3dB_pfft.fh" 2517 2518 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 2519 common / C3dB_fft / tmpx,tmpy,tmpz 2520 2521* *** local variables *** 2522 integer j,q,indx,ierr,fb 2523 2524 2525 if (nbb.eq.0) then 2526 fb = 0 2527 else 2528 fb = 1 2529 endif 2530 2531 !********************** 2532 !**** slab mapping **** 2533 !********************** 2534 if (mapping.eq.1) then 2535 2536 call nwpw_timing_start(31) 2537* ************************************************* 2538* *** do fft along kx dimension *** 2539* *** A(nx(1),ny(1),nz(1)) <- fft1d^(-1)[A(kx,ny(1),nz(1))] *** 2540* ************************************************* 2541#ifdef MLIB 2542 indx = 1 2543 do q=1,nq(1) 2544 do j=1,ny(1) 2545 call z1dfft(tmp2(indx),nx(1),dcpl_mb(tmpx(1,1)),-2,ierr) 2546 indx = indx + nx(1) 2547 end do 2548 end do 2549 2550#else 2551 indx = 1 2552 do q=1,nq(1) 2553 do j=1,ny(1) 2554 call dcfftb(nx(1),tmp2(indx),dcpl_mb(tmpx(1,1))) 2555 indx = indx + nx(1) 2556 end do 2557 end do 2558#endif 2559 call dcopy(2*nx(1)*ny(1)*nq(1),tmp2,1,tmp1,1) 2560 2561 2562 2563 call nwpw_timing_end(31) 2564* ************************************************* 2565 !************************* 2566 !**** hilbert mapping **** 2567 !************************* 2568 else 2569 2570 call nwpw_timing_start(32) 2571 call C3dB_c_ptranspose_ijk_end(fb,4,tmp1,tmp2,request,reqcnt) 2572 call nwpw_timing_end(32) 2573 call nwpw_timing_start(31) 2574 2575* ************************************************* 2576* *** do fft along kx dimension *** 2577* *** A(nx(1),ny(1),nz(1)) <- fft1d^(-1)[A(kx,ny(1),nz(1))] *** 2578* ************************************************* 2579#ifdef MLIB 2580 indx = 1 2581 do q=1,nq1(1) 2582 call z1dfft(tmp1(indx),nx(1),dcpl_mb(tmpx(1,1)),-2,ierr) 2583 indx = indx + nx(1) 2584 end do 2585#else 2586 indx = 1 2587 do q=1,nq1(1) 2588 call dcfftb(nx(1),tmp1(indx),dcpl_mb(tmpx(1,1))) 2589 indx = indx + nx(1) 2590 end do 2591#endif 2592 call nwpw_timing_end(31) 2593 2594 end if 2595 2596 return 2597 end 2598 2599 2600* *********************************** 2601* * * 2602* * C3dB_pfftb_step * 2603* * * 2604* *********************************** 2605* 2606 2607 subroutine C3dB_pfftb_step(step,nbb,A,tmp1,tmp2,request,reqcnt) 2608 implicit none 2609 integer step 2610 integer nbb 2611 complex*16 A(*) 2612 complex*16 tmp1(*) 2613 complex*16 tmp2(*) 2614 integer request(*),reqcnt 2615 2616 2617#include "bafdecls.fh" 2618 2619 2620 if (step.eq.1) then 2621 call nwpw_timing_start(32) 2622 call Cram_c_unpack_start(nbb,A,tmp1,request,reqcnt,47) 2623 call nwpw_timing_end(32) 2624 else if (step.eq.2) then 2625 call nwpw_timing_start(32) 2626 call Cram_c_unpack_end(nbb,tmp1,tmp2,request,reqcnt) 2627 call nwpw_timing_end(32) 2628 else if (step.eq.3) then 2629 call C3dB_pfftbz(nbb,tmp1,tmp2,request,reqcnt) 2630 else if (step.eq.4) then 2631 call C3dB_pfftby(nbb,tmp1,tmp2,request,reqcnt) 2632 else if (step.eq.5) then 2633 call C3dB_pfftbx(nbb,tmp1,tmp2,request,reqcnt) 2634 end if 2635 2636 return 2637 end 2638 2639 2640* *********************************** 2641* * * 2642* * C3dB_pfft3_queue_init * 2643* * * 2644* *********************************** 2645 2646 subroutine C3dB_pfft3_queue_init(qmax_in) 2647 implicit none 2648 integer qmax_in 2649 2650 2651#include "bafdecls.fh" 2652#include "errquit.fh" 2653 2654#include "C3dB.fh" 2655#include "C3dB_pfft.fh" 2656 2657 2658 !**** pfft_queues **** 2659 integer aqindx(2),aqstatus(2) 2660 integer atmp(2),arequest(2),areqcnt(2),aqnbb(2) 2661 integer aqmax,aqsize,alast_index 2662 common / c_pfft_aqueue_common / aqindx,aqstatus,atmp,arequest, 2663 > areqcnt,aqnbb, 2664 > aqmax,aqsize,alast_index 2665 integer bqindx(2),bqstatus(2) 2666 integer btmp(2),brequest(2),breqcnt(2),bqnbb(2) 2667 integer bqmax,bqsize,blast_index 2668 common / c_pfft_bqueue_common / bqindx,bqstatus,btmp,brequest, 2669 > breqcnt,bqnbb, 2670 > bqmax,bqsize,blast_index 2671 2672 2673 logical value 2674 integer np,size 2675 2676 call Parallel3d_np_i(np) 2677 2678c **** allocate aqueue **** 2679 aqmax = qmax_in 2680 aqsize = 0 2681 alast_index = aqmax 2682 size = nfft3d(1)*aqmax*2 2683 value = BA_alloc_get(mt_dcpl,size,'atmp',atmp(2),atmp(1)) 2684 size = np*aqmax*2 2685 value = value.and. 2686 > BA_alloc_get(mt_int,size,'arequest',arequest(2),arequest(1)) 2687 size = aqmax 2688 value = value.and. 2689 > BA_alloc_get(mt_int,size,'aqindx',aqindx(2),aqindx(1)) 2690 value = value.and. 2691 > BA_alloc_get(mt_int,size,'aqstatus',aqstatus(2),aqstatus(1)) 2692 value = value.and. 2693 > BA_alloc_get(mt_int,size,'areqcnt',areqcnt(2),areqcnt(1)) 2694 value = value.and. 2695 > BA_alloc_get(mt_int,size,'aqnbb',aqnbb(2),aqnbb(1)) 2696 2697 2698c **** allocate bqueue **** 2699 bqmax = qmax_in 2700 bqsize = 0 2701 blast_index = bqmax 2702 size = nfft3d(1)*bqmax*2 2703 value = BA_alloc_get(mt_dcpl,size,'btmp',btmp(2),btmp(1)) 2704 size = np*bqmax*2 2705 value = value.and. 2706 > BA_alloc_get(mt_int,size,'brequest',brequest(2),brequest(1)) 2707 size = bqmax 2708 value = value.and. 2709 > BA_alloc_get(mt_int,size,'bqindx',bqindx(2),bqindx(1)) 2710 value = value.and. 2711 > BA_alloc_get(mt_int,size,'bqstatus',bqstatus(2),bqstatus(1)) 2712 value = value.and. 2713 > BA_alloc_get(mt_int,size,'breqcnt',breqcnt(2),breqcnt(1)) 2714 value = value.and. 2715 > BA_alloc_get(mt_int,size,'bqnbb',bqnbb(2),bqnbb(1)) 2716 2717 return 2718 end 2719 2720 2721 2722 2723* *********************************** 2724* * * 2725* * C3dB_pfft3_queue_end * 2726* * * 2727* *********************************** 2728 2729 subroutine C3dB_pfft3_queue_end() 2730 implicit none 2731 2732 2733#include "bafdecls.fh" 2734#include "errquit.fh" 2735 2736#include "C3dB.fh" 2737#include "C3dB_pfft.fh" 2738 2739 2740 2741 !**** pfft_queues **** 2742 integer aqindx(2),aqstatus(2) 2743 integer atmp(2),arequest(2),areqcnt(2),aqnbb(2) 2744 integer aqmax,aqsize,alast_index 2745 common / c_pfft_aqueue_common / aqindx,aqstatus,atmp,arequest, 2746 > areqcnt,aqnbb, 2747 > aqmax,aqsize,alast_index 2748 integer bqindx(2),bqstatus(2) 2749 integer btmp(2),brequest(2),breqcnt(2),bqnbb(2) 2750 integer bqmax,bqsize,blast_index 2751 common / c_pfft_bqueue_common / bqindx,bqstatus,btmp,brequest, 2752 > breqcnt,bqnbb, 2753 > bqmax,bqsize,blast_index 2754 2755 2756 2757 logical value 2758 2759 value = BA_free_heap(atmp(2)) 2760 value = value.and.BA_free_heap(arequest(2)) 2761 value = value.and.BA_free_heap(aqindx(2)) 2762 value = value.and.BA_free_heap(aqstatus(2)) 2763 value = value.and.BA_free_heap(areqcnt(2)) 2764 value = value.and.BA_free_heap(aqnbb(2)) 2765 2766 value = value.and.BA_free_heap(btmp(2)) 2767 value = value.and.BA_free_heap(brequest(2)) 2768 value = value.and.BA_free_heap(bqindx(2)) 2769 value = value.and.BA_free_heap(bqstatus(2)) 2770 value = value.and.BA_free_heap(breqcnt(2)) 2771 value = value.and.BA_free_heap(bqnbb(2)) 2772 2773 if (.not. value) call errquit('error freeing heap',0,MA_ERR) 2774 2775 return 2776 end 2777 2778 2779 2780 2781 2782 2783 2784* *********************************** 2785* * * 2786* * C3dB_cr_pfft3_queue_filled * 2787* * * 2788* *********************************** 2789 2790 logical function C3dB_cr_pfft3_queue_filled() 2791 implicit none 2792 2793#include "bafdecls.fh" 2794#include "errquit.fh" 2795 2796#include "C3dB.fh" 2797#include "C3dB_pfft.fh" 2798 2799 2800 2801 2802 !**** pfft_queue **** 2803 integer qindx(2),qstatus(2) 2804 integer tmp(2),request(2),reqcnt(2),qnbb(2) 2805 integer qmax,qsize,last_index 2806 common / c_pfft_aqueue_common / qindx,qstatus,tmp,request,reqcnt, 2807 > qnbb,qmax,qsize,last_index 2808 2809 C3dB_cr_pfft3_queue_filled = (qsize.ge.qmax) 2810 return 2811 end 2812 2813 2814 2815* *********************************** 2816* * * 2817* * C3dB_rc_pfft3_queue_filled * 2818* * * 2819* *********************************** 2820 2821 logical function C3dB_rc_pfft3_queue_filled() 2822 implicit none 2823 2824#include "bafdecls.fh" 2825#include "errquit.fh" 2826 2827#include "C3dB.fh" 2828#include "C3dB_pfft.fh" 2829 2830 2831 !**** pfft_queue **** 2832 integer qindx(2),qstatus(2) 2833 integer tmp(2),request(2),reqcnt(2),qnbb(2) 2834 integer qmax,qsize,last_index 2835 common / c_pfft_bqueue_common / qindx,qstatus,tmp,request,reqcnt, 2836 > qnbb,qmax,qsize,last_index 2837 2838 C3dB_rc_pfft3_queue_filled = (qsize.ge.qmax) 2839 return 2840 end 2841 2842 2843 2844 2845* *********************************** 2846* * * 2847* * C3dB_rc_pfft3f_queuein * 2848* * * 2849* *********************************** 2850 2851 subroutine C3dB_rc_pfft3f_queuein(nbb,A) 2852 implicit none 2853 integer nbb 2854 complex*16 A(*) 2855 2856 2857#include "bafdecls.fh" 2858#include "errquit.fh" 2859 2860#include "C3dB.fh" 2861#include "C3dB_pfft.fh" 2862 2863 2864 !**** pfft_queue **** 2865 integer qindx(2),qstatus(2) 2866 integer tmp(2),request(2),reqcnt(2),qnbb(2) 2867 integer qmax,qsize,last_index 2868 common / c_pfft_bqueue_common / qindx,qstatus,tmp,request,reqcnt, 2869 > qnbb,qmax,qsize,last_index 2870 2871 !*** local variables *** 2872 integer shift1,shift2,shift3 2873 integer q,indx,status,np 2874 2875 call nwpw_timing_start(30) 2876 call Parallel3d_np_i(np) 2877 2878 do q=1,qsize 2879 indx = int_mb(qindx(1)+q-1) 2880 int_mb(qstatus(1)+indx-1) = int_mb(qstatus(1)+indx-1) + 1 2881 status = int_mb(qstatus(1)+indx-1) 2882 shift1=nfft3d(1)*(2*(indx-1)) 2883 shift2=nfft3d(1)*(2*(indx-1)+1) 2884 shift3=2*np*(indx-1) 2885 call C3dB_pfftf_step(status, 2886 > int_mb(qnbb(1)+indx-1), 2887 > A, 2888 > dcpl_mb(tmp(1)+shift1), 2889 > dcpl_mb(tmp(1)+shift2), 2890 > int_mb(request(1)+shift3), 2891 > int_mb(reqcnt(1)+indx-1)) 2892 end do 2893 2894 qsize = qsize + 1 2895 last_index = last_index+1 2896 if (last_index.gt.qmax) last_index = 1 2897 int_mb(qindx(1)+qsize-1) = last_index 2898 int_mb(qstatus(1)+last_index-1) = 1 2899 status = 1 2900 int_mb(qnbb(1)+last_index-1) = nbb 2901 shift1=nfft3d(1)*(2*(last_index-1)) 2902 shift2=nfft3d(1)*(2*(last_index-1)+1) 2903 shift3=2*np*(last_index-1) 2904 2905 call C3dB_pfftf_step(status,nbb,A, 2906 > dcpl_mb(tmp(1)+shift1), 2907 > dcpl_mb(tmp(1)+shift2), 2908 > int_mb(request(1)+shift3), 2909 > int_mb(reqcnt(1)+last_index-1)) 2910 2911 2912 call nwpw_timing_end(30) 2913 return 2914 end 2915 2916 2917 2918 2919* *********************************** 2920* * * 2921* * C3dB_rc_pfft3f_queueout * 2922* * * 2923* *********************************** 2924 2925 subroutine C3dB_rc_pfft3f_queueout(nbb,A) 2926 implicit none 2927 integer nbb 2928 complex*16 A(*) 2929 2930 2931#include "bafdecls.fh" 2932#include "errquit.fh" 2933 2934#include "C3dB.fh" 2935#include "C3dB_pfft.fh" 2936 2937 2938 !**** pfft_queue **** 2939 integer qindx(2),qstatus(2) 2940 integer tmp(2),request(2),reqcnt(2),qnbb(2) 2941 integer qmax,qsize,last_index 2942 common / c_pfft_bqueue_common / qindx,qstatus,tmp,request,reqcnt, 2943 > qnbb,qmax,qsize,last_index 2944 2945 !*** local variables *** 2946 integer shift1,shift2,shift3 2947 integer q,indx,indx1,status,np 2948 2949 call nwpw_timing_start(30) 2950 call Parallel3d_np_i(np) 2951 2952 indx1 = int_mb(qindx(1)) 2953 do while (int_mb(qstatus(1)+indx1-1).lt.4) 2954 do q=1,qsize 2955 indx = int_mb(qindx(1)+q-1) 2956 int_mb(qstatus(1)+indx-1) = int_mb(qstatus(1)+indx-1) + 1 2957 status = int_mb(qstatus(1)+indx-1) 2958 shift1=nfft3d(1)*(2*(indx-1)) 2959 shift2=nfft3d(1)*(2*(indx-1)+1) 2960 shift3=2*np*(indx-1) 2961 call C3dB_pfftf_step(status, 2962 > int_mb(qnbb(1)+indx-1), 2963 > A, 2964 > dcpl_mb(tmp(1)+shift1), 2965 > dcpl_mb(tmp(1)+shift2), 2966 > int_mb(request(1)+shift3), 2967 > int_mb(reqcnt(1)+indx-1)) 2968 end do 2969 end do 2970 2971 qsize = qsize -1 2972 do q=1,qsize 2973 int_mb(qindx(1)+q-1) = int_mb(qindx(1)+q) 2974 end do 2975 2976 shift2=nfft3d(1)*(2*(indx1-1)+1) 2977 call Cram_c_Copy(nbb,dcpl_mb(tmp(1)+shift2),A) 2978 2979 call nwpw_timing_end(30) 2980 return 2981 end 2982 2983 2984 2985 2986 2987* *********************************** 2988* * * 2989* * C3dB_cr_pfft3b_queuein * 2990* * * 2991* *********************************** 2992 2993 subroutine C3dB_cr_pfft3b_queuein(nbb,A) 2994 implicit none 2995 integer nbb 2996 complex*16 A(*) 2997 2998 2999#include "bafdecls.fh" 3000#include "errquit.fh" 3001 3002#include "C3dB.fh" 3003#include "C3dB_pfft.fh" 3004 3005 3006 !**** pfft_queue **** 3007 integer qindx(2),qstatus(2) 3008 integer tmp(2),request(2),reqcnt(2),qnbb(2) 3009 integer qmax,qsize,last_index 3010 common / c_pfft_aqueue_common / qindx,qstatus,tmp,request,reqcnt, 3011 > qnbb,qmax,qsize,last_index 3012 3013 !*** local variables *** 3014 integer shift1,shift2,shift3 3015 integer q,indx,status,np 3016 3017 call nwpw_timing_start(30) 3018 call Parallel3d_np_i(np) 3019 3020 do q=1,qsize 3021 indx = int_mb(qindx(1)+q-1) 3022 int_mb(qstatus(1)+indx-1) = int_mb(qstatus(1)+indx-1) + 1 3023 status = int_mb(qstatus(1)+indx-1) 3024 shift1=nfft3d(1)*(2*(indx-1)) 3025 shift2=nfft3d(1)*(2*(indx-1)+1) 3026 shift3=2*np*(indx-1) 3027 call C3dB_pfftb_step(status, 3028 > int_mb(qnbb(1)+indx-1), 3029 > A, 3030 > dcpl_mb(tmp(1)+shift1), 3031 > dcpl_mb(tmp(1)+shift2), 3032 > int_mb(request(1)+shift3), 3033 > int_mb(reqcnt(1)+indx-1)) 3034 end do 3035 3036 qsize = qsize + 1 3037 last_index = last_index+1 3038 if (last_index.gt.qmax) last_index = 1 3039 int_mb(qindx(1)+qsize-1) = last_index 3040 int_mb(qstatus(1)+last_index-1) = 1 3041 status = 1 3042 int_mb(qnbb(1)+last_index-1) = nbb 3043 shift1=nfft3d(1)*(2*(last_index-1)) 3044 shift2=nfft3d(1)*(2*(last_index-1)+1) 3045 shift3=2*np*(last_index-1) 3046 3047 call C3dB_pfftb_step(status,nbb,A, 3048 > dcpl_mb(tmp(1)+shift1), 3049 > dcpl_mb(tmp(1)+shift2), 3050 > int_mb(request(1)+shift3), 3051 > int_mb(reqcnt(1)+last_index-1)) 3052 3053 call nwpw_timing_end(30) 3054 return 3055 end 3056 3057 3058 3059 3060* *********************************** 3061* * * 3062* * C3dB_cr_pfft3b_queueout * 3063* * * 3064* *********************************** 3065 3066 subroutine C3dB_cr_pfft3b_queueout(nbb,A) 3067 implicit none 3068 integer nbb 3069 complex*16 A(*) 3070 3071 3072#include "bafdecls.fh" 3073#include "errquit.fh" 3074 3075#include "C3dB.fh" 3076#include "C3dB_pfft.fh" 3077 3078 3079 !**** pfft_queue **** 3080 integer qindx(2),qstatus(2) 3081 integer tmp(2),request(2),reqcnt(2),qnbb(2) 3082 integer qmax,qsize,last_index 3083 common / c_pfft_aqueue_common / qindx,qstatus,tmp,request,reqcnt, 3084 > qnbb,qmax,qsize,last_index 3085 3086 !*** local variables *** 3087 integer shift1,shift2,shift3 3088 integer q,indx,indx1,status,np 3089 3090 call Parallel3d_np_i(np) 3091 3092 call nwpw_timing_start(30) 3093 3094 indx1 = int_mb(qindx(1)) 3095 do while (int_mb(qstatus(1)+indx1-1).lt.5) 3096 do q=1,qsize 3097 indx = int_mb(qindx(1)+q-1) 3098 int_mb(qstatus(1)+indx-1) = int_mb(qstatus(1)+indx-1) + 1 3099 status = int_mb(qstatus(1)+indx-1) 3100 shift1=nfft3d(1)*(2*(indx-1)) 3101 shift2=nfft3d(1)*(2*(indx-1)+1) 3102 shift3=2*np*(indx-1) 3103 call C3dB_pfftb_step(status, 3104 > int_mb(qnbb(1)+indx-1), 3105 > A, 3106 > dcpl_mb(tmp(1)+shift1), 3107 > dcpl_mb(tmp(1)+shift2), 3108 > int_mb(request(1)+shift3), 3109 > int_mb(reqcnt(1)+indx-1)) 3110 end do 3111 end do 3112 3113 qsize = qsize -1 3114 do q=1,qsize 3115 int_mb(qindx(1)+q-1) = int_mb(qindx(1)+q) 3116 end do 3117 3118 shift1=nfft3d(1)*(2*(indx1-1)) 3119 call dcopy(2*nfft3d(1),dcpl_mb(tmp(1)+shift1),1,A,1) 3120 3121 call nwpw_timing_end(30) 3122 return 3123 end 3124 3125