1ccccccccccc #define TCGMSG 2#define NBLOCKS 3 3 4* 5* $Id$ 6* 7 8* *********************************************************** 9* * * 10* * D3dB library * 11* * (MPI implemenation) * 12* * * 13* * Author - Eric Bylaska * 14* * date - 3/23/96 * 15* * * 16* *********************************************************** 17 18* The D3dB (distributed three-dimensional block) library is to 19* be used for handling three kinds of data structures. The first 20* data structure, denoted by "r", is a double precision array of 21* length (nx(nb)+2)*ny(nb)*nz. The second data structure, denoted by "c", is 22* a double complex array of length of (nx(nb)/2+1)*ny(nb)*nz. The third data 23 24* (nx(nb)/2+1)*ny(nb)*nz. 25* 26* The three data structures are distributed across threads, p, in 27* the k (i.e. nz(nb)) dimension using a cyclic decomposition. So that 28* a "r" array A is defined as double precision A(nx(nb)+2,ny(nb),nq(nb)) on 29* each thread. 30* 31* Where 32* np = number of threads 33* nq(nb) = ceil(nz(nb)/np). 34* 0 <= p < np 35* 1 <= q <= nq(nb) 36* 1 <= k <= nz(nb) 37* 38* The mapping of k -> q is defined as: 39* 40* k = ((q-1)*np + p) + 1 41* q = ((k-1) - p)/np + 1 42* p = (k-1) mod np 43* 44* Libraries used: mpi, blas, fftpack, and compressed_io 45* 46* common blocks used in this library: 47* 48* integer nq,nx(NBLOCKS),ny,nz(nb) 49* common / D3dB / nq,nx,ny,nz 50* 51* integer q_map(NFFT3),p_map(NFFT3),k_map(NFFT3) 52* common /D3dB_mapping / q_map,p_map,k_map 53* 54* integer iq_to_i1((NFFT1/2+1)*NFFT2*NSLABS) 55* integer iq_to_i2((NFFT1/2+1)*NFFT2*NSLABS) 56* integer i1_start(NPROCS+1) 57* integer i2_start(NPROCS+1) 58* common / trans_blk / iq_to_i1,iq_to_i2,i1_start,i2_start 59 60* **** local variables **** 61 62* *********************************** 63* * * 64* * Mapping_Init * 65* * * 66* *********************************** 67 68 subroutine Mapping_Init(nb) 69 implicit none 70 integer nb 71 72#include "bafdecls.fh" 73#include "errquit.fh" 74#include "D3dB.fh" 75 76 77 integer k,q,p 78* integer kn 79 integer taskid,np 80 logical value 81 integer tid,Parallel_threadid 82 external Parallel_threadid 83 84 call Parallel2d_np_i(np) 85 call Parallel2d_taskid_i(taskid) 86 87 88 89* ************************** 90* ****** Slab mapping ****** 91* ************************** 92 if (mapping.eq.1) then 93 94* **** allocate q_map,p_map,k_map 95 value = BA_alloc_get(mt_int,nz(nb),'q_map',q_map(2,nb), 96 > q_map(1,nb)) 97 value = value.and.BA_alloc_get(mt_int,nz(nb),'p_map',p_map(2,nb), 98 > p_map(1,nb)) 99 value = value.and.BA_alloc_get(mt_int,nz(nb),'k_map',k_map(2,nb), 100 > k_map(1,nb)) 101 if (.not. value) 102 > call errquit('Mapping_init:out of heap memory',0, MA_ERR) 103 104 105 106* ****** cyclic ****** 107 p = 0 108 q = 1 109 do k=1,nz(nb) 110 int_mb(q_map(1,nb)+k-1) = q 111 int_mb(p_map(1,nb)+k-1) = p 112 if (p .eq. taskid) nq(nb) = q 113 114 p = p+1 115 if (p .ge. np) then 116 p = 0 117 q = q + 1 118 end if 119 end do 120 121 122c if (nb.eq.1) then 123c p = 0 124c q = 1 125c do k=1,nz(nb) 126c int_mb(q_map(1,nb)+k-1) = q 127c int_mb(p_map(1,nb)+k-1) = p 128c if (p .eq. taskid) nq(nb) = q 129c 130c p = p+1 131c if (p .ge. np) then 132c p = 0 133c q = q + 1 134c end if 135c end do 136c else if (nb.eq.2) then 137c p = 0 138c q = 1 139c do k=1,nz(1) 140c int_mb(q_map(1,nb)+k-1) = int_mb(q_map(1,1)+k-1) 141c int_mb(q_map(1,nb)+k+nz(1)-1) = int_mb(q_map(1,1)+k-1)+nq(1) 142c int_mb(p_map(1,nb)+k-1) = int_mb(p_map(1,1)+k-1) 143c int_mb(p_map(1,nb)+k+nz(1)-1) = int_mb(p_map(1,1)+k-1) 144c end do 145c 146c end if 147 148* ***** block ****** 149* **** make sure nz(nb) is a multiple of np **** 150* kn = mod(nz(nb),np) 151* if (kn.ne.0) then 152* kn=(nz(nb)/np)+1 153* else 154* kn=(nz(nb)/np) 155* end if 156* 157* p=0 158* q=1 159* do k=1,nz(nb) 160* int_mb(q_map(1,nb)+k-1) = q 161* int_mb(p_map(1,nb)+k-1) = p 162* if (p .eq. taskid) nq(nb) = q 163* 164* q=q+1 165* if (q .gt. (kn)) then 166* q = 1 167* p = p + 1 168* end if 169* end do 170 171 !*** not used anymore!! **** 172 do k=1,nz(nb) 173 if (int_mb(p_map(1,nb)+k-1) .eq. taskid) then 174c k_map(q_map(k)) = k 175 int_mb(k_map(1,nb)+int_mb(q_map(1,nb)+k-1)-1) = k 176 end if 177 end do 178 179 nfft3d(nb) = (nx(nb)/2+1)*ny(nb)*nq(nb) 180 n2ft3d(nb) = 2*nfft3d(nb) 181 nfft3d_map(nb) = nfft3d(nb) 182 n2ft3d_map(nb) = n2ft3d(nb) 183 184 185* ****************************** 186* ****** Hilbert mappings ****** 187* ****************************** 188 else 189 190 191 192 193 194* **** allocate q_map1,p_map1,q_map2,p_map2,q_map3,p_map3 **** 195 value = BA_alloc_get(mt_int,ny(nb)*nz(nb), 196 > 'q_map1', 197 > q_map1(2,nb), 198 > q_map1(1,nb)) 199 200 201 value = value.and.BA_alloc_get(mt_int,ny(nb)*nz(nb), 202 > 'p_map1', 203 > p_map1(2,nb), 204 > p_map1(1,nb)) 205 206 value = value.and.BA_alloc_get(mt_int,nz(nb)*(nx(nb)/2+1), 207 > 'q_map2', 208 > q_map2(2,nb), 209 > q_map2(1,nb)) 210 value = value.and.BA_alloc_get(mt_int,nz(nb)*(nx(nb)/2+1), 211 > 'p_map2', 212 > p_map2(2,nb), 213 > p_map2(1,nb)) 214 215 value = value.and.BA_alloc_get(mt_int,ny(nb)*(nx(nb)/2+1), 216 > 'q_map3', 217 > q_map3(2,nb), 218 > q_map3(1,nb)) 219 value = value.and.BA_alloc_get(mt_int,ny(nb)*(nx(nb)/2+1), 220 > 'p_map3', 221 > p_map3(2,nb), 222 > p_map3(1,nb)) 223 if (.not. value) 224 > call errquit('Mapping_init:out of heap memory',1, MA_ERR) 225 226 227 !**** double grid map1 defined wrt to single grid **** 228 !**** makes expand and contract routines trivial parallel **** 229 230!MATHIAS 231 if (mapping2d.eq.1) then 232 if ((nb.eq.1).or.(nb.gt.2)) then 233 call hilbert2d_map(ny(nb),nz(nb),int_mb(p_map1(1,nb))) 234 end if 235 call hilbert2d_map(nz(nb),(nx(nb)/2+1),int_mb(p_map2(1,nb))) 236 call hilbert2d_map((nx(nb)/2+1),ny(nb),int_mb(p_map3(1,nb))) 237 else 238 if ((nb.eq.1).or.(nb.gt.2)) then 239 call hcurve_map(ny(nb),nz(nb),int_mb(p_map1(1,nb))) 240 end if 241 242 call hcurve_map(nz(nb),(nx(nb)/2+1),int_mb(p_map2(1,nb))) 243 call hcurve_map((nx(nb)/2+1),ny(nb),int_mb(p_map3(1,nb))) 244 end if 245 246 247 248c!$OMP critical 249c write(*,*) "checking p_map1,q_map1 ",Parallel_threadid() 250c do k=1,ny(nb)*nz(nb) 251c write(*,*) Parallel_threadid(),k, 252c > int_mb(p_map1(1,nb)+k-1) 253c end do 254c!$OMP end critical 255c!$OMP critical 256c write(*,*) "checking p_map2,q_map2 ",Parallel_threadid() 257c do k=1,(nx(nb)/2+1)*nz(nb) 258c write(*,*) Parallel_threadid(),k, 259c > int_mb(p_map2(1,nb)+k-1) 260c end do 261c!$OMP end critical 262c!$OMP critical 263c write(*,*) "checking p_map2,q_map2 ",Parallel_threadid() 264c do k=1,(nx(nb)/2+1)*ny(nb) 265c write(*,*) Parallel_threadid(),k, 266c > int_mb(p_map3(1,nb)+k-1) 267c end do 268c!$OMP end critical 269 270 271 !**** double grid map1 defined wrt to single grid **** 272 !**** makes expand and contract routines trivial parallel **** 273 if ((nb.eq.1).or.(nb.gt.2)) then 274 call generate_map_indexes(taskid,np, 275 > ny(nb),nz(nb), 276 > int_mb(p_map1(1,nb)), 277 > int_mb(q_map1(1,nb)),nq1(nb)) 278 else 279 nq1(2) = 4*nq1(1) 280 call expand_hilbert2d(np,ny(1),nz(1), 281 > int_mb(p_map1(1,1)),int_mb(q_map1(1,1)), 282 > int_mb(p_map1(1,2)),int_mb(q_map1(1,2))) 283 end if 284 call generate_map_indexes(taskid,np, 285 > nz(nb),nx(nb)/2+1, 286 > int_mb(p_map2(1,nb)), 287 > int_mb(q_map2(1,nb)),nq2(nb)) 288 call generate_map_indexes(taskid,np, 289 > nx(nb)/2+1,ny(nb), 290 > int_mb(p_map3(1,nb)), 291 > int_mb(q_map3(1,nb)),nq3(nb)) 292 293c if (taskid.eq.0) then 294c write(*,*) taskid,"nq2=",nq2(nb), ny(nb)*nq2(nb) 295c write(*,*) taskid,"nq1=",nq1(nb),(nx(nb)/2+1)*nq1(nb) 296c write(*,*) taskid,"nq3=",nq3(nb), nz(nb)*nq3(nb) 297c 298c write(*,*) 'hilbert map nb=',nb 299c do j=0,ny(nb)-1 300c write(*,'(A,80I4)') 'hilbert map:', 301c > j,(int_mb(p_map3(1,nb)+j*(nx(nb)/2+1))) 302c end do 303c write(*,*) 304c 305c write(*,*) 'hilbert map nb=',nb 306c do j=0,ny(nb)-1 307c write(*,'(A,80I4)') 'hilbert map:', 308c > (int_mb(p_map3(1,nb)+k+j*(nx(nb)/2+1)), k=0,nx(nb)/2) 309c end do 310c write(*,*) 311c end if 312 313 314 nfft3d(nb) = (nx(nb)/2+1)*nq1(nb) 315 if ((ny(nb)*nq2(nb)).gt.nfft3d(nb)) nfft3d(nb) = ny(nb)*nq2(nb) 316 if ((nz(nb)*nq3(nb)).gt.nfft3d(nb)) nfft3d(nb) = nz(nb)*nq3(nb) 317 n2ft3d(nb) = 2*nfft3d(nb) 318 319 nfft3d_map(nb) = nz(nb)*nq3(nb) 320 n2ft3d_map(nb) = (nx(nb)+2)*nq1(nb) 321 322 323 324 end if 325 326 327 return 328 end 329 330* *********************************** 331* * * 332* * D3dB_end * 333* * * 334* *********************************** 335 subroutine D3dB_end(nb) 336 implicit none 337 integer nb 338 339#include "bafdecls.fh" 340#include "errquit.fh" 341#include "D3dB.fh" 342 343 344* *** hilbert tranpose data structure **** 345 integer h_iq_to_i1(2,6,NBLOCKS) 346 integer h_iq_to_i2(2,6,NBLOCKS) 347 integer h_i1_start(2,6,NBLOCKS) 348 integer h_i2_start(2,6,NBLOCKS) 349 common / trans_blk_ijk / h_iq_to_i1, 350 > h_iq_to_i2, 351 > h_i1_start, 352 > h_i2_start 353 354 355c integer iq_to_i1((NFFT1/2+1)*NFFT2*NSLABS) 356c integer iq_to_i2((NFFT1/2+1)*NFFT2*NSLABS) 357c integer i1_start(NFFT3+1) 358c integer i2_start(NFFT3+1) 359 integer iq_to_i1(2,NBLOCKS) 360 integer iq_to_i2(2,NBLOCKS) 361 integer i1_start(2,NBLOCKS) 362 integer i2_start(2,NBLOCKS) 363 common / trans_blk / iq_to_i1,iq_to_i2,i1_start,i2_start 364 365#ifndef MPI 366 integer Nchannels(NBLOCKS) 367 integer channel_proc(2,NBLOCKS) 368 integer channel_type(2,NBLOCKS) 369 common / channel_blk / channel_proc,channel_type,Nchannels 370#endif 371 372 logical value 373 integer i 374 logical control_single_precision_on 375 external control_single_precision_on 376 377 call D3dB_timereverse_end(nb) 378 call D3dB_fft_end(nb) 379 if (control_single_precision_on()) call D3dBs_fft_end(nb) 380 381 value=.true. 382 383 !**** slab mapping **** 384 if (mapping.eq.1) then 385 value = value.and.BA_free_heap(q_map(2,nb)) 386 value = value.and.BA_free_heap(p_map(2,nb)) 387 value = value.and.BA_free_heap(k_map(2,nb)) 388 end if 389 390 !**** hilbert mappings **** 391 if (mapping.eq.2) then 392 value = value.and.BA_free_heap(q_map1(2,nb)) 393 value = value.and.BA_free_heap(p_map1(2,nb)) 394 value = value.and.BA_free_heap(q_map2(2,nb)) 395 value = value.and.BA_free_heap(p_map2(2,nb)) 396 value = value.and.BA_free_heap(q_map3(2,nb)) 397 value = value.and.BA_free_heap(p_map3(2,nb)) 398 end if 399 400 !**** slab transpose mappings **** 401 if (mapping.eq.1) then 402 value = value.and.BA_free_heap(i1_start(2,nb)) 403 value = value.and.BA_free_heap(i2_start(2,nb)) 404 value = value.and.BA_free_heap(iq_to_i1(2,nb)) 405 value = value.and.BA_free_heap(iq_to_i2(2,nb)) 406 end if 407 408 !**** hilbert transpose mappings **** 409 if (mapping.eq.2) then 410 do i=1,6 411 value = value.and.BA_free_heap(h_i1_start(2,i,nb)) 412 value = value.and.BA_free_heap(h_i2_start(2,i,nb)) 413 value = value.and.BA_free_heap(h_iq_to_i1(2,i,nb)) 414 value = value.and.BA_free_heap(h_iq_to_i2(2,i,nb)) 415 end do 416 end if 417 418#ifndef MPI 419 value = value.and.BA_free_heap(channel_proc(2,nb)) 420 value = value.and.BA_free_heap(channel_type(2,nb)) 421#endif 422 423 if (.not. value) 424 > call errquit('D3dB_end:freeing heap memory',0, MA_ERR) 425 426 427 428 429 return 430 end 431 432* *********************************** 433* * * 434* * D3dB_qtok * 435* * * 436* *********************************** 437 438 subroutine D3dB_qtok(nb,q,k) 439 implicit none 440 integer nb 441 integer q,k 442 443#include "bafdecls.fh" 444#include "D3dB.fh" 445 446 447 448c k = k_map(q) 449 k = int_mb(k_map(1,nb)+q-1) 450 451 return 452 end 453 454* *********************************** 455* * * 456* * D3dB_ktoqp * 457* * * 458* *********************************** 459 460 subroutine D3dB_ktoqp(nb,k,q,p) 461 implicit none 462 integer nb 463 integer k,q,p 464 465#include "bafdecls.fh" 466#include "D3dB.fh" 467 468 469 470c q = q_map(k) 471c p = p_map(k) 472 473 q = int_mb(q_map(1,nb)+k-1) 474 p = int_mb(p_map(1,nb)+k-1) 475 return 476 end 477 478 479* *********************************** 480* * * 481* * D3dB_ijktoindexp * 482* * * 483* *********************************** 484 485 subroutine D3dB_ijktoindexp(nb,i,j,k,indx,p) 486 implicit none 487 integer nb 488 integer i,j,k 489 integer indx,p 490 491#include "bafdecls.fh" 492#include "D3dB.fh" 493 494 integer q 495 496 !**** slab mapping *** 497 if (mapping.eq.1) then 498 q = int_mb(q_map(1,nb)+k-1) 499 p = int_mb(p_map(1,nb)+k-1) 500 501 indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 502 503 !**** hilbert mapping **** 504 else 505 q = int_mb(q_map3(1,nb)+(i-1)+(j-1)*(nx(nb)/2+1)) 506 p = int_mb(p_map3(1,nb)+(i-1)+(j-1)*(nx(nb)/2+1)) 507 508 indx = k + (q-1)*nz(nb) 509 end if 510 511 return 512 end 513 514 515 516* *********************************** 517* * * 518* * D3dB_ijktoindex1p * 519* * * 520* *********************************** 521 522 subroutine D3dB_ijktoindex1p(nb,i,j,k,indx,p) 523 implicit none 524 integer nb 525 integer i,j,k 526 integer indx,p 527 528#include "bafdecls.fh" 529#include "D3dB.fh" 530 531 integer q 532 533 !**** slab mapping *** 534 if (mapping.eq.1) then 535 q = int_mb(q_map(1,nb)+j-1) 536 p = int_mb(p_map(1,nb)+j-1) 537 538 indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb) 539 540 !**** hilbert mapping **** 541 else 542 q = int_mb(q_map2(1,nb)+(k-1)+(i-1)*(nz(nb))) 543 p = int_mb(p_map2(1,nb)+(k-1)+(i-1)*(nz(nb))) 544 545 indx = j + (q-1)*ny(nb) 546 end if 547 548 return 549 end 550 551 552 553 554* *********************************** 555* * * 556* * D3dB_ijktoindex2p * 557* * * 558* *********************************** 559 560 subroutine D3dB_ijktoindex2p(nb,i,j,k,indx,p) 561 implicit none 562 integer nb 563 integer i,j,k 564 integer indx,p 565 566#include "bafdecls.fh" 567#include "D3dB.fh" 568 569 570 integer q 571 572 !**** slab mapping **** 573 if (mapping.eq.1) then 574 q = int_mb(q_map(1,nb)+j-1) 575 p = int_mb(p_map(1,nb)+j-1) 576 577 indx = i + (k-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb) 578 579 !**** hilbert mapping **** 580 else 581 q = int_mb(q_map1(1,nb)+(j-1)+(k-1)*ny(nb)) 582 p = int_mb(p_map1(1,nb)+(j-1)+(k-1)*ny(nb)) 583 584 indx = i + (q-1)*(nx(nb)+2) 585 end if 586 587 return 588 end 589 590 591 592 593 594* *********************************** 595* * * 596* * D3dB_nfft3d * 597* * * 598* *********************************** 599 600 subroutine D3dB_nfft3d(nb,nfft3d_out) 601 implicit none 602 integer nb 603 integer nfft3d_out 604 605#include "D3dB.fh" 606 nfft3d_out = nfft3d(nb) 607 return 608 end 609 610 611* *********************************** 612* * * 613* * D3dB_nfft3d_map * 614* * * 615* *********************************** 616 617 subroutine D3dB_nfft3d_map(nb,nfft3d_out) 618 implicit none 619 integer nb 620 integer nfft3d_out 621 622#include "D3dB.fh" 623 624 nfft3d_out = nfft3d_map(nb) 625 return 626 end 627 628 629* *********************************** 630* * * 631* * D3dB_n2ft3d * 632* * * 633* *********************************** 634 635 subroutine D3dB_n2ft3d(nb,n2ft3d_out) 636 implicit none 637 integer nb 638 integer n2ft3d_out 639 640#include "D3dB.fh" 641 642 n2ft3d_out = n2ft3d(nb) 643 return 644 end 645 646 647* *********************************** 648* * * 649* * D3dB_n2ft3d_map * 650* * * 651* *********************************** 652 653 subroutine D3dB_n2ft3d_map(nb,n2ft3d_out) 654 implicit none 655 integer nb 656 integer n2ft3d_out 657 658#include "D3dB.fh" 659 660 n2ft3d_out = n2ft3d_map(nb) 661 return 662 end 663 664 665* *********************************** 666* * * 667* * D3dB_nq * 668* * * 669* *********************************** 670 671 subroutine D3dB_nq(nb,nqtmp) 672 implicit none 673 integer nb 674 integer nqtmp 675 676#include "D3dB.fh" 677 678 679 nqtmp = nq(nb) 680 681 return 682 end 683 684* *********************************** 685* * * 686* * D3dB_nx * 687* * * 688* *********************************** 689 690 subroutine D3dB_nx(nb,nxtmp) 691 implicit none 692 integer nb 693 integer nxtmp 694 695#include "D3dB.fh" 696 697 698 nxtmp = nx(nb) 699 return 700 end 701 702* *********************************** 703* * * 704* * D3dB_ny * 705* * * 706* *********************************** 707 708 subroutine D3dB_ny(nb,nytmp) 709 implicit none 710 integer nb 711 integer nytmp 712 713#include "D3dB.fh" 714 715 716 nytmp = ny(nb) 717 return 718 end 719 720* *********************************** 721* * * 722* * D3dB_nz * 723* * * 724* *********************************** 725 726 subroutine D3dB_nz(nb,nztmp) 727 implicit none 728 integer nb 729 integer nztmp 730 731#include "D3dB.fh" 732 733 734 nztmp = nz(nb) 735 return 736 end 737 738* *********************************** 739* * * 740* * D3dB_zplane_size * 741* * * 742* *********************************** 743 744 subroutine D3dB_zplane_size(nb,zplane_sizetmp) 745 implicit none 746 integer nb 747 integer zplane_sizetmp 748 749#include "D3dB.fh" 750 751 zplane_sizetmp = zplane_size(nb) 752 return 753 end 754 755* *********************************** 756* * * 757* * D3dB_Init * 758* * * 759* *********************************** 760 761 subroutine D3dB_Init(nb,nx_in,ny_in,nz_in,map_in) 762 implicit none 763 integer nb 764 integer nx_in,ny_in,nz_in 765 integer map_in 766 logical value, MA_verify_allocator_stuff 767 external MA_verify_allocator_stuff 768 769#include "D3dB.fh" 770 771 772 !**** local variables **** 773 integer MASTER 774 parameter (MASTER=0) 775 integer taskid,np 776 integer Parallel_threadid 777 external Parallel_threadid 778 logical control_single_precision_on 779 external control_single_precision_on 780 781 782 783 784 785 call Parallel2d_np_i(np) 786 call Parallel_taskid(taskid) 787 788 !**** Make sure ngrid is consistent with mapping *** 789 if (map_in.eq.1) then 790 if ((np.gt.nz_in).or.(ny_in.ne.nz_in)) then 791 if (taskid.eq.MASTER) then 792 write(6,*) 'Error: for slab decomposition the', 793 > ' number of processors must ', 794 > ' be in the range ( 1 ...ngrid(3)=', 795 > nz_in,')' 796 write(6,*) ' and ngrid(2) == ngrid(3), ', 797 > ' ngrid(2)=',ny_in, 798 > ' ngrid(3)=',nz_in 799 end if 800 call errquit('D3dB_Init: mapping error',0,0) 801 end if 802 if (mod(nx_in,2).ne.0) then 803 if (taskid.eq.MASTER) then 804 write(6,*) 805 > 'Error: ngrid(1) must be even (ngrid(1) mod 2 == 0)' 806 write(6,*) 'Error: ngrid(1)=',nx_in 807 end if 808 call errquit('D3dB_Init: slab mapping error',0,0) 809 end if 810 end if 811 812 if (map_in.ge.2) then 813 if (np.gt.(ny_in*nz_in)) then 814 if (taskid.eq.MASTER) then 815 write(6,*) 'Error: np > MIN(ngrid(2)*ngrid(3),', 816 > ' (ngrid(1)/2+1)*ngrid(2),', 817 > ' (ngrid(1)/2+1)*ngrid(3))' 818 write(6,*) 'Error: np > ngrid(2)*ngrid(3)' 819 write(6,*) 'Error: for the Hilbert decomposition the', 820 > ' the number of processors must ', 821 > ' be in the range ( 1 ...', 822 > ny_in*nz_in,')' 823 end if 824 call errquit('D3dB_Init: Hilbert mapping error',0,0) 825 end if 826 if (np.gt.((nx_in/2+1)*ny_in)) then 827 if (taskid.eq.MASTER) then 828 write(6,*) 'Error: np > MIN(ngrid(2)*ngrid(3),', 829 > ' (ngrid(1)/2+1)*ngrid(2),', 830 > ' (ngrid(1)/2+1)*ngrid(3))' 831 write(6,*) 'Error: np > (ngrid(1)/2+1)*ngrid(2)' 832 write(6,*) 'Error: for the Hilbert decomposition the', 833 > ' the number of processors must ', 834 > ' be in the range ( 1 ...', 835 > (nx_in/2+1)*ny_in,')' 836 end if 837 call errquit('D3dB_Init: Hilbert mapping error',0,0) 838 end if 839 if (np.gt.((nx_in/2+1)*nz_in)) then 840 if (taskid.eq.MASTER) then 841 write(6,*) 'Error: np > MIN(ngrid(2)*ngrid(3),', 842 > ' (ngrid(1)/2+1)*ngrid(2),', 843 > ' (ngrid(1)/2+1)*ngrid(3))' 844 write(6,*) 'Error: np > (ngrid(1)/2+1)*ngrid(3)' 845 write(6,*) 'Error: for the Hilbert decomposition the', 846 > ' the number of processors must ', 847 > ' be in the range ( 1 ...', 848 > (nx_in/2+1)*nz_in,')' 849 end if 850 call errquit('D3dB_Init: Hilbert mapping error',0,0) 851 end if 852 if (mod(nx_in,2).ne.0) then 853 if (taskid.eq.MASTER) then 854 write(6,*) 855 > 'Error: ngrid(1) must be even (ngrid(1) mod 2 == 0)' 856 write(6,*) 'Error: ngrid(1)=',nx_in 857 end if 858 call errquit('D3dB_Init: Hilbert mapping error',0,0) 859 end if 860 end if 861 862 863* ***** initialize D3dB common block ***** 864 nx(nb) = nx_in 865 ny(nb) = ny_in 866 nz(nb) = nz_in 867 mapping = map_in 868 mapping2d = 1 869 if (mapping.eq.3) then 870 mapping = 2 871 mapping2d = 2 872 end if 873 874 875 876* **** do other initializations **** 877 call Mapping_Init(nb) 878 879 if (mapping.eq.1) call D3dB_c_transpose_jk_init(nb) 880 if (mapping.eq.2) call D3dB_c_transpose_ijk_init(nb) 881 882#ifndef MPI 883 call D3dB_channel_init(nb) 884#endif 885 886 call D3dB_c_timereverse_init(nb) 887 call D3dB_fft_init(nb) 888 if (control_single_precision_on()) call D3dBs_fft_init(nb) 889 890 return 891 end 892 893 894c* *********************************** 895c* * * 896c* * D3dB_SumAll * 897c* * * 898c* *********************************** 899c 900c subroutine D3dB_SumAll(sum) 901cc implicit none 902c real*8 sum 903c 904c 905c#include "tcgmsg.fh" 906c#include "msgtypesf.h" 907c 908c 909c integer MASTER 910c parameter (MASTER=0) 911c integer msglen 912c real*8 sumall,sumt 913c 914c* **** external functions **** 915c integer Parallel2d_comm_i 916c external Parallel2d_comm_i 917c 918cc msglen = 8 919c msglen = 1 920c 921c sumt = sum 922c 923c call GA_PGROUP_DGOP(Parallel2d_comm_i(), 924c > 9+MSGDBL,sumt,1,'+') 925c 926cc call GA_DGOP(9+MSGDBL,sumt,1,'+') 927cc call DGOP(9+MSGDBL,sumt,1,'+') 928c sumall=sumt 929c 930c 931c sum = sumall 932c return 933c end 934 935 936c* *********************************** 937c* * * 938c* * D3dB_ISumAll * 939c* * * 940c* *********************************** 941c 942c subroutine D3dB_ISumAll(sum) 943cc implicit none 944c integer sum 945c 946c 947c#include "tcgmsg.fh" 948c#include "msgtypesf.h" 949c 950c 951c integer MASTER 952c parameter (MASTER=0) 953c integer msglen 954c integer sumall,sumt 955c 956c* **** external functions **** 957c integer Parallel2d_comm_i 958c external Parallel2d_comm_i 959c 960c 961cc msglen = 8 962c msglen = 1 963c 964c sumt = sum 965c 966c call GA_PGROUP_IGOP(Parallel2d_comm_i(), 967c > 9+MSGINT,sumt,1,'+') 968cc call GA_IGOP(9+MSGINT,sumt,1,'+') 969c sumall=sumt 970c 971c 972c sum = sumall 973c return 974c end 975 976 977 978* *********************************** 979* * * 980* * D3dB_(c,r,t)_Zero * 981* * * 982* *********************************** 983 984 subroutine D3dB_c_Zero(nb,A) 985 implicit none 986 integer nb 987 complex*16 A(*) 988 989#include "D3dB.fh" 990 991 992 !call dcopy(n2ft3d_map(nb),0.0d0,0,A,1) 993 call Parallel_shared_vector_zero(.true.,n2ft3d_map(nb),A) 994 return 995 end 996 997 subroutine D3dB_c_nZero(nb,n,A) 998 implicit none 999 integer nb,n 1000 complex*16 A(*) 1001 1002#include "D3dB.fh" 1003 1004 1005 !call dcopy(n*n2ft3d_map(nb),0.0d0,0,A,1) 1006 call Parallel_shared_vector_zero(.true.,n*n2ft3d_map(nb),A) 1007 return 1008 end 1009 1010 1011 subroutine D3dB_r_Zero(nb,A) 1012 implicit none 1013 integer nb 1014 real*8 A(*) 1015 1016#include "D3dB.fh" 1017 1018c call dcopy(n2ft3d_map(nb),0.0d0,0,A,1) 1019 call Parallel_shared_vector_zero(.true.,n2ft3d_map(nb),A) 1020 return 1021 end 1022 1023 1024 subroutine D3dB_r_nZero(nb,n,A) 1025 implicit none 1026 integer nb,n 1027 real*8 A(*) 1028 1029#include "D3dB.fh" 1030 1031 !call dcopy(n*n2ft3d_map(nb),0.0d0,0,A,1) 1032 call Parallel_shared_vector_zero(.true.,n*n2ft3d_map(nb),A) 1033 return 1034 end 1035 1036 1037 1038* *********************************** 1039* * * 1040* * D3dB_(c,r,t)_Copy * 1041* * * 1042* *********************************** 1043 1044 subroutine D3dB_c_Copy(nb,A,B) 1045 implicit none 1046 integer nb 1047 complex*16 A(*) 1048 complex*16 B(*) 1049 1050#include "D3dB.fh" 1051 1052 !call dcopy(2*nfft3d_map(nb),A,1,B,1) 1053 call Parallel_shared_vector_copy(.true.,2*nfft3d_map(nb),A,B) 1054 return 1055 end 1056 1057 subroutine D3dB_c_nCopy(nb,n,A,B) 1058 implicit none 1059 integer nb,n 1060 complex*16 A(*) 1061 complex*16 B(*) 1062 1063#include "D3dB.fh" 1064 1065 !call dcopy(n*2*nfft3d_map(nb),A,1,B,1) 1066 call Parallel_shared_vector_copy(.true.,n*2*nfft3d_map(nb),A,B) 1067 return 1068 end 1069 1070 1071 subroutine D3dB_r_Copy(nb,A,B) 1072 implicit none 1073 integer nb 1074 real*8 A(*) 1075 real*8 B(*) 1076 1077#include "D3dB.fh" 1078 integer i 1079 !call dcopy(n2ft3d_map(nb),A,1,B,1) 1080 call Parallel_shared_vector_copy(.true.,n2ft3d_map(nb),A,B) 1081 1082 return 1083 end 1084 1085 1086 subroutine D3dB_r_nCopy(nb,n,A,B) 1087 implicit none 1088 integer nb,n 1089 real*8 A(*) 1090 real*8 B(*) 1091 1092#include "D3dB.fh" 1093 1094 !call dcopy(n*n2ft3d_map(nb),A,1,B,1) 1095 call Parallel_shared_vector_copy(.true.,n*n2ft3d_map(nb),A,B) 1096 return 1097 end 1098 1099 1100 subroutine D3dB_t_Copy(nb,A,B) 1101 implicit none 1102 integer nb 1103 real*8 A(*) 1104 real*8 B(*) 1105 1106#include "D3dB.fh" 1107 1108 !call dcopy(nfft3d_map(nb),A,1,B,1) 1109 call Parallel_shared_vector_copy(.true.,nfft3d_map(nb),A,B) 1110 return 1111 end 1112 1113 1114 subroutine D3dB_t_nCopy(nb,n,A,B) 1115 implicit none 1116 integer nb,n 1117 real*8 A(*) 1118 real*8 B(*) 1119 1120#include "D3dB.fh" 1121 1122 !call dcopy(n*nfft3d_map(nb),A,1,B,1) 1123 call Parallel_shared_vector_copy(.true.,n*nfft3d_map(nb),A,B) 1124 return 1125 end 1126 1127 1128 subroutine D3dB_tc_Copy(nb,A,B) 1129 implicit none 1130 integer nb 1131 real*8 A(*) 1132 complex*16 B(*) 1133 1134#include "D3dB.fh" 1135 1136 integer i 1137 1138!$OMP DO 1139 do i=1,nfft3d_map(nb) 1140 B(i) = dcmplx(A(i),0.0d0) 1141 end do 1142!$OMP END DO 1143 1144 return 1145 end 1146 1147 subroutine D3dB_ct_Copy(nb,A,B) 1148 implicit none 1149 integer nb 1150 complex*16 A(*) 1151 real*8 B(*) 1152 1153#include "D3dB.fh" 1154 1155 integer i 1156!$OMP DO 1157 do i=1,nfft3d_map(nb) 1158 B(i) = dble(A(i)) 1159 end do 1160!$OMP END DO 1161 1162 return 1163 end 1164 1165 1166 1167 1168* *********************************** 1169* * * 1170* * D3dB_fft_init * 1171* * * 1172* *********************************** 1173 1174 subroutine D3dB_fft_init(nb) 1175 implicit none 1176 integer nb 1177 1178#include "bafdecls.fh" 1179#include "errquit.fh" 1180 1181#include "D3dB.fh" 1182 1183 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 1184 common / D3dB_fft / tmpx,tmpy,tmpz 1185 1186#ifdef FFTW3 1187#include "fftw3.fh" 1188 integer Atest(2),nxh,nxhz,nxhy 1189#endif 1190 1191 logical value 1192 integer i,mthr, Parallel_maxthreads 1193 external Parallel_maxthreads 1194 1195 mthr = Parallel_maxthreads() 1196 1197 1198 !call D3dB_nfft3d(nb,nfft3d) 1199 1200c value = BA_alloc_get(mt_dcpl,(nfft3d(nb)), 1201c > 'fttmpx',tmpx(2,nb),tmpx(1,nb)) 1202c value = value.and. 1203c > BA_alloc_get(mt_dcpl,(nfft3d(nb)), 1204c > 'fttmpy',tmpy(2,nb),tmpy(1,nb)) 1205c value = value.and. 1206c > BA_alloc_get(mt_dcpl,(nfft3d(nb)), 1207c > 'fttmpz',tmpz(2,nb),tmpz(1,nb)) 1208 value = BA_alloc_get(mt_dcpl,mthr*(2*nx(nb)+15), 1209 > 'fttmpx',tmpx(2,nb),tmpx(1,nb)) 1210 value = value.and. 1211 > BA_alloc_get(mt_dcpl,mthr*(2*ny(nb)+15), 1212 > 'fttmpy',tmpy(2,nb),tmpy(1,nb)) 1213 value = value.and. 1214 > BA_alloc_get(mt_dcpl,mthr*(2*nz(nb)+15), 1215 > 'fttmpz',tmpz(2,nb),tmpz(1,nb)) 1216 if (.not. value) 1217 > call errquit('D3dB_fft_init:out of heap memory',0, MA_ERR) 1218 1219 1220#ifdef MLIB 1221 call drc1ft(dcpl_mb(tmpx(1,nb)),nx(nb), 1222 > dcpl_mb(tmpx(1,nb)),-3,ierr) 1223 call z1dfft(dcpl_mb(tmpx(1,nb)),ny(nb), 1224 > dcpl_mb(tmpy(1,nb)),-3,ierr) 1225 call z1dfft(dcpl_mb(tmpx(1,nb)),nz(nb), 1226 > dcpl_mb(tmpz(1,nb)),-3,ierr) 1227 1228#else 1229 do i=1,mthr 1230 !write(*,*) "DEBUG init fft arrays of thread ", i-1 1231 call drffti(nx(nb),dcpl_mb(tmpx(1,nb)+(i-1)*(2*nx(nb)+15))) 1232 call dcffti(ny(nb),dcpl_mb(tmpy(1,nb)+(i-1)*(2*ny(nb)+15))) 1233 call dcffti(nz(nb),dcpl_mb(tmpz(1,nb)+(i-1)*(2*nz(nb)+15))) 1234 end do 1235#endif 1236 1237#ifdef FFTW3 1238 1239c call dfftw_init_threads() 1240c call dfftw_plan_with_nthreads(2) 1241 1242 iforward = FFTW_FORWARD 1243 ibackward = FFTW_BACKWARD 1244c iestimate = FFTW_PATIENT 1245c iestimate = FFTW_MEASURE 1246c iestimate = FFTW_ESTIMATE 1247c iestimate = FFTW_EXHAUSTIVE 1248 call icopy(nplans*NBLOCKS,0,0,plans,1) 1249 if (mapping.eq.1) then 1250 nxh = (nx(nb)/2+1) 1251 nxhz = nxh*nz(nb) 1252 nxhy = nxh*ny(nb) 1253 if (.not.BA_alloc_get(mt_dcpl,nx(nb)*ny(nb)*nq(nb), 1254 > 'Atest',Atest(2),Atest(1))) 1255 > call errquit('D3dB_fft_init:out of heap memory',0,MA_ERR) 1256 1257 call dfftw_plan_many_dft(plans(1,nb),1,nz(nb),nxh, 1258 > dcpl_mb(Atest(1)),nxhz,nxh,1, 1259 > dcpl_mb(Atest(1)),nxhz,nxh,1, 1260 > ibackward,FFTW_EXHAUSTIVE) 1261 call dfftw_plan_many_dft(plans(2,nb),1,ny(nb),nxh, 1262 > dcpl_mb(Atest(1)),nxhy,nxh,1, 1263 > dcpl_mb(Atest(1)),nxhy,nxh,1, 1264 > ibackward,FFTW_EXHAUSTIVE) 1265 call dfftw_plan_many_dft_c2r(plans(3,nb),1,nx(nb), 1266 > ny(nb)*nq(nb), 1267 > dcpl_mb(Atest(1)),nxh *ny(nb)*nq(nb),1,nxh, 1268 > dcpl_mb(Atest(1)),(nx(nb)+2)*ny(nb)*nq(nb),1,nx(nb)+2, 1269 > FFTW_EXHAUSTIVE) 1270 1271 call dfftw_plan_many_dft_r2c(plans(4,nb),1,nx(nb), 1272 > ny(nb)*nq(nb), 1273 > dcpl_mb(Atest(1)),(nx(nb)+2)*ny(nb)*nq(nb),1,nx(nb)+2, 1274 > dcpl_mb(Atest(1)),nxh *ny(nb)*nq(nb),1,nxh, 1275 > FFTW_EXHAUSTIVE) 1276 call dfftw_plan_many_dft(plans(5,nb),1,ny(nb),nxh, 1277 > dcpl_mb(Atest(1)),nxhy,nxh,1, 1278 > dcpl_mb(Atest(1)),nxhy,nxh,1, 1279 > iforward,FFTW_EXHAUSTIVE) 1280 call dfftw_plan_many_dft(plans(6,nb),1,nz(nb),nxh, 1281 > dcpl_mb(Atest(1)),nxhz,nxh,1, 1282 > dcpl_mb(Atest(1)),nxhz,nxh,1, 1283 > iforward,FFTW_EXHAUSTIVE) 1284 1285 call dfftw_plan_many_dft(plans(7,nb),1,nz(nb),1, 1286 > dcpl_mb(Atest(1)),nxhz,nxh,1, 1287 > dcpl_mb(Atest(1)),nxhz,nxh,1, 1288 > ibackward,FFTW_EXHAUSTIVE) 1289 call dfftw_plan_many_dft(plans(8,nb),1,ny(nb),1, 1290 > dcpl_mb(Atest(1)),nxhy,nxh,1, 1291 > dcpl_mb(Atest(1)),nxhy,nxh,1, 1292 > ibackward,FFTW_EXHAUSTIVE) 1293 1294 call dfftw_plan_many_dft(plans(9,nb),1,ny(nb),1, 1295 > dcpl_mb(Atest(1)),nxhy,nxh,1, 1296 > dcpl_mb(Atest(1)),nxhy,nxh,1, 1297 > iforward,FFTW_EXHAUSTIVE) 1298 call dfftw_plan_many_dft(plans(10,nb),1,nz(nb),1, 1299 > dcpl_mb(Atest(1)),nxhz,nxh,1, 1300 > dcpl_mb(Atest(1)),nxhz,nxh,1, 1301 > iforward,FFTW_EXHAUSTIVE) 1302 1303 1304 if (.not.BA_free_heap(Atest(2))) 1305 > call errquit('D3dB_fft_init:freeing heap',0,MA_ERR) 1306 1307 else 1308 1309 nxh = (nx(nb)/2+1) 1310 if (.not.BA_alloc_get(mt_dcpl,nfft3d(nb),'Atest', 1311 > Atest(2),Atest(1))) 1312 > call errquit('D3dB_fft_init:out of heap memory',0,MA_ERR) 1313 1314 call dfftw_plan_many_dft(plans(11,nb),1,nz(nb),nq3(nb), 1315 > dcpl_mb(Atest(1)),nz(nb)*nq3(nb),1,nz(nb), 1316 > dcpl_mb(Atest(1)),nz(nb)*nq3(nb),1,nz(nb), 1317 > ibackward,FFTW_EXHAUSTIVE) 1318 call dfftw_plan_many_dft(plans(12,nb),1,ny(nb),nq2(nb), 1319 > dcpl_mb(Atest(1)),ny(nb)*nq2(nb),1,ny(nb), 1320 > dcpl_mb(Atest(1)),ny(nb)*nq2(nb),1,ny(nb), 1321 > ibackward,FFTW_EXHAUSTIVE) 1322 call dfftw_plan_many_dft_c2r(plans(13,nb),1,nx(nb), 1323 > nq1(nb), 1324 > dcpl_mb(Atest(1)),nfft3d(nb),1,nxh, 1325 > dcpl_mb(Atest(1)),n2ft3d(nb),1,(nx(nb)+2), 1326 > FFTW_EXHAUSTIVE) 1327 call dfftw_plan_many_dft_r2c(plans(14,nb),1,nx(nb), 1328 > nq1(nb), 1329 > dcpl_mb(Atest(1)),n2ft3d(nb),1,nx(nb)+2, 1330 > dcpl_mb(Atest(1)),nfft3d(nb),1,nxh, 1331 > FFTW_EXHAUSTIVE) 1332 1333 call dfftw_plan_many_dft(plans(15,nb),1,ny(nb),nq2(nb), 1334 > dcpl_mb(Atest(1)),ny(nb)*nq2(nb),1,ny(nb), 1335 > dcpl_mb(Atest(1)),ny(nb)*nq2(nb),1,ny(nb), 1336 > iforward,FFTW_EXHAUSTIVE) 1337 call dfftw_plan_many_dft(plans(16,nb),1,nz(nb),nq3(nb), 1338 > dcpl_mb(Atest(1)),nz(nb)*nq3(nb),1,nz(nb), 1339 > dcpl_mb(Atest(1)),nz(nb)*nq3(nb),1,nz(nb), 1340 > iforward,FFTW_EXHAUSTIVE) 1341 1342 call dfftw_plan_many_dft(plans(17,nb),1,nz(nb),1, 1343 > dcpl_mb(Atest(1)),nz(nb),1,1, 1344 > dcpl_mb(Atest(1)),nz(nb),1,1, 1345 > ibackward,FFTW_EXHAUSTIVE) 1346 call dfftw_plan_many_dft(plans(18,nb),1,ny(nb),1, 1347 > dcpl_mb(Atest(1)),ny(nb),1,1, 1348 > dcpl_mb(Atest(1)),ny(nb),1,1, 1349 > ibackward,FFTW_EXHAUSTIVE) 1350 1351 call dfftw_plan_many_dft(plans(19,nb),1,ny(nb),1, 1352 > dcpl_mb(Atest(1)),ny(nb),1,1, 1353 > dcpl_mb(Atest(1)),ny(nb),1,1, 1354 > iforward,FFTW_EXHAUSTIVE) 1355 call dfftw_plan_many_dft(plans(20,nb),1,nz(nb),1, 1356 > dcpl_mb(Atest(1)),nz(nb),1,1, 1357 > dcpl_mb(Atest(1)),nz(nb),1,1, 1358 > iforward,FFTW_EXHAUSTIVE) 1359 1360 if (.not.BA_free_heap(Atest(2))) 1361 > call errquit('D3dB_fft_init:freeing heap',0,MA_ERR) 1362 1363 end if 1364#endif 1365 1366 return 1367 end 1368 1369* *********************************** 1370* * * 1371* * D3dB_fft_end * 1372* * * 1373* *********************************** 1374 1375 subroutine D3dB_fft_end(nb) 1376 implicit none 1377 integer nb 1378 1379#include "bafdecls.fh" 1380#include "errquit.fh" 1381 1382#include "D3dB.fh" 1383 1384 1385 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 1386 common / D3dB_fft / tmpx,tmpy,tmpz 1387#ifdef USE_OPENMP 1388 logical is_par 1389 common / Debug_openmp / is_par 1390#endif 1391 1392 logical value 1393 integer i 1394 1395#ifdef FFTW3 1396 do i=1,nplans 1397 if (plans(i,nb).ne.0) call dfftw_destroy_plan(plans(i,nb)) 1398 end do 1399c call dfftw_cleanup_threads() 1400#endif 1401 1402 value = BA_free_heap(tmpx(2,nb)) 1403 value = value.and.BA_free_heap(tmpy(2,nb)) 1404 value = value.and.BA_free_heap(tmpz(2,nb)) 1405 if (.not.value) 1406 > call errquit( 1407 > 'D3dB_fft_end:error deallocatingof heap memory',0, MA_ERR) 1408 1409 return 1410 end 1411 1412 1413 1414 1415 1416 1417 1418c* *********************************** 1419c* * * 1420c* * D3dB_n_fft_init * 1421c* * * 1422c* *********************************** 1423c 1424c subroutine D3dB_n_fft_init(nb,ne) 1425c implicit none 1426c integer nb,ne 1427c 1428c#include "bafdecls.fh" 1429c#include "errquit.fh" 1430c 1431c integer tmp2(2,NBLOCKS),tmp3(2,NBLOCKS) 1432c common / D3dB_n_fft / tmp2,tmp3 1433c 1434c logical value 1435c integer nfft3d 1436c 1437c call D3dB_nfft3d(nb,nfft3d) 1438c value = BA_alloc_get(mt_dcpl,(ne*nfft3d), 1439c > 'fttmp2_h',tmp2(2,nb),tmp2(1,nb)) 1440c value = value.and. 1441c > BA_alloc_get(mt_dbl,(2*ne*nfft3d), 1442c > 'fttmp3_h',tmp3(2,nb),tmp3(1,nb)) 1443c if (.not. value) 1444c > call errquit('D3dB_n_fft_init:out of heap memory',0, MA_ERR) 1445c 1446c return 1447c end 1448c 1449c* *********************************** 1450c* * * 1451c* * D3dB_n_fft_end * 1452c* * * 1453c* *********************************** 1454c 1455c subroutine D3dB_n_fft_end(nb) 1456c implicit none 1457c integer nb 1458c 1459c#include "bafdecls.fh" 1460c#include "errquit.fh" 1461c 1462c integer tmp2(2,NBLOCKS),tmp3(2,NBLOCKS) 1463c common / D3dB_n_fft / tmp2,tmp3 1464c 1465c logical value 1466c 1467c value = BA_free_heap(tmp2(2,nb)) 1468c value = value.and.BA_free_heap(tmp3(2,nb)) 1469c if (.not.value) 1470c > call errquit( 1471c > 'D3dB_n_fft_end:error deallocatingof heap memory',0,MA_ERR) 1472c 1473c return 1474c end 1475 1476 1477 1478* *********************************** 1479* * * 1480* * D3dB_cr_fft3b * 1481* * * 1482* *********************************** 1483 1484 subroutine D3dB_cr_fft3b(nb,A) 1485 1486***************************************************** 1487* * 1488* This routine performs the operation of * 1489* a three dimensional complex to complex * 1490* inverse fft * 1491* A(nx,ny(nb),nz(nb)) <- FFT3^(-1)[A(kx,ky,kz)] * 1492* * 1493* Entry - * 1494* A: a column distribuded 3d block * 1495* tmp: tempory work space must be at * 1496* least the size of (complex) * 1497* (nfft*nfft + 1) + 10*nfft * 1498* * 1499* Exit - A is transformed and the imaginary * 1500* part of A is set to zero * 1501* uses - D3dB_c_transpose_jk, dcopy * 1502* * 1503***************************************************** 1504 1505 implicit none 1506 integer nb 1507 complex*16 A(*) 1508 1509 1510#include "bafdecls.fh" 1511#include "errquit.fh" 1512 1513#include "D3dB.fh" 1514 1515 1516 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 1517 common / D3dB_fft / tmpx,tmpy,tmpz 1518 1519* *** local variables *** 1520 integer i,j,k,q,indx 1521 integer nxh,nxhy,nxhz,indx0,indx1 1522 1523 1524 !integer tmp1(2),tmp2(2),tmp3(2) 1525 integer tmp2(2),tmp3(2) 1526 logical value 1527 1528 integer tid,nthr,offset 1529 integer Parallel_threadid,Parallel_nthreads 1530 external Parallel_threadid,Parallel_nthreads 1531 1532 1533 tid = Parallel_threadid() 1534 nthr = Parallel_nthreads() 1535 1536 call nwpw_timing_start(1) 1537 1538* ***** allocate temporary space **** 1539 !call D3dB_nfft3d(nb,nfft3d) 1540 value = BA_push_get(mt_dcpl,(nfft3d(nb)),'ffttmp2', 1541 > tmp2(2),tmp2(1)) 1542 value = value.and. 1543 > BA_push_get(mt_dbl,(n2ft3d(nb)),'ffttmp3',tmp3(2),tmp3(1)) 1544 if (.not. value) call errquit('out of stack memory',0, MA_ERR) 1545 1546 nxh = (nx(nb)/2+1) 1547 nxhz = nxh*nz(nb) 1548 nxhy = nxh*ny(nb) 1549 1550 !********************** 1551 !**** slab mapping **** 1552 !********************** 1553 if (mapping.eq.1) then 1554* ******************************************** 1555* *** Do a transpose of A *** 1556* *** A(kx,kz,ky) <- A(kx,ky,kz) *** 1557* ******************************************** 1558c call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 1559 1560* ************************************************* 1561* *** do fft along kz dimension *** 1562* *** A(kx,nz(nb),ky) <- fft1d^(-1)[A(kx,kz,ky)] *** 1563* ************************************************* 1564#ifdef MLIB 1565 !call z1dfft(dbl_mb(tmp3(1)),nz(nb),dcpl_mb(tmpz(1)),-3,ierr) 1566 do q=1,nq(nb) 1567 do i=1,(nx(nb)/2+1) 1568 do k=1,nz(nb) 1569 indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb) 1570 dcpl_mb(tmp2(1)+k-1) = A(indx) 1571 end do 1572 call z1dfft(dcpl_mb(tmp2(1)),nz(nb), 1573 > dcpl_mb(tmpz(1,nb)),-2,ierr) 1574 do k=1,nz(nb) 1575 indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb) 1576 A(indx) = dcpl_mb(tmp2(1)+k-1) 1577 end do 1578 end do 1579 end do 1580 !call dscal((nx(nb)+2)*ny(nb)*nq(nb),dble(nz(nb)),A,1) 1581 1582#else 1583 1584#ifdef FFTW3 1585 do q=1,nq(nb) 1586 indx = 1+(q-1)*nxhz 1587 call dfftw_execute_dft(plans(1,nb),A(indx),A(indx)) 1588 end do 1589#else 1590 !call dcffti(nz(nb),dcpl_mb(tmp1(1))) 1591 indx0 = 0 1592 do q=1,nq(nb) 1593 do i=1,nxh 1594 1595 indx = i + indx0 1596 indx1 = indx 1597 do k=1,nz(nb) 1598 dcpl_mb(tmp2(1)+k-1) = A(indx) 1599 indx = indx + nxh 1600 end do 1601 call dcfftb(nz(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpz(1,nb))) 1602 do k=1,nz(nb) 1603 A(indx1) = dcpl_mb(tmp2(1)+k-1) 1604 indx1 = indx1 + nxh 1605 end do 1606 1607 end do 1608 indx0 = indx0 + nxhz 1609 end do 1610#endif 1611#endif 1612 1613* ******************************************** 1614* *** Do a transpose of A *** 1615* *** A(kx,ky,nz(nb)) <- A(kx,nz(nb),ky) *** 1616* ******************************************** 1617 call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 1618 1619* ************************************************* 1620* *** do fft along ky dimension *** 1621* *** A(kx,ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ky,nz(nb))] *** 1622* ************************************************* 1623#ifdef MLIB 1624 !call z1dfft(dbl_mb(tmp3(1)),ny(nb),dcpl_mb(tmp1(1)),-3,ierr) 1625 do q=1,nq(nb) 1626 do i=1,(nx(nb)/2+1) 1627 do j=1,ny(nb) 1628 indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 1629 dcpl_mb(tmp2(1)+j-1) = A(indx) 1630 end do 1631 call z1dfft(dcpl_mb(tmp2(1)),ny(nb), 1632 > dcpl_mb(tmpy(1,nb)),-2,ierr) 1633 do j=1,ny(nb) 1634 indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 1635 A(indx) = dcpl_mb(tmp2(1)+j-1) 1636 end do 1637 end do 1638 end do 1639 !call dscal((nx(nb)+2)*ny(nb)*nq(nb),dble(ny(nb)),A,1) 1640#else 1641 1642#ifdef FFTW3 1643 do q=1,nq(nb) 1644 indx = 1+(q-1)*nxhy 1645 call dfftw_execute_dft(plans(2,nb),A(indx),A(indx)) 1646 end do 1647#else 1648 !call dcffti(ny(nb),dcpl_mb(tmp1(1))) 1649 indx0 = 0 1650 do q=1,nq(nb) 1651 do i=1,(nx(nb)/2+1) 1652 1653 indx = i + indx0 1654 indx1 = indx 1655 do j=1,ny(nb) 1656 dcpl_mb(tmp2(1)+j-1) = A(indx) 1657 indx = indx + nxh 1658 end do 1659 call dcfftb(ny(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpy(1,nb))) 1660 do j=1,ny(nb) 1661 A(indx1) = dcpl_mb(tmp2(1)+j-1) 1662 indx1 = indx1 + nxh 1663 end do 1664 1665 end do 1666 indx0 = indx0 + nxhy 1667 end do 1668#endif 1669#endif 1670 1671* ************************************************* 1672* *** do fft along kx dimension *** 1673* *** A(nx(nb),ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ny(nb),nz(nb))] *** 1674* ************************************************* 1675#ifdef MLIB 1676 !call drc1ft (dbl_mb(tmp3(1)),nx(nb),dcpl_mb(tmp1(1)),-3,ierr) 1677 do q=1,nq(nb) 1678 do j=1,ny(nb) 1679 indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 1680 call drc1ft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),-2,ierr) 1681 end do 1682 end do 1683c call drcfts(A,nx(nb),1,ny(nb)*nq(nb), 1684c > nx(nb)+2,-2,ierr) 1685c call dscal((nx(nb)+2)*ny(nb)*nq(nb),dble(nx(nb)),A,1) 1686 1687#else 1688 1689#ifdef FFTW3 1690 call dfftw_execute_dft_c2r(plans(3,nb),A,A) 1691 1692#else 1693 !call drffti(nx(nb),dcpl_mb(tmp1(1))) 1694 1695c do q=1,nq(nb) 1696c do j=1,ny(nb) 1697c indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 1698c call dcopy((nx(nb)+2),A(indx),1,dbl_mb(tmp3(1)),1) 1699c do i=2,nx(nb) 1700c dbl_mb(tmp3(1)+i-1) = dbl_mb(tmp3(1)+i) 1701c end do 1702c call drfftb(nx(nb),dbl_mb(tmp3(1)),dcpl_mb(tmp1(1))) 1703c dbl_mb(tmp3(1)+nx(nb)) = 0.0d0 1704c dbl_mb(tmp3(1)+nx(nb)+1) = 0.0d0 1705c call dcopy((nx(nb)+2),dbl_mb(tmp3(1)),1,A(indx),1) 1706c end do 1707c end do 1708 1709 call cshift1_fftb(nx(nb),ny(nb),nq(nb),1,A) 1710 indx = 1 1711 do q=1,nq(nb) 1712 do j=1,ny(nb) 1713 !indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 1714 call drfftb(nx(nb),A(indx),dcpl_mb(tmpx(1,nb))) 1715 indx = indx + nxh 1716 end do 1717 end do 1718 call zeroend_fftb(nx(nb),ny(nb),nq(nb),1,A) 1719#endif 1720 1721#endif 1722 1723 1724 !************************* 1725 !**** hilbert mapping **** 1726 !************************* 1727 else 1728 1729 1730* ************************************************* 1731* *** do fft along kz dimension *** 1732* *** A(nz(nb),kx,ky) <- fft1d^(-1)[A(kz,kx,ky)] *** 1733* ************************************************* 1734#ifdef MLIB 1735 indx = 1 1736 do q=1,nq3(nb) 1737 !indx = 1 + (q-1)*nz(nb) 1738 call z1dfft(A(indx),nz(nb),dcpl_mb(tmpz(1,nb)),-2,ierr) 1739 indx = indx + nz(nb) 1740 end do 1741#else 1742 1743#ifdef FFTW3 1744 call dfftw_execute_dft(plans(11,nb),A,A) 1745 1746#else 1747 1748 offset=tid*(2*nz(nb)+15) 1749 do i=tid+1,nq3(nb),nthr 1750 call dcfftb(nz(nb),A(1+(i-1)*nz(nb)),dcpl_mb(tmpz(1,nb)+offset)) 1751 end do 1752!$OMP BARRIER 1753 1754#endif 1755#endif 1756 1757 call D3dB_c_transpose_ijk(nb,3,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 1758 1759* ************************************************* 1760* *** do fft along ky dimension *** 1761* *** A(ny(nb),nz(nb),kx) <- fft1d^(-1)[A(ky,nz(nb),kx)] *** 1762* ************************************************* 1763#ifdef MLIB 1764 indx = 1 1765 do q=1,nq2(nb) 1766 !indx = 1 + (q-1)*ny(nb) 1767 call z1dfft(A(indx),ny(nb),dcpl_mb(tmpy(1,nb)),-2,ierr) 1768 indx = indx + ny(nb) 1769 end do 1770#else 1771 1772#ifdef FFTW3 1773 call dfftw_execute_dft(plans(12,nb),A,A) 1774 1775#else 1776 offset=tid*(2*ny(nb)+15) 1777 do i=tid+1,nq2(nb),nthr 1778 call dcfftb(ny(nb),A(1+(i-1)*ny(nb)),dcpl_mb(tmpy(1,nb)+offset)) 1779 end do 1780!$OMP BARRIER 1781#endif 1782#endif 1783 1784 call D3dB_c_transpose_ijk(nb,4,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 1785 1786* ************************************************* 1787* *** do fft along kx dimension *** 1788* *** A(nx(nb),ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ny(nb),nz(nb))] *** 1789* ************************************************* 1790#ifdef MLIB 1791 indx = 1 1792 do q=1,nq1(nb) 1793 !indx = 1 + (q-1)*(nx(nb)/2+1) 1794 call drc1ft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),-2,ierr) 1795 indx = indx + nxh 1796 end do 1797#else 1798 1799#ifdef FFTW3 1800 call dfftw_execute_dft_c2r(plans(13,nb),A,A) 1801 1802#else 1803 offset=tid*(2*nx(nb)+15) 1804 call cshift1_fftb(nx(nb),nq1(nb),1,1,A) 1805 do i=tid+1,nq1(nb),nthr 1806 call drfftb(nx(nb),A(1+(i-1)*nxh),dcpl_mb(tmpx(1,nb)+offset)) 1807 end do 1808!$OMP BARRIER 1809 1810 call zeroend_fftb(nx(nb),nq1(nb),1,1,A) 1811 1812#endif 1813#endif 1814 1815 end if 1816 1817 1818* **** deallocate temporary space **** 1819 value = BA_pop_stack(tmp3(2)) 1820 value = value.and.BA_pop_stack(tmp2(2)) 1821 !value = BA_pop_stack(tmp1(2)) 1822 if (.not. value) call errquit('popping stack memory',0,MA_ERR) 1823 1824 call nwpw_timing_end(1) 1825 1826 return 1827 end 1828 1829 subroutine D3dB_fftbx_sub(n,nx,nxh,tmpx,A) 1830 implicit none 1831 integer n,nx,nxh 1832 real*8 tmpx(2*nx+15) 1833 complex*16 A(nxh,n) 1834 integer i 1835 1836 1837 1838 do i=1,n 1839 call drfftb(nx,A(1,i),tmpx) 1840 end do 1841 1842 1843 return 1844 end 1845 1846 subroutine D3dB_fftby_sub2(n,ny,tmpy,A) 1847 implicit none 1848 integer n,ny 1849 real*8 tmpy(4*ny+15) 1850 complex*16 A(ny,n) 1851 integer i 1852 1853 1854 1855 do i=1,n 1856 call dcfftb(ny,A(1,i),tmpy) 1857 end do 1858 1859 1860 return 1861 end 1862 1863 subroutine D3dB_fftbz_sub2(n,nz,tmpz,A) 1864 implicit none 1865 integer n,nz 1866 real*8 tmpz(4*nz+15) 1867 complex*16 A(nz,n) 1868 integer i 1869 1870 1871 1872 do i=1,n 1873 call dcfftb(nz,A(1,i),tmpz) 1874 end do 1875 1876 1877 return 1878 end 1879 1880 subroutine cshift1_fftb(nx,ny,nq,ne,A) 1881 implicit none 1882 integer nx,ny,nq,ne 1883 real*8 A(*) 1884 1885 integer i,j,indx 1886 1887!$OMP DO 1888 do j=1,(ny*nq*ne) 1889 indx = 1+(j-1)*(nx+2) 1890c indx = 1 + (j-1)*(nx+2) + (q-1)*(nx+2)*ny 1891c > + (n-1)*(nx+2)*ny*nq 1892 do i=2,nx 1893 A(indx+i-1) = A(indx+i) 1894 end do 1895 end do 1896!$OMP END DO 1897 1898c end do 1899c end do 1900 return 1901 end 1902 1903 1904 subroutine cshift1_fftb1(nx,A) 1905 implicit none 1906 integer nx 1907 real*8 A(*) 1908 integer i 1909 do i=2,nx 1910 A(i) = A(i+1) 1911 end do 1912 return 1913 end 1914 1915 1916 subroutine zeroend_fftb1(nx,A) 1917 implicit none 1918 integer nx 1919 real*8 A(*) 1920 integer i 1921 A(nx+1) = 0.0d0 1922 A(nx+2) = 0.0d0 1923 return 1924 end 1925 1926 1927 subroutine zeroend_fftb(nx,ny,nq,ne,A) 1928 implicit none 1929 integer nx,ny,nq,ne 1930 real*8 A(*) 1931 1932 integer i,indx 1933 1934!$OMP DO 1935 do i=1,(ny*nq*ne) 1936 indx = nx+1+(i-1)*(nx+2) 1937 A(indx) = 0.0d0 1938 A(indx+1) = 0.0d0 1939 end do 1940!$OMP END DO 1941 1942 return 1943 end 1944 1945c* *********************************** 1946c* * * 1947c* * D3dB_ncr_fft3b * 1948c* * * 1949c* *********************************** 1950c 1951c subroutine D3dB_ncr_fft3b(nb,ne,A) 1952c 1953c***************************************************** 1954c* * 1955c* This routine performs the operation of * 1956c* a three dimensional complex to complex * 1957c* inverse fft * 1958c* A(nx,ny(nb),nz(nb),n) <- FFT3^(-1)[A(kx,ky,kz,n)] * 1959c* * 1960c* Entry - * 1961c* A: a column distribuded 3d block * 1962c* tmp: tempory work space must be at * 1963c* least the size of (complex) * 1964c* (nfft*nfft + 1) + 10*nfft * 1965c* * 1966* Exit - A is transformed and the imaginary * 1967cc* part of A is set to zero * 1968c* uses - D3dB_c_transpose_jk, dcopy * 1969c* * 1970c***************************************************** 1971c 1972c implicit none 1973c integer nb,ne 1974c complex*16 A(*) 1975c 1976c#include "bafdecls.fh" 1977c#include "errquit.fh" 1978c 1979c#include "D3dB.fh" 1980c 1981c 1982c integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 1983c common / D3dB_fft / tmpx,tmpy,tmpz 1984c 1985c integer tmp2(2,NBLOCKS),tmp3(2,NBLOCKS) 1986c common / D3dB_n_fft / tmp2,tmp3 1987c 1988c* *** local variables *** 1989c integer i,j,k,q,n,indx,ierr 1990c 1991cc complex*16 tmp1(*) 1992cc complex*16 tmp2(*) 1993cc real*8 tmp3(*) 1994c !integer tmp1(2),tmp2(2),tmp3(2) 1995c logical value 1996c 1997c 1998c call nwpw_timing_start(1) 1999c 2000c* ***** allocate temporary space **** 2001c !call D3dB_nfft3d(nb,nfft3d) 2002c 2003c 2004c* ******************************************** 2005c* *** Do a transpose of A *** 2006c* *** A(kx,kz,ky) <- A(kx,ky,kz) *** 2007c* ******************************************** 2008cc call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1,nb))) 2009c 2010c* ************************************************* 2011c* *** do fft along kz dimension *** 2012c* *** A(kx,nz(nb),ky) <- fft1d^(-1)[A(kx,kz,ky)] *** 2013c* ************************************************* 2014c#ifdef MLIB 2015c do n=1,ne 2016c do q=1,nq(nb) 2017c do i=1,(nx(nb)/2+1) 2018c do k=1,nz(nb) 2019c indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb) 2020c > + (n-1)*nfft3d(nb) 2021c dcpl_mb(tmp2(1,nb)+k-1) = A(indx) 2022c end do 2023c call z1dfft(dcpl_mb(tmp2(1,nb)),nz(nb), 2024c > dcpl_mb(tmpz(1,nb)),-2,ierr) 2025c do k=1,nz(nb) 2026c indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb) 2027c > + (n-1)*nfft3d(nb) 2028c A(indx) = dcpl_mb(tmp2(1,nb)+k-1) 2029c end do 2030c end do 2031c end do 2032c end do 2033c 2034c#else 2035c do n=1,ne 2036c do q=1,nq(nb) 2037c do i=1,(nx(nb)/2+1) 2038c do k=1,nz(nb) 2039c indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb) 2040c > + (n-1)*nfft3d(nb) 2041c dcpl_mb(tmp2(1,nb)+k-1) = A(indx) 2042c end do 2043c call dcfftb(nz(nb),dcpl_mb(tmp2(1,nb)),dcpl_mb(tmpz(1,nb))) 2044c do k=1,nz(nb) 2045c indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb) 2046c > + (n-1)*nfft3d(nb) 2047c A(indx) = dcpl_mb(tmp2(1,nb)+k-1) 2048c end do 2049c end do 2050c end do 2051c end do 2052c#endif 2053c 2054c* ******************************************** 2055c* *** Do a transpose of A *** 2056c* *** A(kx,ky,nz(nb)) <- A(kx,nz(nb),ky) *** 2057c* ******************************************** 2058c do n=1,ne 2059c indx = 1 + (n-1)*nfft3d(nb) 2060c call D3dB_c_transpose_jk(nb,A(indx), 2061c > dcpl_mb(tmp2(1,nb)), 2062c > dbl_mb(tmp3(1,nb))) 2063c end do 2064cc call D3dB_nc_transpose_jk(nb,ne,A, 2065cc > dcpl_mb(tmp2(1,nb)), 2066cc > dbl_mb(tmp3(1,nb))) 2067c 2068c* ************************************************* 2069c* *** do fft along ky dimension *** 2070c* *** A(kx,ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ky,nz(nb))] *** 2071c* ************************************************* 2072c#ifdef MLIB 2073c do n=1,ne 2074c do q=1,nq(nb) 2075c do i=1,(nx(nb)/2+1) 2076c do j=1,ny(nb) 2077c indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2078c > + (n-1)*nfft3d(nb) 2079c dcpl_mb(tmp2(1,nb)+j-1) = A(indx) 2080c end do 2081c call z1dfft(dcpl_mb(tmp2(1,nb)),ny(nb), 2082c > dcpl_mb(tmpy(1,nb)),-2,ierr) 2083c do j=1,ny(nb) 2084c indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2085c > + (n-1)*nfft3d(nb) 2086c A(indx) = dcpl_mb(tmp2(1,nb)+j-1) 2087c end do 2088c end do 2089c end do 2090c end do 2091c !call dscal((nx(nb)+2)*ny(nb)*nq(nb),dble(ny(nb)),A,1) 2092c#else 2093c do n=1,ne 2094c do q=1,nq(nb) 2095c do i=1,(nx(nb)/2+1) 2096c do j=1,ny(nb) 2097c indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2098c > + (n-1)*nfft3d(nb) 2099c dcpl_mb(tmp2(1,nb)+j-1) = A(indx) 2100c end do 2101c call dcfftb(ny(nb),dcpl_mb(tmp2(1,nb)),dcpl_mb(tmpy(1,nb))) 2102c do j=1,ny(nb) 2103c indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2104c > + (n-1)*nfft3d(nb) 2105c A(indx) = dcpl_mb(tmp2(1,nb)+j-1) 2106c end do 2107c end do 2108c end do 2109c end do 2110c#endif 2111c 2112c* ************************************************* 2113c* *** do fft along kx dimension *** 2114c* *** A(nx(nb),ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ny(nb),nz(nb))] *** 2115c* ************************************************* 2116c#ifdef MLIB 2117c do n=1,ne 2118c do q=1,nq(nb) 2119c do j=1,ny(nb) 2120c indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2121c > + (n-1)*nfft3d(nb) 2122c call drc1ft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),-2,ierr) 2123c end do 2124c end do 2125c end do 2126c 2127c#else 2128c 2129c call cshift1_fftb(nx(nb),ny(nb),nq(nb),ne,A) 2130c do n=1,ne 2131c do q=1,nq(nb) 2132c do j=1,ny(nb) 2133c indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2134c > + (n-1)*nfft3d(nb) 2135c call drfftb(nx(nb),A(indx),dcpl_mb(tmpx(1,nb))) 2136c end do 2137c end do 2138c end do 2139c call zeroend_fftb(nx(nb),ny(nb),nq(nb),ne,A) 2140c#endif 2141c 2142c call nwpw_timing_end(1) 2143c return 2144c end 2145 2146 2147 2148 2149* *********************************** 2150* * * 2151* * D3dB_rc_fft3f * 2152* * * 2153* *********************************** 2154 2155 subroutine D3dB_rc_fft3f(nb,A) 2156 2157***************************************************** 2158* * 2159* This routine performs the operation of * 2160* a three dimensional complex to complex fft * 2161* A(kx,ky,kz) <- FFT3[A(nx(nb),ny(nb),nz(nb))] * 2162* * 2163* Entry - * 2164* A: a column distribuded 3d block * 2165* tmp: tempory work space must be at * 2166* least the size of (complex) * 2167* (nfft*nfft + 1) + 10*nfft * 2168* * 2169* Exit - A is transformed * 2170* * 2171* uses - transpose1 subroutine * 2172* * 2173***************************************************** 2174 2175 implicit none 2176 integer nb 2177 complex*16 A(*) 2178 2179#include "bafdecls.fh" 2180#include "errquit.fh" 2181 2182#include "D3dB.fh" 2183 2184 integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 2185 common / D3dB_fft / tmpx,tmpy,tmpz 2186 2187 2188* *** local variables *** 2189 integer i,j,k,q,indx,indx1 2190 integer nxh,nxhy,nxhz 2191 2192 !integer tmp1(2),tmp2(2),tmp3(2) 2193 integer tmp2(2),tmp3(2) 2194 logical value 2195 2196 integer tid,nthr,offset,nnn 2197 integer Parallel_threadid,Parallel_nthreads 2198 external Parallel_threadid,Parallel_nthreads 2199 2200 tid = Parallel_threadid() 2201 nthr = Parallel_nthreads() 2202 2203 call nwpw_timing_start(1) 2204 2205 2206* ***** allocate temporary space **** 2207 !call D3dB_nfft3d(nb,nfft3d) 2208 nnn = nfft3d(nb) 2209 if ((nthr*ny(nb)).gt.nnn) nnn = nthr*ny(nb) 2210 if ((nthr*nz(nb)).gt.nnn) nnn = nthr*nz(nb) 2211 value = BA_push_get(mt_dcpl,nnn,'tmp2',tmp2(2),tmp2(1)) 2212c value = BA_push_get(mt_dcpl,(nfft3d(nb)),'tmp2',tmp2(2),tmp2(1)) 2213 value = value.and. 2214 > BA_push_get(mt_dbl,(n2ft3d(nb)),'tmp3',tmp3(2),tmp3(1)) 2215 if (.not. value) call errquit('out of stack memory',0, MA_ERR) 2216 2217 nxh = (nx(nb)/2+1) 2218 nxhz = nxh*nz(nb) 2219 nxhy = nxh*ny(nb) 2220 2221 !********************** 2222 !**** slab mapping **** 2223 !********************** 2224 if (mapping.eq.1) then 2225* ******************************************** 2226* *** do fft along nx(nb) dimension *** 2227* *** A(kx,ny(nb),nz(nb)) <- fft1d[A(nx(nb),ny(nb),nz(nb))] *** 2228* ******************************************** 2229#ifdef MLIB 2230c do q=1,nq(nb) 2231c do j=1,ny(nb) 2232c indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2233c call drc1ft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),1,ierr) 2234c end do 2235c end do 2236 call drcfts(A,nx(nb),1,ny(nb)*nq(nb), 2237 > nx(nb)+2,1,ierr) 2238 2239#else 2240 2241#ifdef FFTW3 2242 call dfftw_execute_dft_r2c(plans(4,nb),A,A) 2243 2244#else 2245 offset=tid*(2*nx(nb)+15) 2246 do j=tid+1,ny(nb)*nq(nb),nthr 2247 call drfftf(nx(nb),A((j-1)*nxh+1),dcpl_mb(tmpx(1,nb)+offset)) 2248 end do 2249 call cshift_fftf(nx(nb),ny(nb),nq(nb),1,A) 2250#endif 2251#endif 2252 2253 2254* ******************************************** 2255* *** do fft along ny(nb) dimension *** 2256* *** A(kx,ky,nz(nb)) <- fft1d[A(kx,ny(nb),nz(nb))] *** 2257* ******************************************** 2258 2259#ifdef MLIB 2260 !call z1dfft(dbl_mb(tmp3(1)),ny(nb),dcpl_mb(tmp1(1)),-3,ierr) 2261 do q=1,nq(nb) 2262 do i=1,(nx(nb)/2+1) 2263 do j=1,ny(nb) 2264 indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2265 dcpl_mb(tmp2(1)+j-1) = A(indx) 2266 end do 2267 !indx = i + (q-1)*(nx(nb)/2+1)*ny(nb) 2268 !call zcopy(ny(nb),A(indx),(nx(nb)/2+1),dcpl_mb(tmp2(1)),1) 2269 call z1dfft(dcpl_mb(tmp2(1)),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr) 2270 do j=1,ny(nb) 2271 indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2272 A(indx) = dcpl_mb(tmp2(1)+j-1) 2273 end do 2274 !call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),(nx(nb)/2+1)) 2275 end do 2276 end do 2277ccc *** this should be faster but isn't *** 2278c do i=1,(nx(nb)/2+1) 2279c !indx = 1 + (q-1)*(nx(nb)/2+1)*ny(nb) 2280c call zffts(A(i),ny(nb),(nx(nb)/2+1),nq(nb), 2281c > (nx(nb)/2+1)*nq(nb),1,ierr) 2282c end do 2283#else 2284 2285#ifdef FFTW3 2286 do q=1,nq(nb) 2287 indx = 1+(q-1)*nxhy 2288 call dfftw_execute_dft(plans(5,nb),A(indx),A(indx)) 2289 end do 2290 2291#else 2292 offset=tid*(2*ny(nb)+15) 2293 do i=tid+1,nxh,nthr 2294 indx = i 2295 indx1= i 2296 do q=1,nq(nb) 2297 do j=1,ny(nb) 2298 !indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2299 dcpl_mb(tmp2(1)+j-1+tid*ny(nb)) = A(indx) 2300 indx = indx + nxh 2301 end do 2302 2303 call dcfftf(ny(nb),dcpl_mb(tmp2(1)+tid*ny(nb)), 2304 > dcpl_mb(tmpy(1,nb)+offset)) 2305 2306 do j=1,ny(nb) 2307 !indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2308 A(indx1) = dcpl_mb(tmp2(1)+j-1+tid*ny(nb)) 2309 indx1 = indx1 + nxh 2310 end do 2311 end do 2312 end do 2313 2314#endif 2315#endif 2316 2317* ******************************************** 2318* *** Do a transpose of A *** 2319* *** A(ky,nz(nb),ky) <- A(kx,ky,nz(nb)) *** 2320* ******************************************** 2321 call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 2322 2323 2324* ******************************************** 2325* *** do fft along nz(nb) dimension *** 2326* *** A(kx,kz,ky) <- fft1d[A(kx,nz(nb),ky)] *** 2327* ******************************************** 2328#ifdef MLIB 2329 !call z1dfft(dbl_mb(tmp3(1)),nz(nb),dcpl_mb(tmp1(1)),-3,ierr) 2330 do q=1,nq(nb) 2331 do i=1,(nx(nb)/2+1) 2332 do k=1,nz(nb) 2333 indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2334 dcpl_mb(tmp2(1)+k-1) = A(indx) 2335 end do 2336 call z1dfft(dcpl_mb(tmp2(1)),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr) 2337 do k=1,nz(nb) 2338 indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2339 A(indx) = dcpl_mb(tmp2(1)+k-1) 2340 end do 2341 end do 2342 end do 2343#else 2344 2345#ifdef FFTW3 2346 do q=1,nq(nb) 2347 indx = 1+(q-1)*nxhz 2348 call dfftw_execute_dft(plans(6,nb),A(indx),A(indx)) 2349 end do 2350 2351#else 2352 offset=tid*(2*nz(nb)+15) 2353 do i=tid+1,nxh,nthr 2354 indx = i 2355 indx1 = i 2356 do q=1,nq(nb) 2357 2358 do k=1,nz(nb) 2359 !indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2360 dcpl_mb(tmp2(1)+k-1+tid*nz(nb)) = A(indx) 2361 indx = indx + nxh 2362 end do 2363 call dcfftf(nz(nb),dcpl_mb(tmp2(1)+tid*nz(nb)), 2364 > dcpl_mb(tmpz(1,nb)+offset)) 2365 do k=1,nz(nb) 2366 !indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2367 A(indx1) = dcpl_mb(tmp2(1)+k-1+tid*nz(nb)) 2368 indx1 = indx1 + nxh 2369 end do 2370 2371 end do 2372 end do 2373 2374#endif 2375#endif 2376 2377* ******************************************** 2378* *** Do a transpose of A *** 2379* *** A(kx,ky,kz) <- A(kx,kz,ky) *** 2380* ******************************************** 2381c call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 2382 2383 2384 2385 2386 2387 !************************* 2388 !**** hilbert mapping **** 2389 !************************* 2390 else 2391 2392* ******************************************** 2393* *** do fft along nx(nb) dimension *** 2394* *** A(kx,ny(nb),nz(nb)) <- fft1d[A(nx(nb),ny(nb),nz(nb))] *** 2395* ******************************************** 2396#ifdef MLIB 2397 call drcfts(A,nx(nb),1,nq1(nb), 2398 > nx(nb)+2,1,ierr) 2399#else 2400 2401#ifdef FFTW3 2402 call dfftw_execute_dft_r2c(plans(14,nb),A,A) 2403#else 2404 2405 offset=tid*(2*nx(nb)+15) 2406 do i=tid+1,nq1(nb),nthr 2407 call drfftf(nx(nb),A(1+(i-1)*nxh),dcpl_mb(tmpx(1,nb)+offset)) 2408 end do 2409 call cshift_fftf(nx(nb),nq1(nb),1,1,A) 2410 2411 2412#endif 2413#endif 2414 2415 call D3dB_c_transpose_ijk(nb,1,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 2416 2417* ******************************************** 2418* *** do fft along ny(nb) dimension *** 2419* *** A(ky,nz(nb),kx) <- fft1d[A(ny(nb),nz(nb),kx)] *** 2420* ******************************************** 2421#ifdef MLIB 2422 indx = 1 2423 do q=1,nq2(nb) 2424 !indx = 1 + (q-1)*ny(nb) 2425 call z1dfft(A(indx),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr) 2426 indx = indx + ny(nb) 2427 end do 2428#else 2429 2430#ifdef FFTW3 2431 call dfftw_execute_dft(plans(15,nb),A,A) 2432 2433#else 2434 offset=tid*(2*ny(nb)+15) 2435 do i=tid+1,nq2(nb),nthr 2436 call dcfftf(ny(nb),A(1+(i-1)*ny(nb)),dcpl_mb(tmpy(1,nb)+offset)) 2437 end do 2438!$OMP BARRIER 2439#endif 2440#endif 2441 2442 call D3dB_c_transpose_ijk(nb,2,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 2443 2444* ******************************************** 2445* *** do fft along nz(nb) dimension *** 2446* *** A(kz,kx,ky) <- fft1d[A(nz(nb),kx,ky)] *** 2447* ******************************************** 2448#ifdef MLIB 2449 indx = 1 2450 do q=1,nq3(nb) 2451 !indx = 1 + (q-1)*nz(nb) 2452 call z1dfft(A(indx),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr) 2453 indx = indx + nz(nb) 2454 end do 2455#else 2456 2457#ifdef FFTW3 2458 call dfftw_execute_dft(plans(16,nb),A,A) 2459#else 2460 offset=tid*(2*nz(nb)+15) 2461 do i=tid+1,nq3(nb),nthr 2462 call dcfftf(nz(nb),A(1+(i-1)*nz(nb)),dcpl_mb(tmpz(1,nb)+offset)) 2463 end do 2464!$OMP BARRIER 2465 2466#endif 2467#endif 2468 2469 end if 2470 2471* **** deallocate temporary space **** 2472 value = BA_pop_stack(tmp3(2)) 2473 value = BA_pop_stack(tmp2(2)) 2474 !value = BA_pop_stack(tmp1(2)) 2475 2476 call nwpw_timing_end(1) 2477 2478 return 2479 end 2480 2481 subroutine D3dB_fftfx_sub(n,nx,nxh,tmpx,A) 2482 implicit none 2483 integer n,nx,nxh 2484 real*8 tmpx(2*nx+15) 2485 complex*16 A(nxh,n) 2486 integer i 2487 2488 2489 2490 do i=1,n 2491 call drfftf(nx,A(1,i),tmpx) 2492 end do 2493 2494 2495 return 2496 end 2497 2498 subroutine D3dB_fftfy_sub2(n,ny,tmpy,A) 2499 implicit none 2500 integer n,ny 2501 real*8 tmpy(4*ny+15) 2502 complex*16 A(ny,n) 2503 integer i,indx 2504 2505 2506 2507 do i=1,n 2508 call dcfftf(ny,A(1,i),tmpy) 2509 end do 2510 2511 2512 return 2513 end 2514 2515 subroutine D3dB_fftfz_sub2(n,nz,tmpz,A) 2516 implicit none 2517 integer n,nz 2518 real*8 tmpz(4*nz+15) 2519 complex*16 A(nz,n) 2520 integer i 2521 2522 2523 2524 do i=1,n 2525 call dcfftf(nz,A(1,i),tmpz) 2526 end do 2527 2528 2529 return 2530 end 2531 2532 2533 2534 2535 2536 subroutine cshift_fftf(nx,ny,nq,ne,A) 2537 implicit none 2538 integer nx,ny,nq,ne 2539 real*8 A(*) 2540 2541 integer i,j,indx 2542 2543!$OMP BARRIER 2544!$OMP DO 2545 do j=1,(ny*nq*ne) 2546 indx = 1+(j-1)*(nx+2) 2547c indx = 1 + (j-1)*(nx+2) + (q-1)*(nx+2)*ny 2548c > + (n-1)*(nx+2)*ny*nq 2549 2550 do i=nx,2,-1 2551 A(indx+i) = A(indx+i-1) 2552 end do 2553 A(indx+1) = 0.0d0 2554 A(indx+nx+1) = 0.0d0 2555! indx = indx + (nx+2) 2556 end do 2557!$OMP END DO 2558 2559 return 2560 end 2561 2562 2563 subroutine cshift_fftf_ab(nx,ny,nq,ne,A,B) 2564 implicit none 2565 integer nx,ny,nq,ne 2566 real*8 A(*) 2567 real*8 B(*) 2568 2569 integer i,j,indx 2570 2571 indx = 1 2572 do j=1,(ny*nq*ne) 2573CDIR$ NOVECTOR 2574 do i=nx,2,-1 2575 B(indx+i) = A(indx+i-1) 2576 end do 2577 B(indx+1) = 0.0d0 2578 B(indx+nx+1) = 0.0d0 2579 indx = indx + (nx+2) 2580 end do 2581 2582 return 2583 end 2584 2585 2586 2587 2588c* *********************************** 2589c* * * 2590c* * D3dB_nrc_fft3f * 2591c* * * 2592c* *********************************** 2593c 2594c subroutine D3dB_nrc_fft3f(nb,ne,A) 2595c 2596c***************************************************** 2597c* * 2598c* This routine performs the operation of * 2599c* a three dimensional complex to complex fft * 2600c* A(kx,ky,kz) <- FFT3[A(nx(nb),ny(nb),nz(nb))] * 2601c* * 2602c* Entry - * 2603c* A: a column distribuded 3d block * 2604c* tmp: tempory work space must be at * 2605c* least the size of (complex) * 2606c* (nfft*nfft + 1) + 10*nfft * 2607c* * 2608c* Exit - A is transformed * 2609c* * 2610c* uses - transpose1 subroutine * 2611c* * 2612c***************************************************** 2613c 2614c implicit none 2615c integer nb,ne 2616c complex*16 A(*) 2617c 2618c#include "bafdecls.fh" 2619c#include "errquit.fh" 2620c 2621c#include "D3dB.fh" 2622c 2623c integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS) 2624c common / D3dB_fft / tmpx,tmpy,tmpz 2625c 2626c* *** local variables *** 2627c integer i,j,k,q,n,indx,ierr 2628c 2629c !integer tmp1(2),tmp2(2),tmp3(2) 2630c integer tmp2(2),tmp3(2) 2631c logical value 2632c 2633c 2634c call nwpw_timing_start(1) 2635c 2636c 2637c* ***** allocate temporary space **** 2638c !call D3dB_nfft3d(nb,nfft3d) 2639c value = BA_push_get(mt_dcpl,(nfft3d(nb)),'tmp2',tmp2(2),tmp2(1)) 2640c value = value.and. 2641c > BA_push_get(mt_dbl,(n2ft3d(nb)),'tmp3',tmp3(2),tmp3(1)) 2642c if (.not. value) call errquit('out of stack memory',0, MA_ERR) 2643c 2644c 2645c* ******************************************** 2646c* *** do fft along nx(nb) dimension *** 2647c* *** A(kx,ny(nb),nz(nb)) <- fft1d[A(nx(nb),ny(nb),nz(nb))] *** 2648c* ******************************************** 2649c#ifdef MLIB 2650c call drcfts(A,nx(nb),1,ny(nb)*nq(nb)*ne, 2651c > nx(nb)+2,1,ierr) 2652c 2653c#else 2654c !call drffti(nx(nb),dcpl_mb(tmp1(1))) 2655c do n=1,ne 2656c do q=1,nq(nb) 2657c do j=1,ny(nb) 2658c indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2659c > + (n-1)*nfft3d(nb) 2660c 2661c 2662c call drfftf(nx(nb),A(indx),dcpl_mb(tmpx(1,nb))) 2663c end do 2664c end do 2665c end do 2666c call cshift_fftf(nx(nb),ny(nb),nq(nb),ne,A) 2667c#endif 2668c 2669c 2670c* ******************************************** 2671c* *** do fft along ny(nb) dimension *** 2672c* *** A(kx,ky,nz(nb)) <- fft1d[A(kx,ny(nb),nz(nb))] *** 2673c* ******************************************** 2674c 2675c#ifdef MLIB 2676c !call z1dfft(dbl_mb(tmp3(1)),ny(nb),dcpl_mb(tmp1(1)),-3,ierr) 2677c do n=1,ne 2678c do q=1,nq(nb) 2679c do i=1,(nx(nb)/2+1) 2680c do j=1,ny(nb) 2681c indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2682c > + (n-1)*nfft3d(nb) 2683c dcpl_mb(tmp2(1)+j-1) = A(indx) 2684c end do 2685c !indx = i + (q-1)*(nx(nb)/2+1)*ny(nb) 2686c !call zcopy(ny(nb),A(indx),(nx(nb)/2+1),dcpl_mb(tmp2(1)),1) 2687c call z1dfft(dcpl_mb(tmp2(1)),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr) 2688c do j=1,ny(nb) 2689c indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2690c > + (n-1)*nfft3d(nb) 2691c A(indx) = dcpl_mb(tmp2(1)+j-1) 2692c end do 2693c !call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),(nx(nb)/2+1)) 2694c end do 2695c end do 2696c end do 2697cccc *** this should be faster but isn't *** 2698cc do i=1,(nx(nb)/2+1) 2699cc !indx = 1 + (q-1)*(nx(nb)/2+1)*ny(nb) 2700cc call zffts(A(i),ny(nb),(nx(nb)/2+1),nq(nb), 2701cc > (nx(nb)/2+1)*nq(nb),1,ierr) 2702cc end do 2703c#else 2704c !call dcffti(ny(nb),dcpl_mb(tmp1(1))) 2705c do n=1,ne 2706c do q=1,nq(nb) 2707c do i=1,(nx(nb)/2+1) 2708c do j=1,ny(nb) 2709c indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2710c > + (n-1)*nfft3d(nb) 2711c dcpl_mb(tmp2(1)+j-1) = A(indx) 2712c end do 2713c call dcfftf(ny(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpy(1,nb))) 2714c do j=1,ny(nb) 2715c indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2716c > + (n-1)*nfft3d(nb) 2717c A(indx) = dcpl_mb(tmp2(1)+j-1) 2718c end do 2719c end do 2720c end do 2721c end do 2722c#endif 2723c 2724c 2725c* ******************************************** 2726c* *** Do a transpose of A *** 2727c* *** A(ky,nz(nb),ky) <- A(kx,ky,nz(nb)) *** 2728c* ******************************************** 2729c do n=1,ne 2730c indx = 1 + (n-1)*nfft3d(nb) 2731c call D3dB_c_transpose_jk(nb,A(indx), 2732c > dcpl_mb(tmp2(1)), 2733c > dbl_mb(tmp3(1))) 2734c end do 2735c 2736c 2737c* ******************************************** 2738c* *** do fft along nz(nb) dimension *** 2739c* *** A(kx,kz,ky) <- fft1d[A(kx,nz(nb),ky)] *** 2740c* ******************************************** 2741c#ifdef MLIB 2742c !call z1dfft(dbl_mb(tmp3(1)),nz(nb),dcpl_mb(tmp1(1)),-3,ierr) 2743c do n=1,ne 2744c do q=1,nq(nb) 2745c do i=1,(nx(nb)/2+1) 2746c do k=1,nz(nb) 2747c indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2748c > + (n-1)*nfft3d(nb) 2749c dcpl_mb(tmp2(1)+k-1) = A(indx) 2750c end do 2751c call z1dfft(dcpl_mb(tmp2(1)),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr) 2752c do k=1,nz(nb) 2753c indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2754c > + (n-1)*nfft3d(nb) 2755c A(indx) = dcpl_mb(tmp2(1)+k-1) 2756c end do 2757c end do 2758c end do 2759c end do 2760c#else 2761c !call dcffti(nz(nb),dcpl_mb(tmp1(1))) 2762c do n=1,ne 2763c do q=1,nq(nb) 2764c do i=1,(nx(nb)/2+1) 2765c do k=1,nz(nb) 2766c indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2767c > + (n-1)*nfft3d(nb) 2768c dcpl_mb(tmp2(1)+k-1) = A(indx) 2769c end do 2770c call dcfftf(nz(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpz(1,nb))) 2771c do k=1,nz(nb) 2772c indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb) 2773c > + (n-1)*nfft3d(nb) 2774c A(indx) = dcpl_mb(tmp2(1)+k-1) 2775c end do 2776c end do 2777c end do 2778c end do 2779c#endif 2780c 2781c* ******************************************** 2782c* *** Do a transpose of A *** 2783c* *** A(kx,ky,kz) <- A(kx,kz,ky) *** 2784c* ******************************************** 2785cc call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1))) 2786c 2787c 2788c* **** deallocate temporary space **** 2789c value = BA_pop_stack(tmp3(2)) 2790c value = BA_pop_stack(tmp2(2)) 2791c !value = BA_pop_stack(tmp1(2)) 2792 2793c call nwpw_timing_end(1) 2794c return 2795c end 2796 2797 2798 2799 2800* *********************************** 2801* * * 2802* * D3dB_(c,r,t)_SMul * 2803* * * 2804* *********************************** 2805 2806* This routine performs the operation C = scale * A 2807* where scale is a real*8 number. 2808 2809 subroutine D3dB_c_SMul(nb,scale,A,C) 2810 implicit none 2811 integer nb 2812 real*8 scale 2813 complex*16 A(*) 2814 complex*16 C(*) 2815 2816#include "D3dB.fh" 2817 2818 integer i 2819 2820 do i=1,nfft3d_map(nb) 2821 C(i) = scale*A(i) 2822 end do 2823 return 2824 end 2825 2826 2827 subroutine D3dB_c_SMul1(nb,scale,A) 2828 implicit none 2829 integer nb 2830 real*8 scale 2831 complex*16 A(*) 2832 2833#include "D3dB.fh" 2834 2835 integer i 2836 2837 do i=1,nfft3d_map(nb) 2838 A(i) = scale*A(i) 2839 end do 2840 return 2841 end 2842 2843 2844 2845 subroutine D3dB_r_SMul(nb,scale,A,C) 2846 implicit none 2847 integer nb 2848 real*8 scale 2849 real*8 A(*) 2850 real*8 C(*) 2851 2852#include "D3dB.fh" 2853 2854 integer i 2855 2856!$OMP DO 2857 do i=1,n2ft3d_map(nb) 2858 C(i) = scale*A(i) 2859 end do 2860!$OMP END DO 2861 return 2862 end 2863 2864 subroutine D3dB_r_SMul1(nb,scale,A) 2865 implicit none 2866 integer nb 2867 real*8 scale 2868 real*8 A(*) 2869 2870#include "D3dB.fh" 2871 2872 integer i 2873 2874!$OMP DO 2875 do i=1,n2ft3d_map(nb) 2876 A(i) = scale*A(i) 2877 end do 2878!$OMP END DO 2879 return 2880 end 2881 2882 2883 subroutine D3dB_t_SMul(nb,scale,A,C) 2884 implicit none 2885 integer nb 2886 real*8 scale 2887 real*8 A(*) 2888 real*8 C(*) 2889 2890#include "D3dB.fh" 2891 2892 integer i 2893 2894 do i=1,nfft3d_map(nb) 2895 C(i) = scale*A(i) 2896 end do 2897 return 2898 end 2899 2900 2901 subroutine D3dB_t_SMul1(nb,scale,A) 2902 implicit none 2903 integer nb 2904 real*8 scale 2905 real*8 A(*) 2906 2907#include "D3dB.fh" 2908 2909 integer i 2910 2911 do i=1,nfft3d_map(nb) 2912 A(i) = scale*A(i) 2913 end do 2914 return 2915 end 2916 2917 2918 2919 subroutine D3dB_c_ZMul(nb,scale,A,C) 2920 implicit none 2921 integer nb 2922 complex*16 scale 2923 complex*16 A(*) 2924 complex*16 C(*) 2925 2926#include "D3dB.fh" 2927 2928 2929 integer i 2930 2931 do i=1,nfft3d_map(nb) 2932 C(i) = scale*A(i) 2933 end do 2934 return 2935 end 2936 2937 subroutine D3dB_r_Power1(nb,y,A) 2938 implicit none 2939 integer nb 2940 real*8 y 2941 real*8 A(*) 2942 2943#include "D3dB.fh" 2944 2945 integer ii 2946 2947 do ii=1,n2ft3d_map(nb) 2948 A(ii) = A(ii)**y 2949 end do 2950 2951 return 2952 end 2953 2954 2955 subroutine D3dB_rr_Power(nb,y,A,B) 2956 implicit none 2957 integer nb 2958 real*8 y 2959 real*8 A(*) 2960 real*8 B(*) 2961 2962#include "D3dB.fh" 2963 2964 integer i 2965 2966 do i=1,n2ft3d_map(nb) 2967 B(i) = A(i)**y 2968 end do 2969 return 2970 end 2971 2972 2973 2974 2975 2976* *********************************** 2977* * * 2978* * D3dB_ct_Sqr * 2979* * * 2980* *********************************** 2981 2982* This routine performs the operation C = A * A 2983 2984 subroutine D3dB_ct_Sqr(nb,A,C) 2985 implicit none 2986 integer nb 2987 complex*16 A(*) 2988 real*8 C(*) 2989 2990#include "D3dB.fh" 2991 2992 integer i 2993 2994 do i=1,nfft3d_map(nb) 2995 C(i) = dble(A(i))**2 + dimag(A(i))**2 2996 end do 2997 return 2998 end 2999 3000* *********************************** 3001* * * 3002* * D3dB_rr_Sqr * 3003* * * 3004* *********************************** 3005 3006 subroutine D3dB_rr_Sqr(nb,A,C) 3007 implicit none 3008 integer nb 3009 real*8 A(*) 3010 real*8 C(*) 3011 3012#include "D3dB.fh" 3013 3014 3015 integer i 3016 3017!$OMP DO 3018 do i=1,n2ft3d_map(nb) 3019 C(i) = A(i)**2 3020 end do 3021!$OMP END DO 3022 return 3023 end 3024 3025 3026* *********************************** 3027* * * 3028* * D3dB_rr_SqrAdd * 3029* * * 3030* *********************************** 3031 3032 subroutine D3dB_rr_SqrAdd(nb,A,C) 3033 implicit none 3034 integer nb 3035 real*8 A(*) 3036 real*8 C(*) 3037 3038#include "D3dB.fh" 3039 3040 3041 integer i 3042 3043!$OMP DO 3044 do i=1,n2ft3d_map(nb) 3045 C(i) = C(i) + A(i)**2 3046 end do 3047!$OMP END DO 3048 return 3049 end 3050 3051 3052 3053 3054* *********************************** 3055* * * 3056* * D3dB_rr_Sqr1 * 3057* * * 3058* *********************************** 3059 3060 subroutine D3dB_rr_Sqr1(nb,A) 3061 implicit none 3062 integer nb 3063 real*8 A(*) 3064 3065#include "D3dB.fh" 3066 3067 integer i 3068 3069!$OMP DO 3070 do i=1,n2ft3d_map(nb) 3071 A(i) = A(i)**2 3072 end do 3073!$OMP END DO 3074 return 3075 end 3076 3077 3078* *********************************** 3079* * * 3080* * D3dB_rr_Sqrt * 3081* * * 3082* *********************************** 3083 3084 subroutine D3dB_rr_Sqrt(nb,A,C) 3085 implicit none 3086 integer nb 3087 real*8 A(*) 3088 real*8 C(*) 3089 3090#include "D3dB.fh" 3091 3092 integer i 3093 3094 do i=1,n2ft3d_map(nb) 3095 C(i) = dsqrt(A(i)) 3096 end do 3097 return 3098 end 3099 3100 3101 3102* *********************************** 3103* * * 3104* * D3dB_rr_Sqrt1 * 3105* * * 3106* *********************************** 3107 3108 subroutine D3dB_rr_Sqrt1(nb,A) 3109 implicit none 3110 integer nb 3111 real*8 A(*) 3112 3113#include "D3dB.fh" 3114 3115 integer i 3116 3117!$OMP DO 3118 do i=1,n2ft3d_map(nb) 3119 A(i) = dsqrt(A(i)) 3120 end do 3121!$OMP END DO 3122 return 3123 end 3124 3125 3126* *********************************** 3127* * * 3128* * D3dB_tt_Sqr * 3129* * * 3130* *********************************** 3131 3132 subroutine D3dB_tt_Sqr(nb,A,C) 3133 implicit none 3134 integer nb 3135 real*8 A(*) 3136 real*8 C(*) 3137 3138#include "D3dB.fh" 3139 3140 integer i 3141 3142 do i=1,nfft3d_map(nb) 3143 C(i) = A(i)**2 3144 end do 3145 return 3146 end 3147 3148 3149 3150* *********************************** 3151* * * 3152* * D3dB_c_transpose_jk_init * 3153* * * 3154* *********************************** 3155 3156 subroutine D3dB_c_transpose_jk_init(nb) 3157 implicit none 3158 integer nb 3159 3160#include "bafdecls.fh" 3161#include "errquit.fh" 3162#include "D3dB.fh" 3163 3164 3165c integer iq_to_i1((NFFT1/2+1)*NFFT2*NSLABS) 3166c integer iq_to_i2((NFFT1/2+1)*NFFT2*NSLABS) 3167c integer i1_start(NFFT3+1) 3168c integer i2_start(NFFT3+1) 3169 integer iq_to_i1(2,NBLOCKS) 3170 integer iq_to_i2(2,NBLOCKS) 3171 integer i1_start(2,NBLOCKS) 3172 integer i2_start(2,NBLOCKS) 3173 common / trans_blk / iq_to_i1,iq_to_i2,i1_start,i2_start 3174 3175 3176* **** local variables **** 3177 integer proc_to,proc_from 3178 integer pto,qto,np,taskid 3179 integer pfrom,qfrom 3180 integer phere,qhere 3181 integer index1,index2,itmp 3182 integer i,j,k,it 3183 logical value 3184 3185 3186* **** allocate trans_blk common block **** 3187 value = BA_alloc_get(mt_int,((nx(nb)/2+1)*ny(nb)*nq(nb)), 3188 > 'iq_to_i1',iq_to_i1(2,nb),iq_to_i1(1,nb)) 3189 value=value.and.BA_alloc_get(mt_int,((nx(nb)/2+1)*ny(nb)*nq(nb)), 3190 > 'iq_to_i2',iq_to_i2(2,nb),iq_to_i2(1,nb)) 3191 value = value.and.BA_alloc_get(mt_int,(nz(nb)+1), 3192 > 'i1_start',i1_start(2,nb),i1_start(1,nb)) 3193 value = value.and.BA_alloc_get(mt_int,(nz(nb)+1), 3194 > 'i2_start',i2_start(2,nb),i2_start(1,nb)) 3195 if (.not. value) 3196 > call errquit('D3dB_transpose_jk_init:out of heap',0,MA_ERR) 3197 3198 call Parallel2d_taskid_i(taskid) 3199 call Parallel2d_np_i(np) 3200 3201!MATHIAS 3202 index1 = 1 3203 index2 = 1 3204 do it=0,np-1 3205 proc_to = mod(taskid+it,np) 3206 proc_from = mod(taskid-it+np,np) 3207c i1_start(it+1) = index1 3208c i2_start(it+1) = index2 3209 int_mb(i1_start(1,nb)+it) = index1 3210 int_mb(i2_start(1,nb)+it) = index2 3211 3212 do k=1,nz(nb) 3213 do j=1,ny(nb) 3214 3215* **** packing scheme **** 3216 call D3dB_ktoqp(nb,k,qhere,phere) 3217 call D3dB_ktoqp(nb,j,qto,pto) 3218 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 3219 do i=1,(nx(nb)/2+1) 3220 itmp = i + (j-1)*(nx(nb)/2+1) 3221 > + (qhere-1)*(nx(nb)/2+1)*ny(nb) 3222c iq_to_i1(itmp) = index1 3223 int_mb(iq_to_i1(1,nb)+itmp-1) = index1 3224 index1 = index1 + 1 3225 end do 3226 end if 3227 3228* **** unpacking scheme **** 3229 call D3dB_ktoqp(nb,j,qhere,phere) 3230 call D3dB_ktoqp(nb,k,qfrom,pfrom) 3231 if ((phere.eq.taskid).and.(pfrom.eq.proc_from)) then 3232 do i=1,(nx(nb)/2+1) 3233 itmp = i + (k-1)*(nx(nb)/2+1) 3234 > + (qhere-1)*(nx(nb)/2+1)*ny(nb) 3235c iq_to_i2(itmp) = index2 3236 int_mb(iq_to_i2(1,nb)+itmp-1) = index2 3237 index2 = index2 + 1 3238 end do 3239 end if 3240 end do 3241 end do 3242 end do 3243c i1_start(np+1) = index1 3244c i2_start(np+1) = index2 3245 int_mb(i1_start(1,nb)+np) = index1 3246 int_mb(i2_start(1,nb)+np) = index2 3247 3248 3249 return 3250 end 3251 3252 3253 3254* *********************************** 3255* * * 3256* * D3dB_cc_dot * 3257* * * 3258* *********************************** 3259 3260 subroutine D3dB_cc_dot(nb,A,B,sumall) 3261 implicit none 3262 integer nb 3263 complex*16 A(*) 3264 complex*16 B(*) 3265 real*8 sumall 3266 3267 3268#include "D3dB.fh" 3269 3270 integer i,j,k,q,index,np,taskid,p 3271 real*8 sum 3272 3273 3274 call nwpw_timing_start(2) 3275 3276 call Parallel2d_np_i(np) 3277 3278* **** sum up dot product on this node **** 3279 sum = 0.0d0 3280 3281 !********************** 3282 !**** slab mapping **** 3283 !********************** 3284 if (mapping.eq.1) then 3285* ***** kx!=0 plane, so double count ***** 3286 do q=1,nq(nb) 3287 do j=1,ny(nb) 3288 do i=2,(nx(nb)/2+1) 3289 index = (q-1)*(nx(nb)/2+1)*ny(nb) 3290 > + (j-1)*(nx(nb)/2+1) + i 3291 sum = sum + dble(A(index)) * dble(B(index)) 3292 > + dimag(A(index)) * dimag(B(index)) 3293 end do 3294 end do 3295 end do 3296 sum = sum*2.0d0 3297 3298* ***** kx==0 plane, so single count ***** 3299 do q=1,nq(nb) 3300 do j=1,ny(nb) 3301 i=1 3302 index = (q-1)*(nx(nb)/2+1)*ny(nb) + (j-1)*(nx(nb)/2+1) + 1 3303 sum = sum + dble(A(index)) * dble(B(index)) 3304 > + dimag(A(index)) * dimag(B(index)) 3305 end do 3306 end do 3307 3308 !************************* 3309 !**** hilbert mapping **** 3310 !************************* 3311 else 3312 call Parallel2d_taskid_i(taskid) 3313* ***** kx!=0 plane, so double count ***** 3314 do index=1,nfft3d_map(nb) 3315 sum = sum + dble(A(index)) * dble(B(index)) 3316 > + dimag(A(index)) * dimag(B(index)) 3317 end do 3318 sum = sum*2.0d0 3319 3320* ***** kx==0 plane, so single count ***** 3321 do k=1,nz(nb) 3322 do j=1,ny(nb) 3323 i=1 3324 call D3dB_ijktoindexp(1,i,j,k,index,p) 3325 if (p.eq.taskid) then 3326 sum = sum - dble(A(index)) * dble(B(index)) 3327 > - dimag(A(index)) * dimag(B(index)) 3328 end if 3329 end do 3330 end do 3331 3332 end if 3333 3334 3335* **** add up sums from other nodes **** 3336 if (np.gt.1) then 3337 call D3dB_SumAll(sum) 3338 end if 3339 3340 call nwpw_timing_end(2) 3341 3342 sumall = sum 3343 return 3344 end 3345 3346* *********************************** 3347* * * 3348* * D3dB_cc_idot * 3349* * * 3350* *********************************** 3351 3352 subroutine D3dB_cc_idot(nb,A,B,sumall) 3353 implicit none 3354 integer nb 3355 complex*16 A(*) 3356 complex*16 B(*) 3357 real*8 sumall 3358 3359 3360#include "D3dB.fh" 3361 3362 integer i,j,k,q,index,np,taskid,p 3363 real*8 sum 3364 3365 3366 call nwpw_timing_start(2) 3367 3368c call Parallel2d_np_i(np) 3369 3370* **** sum up dot product on this node **** 3371 sum = 0.0d0 3372 3373 !********************** 3374 !**** slab mapping **** 3375 !********************** 3376 if (mapping.eq.1) then 3377* ***** kx!=0 plane, so double count ***** 3378 do q=1,nq(nb) 3379 do j=1,ny(nb) 3380 do i=2,(nx(nb)/2+1) 3381 index = (q-1)*(nx(nb)/2+1)*ny(nb) 3382 > + (j-1)*(nx(nb)/2+1) + i 3383 sum = sum + dble(A(index)) * dble(B(index)) 3384 > + dimag(A(index)) * dimag(B(index)) 3385 end do 3386 end do 3387 end do 3388 sum = sum*2.0d0 3389 3390* ***** kx==0 plane, so single count ***** 3391 do q=1,nq(nb) 3392 do j=1,ny(nb) 3393 i=1 3394 index = (q-1)*(nx(nb)/2+1)*ny(nb) + (j-1)*(nx(nb)/2+1) + 1 3395 sum = sum + dble(A(index)) * dble(B(index)) 3396 > + dimag(A(index)) * dimag(B(index)) 3397 end do 3398 end do 3399 3400 3401 !************************* 3402 !**** hilbert mapping **** 3403 !************************* 3404 else 3405 call Parallel2d_taskid_i(taskid) 3406* ***** kx!=0 plane, so double count ***** 3407 do index=1,nfft3d_map(nb) 3408 sum = sum + dble(A(index)) * dble(B(index)) 3409 > + dimag(A(index)) * dimag(B(index)) 3410 end do 3411 sum = sum*2.0d0 3412 3413* ***** kx==0 plane, so single count ***** 3414 do k=1,nz(nb) 3415 do j=1,ny(nb) 3416 i=1 3417 call D3dB_ijktoindexp(1,i,j,k,index,p) 3418 if (p.eq.taskid) then 3419 sum = sum - dble(A(index)) * dble(B(index)) 3420 > - dimag(A(index)) * dimag(B(index)) 3421 end if 3422 end do 3423 end do 3424 3425 end if 3426 3427 3428* **** do not add up sums from other nodes **** 3429 3430 call nwpw_timing_end(2) 3431 3432 sumall = sum 3433 return 3434 end 3435 3436* *********************************** 3437* * * 3438* * D3dB_tt_dot * 3439* * * 3440* *********************************** 3441 3442 subroutine D3dB_tt_dot(nb,A,B,sumall) 3443 implicit none 3444 integer nb 3445 real*8 A(*) 3446 real*8 B(*) 3447 real*8 sumall 3448 3449#include "D3dB.fh" 3450 3451 integer i,j,k,q,index,np,nxh,taskid,p 3452 real*8 sum 3453 3454 nxh=nx(nb)/2 3455 call Parallel2d_np_i(np) 3456 3457* **** sum up dot product on this node **** 3458 sum = 0.0d0 3459 3460 !********************** 3461 !**** slab mapping **** 3462 !********************** 3463 if (mapping.eq.1) then 3464* ***** k!=0 plane, so double count ***** 3465 do q=1,nq(nb) 3466 do j=1,ny(nb) 3467 do i=2,(nxh+1) 3468 index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + i 3469 sum = sum + A(index)*B(index) 3470 end do 3471 end do 3472 end do 3473 sum = sum*2.0d0 3474 3475* **** kx==0 plane, so single count ***** 3476 do q=1,nq(nb) 3477 do j=1,ny(nb) 3478 i=1 3479 index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + 1 3480 sum = sum + A(index)*B(index) 3481 end do 3482 end do 3483 3484 3485 !************************* 3486 !**** hilbert mapping **** 3487 !************************* 3488 else 3489 call Parallel2d_taskid_i(taskid) 3490* ***** kx!=0 plane, so double count ***** 3491 do index=1,nfft3d_map(nb) 3492 sum = sum + A(index)*B(index) 3493 end do 3494 sum = sum*2.0d0 3495 3496* ***** kx==0 plane, so single count ***** 3497 do k=1,nz(nb) 3498 do j=1,ny(nb) 3499 i=1 3500 call D3dB_ijktoindexp(1,i,j,k,index,p) 3501 if (p.eq.taskid) then 3502 sum = sum - A(index)*B(index) 3503 end if 3504 end do 3505 end do 3506 3507 end if 3508 3509 3510* **** add up sums from other nodes **** 3511 if (np.gt.1) then 3512 call D3dB_SumAll(sum) 3513 end if 3514 3515 sumall = sum 3516 return 3517 end 3518 3519 3520* *********************************** 3521* * * 3522* * D3dB_tt_idot * 3523* * * 3524* *********************************** 3525 3526 subroutine D3dB_tt_idot(nb,A,B,sumall) 3527 implicit none 3528 integer nb 3529 real*8 A(*) 3530 real*8 B(*) 3531 real*8 sumall 3532 3533 3534#include "D3dB.fh" 3535 3536 integer i,j,k,q,index,np,nxh,taskid,p 3537 real*8 sum 3538 3539 3540 call nwpw_timing_start(2) 3541 3542 nxh=nx(nb)/2 3543c call Parallel2d_np_i(np) 3544 3545* **** sum up dot product on this node **** 3546 sum = 0.0d0 3547 3548 !********************** 3549 !**** slab mapping **** 3550 !********************** 3551 if (mapping.eq.1) then 3552* ***** k!=0 plane, so double count ***** 3553 do q=1,nq(nb) 3554 do j=1,ny(nb) 3555 do i=2,(nxh+1) 3556 index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + i 3557 sum = sum + A(index)*B(index) 3558 end do 3559 end do 3560 end do 3561 sum = sum*2.0d0 3562 3563* **** kx==0 plane, so single count ***** 3564 do q=1,nq(nb) 3565 do j=1,ny(nb) 3566 i=1 3567 index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + 1 3568 sum = sum + A(index)*B(index) 3569 end do 3570 end do 3571 3572 3573 3574 !************************* 3575 !**** hilbert mapping **** 3576 !************************* 3577 else 3578 call Parallel2d_taskid_i(taskid) 3579* ***** kx!=0 plane, so double count ***** 3580 do index=1,nfft3d_map(nb) 3581 sum = sum + A(index)*B(index) 3582 end do 3583 sum = sum*2.0d0 3584 3585* ***** kx==0 plane, so single count ***** 3586 do k=1,nz(nb) 3587 do j=1,ny(nb) 3588 i=1 3589 call D3dB_ijktoindexp(1,i,j,k,index,p) 3590 if (p.eq.taskid) then 3591 sum = sum - A(index)*B(index) 3592 end if 3593 end do 3594 end do 3595 3596 end if 3597 3598 3599 3600* **** !!!! do not add up sums from other nodes **** 3601 3602 call nwpw_timing_end(2) 3603 3604 sumall = sum 3605 return 3606 end 3607 3608 3609 3610* *********************************** 3611* * * 3612* * D3dB_rr_dot * 3613* * * 3614* *********************************** 3615* shared memory output 3616* - sumall 3617 3618 subroutine D3dB_rr_dot(nb,A,B,sumall) 3619 implicit none 3620 integer nb 3621 real*8 A(*) 3622 real*8 B(*) 3623 real*8 sumall 3624 3625#include "D3dB.fh" 3626 3627 integer i,np 3628 real*8 sum 3629 common /D3dB_RR_TSUM/ sum 3630 3631 call Parallel2d_np_i(np) 3632 3633* **** sum up dot product on this node **** 3634!$OMP MASTER 3635 sum = 0.0d0 3636!$OMP END MASTER 3637!$OMP BARRIER 3638!$OMP DO REDUCTION(+:sum) 3639 do i=1,n2ft3d_map(nb) 3640 sum = sum + A(i)*B(i) 3641 end do 3642!$OMP END DO 3643 3644 3645* **** add up sums from other nodes **** 3646 if (np.gt.1) then 3647 call D3dB_SumAll(sum) 3648 end if 3649 3650!$OMP MASTER 3651 sumall = sum 3652!$OMP END MASTER 3653!$OMP BARRIER 3654 return 3655 end 3656 3657* *********************************** 3658* * * 3659* * D3dB_rr_idot * 3660* * * 3661* *********************************** 3662 3663 subroutine D3dB_rr_idot(nb,A,B,sumall) 3664 implicit none 3665 integer nb 3666 real*8 A(*) 3667 real*8 B(*) 3668 real*8 sumall 3669 3670#include "D3dB.fh" 3671 3672 integer i,np 3673 real*8 sum 3674 common /D3dB_RR_TSUM/ sum 3675 3676 3677* **** sum up dot product on this node **** 3678!$OMP MASTER 3679 sum = 0.0d0 3680!$OMP END MASTER 3681!$OMP BARRIER 3682!$OMP DO reduction(+:sum) 3683 do i=1,n2ft3d_map(nb) 3684 sum = sum + A(i)*B(i) 3685 end do 3686!$OMP END DO 3687 3688* **** add up sums from other nodes **** 3689* if (np.gt.1) then 3690* call D3dB_SumAll(sum) 3691* end if 3692 3693!$OMP MASTER 3694 sumall = sum 3695!$OMP END MASTER 3696!$OMP BARRIER 3697 return 3698 end 3699 3700* *********************************** 3701* * * 3702* * D3dB_rrm_sym_dot * 3703* * * 3704* *********************************** 3705 3706 subroutine D3dB_rrm_sym_dot(nb,n,A,B,matrix) 3707 implicit none 3708 integer nb,n 3709 real*8 A(*) 3710 real*8 B(*) 3711 real*8 matrix(n,n) 3712 3713#include "D3dB.fh" 3714 3715* **** local variables **** 3716 integer j,k 3717 integer np 3718 3719 call nwpw_timing_start(2) 3720 call Parallel2d_np_i(np) 3721 3722 do k=1,n 3723 call DGEMM_OMP('T','N',k,1,n2ft3d(nb), 3724 > 1.0d0, 3725 > A,n2ft3d(nb), 3726 > B(1+(k-1)*n2ft3d(nb)),n2ft3d(nb), 3727 > 0.0d0, 3728 > matrix(1,k),k) 3729 end do 3730 3731!$OMP DO 3732 do k=1,n 3733 do j=k+1,n 3734 matrix(j,k) = matrix(k,j) 3735 end do 3736 end do 3737!$OMP END DO 3738 3739 if (np.gt.1) call D3dB_Vector_SumAll(n*n,matrix) 3740 call nwpw_timing_end(2) 3741 return 3742 end 3743 3744 3745 3746 3747 3748* *********************************** 3749* * * 3750* * D3dB_cc_Mul * 3751* * * 3752* *********************************** 3753 3754 subroutine D3dB_cc_Mul(nb,A,B,C) 3755 implicit none 3756 integer nb 3757 complex*16 A(*) 3758 complex*16 B(*) 3759 complex*16 C(*) 3760 3761#include "D3dB.fh" 3762 3763 integer i 3764 3765 do i=1,nfft3d_map(nb) 3766 C(i) = dconjg(A(i)) * B(i) 3767 end do 3768 3769 return 3770 end 3771 3772* *********************************** 3773* * * 3774* * D3dB_cc_Mul2 * 3775* * * 3776* *********************************** 3777 3778 subroutine D3dB_cc_Mul2(nb,A,B) 3779 implicit none 3780 integer nb 3781 complex*16 A(*) 3782 complex*16 B(*) 3783 3784#include "D3dB.fh" 3785 3786 integer i 3787 3788 do i=1,nfft3d_map(nb) 3789 B(i) = A(i) * B(i) 3790 end do 3791 3792 return 3793 end 3794 3795* *********************************** 3796* * * 3797* * D3dB_lc_Mask * 3798* * * 3799* *********************************** 3800 3801 subroutine D3dB_lc_Mask(nb,masker,A) 3802 implicit none 3803 integer nb 3804 logical masker(*) 3805 complex*16 A(*) 3806 3807#include "D3dB.fh" 3808 3809 integer i 3810 3811 do i=1,nfft3d_map(nb) 3812 if (masker(i)) A(i) = dcmplx(0.0d0,0.0d0) 3813 end do 3814 return 3815 end 3816 3817* *********************************** 3818* * * 3819* * D3dB_lr_Mask * 3820* * * 3821* *********************************** 3822 3823 subroutine D3dB_lr_Mask(nb,masker,A) 3824 implicit none 3825 integer nb 3826 logical masker(*) 3827 real*8 A(*) 3828 3829#include "D3dB.fh" 3830 3831 integer i 3832 3833 do i=1,nfft3d_map(nb) 3834 if (masker(i)) A(i) = 0.0d0 3835 end do 3836 return 3837 end 3838 3839 3840* *********************************** 3841* * * 3842* * D3dB_tc_Mul * 3843* * * 3844* *********************************** 3845 3846 subroutine D3dB_tc_Mul(nb,A,B,C) 3847 implicit none 3848 integer nb 3849 real*8 A(*) 3850 complex*16 B(*) 3851 complex*16 C(*) 3852 3853#include "D3dB.fh" 3854 3855 integer i 3856 3857 do i=1,nfft3d_map(nb) 3858 C(i) = A(i) * B(i) 3859 end do 3860 3861 return 3862 end 3863 3864 3865 3866* *********************************** 3867* * * 3868* * D3dB_tc_Mul2 * 3869* * * 3870* *********************************** 3871 3872 subroutine D3dB_tc_Mul2(nb,A,B) 3873 implicit none 3874 integer nb 3875 real*8 A(*) 3876 complex*16 B(*) 3877 3878#include "D3dB.fh" 3879 3880 integer i 3881 3882!$OMP DO 3883 do i=1,nfft3d_map(nb) 3884 B(i) = B(i) * A(i) 3885 end do 3886!$OMP END DO 3887 3888 return 3889 end 3890 3891* *********************************** 3892* * * 3893* * D3dB_rr_Mul * 3894* * * 3895* *********************************** 3896 3897 subroutine D3dB_rr_Mul(nb,A,B,C) 3898 implicit none 3899 3900#include "D3dB.fh" 3901 3902 integer nb 3903 real*8 A(*) 3904 real*8 B(*) 3905 real*8 C(*) 3906 integer i,n 3907 3908!$OMP DO 3909 do i=1,n2ft3d_map(nb) 3910 C(i) = A(i) * B(i) 3911 end do 3912!$OMP END DO 3913 3914 return 3915 end 3916 3917* *********************************** 3918* * * 3919* * D3dB_rr_Mul2 * 3920* * * 3921* *********************************** 3922 3923 subroutine D3dB_rr_Mul2(nb,A,B) 3924 implicit none 3925 integer nb 3926 real*8 A(*) 3927 real*8 B(*) 3928 3929#include "D3dB.fh" 3930 3931 integer i 3932 3933!$OMP DO 3934 do i=1,n2ft3d_map(nb) 3935 B(i) = B(i) * A(i) 3936 end do 3937!$OMP END DO 3938 3939 return 3940 end 3941 3942 3943 3944* *********************************** 3945* * * 3946* * D3dB_cc_Sum * 3947* * * 3948* *********************************** 3949 3950 subroutine D3dB_cc_Sum(nb,A,B,C) 3951 implicit none 3952 integer nb 3953 complex*16 A(*) 3954 complex*16 B(*) 3955 complex*16 C(*) 3956 3957#include "D3dB.fh" 3958 3959 integer i 3960 3961!$OMP DO 3962 do i=1,nfft3d_map(nb) 3963 C(i) = A(i) + B(i) 3964 end do 3965!$OMP END DO 3966 3967 return 3968 end 3969 3970 3971* *********************************** 3972* * * 3973* * D3dB_cc_Sum2 * 3974* * * 3975* *********************************** 3976 3977 subroutine D3dB_cc_Sum2(nb,A,B) 3978 implicit none 3979 integer nb 3980 complex*16 A(*) 3981 complex*16 B(*) 3982 3983#include "D3dB.fh" 3984 3985 integer i 3986 3987!$OMP DO 3988 do i=1,nfft3d_map(nb) 3989 B(i) = B(i) + A(i) 3990 end do 3991!$OMP END DO 3992 3993 return 3994 end 3995 3996 3997* *********************************** 3998* * * 3999* * D3dB_rr_Sum * 4000* * * 4001* *********************************** 4002 4003 subroutine D3dB_rr_Sum(nb,A,B,C) 4004 implicit none 4005 integer nb 4006 real*8 A(*) 4007 real*8 B(*) 4008 real*8 C(*) 4009 4010#include "D3dB.fh" 4011 4012 integer i 4013 4014!$OMP DO 4015 do i=1,n2ft3d_map(nb) 4016 C(i) = B(i)+A(i) 4017 end do 4018!$OMP END DO 4019 4020 return 4021 end 4022 4023 4024* *********************************** 4025* * * 4026* * D3dB_rr_Sum2 * 4027* * * 4028* *********************************** 4029 subroutine D3dB_rr_Sum2(nb,A,B) 4030 implicit none 4031 integer nb 4032 real*8 A(*) 4033 real*8 B(*) 4034 4035#include "D3dB.fh" 4036 4037 integer i 4038 4039!$OMP DO 4040 do i=1,n2ft3d_map(nb) 4041 B(i) = B(i) + A(i) 4042 end do 4043!$OMP END DO 4044 return 4045 end 4046 4047 4048* *********************************** 4049* * * 4050* * D3dB_cc_Sub * 4051* * * 4052* *********************************** 4053 4054 subroutine D3dB_cc_Sub(nb,A,B,C) 4055 implicit none 4056 integer nb 4057 complex*16 A(*) 4058 complex*16 B(*) 4059 complex*16 C(*) 4060 4061#include "D3dB.fh" 4062 4063 integer i 4064 4065 do i=1,nfft3d_map(nb) 4066 C(i) = A(i) - B(i) 4067 end do 4068 4069 return 4070 end 4071 4072 4073* *********************************** 4074* * * 4075* * D3dB_rr_Sub * 4076* * * 4077* *********************************** 4078 4079 subroutine D3dB_rr_Sub(nb,A,B,C) 4080 implicit none 4081 4082#include "D3dB.fh" 4083 4084 integer nb 4085 real*8 A(n2ft3d_map(nb)) 4086 real*8 B(n2ft3d_map(nb)) 4087 real*8 C(n2ft3d_map(nb)) 4088 integer i 4089 4090 4091!$OMP DO 4092 do i=1,n2ft3d_map(nb) 4093 C(i) = A(i) - B(i) 4094 end do 4095!$OMP END DO 4096 4097 return 4098 end 4099 4100 4101 4102* *********************************** 4103* * * 4104* * D3dB_rr_Sub2 * 4105* * * 4106* *********************************** 4107 4108 subroutine D3dB_rr_Sub2(nb,A,B) 4109 implicit none 4110 4111#include "D3dB.fh" 4112 4113 integer nb 4114 real*8 A(n2ft3d_map(nb)) 4115 real*8 B(n2ft3d_map(nb)) 4116 4117 integer i 4118 4119!$OMP DO 4120 do i=1,n2ft3d_map(nb) 4121 B(i) = B(i) - A(i) 4122 end do 4123!$OMP END DO 4124 4125 4126 return 4127 end 4128 4129* *********************************** 4130* * * 4131* * D3dB_rr_Multiply2 * 4132* * * 4133* *********************************** 4134 subroutine D3dB_rr_Multiply2(nb,A,B) 4135 implicit none 4136 integer nb 4137 real*8 A(*) 4138 real*8 B(*) 4139 4140#include "D3dB.fh" 4141 4142 integer i 4143 4144!$OMP DO 4145 do i=1,n2ft3d_map(nb) 4146 B(i) = B(i)*A(i) 4147 end do 4148!$OMP END DO 4149 4150 return 4151 end 4152 4153 4154 4155* *********************************** 4156* * * 4157* * D3dB_rrr_MultiplyAdd * 4158* * * 4159* *********************************** 4160 subroutine D3dB_rrr_MultiplyAdd(nb,A,B,C) 4161 implicit none 4162 integer nb 4163 real*8 A(*) 4164 real*8 B(*) 4165 real*8 C(*) 4166 4167#include "D3dB.fh" 4168 4169 integer i 4170 4171!$OMP DO 4172 do i=1,n2ft3d_map(nb) 4173 C(i) = C(i) + B(i)*A(i) 4174 end do 4175!$OMP END DO 4176 4177 return 4178 end 4179 4180 4181 4182* *********************************** 4183* * * 4184* * D3dB_cc_zaxpy * 4185* * * 4186* *********************************** 4187 4188 subroutine D3dB_cc_zaxpy(nb,alpha,A,B) 4189 implicit none 4190 integer nb 4191 complex*16 alpha 4192 complex*16 A(*) 4193 complex*16 B(*) 4194 4195#include "D3dB.fh" 4196 4197 integer i 4198 4199!$OMP DO 4200 do i=1,nfft3d_map(nb) 4201 B(i) = B(i) + alpha*A(i) 4202 end do 4203!$OMP END DO 4204 4205 return 4206 end 4207 4208 4209 4210* *********************************** 4211* * * 4212* * D3dB_cc_daxpy * 4213* * * 4214* *********************************** 4215 4216 subroutine D3dB_cc_daxpy(nb,alpha,A,B) 4217 implicit none 4218 integer nb 4219 real*8 alpha 4220 complex*16 A(*) 4221 complex*16 B(*) 4222 4223#include "D3dB.fh" 4224 4225 integer i 4226 4227!$OMP DO 4228 do i=1,nfft3d_map(nb) 4229 B(i) = B(i) + alpha*A(i) 4230 end do 4231!$OMP END DO 4232 4233 return 4234 end 4235 4236* *********************************** 4237* * * 4238* * D3dB_rr_daxpy * 4239* * * 4240* *********************************** 4241 4242 subroutine D3dB_rr_daxpy(nb,alpha,A,B) 4243 implicit none 4244 integer nb 4245 real*8 alpha 4246 real*8 A(*) 4247 real*8 B(*) 4248 4249#include "D3dB.fh" 4250 4251 integer i 4252 4253!$OMP DO 4254 do i=1,n2ft3d_map(nb) 4255 B(i) = B(i) + alpha* A(i) 4256 end do 4257!$OMP END DO 4258 4259 return 4260 end 4261 4262* *********************************** 4263* * * 4264* * D3dB_rr_Divide * 4265* * * 4266* *********************************** 4267 4268 subroutine D3dB_rr_Divide(nb,A,B,C) 4269 implicit none 4270 integer nb 4271 real*8 A(*) 4272 real*8 B(*) 4273 real*8 C(*) 4274 4275#include "D3dB.fh" 4276 4277 real*8 eta 4278 parameter (eta=1.0d-9) 4279 4280 integer index 4281 4282 !do q=1,nq(nb) 4283 !do j=1,ny(nb) 4284 !do i=1,nx(nb) 4285CDIR$ NOVECTOR 4286!$OMP DO 4287 do index = 1,n2ft3d_map(nb) 4288 !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb) 4289 if (dabs(B(index)) .le. eta) then 4290 C(index) = 0.0d0 4291 else 4292 C(index) = A(index) / B(index) 4293 end if 4294 end do 4295!$OMP END DO 4296 !end do 4297 !end do 4298 !end do 4299 4300 return 4301 end 4302 4303 4304 4305* *********************************** 4306* * * 4307* * D3dB_rr_Divide2 * 4308* * * 4309* *********************************** 4310 4311 subroutine D3dB_rr_Divide2(nb,A,B) 4312 implicit none 4313 integer nb 4314 real*8 A(*) 4315 real*8 B(*) 4316 4317#include "D3dB.fh" 4318 4319 real*8 eta 4320 parameter (eta=1.0d-9) 4321 4322 integer index 4323 4324 !do q=1,nq(nb) 4325 !do j=1,ny(nb) 4326 !do i=1,nx(nb) 4327CDIR$ NOVECTOR 4328!$OMP DO 4329 do index = 1,n2ft3d_map(nb) 4330 !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb) 4331 if (dabs(A(index)) .le. eta) then 4332 B(index) = 0.0d0 4333 else 4334 B(index) = B(index) / A(index) 4335 end if 4336 end do 4337!$OMP END DO 4338 !end do 4339 !end do 4340 !end do 4341 4342 return 4343 end 4344 4345 4346 4347* *********************************** 4348* * * 4349* * D3dB_r_ABS * 4350* * * 4351* *********************************** 4352 4353 subroutine D3dB_r_ABS(nb,A,C) 4354 implicit none 4355 integer nb 4356 real*8 A(*) 4357 real*8 C(*) 4358 4359#include "D3dB.fh" 4360 4361 4362 integer index 4363 4364 !do q=1,nq(nb) 4365 !do j=1,ny(nb) 4366 !do i=1,nx(nb) 4367!$OMP DO 4368 do index=1,n2ft3d_map(nb) 4369 !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb) 4370 C(index) = dabs(A(index)) 4371 end do 4372!$OMP END DO 4373 !end do 4374 !end do 4375 !end do 4376 4377 return 4378 end 4379 4380 subroutine D3dB_r_abs1(nb,A) 4381 implicit none 4382 integer nb 4383 real*8 A(*) 4384 4385#include "D3dB.fh" 4386 4387 integer index 4388 4389!$OMP DO 4390 do index=1,n2ft3d_map(nb) 4391 !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb) 4392 A(index) = dabs(A(index)) 4393 end do 4394!$OMP END DO 4395 return 4396 end 4397 4398* *********************************** 4399* * * 4400* * D3dB_r_ABSMAX * 4401* * * 4402* *********************************** 4403 4404 subroutine D3dB_r_ABSMAX(nb,A,amax) 4405 implicit none 4406 integer nb 4407 real*8 A(*) 4408 real*8 amax 4409 4410#include "D3dB.fh" 4411 4412 integer index 4413 real*8 aa 4414 4415 amax = 0.0d0 4416 do index=1,n2ft3d_map(nb) 4417 !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb) 4418 aa = dabs(A(index)) 4419 if (aa.gt.amax) amax = aa 4420 end do 4421 call D3dB_MaxAll(amax) 4422 4423 return 4424 end 4425 4426 4427* *********************************** 4428* * * 4429* * D3dB_r_ZeroNegative * 4430* * * 4431* *********************************** 4432 4433 subroutine D3dB_r_ZeroNegative(nb,A) 4434 implicit none 4435 integer nb 4436 real*8 A(*) 4437 4438#include "D3dB.fh" 4439 4440 integer i 4441 4442 do i=1,n2ft3d_map(nb) 4443 if (A(i).lt.0.0d0) A(i) = 0.0d0 4444 end do 4445 4446 return 4447 end 4448 4449* *********************************** 4450* * * 4451* * D3dB_rr_Minus * 4452* * * 4453* *********************************** 4454 subroutine D3dB_rr_Minus(nb,A,B,C) 4455 implicit none 4456 integer nb 4457 real*8 A(*) 4458 real*8 B(*) 4459 real*8 C(*) 4460 4461#include "D3dB.fh" 4462 4463 integer i 4464 4465!$OMP DO 4466 do i=1,n2ft3d_map(nb) 4467 C(i) = A(i) - B(i) 4468 end do 4469!$OMP END DO 4470 4471 return 4472 end 4473 4474* *********************************** 4475* * * 4476* * D3dB_r_Zero_Ends0 * 4477* * * 4478* *********************************** 4479 4480 subroutine D3dB_r_Zero_Ends0(nb,A) 4481 integer nb 4482 real*8 A(*) 4483 4484#include "D3dB.fh" 4485 4486 integer j,k,q,index,taskid,p 4487 4488 !**** slab mapping **** 4489 if (mapping.eq.1) then 4490 do q=1,nq(nb) 4491 do j=1,ny(nb) 4492 index = (nx(nb)+1) + (j-1)*(nx(nb)+2) 4493 > + (q-1)*(nx(nb)+2)*(ny(nb)) 4494 4495 A(index) = 0.0d0 4496 A(index+1) = 0.0d0 4497 end do 4498 end do 4499 4500 4501 !**** hilbert mapping **** 4502 else 4503 call Parallel2d_taskid_i(taskid) 4504 do k=1,nz(nb) 4505 do j=1,ny(nb) 4506 4507 call D3dB_ijktoindex2p(nb,(nx(nb)+1),j,k,index,p) 4508 if (p.eq.taskid) A(index) = 0.0d0 4509 4510 call D3dB_ijktoindex2p(nb,(nx(nb)+2),j,k,index,p) 4511 if (p.eq.taskid) A(index) = 0.0d0 4512 4513 end do 4514 end do 4515 end if 4516 4517 if (n2ft3d_map(nb).lt.n2ft3d(nb)) then 4518 call dcopy((n2ft3d(nb)-n2ft3d_map(nb)), 4519 > 0.0d0,0,A(n2ft3d_map(nb)+1),1) 4520 end if 4521 4522 return 4523 end 4524 4525 4526 4527 4528 4529* *********************************** 4530* * * 4531* * D3dB_r_Zero_Ends * 4532* * * 4533* *********************************** 4534 4535 subroutine D3dB_r_Zero_Ends(nb,A) 4536 integer nb 4537 real*8 A(*) 4538 4539#include "D3dB.fh" 4540 4541 integer j,k,q,index,taskid,p 4542 4543 !**** slab mapping **** 4544 if (mapping.eq.1) then 4545!$OMP DO 4546 do q=1,nq(nb) 4547 do j=1,ny(nb) 4548 index = (nx(nb)+1) + (j-1)*(nx(nb)+2) 4549 > + (q-1)*(nx(nb)+2)*(ny(nb)) 4550 4551 A(index) = 0.0d0 4552 A(index+1) = 0.0d0 4553 end do 4554 end do 4555!$OMP END DO 4556 4557 4558 !**** hilbert mapping **** 4559 else 4560 call Parallel2d_taskid_i(taskid) 4561!$OMP DO 4562 do k=1,nz(nb) 4563 do j=1,ny(nb) 4564 4565 call D3dB_ijktoindex2p(nb,(nx(nb)+1),j,k,index,p) 4566 if (p.eq.taskid) A(index) = 0.0d0 4567 4568 call D3dB_ijktoindex2p(nb,(nx(nb)+2),j,k,index,p) 4569 if (p.eq.taskid) A(index) = 0.0d0 4570 4571 end do 4572 end do 4573!$OMP end DO 4574 end if 4575 4576!$OMP MASTER 4577 if (n2ft3d_map(nb).lt.n2ft3d(nb)) then 4578 call dcopy((n2ft3d(nb)-n2ft3d_map(nb)), 4579 > 0.0d0,0,A(n2ft3d_map(nb)+1),1) 4580 end if 4581!$OMP END MASTER 4582!$OMP BARRIER 4583 4584 return 4585 end 4586 4587 4588 4589* *********************************** 4590* * * 4591* * D3dB_r_notZero_Ends * 4592* * * 4593* *********************************** 4594 4595 subroutine D3dB_r_notZero_Ends(nb,A) 4596 integer nb 4597 real*8 A(*) 4598 4599#include "D3dB.fh" 4600 4601 integer j,k,q,index,taskid,p 4602 4603 !**** slab mapping **** 4604 if (mapping.eq.1) then 4605 do q=1,nq(nb) 4606 do j=1,ny(nb) 4607 index = (nx(nb)+1) + (j-1)*(nx(nb)+2) 4608 > + (q-1)*(nx(nb)+2)*(ny(nb)) 4609 A(index) = 1.0d0 4610 A(index+1) = 1.0d0 4611 end do 4612 end do 4613 4614 4615 !**** hilbert mapping **** 4616 else 4617 call Parallel2d_taskid_i(taskid) 4618 do k=1,nz(nb) 4619 do j=1,ny(nb) 4620 4621 call D3dB_ijktoindex2p(nb,(nx(nb)+1),j,k,index,p) 4622 if (p.eq.taskid) A(index) = 1.0d0 4623 4624 call D3dB_ijktoindex2p(nb,(nx(nb)+2),j,k,index,p) 4625 if (p.eq.taskid) A(index) = 1.0d0 4626 4627 end do 4628 end do 4629 end if 4630 4631 return 4632 end 4633 4634 4635 4636 4637* *********************************** 4638* * * 4639* * D3dB_r_dsum * 4640* * * 4641* *********************************** 4642 4643 subroutine D3dB_r_dsum(nb,A,sumall) 4644 implicit none 4645 integer nb 4646 real*8 A(*) 4647 real*8 sumall 4648 4649#include "D3dB.fh" 4650 4651 integer i,np 4652 real*8 sum 4653 4654 call Parallel2d_np_i(np) 4655 4656* **** sum up dot product on this node **** 4657 sum = 0.0d0 4658 do i=1,n2ft3d_map(nb) 4659 sum = sum + A(i) 4660 end do 4661 4662* **** add up sums from other nodes **** 4663 if (np.gt.1) then 4664 call D3dB_SumAll(sum) 4665 end if 4666 4667 sumall = sum 4668 4669 return 4670 end 4671 4672* *********************************** 4673* * * 4674* * D3dB_t_dsum * 4675* * * 4676* *********************************** 4677 4678 subroutine D3dB_t_dsum(nb,A,sumall) 4679 implicit none 4680 integer nb 4681 real*8 A(*) 4682 real*8 sumall 4683 4684#include "D3dB.fh" 4685 4686 integer i,j,k,q,np,nxh,index,taskid,p 4687 real*8 sum 4688 4689 nxh = nx(nb)/2 4690 call Parallel2d_np_i(np) 4691 4692* **** sum up dot product on this node **** 4693 sum = 0.0d0 4694 4695 !********************** 4696 !**** slab mapping **** 4697 !********************** 4698 if (mapping.eq.1) then 4699* ***** k!=0 plane so double count ***** 4700 do q=1,nq(nb) 4701 do j=1,ny(nb) 4702 do i=2,(nxh+1) 4703 index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + i 4704 sum = sum + A(index) 4705 end do 4706 end do 4707 end do 4708 sum = sum*2.0d0 4709 4710* ***** k==0 plane, so single count ***** 4711 do q=1,nq(nb) 4712 do j=1,ny(nb) 4713 index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + 1 4714 sum = sum + A(index) 4715 end do 4716 end do 4717 4718 4719 !************************* 4720 !**** hilbert mapping **** 4721 !************************* 4722 else 4723 call Parallel2d_taskid_i(taskid) 4724* ***** kx!=0 plane, so double count ***** 4725 do index=1,nfft3d_map(nb) 4726 sum = sum + A(index) 4727 end do 4728 sum = sum*2.0d0 4729 4730* ***** kx==0 plane, so single count ***** 4731 do k=1,nz(nb) 4732 do j=1,ny(nb) 4733 i=1 4734 call D3dB_ijktoindexp(nb,i,j,k,index,p) 4735 if (p.eq.taskid) then 4736 sum = sum - A(index) 4737 end if 4738 end do 4739 end do 4740 4741 end if 4742 4743 4744 4745 4746* **** add up sums from other nodes **** 4747 if (np.gt.1) then 4748 call D3dB_SumAll(sum) 4749 end if 4750 4751 sumall = sum 4752 4753 return 4754 end 4755 4756 4757* *********************************** 4758* * * 4759* * D3dB_cc_Vector_dot * 4760* * * 4761* *********************************** 4762 4763 subroutine D3dB_cc_Vector_dot(nb,nnfft3d,nn,ne,A,B,sumall) 4764 implicit none 4765 integer nb 4766 integer nnfft3d,nn,ne 4767 complex*16 A(*) 4768 complex*16 B(*) 4769 real*8 sumall(nn,nn) 4770 4771 4772#include "D3dB.fh" 4773 4774 integer i,j,k,q,index,np,taskid,p 4775 integer index1,index2 4776 integer n,m,shift1,shift2 4777 real*8 sum 4778 4779 4780 call nwpw_timing_start(2) 4781 4782 call Parallel2d_np_i(np) 4783 4784 !********************** 4785 !**** slab mapping **** 4786 !********************** 4787 if (mapping.eq.1) then 4788* **** sum up dot product on this node **** 4789 do n=1,ne 4790 do m=n,ne 4791 4792 shift1 = (n-1)*nnfft3d 4793 shift2 = (m-1)*nnfft3d 4794 sum = 0.0d0 4795 4796* ***** kx!=0 plane, so double count ***** 4797 do q=1,nq(nb) 4798 do j=1,ny(nb) 4799 do i=2,(nx(nb)/2+1) 4800 index = (q-1)*(nx(nb)/2+1)*ny(nb) 4801 > + (j-1)*(nx(nb)/2+1) + i 4802 index1 = index+shift1 4803 index2 = index+shift2 4804 sum = sum + dble(A(index1)) * dble(B(index2)) 4805 > + dimag(A(index1)) * dimag(B(index2)) 4806 end do 4807 end do 4808 end do 4809 sum = sum*2.0d0 4810 4811* ***** kx==0 plane, so single count ***** 4812 do q=1,nq(nb) 4813 do j=1,ny(nb) 4814 i=1 4815 index = (q-1)*(nx(nb)/2+1)*ny(nb) 4816 > + (j-1)*(nx(nb)/2+1) + 1 4817 index1 = index+shift1 4818 index2 = index+shift2 4819 sum = sum + dble(A(index1)) * dble(B(index2)) 4820 > + dimag(A(index1)) * dimag(B(index2)) 4821 end do 4822 end do 4823 4824 sumall(n,m) = sum 4825 sumall(m,n) = sum 4826 end do 4827 end do 4828 4829 4830 !************************* 4831 !**** hilbert mapping **** 4832 !************************* 4833 else 4834 call Parallel2d_taskid_i(taskid) 4835* **** sum up dot product on this node **** 4836 do n=1,ne 4837 do m=n,ne 4838 4839 shift1 = (n-1)*nnfft3d 4840 shift2 = (m-1)*nnfft3d 4841 sum = 0.0d0 4842 4843* ***** kx!=0 plane, so double count ***** 4844 do index=1,nfft3d_map(nb) 4845 index1 = index+shift1 4846 index2 = index+shift2 4847 sum = sum + dble(A(index1)) * dble(B(index2)) 4848 > + dimag(A(index1)) * dimag(B(index2)) 4849 end do 4850 sum = sum*2.0d0 4851 4852* ***** kx==0 plane, so single count ***** 4853 do k=1,nz(nb) 4854 do j=1,ny(nb) 4855 i=1 4856 call D3dB_ijktoindexp(nb,i,j,k,index,p) 4857 if (p.eq.taskid) then 4858 index1 = index+shift1 4859 index2 = index+shift2 4860 sum = sum - dble(A(index1)) * dble(B(index2)) 4861 > - dimag(A(index1)) * dimag(B(index2)) 4862 end if 4863 end do 4864 end do 4865 4866 4867 sumall(n,m) = sum 4868 sumall(m,n) = sum 4869 end do 4870 end do 4871 4872 end if 4873 4874 4875* **** add up sums from other nodes **** 4876 if (np.gt.1) then 4877 call D3dB_Vector_SumAll(nn*ne,sumall) 4878 end if 4879 4880 call nwpw_timing_end(2) 4881 4882 return 4883 end 4884 4885 4886 4887* *********************************** 4888* * * 4889* * D3dB_cc_Vector_ndot * 4890* * * 4891* *********************************** 4892 4893 subroutine D3dB_cc_Vector_ndot(nb,nnfft3d,ne,A,B,sumall) 4894 implicit none 4895 integer nb 4896 integer nnfft3d,ne 4897 complex*16 A(*) 4898 complex*16 B(*) 4899 real*8 sumall(ne) 4900 4901 4902#include "D3dB.fh" 4903 4904 integer i,j,k,q,index,np,taskid,p 4905 integer index1,index2 4906 integer n,shift1,shift2 4907 real*8 sum 4908 4909 4910 call nwpw_timing_start(2) 4911 4912 call Parallel2d_np_i(np) 4913 4914 !********************** 4915 !**** slab mapping **** 4916 !********************** 4917 if (mapping.eq.1) then 4918* **** sum up dot product on this node **** 4919 do n=1,ne 4920 4921 shift1 = (n-1)*nnfft3d 4922 shift2 = 0 4923 sum = 0.0d0 4924* ***** kx!=0 plane, so double count ***** 4925 do q=1,nq(nb) 4926 do j=1,ny(nb) 4927 do i=2,(nx(nb)/2+1) 4928 index = (q-1)*(nx(nb)/2+1)*ny(nb) 4929 > + (j-1)*(nx(nb)/2+1) + i 4930 index1 = index+shift1 4931 index2 = index+shift2 4932 sum = sum + dble(A(index1)) * dble(B(index2)) 4933 > + dimag(A(index1)) * dimag(B(index2)) 4934 end do 4935 end do 4936 end do 4937 sum = sum*2.0d0 4938 4939* ***** kx==0 plane, so single count ***** 4940 do q=1,nq(nb) 4941 do j=1,ny(nb) 4942 i=1 4943 index = (q-1)*(nx(nb)/2+1)*ny(nb) 4944 > + (j-1)*(nx(nb)/2+1) + 1 4945 index1 = index+shift1 4946 index2 = index+shift2 4947 sum = sum + dble(A(index1)) * dble(B(index2)) 4948 > + dimag(A(index1)) * dimag(B(index2)) 4949 end do 4950 end do 4951 4952 sumall(n) = sum 4953 end do 4954 4955 4956 !************************* 4957 !**** hilbert mapping **** 4958 !************************* 4959 else 4960 call Parallel2d_taskid_i(taskid) 4961* **** sum up dot product on this node **** 4962 do n=1,ne 4963 4964 shift1 = (n-1)*nnfft3d 4965 shift2 = 0 4966 sum = 0.0d0 4967 4968* ***** kx!=0 plane, so double count ***** 4969 do index=1,nfft3d_map(nb) 4970 index1 = index+shift1 4971 index2 = index+shift2 4972 sum = sum + dble(A(index1)) * dble(B(index2)) 4973 > + dimag(A(index1)) * dimag(B(index2)) 4974 end do 4975 sum = sum*2.0d0 4976 4977* ***** kx==0 plane, so single count ***** 4978 do k=1,nz(nb) 4979 do j=1,ny(nb) 4980 i=1 4981 call D3dB_ijktoindexp(nb,i,j,k,index,p) 4982 if (p.eq.taskid) then 4983 index1 = index+shift1 4984 index2 = index+shift2 4985 sum = sum - dble(A(index1)) * dble(B(index2)) 4986 > - dimag(A(index1)) * dimag(B(index2)) 4987 end if 4988 end do 4989 end do 4990 4991 sumall(n) = sum 4992 end do 4993 4994 end if 4995 4996 4997* **** add up sums from other nodes **** 4998 if (np.gt.1) then 4999 call D3dB_Vector_SumAll(ne,sumall) 5000 end if 5001 5002 call nwpw_timing_end(2) 5003 5004 return 5005 end 5006 5007 5008 5009 5010 5011c* *********************************** 5012c* * * 5013c* * D3dB_Vector_SumAll * 5014c* * * 5015c* *********************************** 5016c 5017c subroutine D3dB_Vector_SumAll(n,sum) 5018cc implicit none 5019c integer n 5020c real*8 sum(*) 5021c 5022c#include "bafdecls.fh" 5023c 5024c#include "tcgmsg.fh" 5025c#include "msgtypesf.h" 5026c#include "errquit.fh" 5027c 5028c 5029c 5030c logical value 5031c integer MASTER 5032c parameter (MASTER=0) 5033c integer msglen 5034c 5035c* **** temporary workspace **** 5036c integer sumall(2),np 5037c 5038c* **** external functions **** 5039c integer Parallel2d_comm_i 5040c external Parallel2d_comm_i 5041c 5042c 5043c 5044c call Parallel_np(np) 5045c call nwpw_timing_start(2) 5046c if (np.gt.1) then 5047c 5048c* ***** allocate temporary space **** 5049c value = BA_push_get(mt_dbl,n,'sumall',sumall(2),sumall(1)) 5050c if (.not. value) call errquit('out of stack memory',0, MA_ERR) 5051c 5052c msglen = n 5053c 5054c 5055c call dcopy(n,sum,1,dbl_mb(sumall(1)),1) 5056c 5057c call GA_PGROUP_DGOP(Parallel2d_comm_i(), 5058c > 9+MSGDBL,dbl_mb(sumall(1)),n,'+') 5059cc call GA_DGOP(9+MSGDBL,dbl_mb(sumall(1)),n,'+') 5060cc call DGOP(9+MSGDBL,dbl_mb(sumall(1)),n,'+') 5061c 5062c 5063c call dcopy(n,dbl_mb(sumall(1)),1,sum,1) 5064c value = BA_pop_stack(sumall(2)) 5065c 5066c end if 5067c call nwpw_timing_end(2) 5068c return 5069c end 5070c 5071c 5072c* *********************************** 5073c* * * 5074c* * D3dB_Vector_ISumAll * 5075c* * * 5076c* *********************************** 5077c 5078c subroutine D3dB_Vector_ISumAll(n,sum) 5079cc implicit none 5080c integer n 5081c integer sum(*) 5082c 5083c#include "bafdecls.fh" 5084c#include "errquit.fh" 5085c 5086c 5087c 5088c#include "tcgmsg.fh" 5089c#include "msgtypesf.h" 5090c 5091c 5092c 5093c integer MASTER 5094c parameter (MASTER=0) 5095c integer msglen 5096c logical value 5097c 5098c* **** temporary workspace **** 5099c integer sumall(2) 5100c 5101c* **** external functions **** 5102c integer Parallel2d_comm_i 5103c external Parallel2d_comm_i 5104c 5105c 5106c call nwpw_timing_start(2) 5107c 5108c* ***** allocate temporary space **** 5109c value = BA_push_get(mt_int,n,'sumall',sumall(2),sumall(1)) 5110c if (.not. value) call errquit('out of stack memory',0, MA_ERR) 5111c 5112c msglen = n 5113c 5114c 5115c call icopy(n,sum,1,int_mb(sumall(1)),1) 5116c call GA_PGROUP_IGOP(Parallel2d_comm_i(), 5117c > 9+MSGINT,int_mb(sumall(1)),n,'+') 5118cc call GA_IGOP(9+MSGINT,int_mb(sumall(1)),n,'+') 5119c 5120c 5121c call icopy(n,int_mb(sumall(1)),1,sum,1) 5122c value = BA_pop_stack(sumall(2)) 5123c 5124c call nwpw_timing_end(2) 5125c return 5126c end 5127 5128 5129c *** icopy define in src/util directory!!! 5130c subroutine icopy(n,a,inca,b,incb) 5131c integer n 5132c integer a(*),inca 5133c integer b(*),incb 5134c 5135c integer i,shifta,shiftb 5136c 5137c shifta = 1 5138c shiftb = 1 5139c do i=1,n 5140c b(shiftb)=a(shifta) 5141c shifta=shifta+inca 5142c shiftb=shiftb+incb 5143c end do 5144c 5145c return 5146c end 5147 5148 5149* *********************************** 5150* * * 5151* * D3dB_ic_Mul * 5152* * * 5153* *********************************** 5154cpgi$r opt=1 5155 subroutine D3dB_ic_Mul(nb,A,B,C) 5156 implicit none 5157 integer nb 5158 real*8 A(*) 5159 complex*16 B(*) 5160 complex*16 C(*) 5161 5162#include "D3dB.fh" 5163 5164 integer i 5165 5166!$OMP DO 5167 do i=1,nfft3d_map(nb) 5168 C(i) = dcmplx(0.0d0,A(i)) * B(i) 5169 end do 5170!$OMP END DO 5171 5172 return 5173 end 5174 5175* *********************************** 5176* * * 5177* * D3dB_ic_Mul2 * 5178* * * 5179* *********************************** 5180cpgi$r opt=1 5181 subroutine D3dB_ic_Mul2(nb,A,B) 5182 implicit none 5183 integer nb 5184 real*8 A(*) 5185 complex*16 B(*) 5186 5187#include "D3dB.fh" 5188 5189 integer i 5190 5191!$OMP DO 5192 do i=1,nfft3d_map(nb) 5193 B(i) = dcmplx(0.0d0,A(i)) * B(i) 5194 end do 5195!$OMP END DO 5196 5197 return 5198 end 5199 5200 5201* *********************************** 5202* * * 5203* * D3dB_r_Expand * 5204* * * 5205* *********************************** 5206 5207 subroutine D3dB_r_Expand(nb1,A,nb2,B) 5208 implicit none 5209 integer nb1 5210 real*8 A(*) 5211 integer nb2 5212 real*8 B(*) 5213 5214#include "D3dB.fh" 5215 5216 integer i,j,q,index1,index2 5217 5218 if (mapping.eq.1) then 5219c call dcopy(nq(nb2)*ny(nb2)*(nx(nb2)+2),0.0d0,0,B,1) 5220 call Parallel_shared_vector_zero(.true., 5221 > nq(nb2)*ny(nb2)*(nx(nb2)+2),B) 5222!$OMP DO 5223 do j=1,ny(nb1) 5224 do q=1,nq(nb1) 5225 do i=1,nx(nb1) 5226 index1 = i + (j-1)*(nx(nb1)+2) + (q-1)*(nx(nb1)+2)*ny(nb1) 5227 index2 = i + (j-1)*(nx(nb2)+2) + (q-1)*(nx(nb2)+2)*ny(nb2) 5228 B(index2) = A(index1) 5229 end do 5230 end do 5231 end do 5232!$OMP END DO 5233 5234 else 5235c call dcopy(n2ft3d(nb2),0.0d0,0,B,1) 5236 call Parallel_shared_vector_zero(.true.,n2ft3d(nb2),B) 5237!$OMP DO 5238 do q=1,nq1(nb1) 5239 do i=1,nx(nb1) 5240 index1 = i + (q-1)*(nx(nb1)+2) 5241 index2 = i + (q-1)*(nx(nb2)+2) 5242 B(index2) = A(index1) 5243 end do 5244 end do 5245!$OMP END DO 5246 end if 5247 return 5248 end 5249 5250* *********************************** 5251* * * 5252* * D3dB_r_Contract * 5253* * * 5254* *********************************** 5255 5256 subroutine D3dB_r_Contract(nb2,B,nb1,A) 5257 implicit none 5258 integer nb2 5259 real*8 B(*) 5260 integer nb1 5261 real*8 A(*) 5262 5263#include "D3dB.fh" 5264 5265 integer i,j,q,index1,index2 5266 5267 if (mapping.eq.1) then 5268c call dcopy(nq(nb1)*ny(nb1)*(nx(nb1)+2),0.0d0,0,A,1) 5269 call Parallel_shared_vector_zero(.true., 5270 > nq(nb1)*ny(nb1)*(nx(nb1)+2),A) 5271!$OMP DO 5272 do j=1,ny(nb1) 5273 do q=1,nq(nb1) 5274 do i=1,nx(nb1) 5275 index1 = i + (j-1)*(nx(nb1)+2) + (q-1)*(nx(nb1)+2)*ny(nb1) 5276 index2 = i + (j-1)*(nx(nb2)+2) + (q-1)*(nx(nb2)+2)*ny(nb2) 5277 A(index1) = B(index2) 5278 end do 5279 end do 5280 end do 5281!$OMP END DO 5282 5283 else 5284c call dcopy(n2ft3d(nb1),0.0d0,0,A,1) 5285 call Parallel_shared_vector_zero(.true.,n2ft3d(nb1),A) 5286!$OMP DO 5287 do q=1,nq1(nb1) 5288 do i=1,nx(nb1) 5289 index1 = i + (q-1)*(nx(nb1)+2) 5290 index2 = i + (q-1)*(nx(nb2)+2) 5291 A(index1) = B(index2) 5292 end do 5293 end do 5294!$OMP END DO 5295 end if 5296 5297 return 5298 end 5299 5300* *********************************** 5301* * * 5302* * D3dB_timereverse_size * 5303* * * 5304* *********************************** 5305 5306 integer function D3dB_timereverse_size(nb) 5307 implicit none 5308 integer nb 5309 5310#include "D3dB.fh" 5311 5312* **** local variables **** 5313 integer proc_to,proc_from 5314 integer pto,np,taskid 5315 integer phere,itmp,itmp2 5316 integer index1,index2 5317 integer it,size 5318 integer i2,j2,k2 5319 integer i3,j3,k3 5320 integer nyh,nzh 5321 5322 call Parallel2d_taskid_i(taskid) 5323 call Parallel2d_np_i(np) 5324 5325 nyh = ny(nb)/2 5326 nzh = nz(nb)/2 5327 5328 index1 = 1 5329 index2 = 1 5330 do it=0,np-1 5331 proc_to = mod(taskid+it,np) 5332 proc_from = mod(taskid-it+np,np) 5333 5334* ********************* 5335* **** K=(0,0,k3) **** 5336* ********************* 5337 do k3=1,(nzh-1) 5338 i3 = k3 5339 j3 = -k3 5340 if (i3.lt.0) i3 = i3 + nz(nb) 5341 if (j3.lt.0) j3 = j3 + nz(nb) 5342 i2 = 1 5343 i3 = i3 + 1 5344 j2 = 1 5345 j3 = j3 + 1 5346 5347* **** packing scheme **** 5348 call D3dB_ijktoindexp(nb,1,1,i3,itmp,phere) 5349 call D3dB_ijktoindexp(nb,1,1,j3,itmp2,pto) 5350 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 5351 index1 = index1 + 1 5352 end if 5353 5354* **** unpacking scheme **** 5355 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 5356 index2 = index2 + 1 5357 end if 5358 5359 end do 5360 5361* ********************* 5362* **** k=(0,k2,k3) **** 5363* ********************* 5364 do k3=(-nzh+1),(nzh-1) 5365 do k2=1,(nyh-1) 5366 i2 = k2 5367 i3 = k3 5368 j2 = -k2 5369 j3 = -k3 5370 if (i2.lt.0) i2 = i2 + ny(nb) 5371 if (i3.lt.0) i3 = i3 + nz(nb) 5372 if (j2.lt.0) j2 = j2 + ny(nb) 5373 if (j3.lt.0) j3 = j3 + nz(nb) 5374 i2 = i2 + 1 5375 i3 = i3 + 1 5376 j2 = j2 + 1 5377 j3 = j3 + 1 5378 5379* **** packing scheme **** 5380 call D3dB_ijktoindexp(nb,1,i2,i3,itmp,phere) 5381 call D3dB_ijktoindexp(nb,1,j2,j3,itmp2,pto) 5382 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 5383 index1 = index1 + 1 5384 end if 5385 5386* **** unpacking scheme **** 5387 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 5388 index2 = index2 + 1 5389 end if 5390 end do 5391 end do 5392 5393 end do 5394 size = index1 5395 if (size.lt.index2) size = index2 5396 5397 D3dB_timereverse_size = size 5398 return 5399 end 5400 5401* *********************************** 5402* * * 5403* * D3dB_c_timereverse_init * 5404* * * 5405* *********************************** 5406 5407 subroutine D3dB_c_timereverse_init(nb) 5408 implicit none 5409 integer nb 5410 5411#include "bafdecls.fh" 5412#include "D3dB.fh" 5413#include "errquit.fh" 5414 5415c integer iq_to_i1(2*NFFT2*NSLABS) 5416c integer iq_to_i2(2*NFFT2*NSLABS) 5417c integer i1_start(NFFT3+1) 5418c integer i2_start(NFFT3+1) 5419 integer iq_to_i1(2,NBLOCKS) 5420 integer iq_to_i2(2,NBLOCKS) 5421 integer i1_start(2,NBLOCKS) 5422 integer i2_start(2,NBLOCKS) 5423 common / timereverse_blk / iq_to_i1,iq_to_i2,i1_start,i2_start 5424 5425* **** local variables **** 5426 integer proc_to,proc_from 5427 integer pto,np,taskid 5428 integer phere 5429 integer index1,index2,itmp,itmp2 5430 integer it 5431 integer i2,j2,k2 5432 integer i3,j3,k3 5433 integer nyh,nzh 5434 logical value 5435 5436 !**** external functions **** 5437 integer D3dB_timereverse_size 5438 external D3dB_timereverse_size 5439 5440 call Parallel2d_taskid_i(taskid) 5441 call Parallel2d_np_i(np) 5442 5443 nyh = ny(nb)/2 5444 nzh = nz(nb)/2 5445 5446* **** set zplane_size **** 5447 zplane_size(nb) = D3dB_timereverse_size(nb) 5448 5449* **** allocate timereverse_blk common block **** 5450 value = BA_alloc_get(mt_int,zplane_size(nb), 5451 > 'iq_to_i1',iq_to_i1(2,nb),iq_to_i1(1,nb)) 5452 value = value.and. 5453 > BA_alloc_get(mt_int,zplane_size(nb), 5454 > 'iq_to_i2',iq_to_i2(2,nb),iq_to_i2(1,nb)) 5455 5456 value = value.and. 5457 > BA_alloc_get(mt_int,(np+1), 5458 > 'i1_start',i1_start(2,nb),i1_start(1,nb)) 5459 value = value.and. 5460 > BA_alloc_get(mt_int,(np+1), 5461 > 'i2_start',i2_start(2,nb),i2_start(1,nb)) 5462 if (.not.value) call errquit('out of heap memory',0, MA_ERR) 5463 5464 5465 5466!MATHIAS 5467 index1 = 1 5468 index2 = 1 5469 do it=0,np-1 5470 proc_to = mod(taskid+it,np) 5471 proc_from = mod(taskid-it+np,np) 5472c i1_start(it+1) = index1 5473c i2_start(it+1) = index2 5474 int_mb(i1_start(1,nb)+it) = index1 5475 int_mb(i2_start(1,nb)+it) = index2 5476 5477* ********************* 5478* **** K=(0,0,k3) **** 5479* ********************* 5480 do k3=1,(nzh-1) 5481 i3 = k3 5482 j3 = -k3 5483 if (i3.lt.0) i3 = i3 + nz(nb) 5484 if (j3.lt.0) j3 = j3 + nz(nb) 5485 i2 = 1 5486 i3 = i3 + 1 5487 j2 = 1 5488 j3 = j3 + 1 5489 5490* **** packing scheme **** 5491 call D3dB_ijktoindexp(nb,1,1,i3,itmp,phere) 5492 call D3dB_ijktoindexp(nb,1,1,j3,itmp2,pto) 5493 !call D3dB_ktoqp(nb,i3,qhere,phere) 5494 !call D3dB_ktoqp(nb,j3,qto,pto) 5495 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 5496c itmp = 1 + (i2-1)*(nx(nb)/2+1) 5497c > + (qhere-1)*(nx(nb)/2+1)*ny(nb) 5498c iq_to_i1(index1) = itmp 5499 int_mb(iq_to_i1(1,nb)+index1-1)=itmp 5500 index1 = index1 + 1 5501 end if 5502 5503* **** unpacking scheme **** 5504 !call D3dB_ktoqp(nb,j3,qhere,phere) 5505 !call D3dB_ktoqp(nb,i3,qfrom,pfrom) 5506 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 5507c itmp = 1 + (j2-1)*(nx(nb)/2+1) 5508c > + (qhere-1)*(nx(nb)/2+1)*ny(nb) 5509c iq_to_i2(index2) = itmp 5510 int_mb(iq_to_i2(1,nb)+index2-1) = itmp2 5511 index2 = index2 + 1 5512 end if 5513 5514 end do 5515 5516* ********************* 5517* **** k=(0,k2,k3) **** 5518* ********************* 5519 do k3=(-nzh+1),(nzh-1) 5520 do k2=1,(nyh-1) 5521 i2 = k2 5522 i3 = k3 5523 j2 = -k2 5524 j3 = -k3 5525 if (i2.lt.0) i2 = i2 + ny(nb) 5526 if (i3.lt.0) i3 = i3 + nz(nb) 5527 if (j2.lt.0) j2 = j2 + ny(nb) 5528 if (j3.lt.0) j3 = j3 + nz(nb) 5529 i2 = i2 + 1 5530 i3 = i3 + 1 5531 j2 = j2 + 1 5532 j3 = j3 + 1 5533 5534* **** packing scheme **** 5535 call D3dB_ijktoindexp(nb,1,i2,i3,itmp,phere) 5536 call D3dB_ijktoindexp(nb,1,j2,j3,itmp2,pto) 5537 !call D3dB_ktoqp(nb,i3,qhere,phere) 5538 !call D3dB_ktoqp(nb,j3,qto,pto) 5539 if ((phere.eq.taskid).and.(pto.eq.proc_to)) then 5540c itmp = 1 + (i2-1)*(nx(nb)/2+1) 5541c > + (qhere-1)*(nx(nb)/2+1)*ny(nb) 5542c iq_to_i1(index1) = itmp 5543 int_mb(iq_to_i1(1,nb)+index1-1) = itmp 5544 index1 = index1 + 1 5545 end if 5546 5547* **** unpacking scheme **** 5548 !call D3dB_ktoqp(nb,j3,qhere,phere) 5549 !call D3dB_ktoqp(nb,i3,qfrom,pfrom) 5550 if ((pto.eq.taskid).and.(phere.eq.proc_from)) then 5551c itmp = 1 + (j2-1)*(nx(nb)/2+1) 5552c > + (qhere-1)*(nx(nb)/2+1)*ny(nb) 5553c iq_to_i2(index2) = itmp 5554 int_mb(iq_to_i2(1,nb)+index2-1)=itmp2 5555 index2 = index2 + 1 5556 end if 5557 end do 5558 end do 5559 5560 end do 5561c i1_start(np+1) = index1 5562c i2_start(np+1) = index2 5563 int_mb(i1_start(1,nb)+np) = index1 5564 int_mb(i2_start(1,nb)+np) = index2 5565 5566 return 5567 end 5568 5569 5570 5571* *********************************** 5572* * * 5573* * D3dB_timereverse_end * 5574* * * 5575* *********************************** 5576 subroutine D3dB_timereverse_end(nb) 5577 implicit none 5578 integer nb 5579 5580#include "bafdecls.fh" 5581#include "errquit.fh" 5582 5583c integer iq_to_i1((NFFT1/2+1)*NFFT2*NSLABS) 5584c integer iq_to_i2((NFFT1/2+1)*NFFT2*NSLABS) 5585c integer i1_start(NFFT3+1) 5586c integer i2_start(NFFT3+1) 5587 integer iq_to_i1(2,NBLOCKS) 5588 integer iq_to_i2(2,NBLOCKS) 5589 integer i1_start(2,NBLOCKS) 5590 integer i2_start(2,NBLOCKS) 5591 common / timereverse_blk / iq_to_i1,iq_to_i2,i1_start,i2_start 5592 5593 logical value 5594 5595 value = BA_free_heap(i1_start(2,nb)) 5596 value = value.and. 5597 > BA_free_heap(i2_start(2,nb)) 5598 value = value.and. 5599 > BA_free_heap(iq_to_i1(2,nb)) 5600 value = value.and. 5601 > BA_free_heap(iq_to_i2(2,nb)) 5602 if (.not.value) call errquit('error freeing heap',0, MA_ERR) 5603 return 5604 end 5605 5606 5607 5608 subroutine D3dB_pfft_index1_copy(n,index,a,b) 5609 implicit none 5610 integer n 5611 integer index(*) 5612 complex*16 a(*),b(*) 5613 integer i 5614 5615#ifndef CRAY 5616!DIR$ ivdep 5617#endif 5618!$OMP DO 5619 do i=1,n 5620 b(i) = a(index(i)) 5621 end do 5622!$OMP END DO 5623 5624 return 5625 end 5626 5627 subroutine D3dB_pfft_index2_copy(n,index,a,b) 5628 implicit none 5629 integer n 5630 integer index(*) 5631 complex*16 a(*),b(*) 5632 integer i 5633#ifndef CRAY 5634!DIR$ ivdep 5635#endif 5636!$OMP DO 5637 do i=1,n 5638 b(index(i)) = a(i) 5639 end do 5640!$OMP END DO NOWAIT 5641 return 5642 end 5643 5644 5645 subroutine D3dB_pfft_index2_zero(n,index,a) 5646 implicit none 5647 integer n 5648 integer index(*) 5649 complex*16 a(*) 5650 integer i 5651#ifndef CRAY 5652!DIR$ ivdep 5653#endif 5654!$OMP DO 5655 do i=1,n 5656 a(index(i)) = 0.0d0 5657 end do 5658!$OMP END DO 5659 return 5660 end 5661 5662 5663 5664 subroutine D3dB_pfft_index2_copy_conjg(n,index,a,b) 5665 implicit none 5666 integer n 5667 integer index(*) 5668 complex*16 a(*),b(*) 5669 integer i 5670#ifndef CRAY 5671!DIR$ ivdep 5672#endif 5673!$OMP DO 5674 do i=1,n 5675 b(index(i)) = dconjg(a(i)) 5676 end do 5677!$OMP END DO NOWAIT 5678 return 5679 end 5680 5681 5682 5683