1ccccccccccc  #define TCGMSG
2#define NBLOCKS 3
3
4*
5* $Id$
6*
7
8*     ***********************************************************
9*     *								*
10*     *   		   D3dB library				*
11*     *			(MPI implemenation)			*
12*     *								*
13*     *   Author - Eric Bylaska					*
14*     *   date   - 3/23/96					*
15*     *								*
16*     ***********************************************************
17
18*	The D3dB (distributed three-dimensional block) library is to
19* be used for handling three kinds of data structures.  The first
20* data structure, denoted by "r", is a double precision array of
21* length (nx(nb)+2)*ny(nb)*nz.  The second data structure, denoted by "c", is
22* a double complex array of length of (nx(nb)/2+1)*ny(nb)*nz.  The third data
23
24* (nx(nb)/2+1)*ny(nb)*nz.
25*
26*	The three data structures are distributed across threads, p, in
27* the k (i.e. nz(nb)) dimension using a cyclic decomposition.  So that
28* a "r" array A is defined as double precision A(nx(nb)+2,ny(nb),nq(nb)) on
29* each thread.
30*
31*	Where
32*		np = number of threads
33*		nq(nb) = ceil(nz(nb)/np).
34*		0 <= p < np
35*		1 <= q <= nq(nb)
36*		1 <= k <= nz(nb)
37*
38* 	The mapping of k -> q is defined as:
39*
40*		k = ((q-1)*np + p) + 1
41*		q = ((k-1) - p)/np + 1
42*		p = (k-1) mod np
43*
44*  Libraries used: mpi, blas, fftpack, and compressed_io
45*
46*  common blocks used in this library:
47*
48*       integer nq,nx(NBLOCKS),ny,nz(nb)
49*	common	/ D3dB / nq,nx,ny,nz
50*
51*	integer q_map(NFFT3),p_map(NFFT3),k_map(NFFT3)
52* 	common /D3dB_mapping / q_map,p_map,k_map
53*
54*     integer iq_to_i1((NFFT1/2+1)*NFFT2*NSLABS)
55*     integer iq_to_i2((NFFT1/2+1)*NFFT2*NSLABS)
56*     integer i1_start(NPROCS+1)
57*     integer i2_start(NPROCS+1)
58*     common / trans_blk / iq_to_i1,iq_to_i2,i1_start,i2_start
59
60*     **** local variables ****
61
62*     ***********************************
63*     *					*
64*     *	       Mapping_Init		*
65*     *					*
66*     ***********************************
67
68      subroutine Mapping_Init(nb)
69      implicit none
70      integer nb
71
72#include "bafdecls.fh"
73#include "errquit.fh"
74#include "D3dB.fh"
75
76
77      integer k,q,p
78*     integer kn
79      integer taskid,np
80      logical value
81      integer  tid,Parallel_threadid
82      external Parallel_threadid
83
84      call Parallel2d_np_i(np)
85      call Parallel2d_taskid_i(taskid)
86
87
88
89*     **************************
90*     ****** Slab mapping ******
91*     **************************
92      if (mapping.eq.1) then
93
94*     **** allocate q_map,p_map,k_map
95      value = BA_alloc_get(mt_int,nz(nb),'q_map',q_map(2,nb),
96     >                                       q_map(1,nb))
97      value = value.and.BA_alloc_get(mt_int,nz(nb),'p_map',p_map(2,nb),
98     >                                       p_map(1,nb))
99      value = value.and.BA_alloc_get(mt_int,nz(nb),'k_map',k_map(2,nb),
100     >                                       k_map(1,nb))
101      if (.not. value)
102     > call errquit('Mapping_init:out of heap memory',0, MA_ERR)
103
104
105
106*     ****** cyclic ******
107       p = 0
108       q = 1
109       do k=1,nz(nb)
110         int_mb(q_map(1,nb)+k-1) = q
111         int_mb(p_map(1,nb)+k-1) = p
112         if (p .eq. taskid) nq(nb) = q
113
114         p        = p+1
115         if (p .ge. np) then
116            p = 0
117            q = q + 1
118         end if
119       end do
120
121
122c      if (nb.eq.1) then
123c       p = 0
124c       q = 1
125c       do k=1,nz(nb)
126c         int_mb(q_map(1,nb)+k-1) = q
127c         int_mb(p_map(1,nb)+k-1) = p
128c         if (p .eq. taskid) nq(nb) = q
129c
130c         p        = p+1
131c         if (p .ge. np) then
132c            p = 0
133c            q = q + 1
134c         end if
135c       end do
136c      else if (nb.eq.2) then
137c       p = 0
138c       q = 1
139c       do k=1,nz(1)
140c         int_mb(q_map(1,nb)+k-1)       = int_mb(q_map(1,1)+k-1)
141c         int_mb(q_map(1,nb)+k+nz(1)-1) = int_mb(q_map(1,1)+k-1)+nq(1)
142c         int_mb(p_map(1,nb)+k-1)       = int_mb(p_map(1,1)+k-1)
143c         int_mb(p_map(1,nb)+k+nz(1)-1) = int_mb(p_map(1,1)+k-1)
144c       end do
145c
146c      end if
147
148*     ***** block  ******
149*     **** make sure nz(nb) is a multiple of np ****
150*     kn = mod(nz(nb),np)
151*     if (kn.ne.0) then
152*        kn=(nz(nb)/np)+1
153*     else
154*        kn=(nz(nb)/np)
155*     end if
156*
157*     p=0
158*     q=1
159*     do k=1,nz(nb)
160*        int_mb(q_map(1,nb)+k-1) = q
161*        int_mb(p_map(1,nb)+k-1) = p
162*        if (p .eq. taskid) nq(nb) = q
163*
164*        q=q+1
165*        if (q .gt. (kn)) then
166*           q = 1
167*           p = p + 1
168*        end if
169*     end do
170
171      !*** not used anymore!! ****
172      do k=1,nz(nb)
173         if (int_mb(p_map(1,nb)+k-1) .eq. taskid) then
174c           k_map(q_map(k)) = k
175            int_mb(k_map(1,nb)+int_mb(q_map(1,nb)+k-1)-1) = k
176         end if
177      end do
178
179      nfft3d(nb)     = (nx(nb)/2+1)*ny(nb)*nq(nb)
180      n2ft3d(nb)     = 2*nfft3d(nb)
181      nfft3d_map(nb) = nfft3d(nb)
182      n2ft3d_map(nb) = n2ft3d(nb)
183
184
185*     ******************************
186*     ****** Hilbert mappings ******
187*     ******************************
188      else
189
190
191
192
193
194*     **** allocate q_map1,p_map1,q_map2,p_map2,q_map3,p_map3 ****
195      value =           BA_alloc_get(mt_int,ny(nb)*nz(nb),
196     >                              'q_map1',
197     >                               q_map1(2,nb),
198     >                               q_map1(1,nb))
199
200
201      value = value.and.BA_alloc_get(mt_int,ny(nb)*nz(nb),
202     >                              'p_map1',
203     >                               p_map1(2,nb),
204     >                               p_map1(1,nb))
205
206      value = value.and.BA_alloc_get(mt_int,nz(nb)*(nx(nb)/2+1),
207     >                              'q_map2',
208     >                               q_map2(2,nb),
209     >                               q_map2(1,nb))
210      value = value.and.BA_alloc_get(mt_int,nz(nb)*(nx(nb)/2+1),
211     >                              'p_map2',
212     >                               p_map2(2,nb),
213     >                               p_map2(1,nb))
214
215      value = value.and.BA_alloc_get(mt_int,ny(nb)*(nx(nb)/2+1),
216     >                              'q_map3',
217     >                               q_map3(2,nb),
218     >                               q_map3(1,nb))
219      value = value.and.BA_alloc_get(mt_int,ny(nb)*(nx(nb)/2+1),
220     >                              'p_map3',
221     >                               p_map3(2,nb),
222     >                               p_map3(1,nb))
223      if (.not. value)
224     > call errquit('Mapping_init:out of heap memory',1, MA_ERR)
225
226
227      !**** double grid map1 defined wrt to single grid         ****
228      !**** makes expand and contract routines trivial parallel ****
229
230!MATHIAS
231      if (mapping2d.eq.1) then
232         if ((nb.eq.1).or.(nb.gt.2)) then
233           call hilbert2d_map(ny(nb),nz(nb),int_mb(p_map1(1,nb)))
234         end if
235         call hilbert2d_map(nz(nb),(nx(nb)/2+1),int_mb(p_map2(1,nb)))
236         call hilbert2d_map((nx(nb)/2+1),ny(nb),int_mb(p_map3(1,nb)))
237      else
238         if ((nb.eq.1).or.(nb.gt.2)) then
239           call hcurve_map(ny(nb),nz(nb),int_mb(p_map1(1,nb)))
240         end if
241
242         call hcurve_map(nz(nb),(nx(nb)/2+1),int_mb(p_map2(1,nb)))
243         call hcurve_map((nx(nb)/2+1),ny(nb),int_mb(p_map3(1,nb)))
244      end if
245
246
247
248c!$OMP critical
249c      write(*,*) "checking p_map1,q_map1 ",Parallel_threadid()
250c      do k=1,ny(nb)*nz(nb)
251c         write(*,*) Parallel_threadid(),k,
252c     >              int_mb(p_map1(1,nb)+k-1)
253c      end do
254c!$OMP end critical
255c!$OMP critical
256c      write(*,*) "checking p_map2,q_map2 ",Parallel_threadid()
257c      do k=1,(nx(nb)/2+1)*nz(nb)
258c         write(*,*) Parallel_threadid(),k,
259c     >              int_mb(p_map2(1,nb)+k-1)
260c      end do
261c!$OMP end critical
262c!$OMP critical
263c      write(*,*) "checking p_map2,q_map2 ",Parallel_threadid()
264c      do k=1,(nx(nb)/2+1)*ny(nb)
265c         write(*,*) Parallel_threadid(),k,
266c     >              int_mb(p_map3(1,nb)+k-1)
267c      end do
268c!$OMP end critical
269
270
271      !**** double grid map1 defined wrt to single grid         ****
272      !**** makes expand and contract routines trivial parallel ****
273      if ((nb.eq.1).or.(nb.gt.2)) then
274      call generate_map_indexes(taskid,np,
275     >                          ny(nb),nz(nb),
276     >                          int_mb(p_map1(1,nb)),
277     >                          int_mb(q_map1(1,nb)),nq1(nb))
278      else
279        nq1(2) = 4*nq1(1)
280        call expand_hilbert2d(np,ny(1),nz(1),
281     >                        int_mb(p_map1(1,1)),int_mb(q_map1(1,1)),
282     >                        int_mb(p_map1(1,2)),int_mb(q_map1(1,2)))
283      end if
284      call generate_map_indexes(taskid,np,
285     >                          nz(nb),nx(nb)/2+1,
286     >                          int_mb(p_map2(1,nb)),
287     >                          int_mb(q_map2(1,nb)),nq2(nb))
288      call generate_map_indexes(taskid,np,
289     >                          nx(nb)/2+1,ny(nb),
290     >                          int_mb(p_map3(1,nb)),
291     >                          int_mb(q_map3(1,nb)),nq3(nb))
292
293c      if (taskid.eq.0) then
294c      write(*,*) taskid,"nq2=",nq2(nb), ny(nb)*nq2(nb)
295c      write(*,*) taskid,"nq1=",nq1(nb),(nx(nb)/2+1)*nq1(nb)
296c      write(*,*) taskid,"nq3=",nq3(nb), nz(nb)*nq3(nb)
297c
298c      write(*,*) 'hilbert map nb=',nb
299c      do j=0,ny(nb)-1
300c        write(*,'(A,80I4)') 'hilbert map:',
301c     >   j,(int_mb(p_map3(1,nb)+j*(nx(nb)/2+1)))
302c      end do
303c      write(*,*)
304c
305c      write(*,*) 'hilbert map nb=',nb
306c      do j=0,ny(nb)-1
307c        write(*,'(A,80I4)') 'hilbert map:',
308c     >   (int_mb(p_map3(1,nb)+k+j*(nx(nb)/2+1)), k=0,nx(nb)/2)
309c      end do
310c      write(*,*)
311c      end if
312
313
314      nfft3d(nb) = (nx(nb)/2+1)*nq1(nb)
315      if ((ny(nb)*nq2(nb)).gt.nfft3d(nb)) nfft3d(nb) = ny(nb)*nq2(nb)
316      if ((nz(nb)*nq3(nb)).gt.nfft3d(nb)) nfft3d(nb) = nz(nb)*nq3(nb)
317      n2ft3d(nb) = 2*nfft3d(nb)
318
319      nfft3d_map(nb) = nz(nb)*nq3(nb)
320      n2ft3d_map(nb) = (nx(nb)+2)*nq1(nb)
321
322
323
324      end if
325
326
327      return
328      end
329
330*     ***********************************
331*     *					*
332*     *	          D3dB_end   		*
333*     *					*
334*     ***********************************
335      subroutine D3dB_end(nb)
336      implicit none
337      integer nb
338
339#include "bafdecls.fh"
340#include "errquit.fh"
341#include "D3dB.fh"
342
343
344*     *** hilbert tranpose data structure ****
345      integer h_iq_to_i1(2,6,NBLOCKS)
346      integer h_iq_to_i2(2,6,NBLOCKS)
347      integer h_i1_start(2,6,NBLOCKS)
348      integer h_i2_start(2,6,NBLOCKS)
349      common / trans_blk_ijk / h_iq_to_i1,
350     >                         h_iq_to_i2,
351     >                         h_i1_start,
352     >                         h_i2_start
353
354
355c     integer iq_to_i1((NFFT1/2+1)*NFFT2*NSLABS)
356c     integer iq_to_i2((NFFT1/2+1)*NFFT2*NSLABS)
357c     integer i1_start(NFFT3+1)
358c     integer i2_start(NFFT3+1)
359      integer iq_to_i1(2,NBLOCKS)
360      integer iq_to_i2(2,NBLOCKS)
361      integer i1_start(2,NBLOCKS)
362      integer i2_start(2,NBLOCKS)
363      common / trans_blk / iq_to_i1,iq_to_i2,i1_start,i2_start
364
365#ifndef MPI
366      integer Nchannels(NBLOCKS)
367      integer channel_proc(2,NBLOCKS)
368      integer channel_type(2,NBLOCKS)
369      common / channel_blk / channel_proc,channel_type,Nchannels
370#endif
371
372      logical value
373      integer i
374      logical  control_single_precision_on
375      external control_single_precision_on
376
377      call D3dB_timereverse_end(nb)
378      call D3dB_fft_end(nb)
379      if (control_single_precision_on()) call D3dBs_fft_end(nb)
380
381      value=.true.
382
383      !**** slab mapping ****
384      if (mapping.eq.1) then
385      value = value.and.BA_free_heap(q_map(2,nb))
386      value = value.and.BA_free_heap(p_map(2,nb))
387      value = value.and.BA_free_heap(k_map(2,nb))
388      end if
389
390      !**** hilbert mappings ****
391      if (mapping.eq.2) then
392      value = value.and.BA_free_heap(q_map1(2,nb))
393      value = value.and.BA_free_heap(p_map1(2,nb))
394      value = value.and.BA_free_heap(q_map2(2,nb))
395      value = value.and.BA_free_heap(p_map2(2,nb))
396      value = value.and.BA_free_heap(q_map3(2,nb))
397      value = value.and.BA_free_heap(p_map3(2,nb))
398      end if
399
400      !**** slab transpose mappings ****
401      if (mapping.eq.1) then
402      value = value.and.BA_free_heap(i1_start(2,nb))
403      value = value.and.BA_free_heap(i2_start(2,nb))
404      value = value.and.BA_free_heap(iq_to_i1(2,nb))
405      value = value.and.BA_free_heap(iq_to_i2(2,nb))
406      end if
407
408      !**** hilbert transpose mappings ****
409      if (mapping.eq.2) then
410      do i=1,6
411      value = value.and.BA_free_heap(h_i1_start(2,i,nb))
412      value = value.and.BA_free_heap(h_i2_start(2,i,nb))
413      value = value.and.BA_free_heap(h_iq_to_i1(2,i,nb))
414      value = value.and.BA_free_heap(h_iq_to_i2(2,i,nb))
415      end do
416      end if
417
418#ifndef MPI
419      value = value.and.BA_free_heap(channel_proc(2,nb))
420      value = value.and.BA_free_heap(channel_type(2,nb))
421#endif
422
423      if (.not. value)
424     > call errquit('D3dB_end:freeing heap memory',0, MA_ERR)
425
426
427
428
429      return
430      end
431
432*     ***********************************
433*     *					*
434*     *	          D3dB_qtok   		*
435*     *					*
436*     ***********************************
437
438      subroutine D3dB_qtok(nb,q,k)
439      implicit none
440      integer nb
441      integer q,k
442
443#include "bafdecls.fh"
444#include "D3dB.fh"
445
446
447
448c     k = k_map(q)
449      k = int_mb(k_map(1,nb)+q-1)
450
451      return
452      end
453
454*     ***********************************
455*     *					*
456*     *	          D3dB_ktoqp  		*
457*     *					*
458*     ***********************************
459
460      subroutine D3dB_ktoqp(nb,k,q,p)
461      implicit none
462      integer nb
463      integer k,q,p
464
465#include "bafdecls.fh"
466#include "D3dB.fh"
467
468
469
470c     q = q_map(k)
471c     p = p_map(k)
472
473      q = int_mb(q_map(1,nb)+k-1)
474      p = int_mb(p_map(1,nb)+k-1)
475      return
476      end
477
478
479*     ***********************************
480*     *					*
481*     *	          D3dB_ijktoindexp	*
482*     *					*
483*     ***********************************
484
485      subroutine D3dB_ijktoindexp(nb,i,j,k,indx,p)
486      implicit none
487      integer nb
488      integer i,j,k
489      integer indx,p
490
491#include "bafdecls.fh"
492#include "D3dB.fh"
493
494      integer q
495
496      !**** slab mapping ***
497      if (mapping.eq.1) then
498      q = int_mb(q_map(1,nb)+k-1)
499      p = int_mb(p_map(1,nb)+k-1)
500
501      indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
502
503      !**** hilbert mapping ****
504      else
505      q = int_mb(q_map3(1,nb)+(i-1)+(j-1)*(nx(nb)/2+1))
506      p = int_mb(p_map3(1,nb)+(i-1)+(j-1)*(nx(nb)/2+1))
507
508      indx = k + (q-1)*nz(nb)
509      end if
510
511      return
512      end
513
514
515
516*     ***********************************
517*     *                                 *
518*     *           D3dB_ijktoindex1p     *
519*     *                                 *
520*     ***********************************
521
522      subroutine D3dB_ijktoindex1p(nb,i,j,k,indx,p)
523      implicit none
524      integer nb
525      integer i,j,k
526      integer indx,p
527
528#include "bafdecls.fh"
529#include "D3dB.fh"
530
531      integer q
532
533      !**** slab mapping ***
534      if (mapping.eq.1) then
535      q = int_mb(q_map(1,nb)+j-1)
536      p = int_mb(p_map(1,nb)+j-1)
537
538      indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb)
539
540      !**** hilbert mapping ****
541      else
542      q = int_mb(q_map2(1,nb)+(k-1)+(i-1)*(nz(nb)))
543      p = int_mb(p_map2(1,nb)+(k-1)+(i-1)*(nz(nb)))
544
545      indx = j + (q-1)*ny(nb)
546      end if
547
548      return
549      end
550
551
552
553
554*     ***********************************
555*     *                                 *
556*     *           D3dB_ijktoindex2p     *
557*     *                                 *
558*     ***********************************
559
560      subroutine D3dB_ijktoindex2p(nb,i,j,k,indx,p)
561      implicit none
562      integer nb
563      integer i,j,k
564      integer indx,p
565
566#include "bafdecls.fh"
567#include "D3dB.fh"
568
569
570      integer q
571
572      !**** slab mapping ****
573      if (mapping.eq.1) then
574      q = int_mb(q_map(1,nb)+j-1)
575      p = int_mb(p_map(1,nb)+j-1)
576
577      indx = i + (k-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb)
578
579      !**** hilbert mapping ****
580      else
581      q = int_mb(q_map1(1,nb)+(j-1)+(k-1)*ny(nb))
582      p = int_mb(p_map1(1,nb)+(j-1)+(k-1)*ny(nb))
583
584      indx = i + (q-1)*(nx(nb)+2)
585      end if
586
587      return
588      end
589
590
591
592
593
594*     ***********************************
595*     *					*
596*     *	        D3dB_nfft3d		*
597*     *					*
598*     ***********************************
599
600      subroutine D3dB_nfft3d(nb,nfft3d_out)
601      implicit none
602      integer nb
603      integer nfft3d_out
604
605#include "D3dB.fh"
606      nfft3d_out = nfft3d(nb)
607      return
608      end
609
610
611*     ***********************************
612*     *                                 *
613*     *         D3dB_nfft3d_map         *
614*     *                                 *
615*     ***********************************
616
617      subroutine D3dB_nfft3d_map(nb,nfft3d_out)
618      implicit none
619      integer nb
620      integer nfft3d_out
621
622#include "D3dB.fh"
623
624      nfft3d_out = nfft3d_map(nb)
625      return
626      end
627
628
629*     ***********************************
630*     *					*
631*     *	        D3dB_n2ft3d		*
632*     *					*
633*     ***********************************
634
635      subroutine D3dB_n2ft3d(nb,n2ft3d_out)
636      implicit none
637      integer nb
638      integer n2ft3d_out
639
640#include "D3dB.fh"
641
642      n2ft3d_out = n2ft3d(nb)
643      return
644      end
645
646
647*     ***********************************
648*     *                                 *
649*     *         D3dB_n2ft3d_map         *
650*     *                                 *
651*     ***********************************
652
653      subroutine D3dB_n2ft3d_map(nb,n2ft3d_out)
654      implicit none
655      integer nb
656      integer n2ft3d_out
657
658#include "D3dB.fh"
659
660      n2ft3d_out = n2ft3d_map(nb)
661      return
662      end
663
664
665*     ***********************************
666*     *					*
667*     *	        D3dB_nq			*
668*     *					*
669*     ***********************************
670
671      subroutine D3dB_nq(nb,nqtmp)
672      implicit none
673      integer nb
674      integer nqtmp
675
676#include "D3dB.fh"
677
678
679      nqtmp = nq(nb)
680
681      return
682      end
683
684*     ***********************************
685*     *					*
686*     *	        D3dB_nx			*
687*     *					*
688*     ***********************************
689
690      subroutine D3dB_nx(nb,nxtmp)
691      implicit none
692      integer nb
693      integer nxtmp
694
695#include "D3dB.fh"
696
697
698      nxtmp = nx(nb)
699      return
700      end
701
702*     ***********************************
703*     *					*
704*     *	        D3dB_ny			*
705*     *					*
706*     ***********************************
707
708      subroutine D3dB_ny(nb,nytmp)
709      implicit none
710      integer nb
711      integer nytmp
712
713#include "D3dB.fh"
714
715
716      nytmp = ny(nb)
717      return
718      end
719
720*     ***********************************
721*     *					*
722*     *	        D3dB_nz			*
723*     *					*
724*     ***********************************
725
726      subroutine D3dB_nz(nb,nztmp)
727      implicit none
728      integer nb
729      integer nztmp
730
731#include "D3dB.fh"
732
733
734      nztmp = nz(nb)
735      return
736      end
737
738*     ***********************************
739*     *                                 *
740*     *         D3dB_zplane_size        *
741*     *                                 *
742*     ***********************************
743
744      subroutine D3dB_zplane_size(nb,zplane_sizetmp)
745      implicit none
746      integer nb
747      integer zplane_sizetmp
748
749#include "D3dB.fh"
750
751      zplane_sizetmp = zplane_size(nb)
752      return
753      end
754
755*     ***********************************
756*     *					*
757*     *	        D3dB_Init		*
758*     *					*
759*     ***********************************
760
761      subroutine D3dB_Init(nb,nx_in,ny_in,nz_in,map_in)
762      implicit none
763      integer nb
764      integer nx_in,ny_in,nz_in
765      integer map_in
766      logical value, MA_verify_allocator_stuff
767      external MA_verify_allocator_stuff
768
769#include "D3dB.fh"
770
771
772      !**** local variables ****
773      integer MASTER
774      parameter (MASTER=0)
775      integer taskid,np
776      integer  Parallel_threadid
777      external Parallel_threadid
778      logical  control_single_precision_on
779      external control_single_precision_on
780
781
782
783
784
785      call Parallel2d_np_i(np)
786      call Parallel_taskid(taskid)
787
788      !**** Make sure ngrid is consistent with mapping ***
789      if (map_in.eq.1) then
790        if ((np.gt.nz_in).or.(ny_in.ne.nz_in)) then
791          if (taskid.eq.MASTER) then
792            write(6,*) 'Error: for slab decomposition the',
793     >                 ' number of processors must ',
794     >                 ' be in the range ( 1 ...ngrid(3)=',
795     >                   nz_in,')'
796           write(6,*) ' and ngrid(2) == ngrid(3), ',
797     >                ' ngrid(2)=',ny_in,
798     >                ' ngrid(3)=',nz_in
799          end if
800          call errquit('D3dB_Init: mapping error',0,0)
801        end if
802        if (mod(nx_in,2).ne.0) then
803          if (taskid.eq.MASTER) then
804           write(6,*)
805     >      'Error: ngrid(1) must be even (ngrid(1) mod 2 == 0)'
806           write(6,*) 'Error: ngrid(1)=',nx_in
807          end if
808          call errquit('D3dB_Init: slab mapping error',0,0)
809        end if
810      end if
811
812      if (map_in.ge.2) then
813        if (np.gt.(ny_in*nz_in)) then
814          if (taskid.eq.MASTER) then
815           write(6,*) 'Error: np > MIN(ngrid(2)*ngrid(3),',
816     >                ' (ngrid(1)/2+1)*ngrid(2),',
817     >                ' (ngrid(1)/2+1)*ngrid(3))'
818           write(6,*) 'Error: np > ngrid(2)*ngrid(3)'
819           write(6,*) 'Error: for the Hilbert decomposition the',
820     >                 ' the number of processors must ',
821     >                 ' be in the range ( 1 ...',
822     >                   ny_in*nz_in,')'
823          end if
824          call errquit('D3dB_Init: Hilbert mapping error',0,0)
825        end if
826        if (np.gt.((nx_in/2+1)*ny_in)) then
827          if (taskid.eq.MASTER) then
828           write(6,*) 'Error: np > MIN(ngrid(2)*ngrid(3),',
829     >                ' (ngrid(1)/2+1)*ngrid(2),',
830     >                ' (ngrid(1)/2+1)*ngrid(3))'
831           write(6,*) 'Error: np > (ngrid(1)/2+1)*ngrid(2)'
832           write(6,*) 'Error: for the Hilbert decomposition the',
833     >                 ' the number of processors must ',
834     >                 ' be in the range ( 1 ...',
835     >                   (nx_in/2+1)*ny_in,')'
836          end if
837          call errquit('D3dB_Init: Hilbert mapping error',0,0)
838        end if
839        if (np.gt.((nx_in/2+1)*nz_in)) then
840          if (taskid.eq.MASTER) then
841           write(6,*) 'Error: np > MIN(ngrid(2)*ngrid(3),',
842     >                ' (ngrid(1)/2+1)*ngrid(2),',
843     >                ' (ngrid(1)/2+1)*ngrid(3))'
844           write(6,*) 'Error: np > (ngrid(1)/2+1)*ngrid(3)'
845           write(6,*) 'Error: for the Hilbert decomposition the',
846     >                 ' the number of processors must ',
847     >                 ' be in the range ( 1 ...',
848     >                   (nx_in/2+1)*nz_in,')'
849          end if
850          call errquit('D3dB_Init: Hilbert mapping error',0,0)
851        end if
852        if (mod(nx_in,2).ne.0) then
853          if (taskid.eq.MASTER) then
854           write(6,*)
855     >      'Error: ngrid(1) must be even (ngrid(1) mod 2 == 0)'
856           write(6,*) 'Error: ngrid(1)=',nx_in
857          end if
858          call errquit('D3dB_Init: Hilbert mapping error',0,0)
859        end if
860      end if
861
862
863*     ***** initialize D3dB common block *****
864      nx(nb)     = nx_in
865      ny(nb)     = ny_in
866      nz(nb)     = nz_in
867      mapping    = map_in
868      mapping2d  = 1
869      if (mapping.eq.3) then
870         mapping   = 2
871         mapping2d = 2
872      end if
873
874
875
876*     **** do other initializations ****
877      call Mapping_Init(nb)
878
879      if (mapping.eq.1) call D3dB_c_transpose_jk_init(nb)
880      if (mapping.eq.2) call D3dB_c_transpose_ijk_init(nb)
881
882#ifndef MPI
883      call D3dB_channel_init(nb)
884#endif
885
886      call D3dB_c_timereverse_init(nb)
887      call D3dB_fft_init(nb)
888      if (control_single_precision_on()) call D3dBs_fft_init(nb)
889
890      return
891      end
892
893
894c*     ***********************************
895c*     *					*
896c*     *	        D3dB_SumAll		*
897c*     *					*
898c*     ***********************************
899c
900c      subroutine D3dB_SumAll(sum)
901cc     implicit none
902c      real*8  sum
903c
904c
905c#include "tcgmsg.fh"
906c#include "msgtypesf.h"
907c
908c
909c      integer MASTER
910c      parameter (MASTER=0)
911c      integer msglen
912c      real*8 sumall,sumt
913c
914c*     **** external functions ****
915c      integer  Parallel2d_comm_i
916c      external Parallel2d_comm_i
917c
918cc     msglen = 8
919c      msglen = 1
920c
921c      sumt = sum
922c
923c      call GA_PGROUP_DGOP(Parallel2d_comm_i(),
924c     >                    9+MSGDBL,sumt,1,'+')
925c
926cc      call GA_DGOP(9+MSGDBL,sumt,1,'+')
927cc     call DGOP(9+MSGDBL,sumt,1,'+')
928c      sumall=sumt
929c
930c
931c      sum = sumall
932c      return
933c      end
934
935
936c*     ***********************************
937c*     *					*
938c*     *	        D3dB_ISumAll		*
939c*     *					*
940c*     ***********************************
941c
942c      subroutine D3dB_ISumAll(sum)
943cc     implicit none
944c      integer  sum
945c
946c
947c#include "tcgmsg.fh"
948c#include "msgtypesf.h"
949c
950c
951c      integer MASTER
952c      parameter (MASTER=0)
953c      integer msglen
954c      integer sumall,sumt
955c
956c*     **** external functions ****
957c      integer  Parallel2d_comm_i
958c      external Parallel2d_comm_i
959c
960c
961cc     msglen = 8
962c      msglen = 1
963c
964c      sumt = sum
965c
966c      call GA_PGROUP_IGOP(Parallel2d_comm_i(),
967c     >                    9+MSGINT,sumt,1,'+')
968cc     call GA_IGOP(9+MSGINT,sumt,1,'+')
969c      sumall=sumt
970c
971c
972c      sum = sumall
973c      return
974c      end
975
976
977
978*     ***********************************
979*     *					*
980*     *	        D3dB_(c,r,t)_Zero	*
981*     *					*
982*     ***********************************
983
984      subroutine D3dB_c_Zero(nb,A)
985      implicit none
986      integer nb
987      complex*16 A(*)
988
989#include "D3dB.fh"
990
991
992      !call dcopy(n2ft3d_map(nb),0.0d0,0,A,1)
993      call Parallel_shared_vector_zero(.true.,n2ft3d_map(nb),A)
994      return
995      end
996
997      subroutine D3dB_c_nZero(nb,n,A)
998      implicit none
999      integer nb,n
1000      complex*16 A(*)
1001
1002#include "D3dB.fh"
1003
1004
1005      !call dcopy(n*n2ft3d_map(nb),0.0d0,0,A,1)
1006      call Parallel_shared_vector_zero(.true.,n*n2ft3d_map(nb),A)
1007      return
1008      end
1009
1010
1011      subroutine D3dB_r_Zero(nb,A)
1012      implicit none
1013      integer nb
1014      real*8  A(*)
1015
1016#include "D3dB.fh"
1017
1018c      call dcopy(n2ft3d_map(nb),0.0d0,0,A,1)
1019      call Parallel_shared_vector_zero(.true.,n2ft3d_map(nb),A)
1020      return
1021      end
1022
1023
1024      subroutine D3dB_r_nZero(nb,n,A)
1025      implicit none
1026      integer nb,n
1027      real*8  A(*)
1028
1029#include "D3dB.fh"
1030
1031      !call dcopy(n*n2ft3d_map(nb),0.0d0,0,A,1)
1032      call Parallel_shared_vector_zero(.true.,n*n2ft3d_map(nb),A)
1033      return
1034      end
1035
1036
1037
1038*     ***********************************
1039*     *					*
1040*     *	        D3dB_(c,r,t)_Copy	*
1041*     *					*
1042*     ***********************************
1043
1044      subroutine D3dB_c_Copy(nb,A,B)
1045      implicit none
1046      integer nb
1047      complex*16 A(*)
1048      complex*16 B(*)
1049
1050#include "D3dB.fh"
1051
1052      !call dcopy(2*nfft3d_map(nb),A,1,B,1)
1053      call Parallel_shared_vector_copy(.true.,2*nfft3d_map(nb),A,B)
1054      return
1055      end
1056
1057      subroutine D3dB_c_nCopy(nb,n,A,B)
1058      implicit none
1059      integer nb,n
1060      complex*16 A(*)
1061      complex*16 B(*)
1062
1063#include "D3dB.fh"
1064
1065      !call dcopy(n*2*nfft3d_map(nb),A,1,B,1)
1066      call Parallel_shared_vector_copy(.true.,n*2*nfft3d_map(nb),A,B)
1067      return
1068      end
1069
1070
1071      subroutine D3dB_r_Copy(nb,A,B)
1072      implicit none
1073      integer nb
1074      real*8 A(*)
1075      real*8 B(*)
1076
1077#include "D3dB.fh"
1078      integer i
1079      !call dcopy(n2ft3d_map(nb),A,1,B,1)
1080      call Parallel_shared_vector_copy(.true.,n2ft3d_map(nb),A,B)
1081
1082      return
1083      end
1084
1085
1086      subroutine D3dB_r_nCopy(nb,n,A,B)
1087      implicit none
1088      integer nb,n
1089      real*8 A(*)
1090      real*8 B(*)
1091
1092#include "D3dB.fh"
1093
1094      !call dcopy(n*n2ft3d_map(nb),A,1,B,1)
1095      call Parallel_shared_vector_copy(.true.,n*n2ft3d_map(nb),A,B)
1096      return
1097      end
1098
1099
1100      subroutine D3dB_t_Copy(nb,A,B)
1101      implicit none
1102      integer nb
1103      real*8 A(*)
1104      real*8 B(*)
1105
1106#include "D3dB.fh"
1107
1108      !call dcopy(nfft3d_map(nb),A,1,B,1)
1109      call Parallel_shared_vector_copy(.true.,nfft3d_map(nb),A,B)
1110      return
1111      end
1112
1113
1114      subroutine D3dB_t_nCopy(nb,n,A,B)
1115      implicit none
1116      integer nb,n
1117      real*8 A(*)
1118      real*8 B(*)
1119
1120#include "D3dB.fh"
1121
1122      !call dcopy(n*nfft3d_map(nb),A,1,B,1)
1123      call Parallel_shared_vector_copy(.true.,n*nfft3d_map(nb),A,B)
1124      return
1125      end
1126
1127
1128      subroutine D3dB_tc_Copy(nb,A,B)
1129      implicit none
1130      integer nb
1131      real*8     A(*)
1132      complex*16 B(*)
1133
1134#include "D3dB.fh"
1135
1136      integer i
1137
1138!$OMP DO
1139      do i=1,nfft3d_map(nb)
1140         B(i) = dcmplx(A(i),0.0d0)
1141      end do
1142!$OMP END DO
1143
1144      return
1145      end
1146
1147      subroutine D3dB_ct_Copy(nb,A,B)
1148      implicit none
1149      integer nb
1150      complex*16 A(*)
1151      real*8     B(*)
1152
1153#include "D3dB.fh"
1154
1155      integer i
1156!$OMP DO
1157      do i=1,nfft3d_map(nb)
1158         B(i) = dble(A(i))
1159      end do
1160!$OMP END DO
1161
1162      return
1163      end
1164
1165
1166
1167
1168*     ***********************************
1169*     *					*
1170*     *	        D3dB_fft_init		*
1171*     *					*
1172*     ***********************************
1173
1174      subroutine D3dB_fft_init(nb)
1175      implicit none
1176      integer nb
1177
1178#include "bafdecls.fh"
1179#include "errquit.fh"
1180
1181#include "D3dB.fh"
1182
1183      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
1184      common    / D3dB_fft / tmpx,tmpy,tmpz
1185
1186#ifdef FFTW3
1187#include "fftw3.fh"
1188       integer Atest(2),nxh,nxhz,nxhy
1189#endif
1190
1191      logical value
1192      integer i,mthr, Parallel_maxthreads
1193      external Parallel_maxthreads
1194
1195      mthr = Parallel_maxthreads()
1196
1197
1198      !call D3dB_nfft3d(nb,nfft3d)
1199
1200c      value = BA_alloc_get(mt_dcpl,(nfft3d(nb)),
1201c     >        'fttmpx',tmpx(2,nb),tmpx(1,nb))
1202c      value = value.and.
1203c     >        BA_alloc_get(mt_dcpl,(nfft3d(nb)),
1204c     >        'fttmpy',tmpy(2,nb),tmpy(1,nb))
1205c      value = value.and.
1206c     >        BA_alloc_get(mt_dcpl,(nfft3d(nb)),
1207c     >        'fttmpz',tmpz(2,nb),tmpz(1,nb))
1208      value = BA_alloc_get(mt_dcpl,mthr*(2*nx(nb)+15),
1209     >        'fttmpx',tmpx(2,nb),tmpx(1,nb))
1210      value = value.and.
1211     >        BA_alloc_get(mt_dcpl,mthr*(2*ny(nb)+15),
1212     >        'fttmpy',tmpy(2,nb),tmpy(1,nb))
1213      value = value.and.
1214     >        BA_alloc_get(mt_dcpl,mthr*(2*nz(nb)+15),
1215     >        'fttmpz',tmpz(2,nb),tmpz(1,nb))
1216      if (.not. value)
1217     >   call errquit('D3dB_fft_init:out of heap memory',0, MA_ERR)
1218
1219
1220#ifdef MLIB
1221      call drc1ft(dcpl_mb(tmpx(1,nb)),nx(nb),
1222     >            dcpl_mb(tmpx(1,nb)),-3,ierr)
1223      call z1dfft(dcpl_mb(tmpx(1,nb)),ny(nb),
1224     >            dcpl_mb(tmpy(1,nb)),-3,ierr)
1225      call z1dfft(dcpl_mb(tmpx(1,nb)),nz(nb),
1226     >            dcpl_mb(tmpz(1,nb)),-3,ierr)
1227
1228#else
1229      do i=1,mthr
1230        !write(*,*) "DEBUG init fft arrays of thread ", i-1
1231        call drffti(nx(nb),dcpl_mb(tmpx(1,nb)+(i-1)*(2*nx(nb)+15)))
1232        call dcffti(ny(nb),dcpl_mb(tmpy(1,nb)+(i-1)*(2*ny(nb)+15)))
1233        call dcffti(nz(nb),dcpl_mb(tmpz(1,nb)+(i-1)*(2*nz(nb)+15)))
1234      end do
1235#endif
1236
1237#ifdef FFTW3
1238
1239c       call dfftw_init_threads()
1240c       call dfftw_plan_with_nthreads(2)
1241
1242       iforward  = FFTW_FORWARD
1243       ibackward = FFTW_BACKWARD
1244c       iestimate = FFTW_PATIENT
1245c       iestimate = FFTW_MEASURE
1246c       iestimate = FFTW_ESTIMATE
1247c       iestimate = FFTW_EXHAUSTIVE
1248       call icopy(nplans*NBLOCKS,0,0,plans,1)
1249      if (mapping.eq.1) then
1250         nxh = (nx(nb)/2+1)
1251         nxhz = nxh*nz(nb)
1252         nxhy = nxh*ny(nb)
1253         if (.not.BA_alloc_get(mt_dcpl,nx(nb)*ny(nb)*nq(nb),
1254     >                       'Atest',Atest(2),Atest(1)))
1255     >     call errquit('D3dB_fft_init:out of heap memory',0,MA_ERR)
1256
1257         call dfftw_plan_many_dft(plans(1,nb),1,nz(nb),nxh,
1258     >                        dcpl_mb(Atest(1)),nxhz,nxh,1,
1259     >                        dcpl_mb(Atest(1)),nxhz,nxh,1,
1260     >                        ibackward,FFTW_EXHAUSTIVE)
1261         call dfftw_plan_many_dft(plans(2,nb),1,ny(nb),nxh,
1262     >                        dcpl_mb(Atest(1)),nxhy,nxh,1,
1263     >                        dcpl_mb(Atest(1)),nxhy,nxh,1,
1264     >                        ibackward,FFTW_EXHAUSTIVE)
1265         call dfftw_plan_many_dft_c2r(plans(3,nb),1,nx(nb),
1266     >        ny(nb)*nq(nb),
1267     >        dcpl_mb(Atest(1)),nxh       *ny(nb)*nq(nb),1,nxh,
1268     >        dcpl_mb(Atest(1)),(nx(nb)+2)*ny(nb)*nq(nb),1,nx(nb)+2,
1269     >        FFTW_EXHAUSTIVE)
1270
1271         call dfftw_plan_many_dft_r2c(plans(4,nb),1,nx(nb),
1272     >        ny(nb)*nq(nb),
1273     >        dcpl_mb(Atest(1)),(nx(nb)+2)*ny(nb)*nq(nb),1,nx(nb)+2,
1274     >        dcpl_mb(Atest(1)),nxh       *ny(nb)*nq(nb),1,nxh,
1275     >        FFTW_EXHAUSTIVE)
1276         call dfftw_plan_many_dft(plans(5,nb),1,ny(nb),nxh,
1277     >                        dcpl_mb(Atest(1)),nxhy,nxh,1,
1278     >                        dcpl_mb(Atest(1)),nxhy,nxh,1,
1279     >                        iforward,FFTW_EXHAUSTIVE)
1280         call dfftw_plan_many_dft(plans(6,nb),1,nz(nb),nxh,
1281     >                        dcpl_mb(Atest(1)),nxhz,nxh,1,
1282     >                        dcpl_mb(Atest(1)),nxhz,nxh,1,
1283     >                        iforward,FFTW_EXHAUSTIVE)
1284
1285         call dfftw_plan_many_dft(plans(7,nb),1,nz(nb),1,
1286     >                        dcpl_mb(Atest(1)),nxhz,nxh,1,
1287     >                        dcpl_mb(Atest(1)),nxhz,nxh,1,
1288     >                        ibackward,FFTW_EXHAUSTIVE)
1289         call dfftw_plan_many_dft(plans(8,nb),1,ny(nb),1,
1290     >                        dcpl_mb(Atest(1)),nxhy,nxh,1,
1291     >                        dcpl_mb(Atest(1)),nxhy,nxh,1,
1292     >                        ibackward,FFTW_EXHAUSTIVE)
1293
1294          call dfftw_plan_many_dft(plans(9,nb),1,ny(nb),1,
1295     >                        dcpl_mb(Atest(1)),nxhy,nxh,1,
1296     >                        dcpl_mb(Atest(1)),nxhy,nxh,1,
1297     >                        iforward,FFTW_EXHAUSTIVE)
1298          call dfftw_plan_many_dft(plans(10,nb),1,nz(nb),1,
1299     >                        dcpl_mb(Atest(1)),nxhz,nxh,1,
1300     >                        dcpl_mb(Atest(1)),nxhz,nxh,1,
1301     >                        iforward,FFTW_EXHAUSTIVE)
1302
1303
1304         if (.not.BA_free_heap(Atest(2)))
1305     >   call errquit('D3dB_fft_init:freeing heap',0,MA_ERR)
1306
1307      else
1308
1309         nxh  = (nx(nb)/2+1)
1310         if (.not.BA_alloc_get(mt_dcpl,nfft3d(nb),'Atest',
1311     >                         Atest(2),Atest(1)))
1312     >     call errquit('D3dB_fft_init:out of heap memory',0,MA_ERR)
1313
1314         call dfftw_plan_many_dft(plans(11,nb),1,nz(nb),nq3(nb),
1315     >                        dcpl_mb(Atest(1)),nz(nb)*nq3(nb),1,nz(nb),
1316     >                        dcpl_mb(Atest(1)),nz(nb)*nq3(nb),1,nz(nb),
1317     >                        ibackward,FFTW_EXHAUSTIVE)
1318         call dfftw_plan_many_dft(plans(12,nb),1,ny(nb),nq2(nb),
1319     >                        dcpl_mb(Atest(1)),ny(nb)*nq2(nb),1,ny(nb),
1320     >                        dcpl_mb(Atest(1)),ny(nb)*nq2(nb),1,ny(nb),
1321     >                        ibackward,FFTW_EXHAUSTIVE)
1322         call dfftw_plan_many_dft_c2r(plans(13,nb),1,nx(nb),
1323     >        nq1(nb),
1324     >        dcpl_mb(Atest(1)),nfft3d(nb),1,nxh,
1325     >        dcpl_mb(Atest(1)),n2ft3d(nb),1,(nx(nb)+2),
1326     >        FFTW_EXHAUSTIVE)
1327         call dfftw_plan_many_dft_r2c(plans(14,nb),1,nx(nb),
1328     >        nq1(nb),
1329     >        dcpl_mb(Atest(1)),n2ft3d(nb),1,nx(nb)+2,
1330     >        dcpl_mb(Atest(1)),nfft3d(nb),1,nxh,
1331     >        FFTW_EXHAUSTIVE)
1332
1333         call dfftw_plan_many_dft(plans(15,nb),1,ny(nb),nq2(nb),
1334     >                        dcpl_mb(Atest(1)),ny(nb)*nq2(nb),1,ny(nb),
1335     >                        dcpl_mb(Atest(1)),ny(nb)*nq2(nb),1,ny(nb),
1336     >                        iforward,FFTW_EXHAUSTIVE)
1337         call dfftw_plan_many_dft(plans(16,nb),1,nz(nb),nq3(nb),
1338     >                        dcpl_mb(Atest(1)),nz(nb)*nq3(nb),1,nz(nb),
1339     >                        dcpl_mb(Atest(1)),nz(nb)*nq3(nb),1,nz(nb),
1340     >                        iforward,FFTW_EXHAUSTIVE)
1341
1342         call dfftw_plan_many_dft(plans(17,nb),1,nz(nb),1,
1343     >                        dcpl_mb(Atest(1)),nz(nb),1,1,
1344     >                        dcpl_mb(Atest(1)),nz(nb),1,1,
1345     >                        ibackward,FFTW_EXHAUSTIVE)
1346         call dfftw_plan_many_dft(plans(18,nb),1,ny(nb),1,
1347     >                        dcpl_mb(Atest(1)),ny(nb),1,1,
1348     >                        dcpl_mb(Atest(1)),ny(nb),1,1,
1349     >                        ibackward,FFTW_EXHAUSTIVE)
1350
1351          call dfftw_plan_many_dft(plans(19,nb),1,ny(nb),1,
1352     >                        dcpl_mb(Atest(1)),ny(nb),1,1,
1353     >                        dcpl_mb(Atest(1)),ny(nb),1,1,
1354     >                        iforward,FFTW_EXHAUSTIVE)
1355          call dfftw_plan_many_dft(plans(20,nb),1,nz(nb),1,
1356     >                        dcpl_mb(Atest(1)),nz(nb),1,1,
1357     >                        dcpl_mb(Atest(1)),nz(nb),1,1,
1358     >                        iforward,FFTW_EXHAUSTIVE)
1359
1360         if (.not.BA_free_heap(Atest(2)))
1361     >   call errquit('D3dB_fft_init:freeing heap',0,MA_ERR)
1362
1363      end if
1364#endif
1365
1366      return
1367      end
1368
1369*     ***********************************
1370*     *                                 *
1371*     *         D3dB_fft_end            *
1372*     *                                 *
1373*     ***********************************
1374
1375      subroutine D3dB_fft_end(nb)
1376      implicit none
1377      integer nb
1378
1379#include "bafdecls.fh"
1380#include "errquit.fh"
1381
1382#include "D3dB.fh"
1383
1384
1385      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
1386      common    / D3dB_fft / tmpx,tmpy,tmpz
1387#ifdef USE_OPENMP
1388      logical is_par
1389      common    / Debug_openmp / is_par
1390#endif
1391
1392      logical value
1393      integer i
1394
1395#ifdef FFTW3
1396      do i=1,nplans
1397         if (plans(i,nb).ne.0) call dfftw_destroy_plan(plans(i,nb))
1398      end do
1399c      call dfftw_cleanup_threads()
1400#endif
1401
1402      value =           BA_free_heap(tmpx(2,nb))
1403      value = value.and.BA_free_heap(tmpy(2,nb))
1404      value = value.and.BA_free_heap(tmpz(2,nb))
1405      if (.not.value)
1406     >   call errquit(
1407     >   'D3dB_fft_end:error deallocatingof heap memory',0, MA_ERR)
1408
1409      return
1410      end
1411
1412
1413
1414
1415
1416
1417
1418c*     ***********************************
1419c*     *                                 *
1420c*     *         D3dB_n_fft_init         *
1421c*     *                                 *
1422c*     ***********************************
1423c
1424c      subroutine D3dB_n_fft_init(nb,ne)
1425c      implicit none
1426c      integer nb,ne
1427c
1428c#include "bafdecls.fh"
1429c#include "errquit.fh"
1430c
1431c      integer tmp2(2,NBLOCKS),tmp3(2,NBLOCKS)
1432c      common    / D3dB_n_fft / tmp2,tmp3
1433c
1434c      logical value
1435c      integer nfft3d
1436c
1437c      call D3dB_nfft3d(nb,nfft3d)
1438c      value = BA_alloc_get(mt_dcpl,(ne*nfft3d),
1439c     >        'fttmp2_h',tmp2(2,nb),tmp2(1,nb))
1440c      value = value.and.
1441c     >        BA_alloc_get(mt_dbl,(2*ne*nfft3d),
1442c     >        'fttmp3_h',tmp3(2,nb),tmp3(1,nb))
1443c      if (.not. value)
1444c     > call errquit('D3dB_n_fft_init:out of heap memory',0, MA_ERR)
1445c
1446c      return
1447c      end
1448c
1449c*     ***********************************
1450c*     *                                 *
1451c*     *         D3dB_n_fft_end          *
1452c*     *                                 *
1453c*     ***********************************
1454c
1455c      subroutine D3dB_n_fft_end(nb)
1456c      implicit none
1457c      integer nb
1458c
1459c#include "bafdecls.fh"
1460c#include "errquit.fh"
1461c
1462c      integer tmp2(2,NBLOCKS),tmp3(2,NBLOCKS)
1463c      common    / D3dB_n_fft / tmp2,tmp3
1464c
1465c      logical value
1466c
1467c      value =           BA_free_heap(tmp2(2,nb))
1468c      value = value.and.BA_free_heap(tmp3(2,nb))
1469c      if (.not.value)
1470c     > call errquit(
1471c     > 'D3dB_n_fft_end:error deallocatingof heap memory',0,MA_ERR)
1472c
1473c      return
1474c      end
1475
1476
1477
1478*     ***********************************
1479*     *					*
1480*     *	        D3dB_cr_fft3b		*
1481*     *					*
1482*     ***********************************
1483
1484      subroutine D3dB_cr_fft3b(nb,A)
1485
1486*****************************************************
1487*                                                   *
1488*      This routine performs the operation of       *
1489*      a three dimensional complex to complex       *
1490*      inverse fft                                  *
1491*           A(nx,ny(nb),nz(nb)) <- FFT3^(-1)[A(kx,ky,kz)]   *
1492*                                                   *
1493*      Entry - 					    *
1494*              A: a column distribuded 3d block     *
1495*              tmp: tempory work space must be at   *
1496*                    least the size of (complex)    *
1497*                    (nfft*nfft + 1) + 10*nfft      *
1498*                                                   *
1499*       Exit - A is transformed and the imaginary   *
1500*              part of A is set to zero             *
1501*       uses - D3dB_c_transpose_jk, dcopy           *
1502*                                                   *
1503*****************************************************
1504
1505      implicit none
1506      integer nb
1507      complex*16  A(*)
1508
1509
1510#include "bafdecls.fh"
1511#include "errquit.fh"
1512
1513#include "D3dB.fh"
1514
1515
1516      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
1517      common    / D3dB_fft / tmpx,tmpy,tmpz
1518
1519*     *** local variables ***
1520      integer i,j,k,q,indx
1521      integer nxh,nxhy,nxhz,indx0,indx1
1522
1523
1524      !integer tmp1(2),tmp2(2),tmp3(2)
1525      integer tmp2(2),tmp3(2)
1526      logical value
1527
1528      integer  tid,nthr,offset
1529      integer  Parallel_threadid,Parallel_nthreads
1530      external Parallel_threadid,Parallel_nthreads
1531
1532
1533      tid  = Parallel_threadid()
1534      nthr = Parallel_nthreads()
1535
1536      call nwpw_timing_start(1)
1537
1538*     ***** allocate temporary space ****
1539      !call D3dB_nfft3d(nb,nfft3d)
1540      value = BA_push_get(mt_dcpl,(nfft3d(nb)),'ffttmp2',
1541     >                    tmp2(2),tmp2(1))
1542      value = value.and.
1543     >      BA_push_get(mt_dbl,(n2ft3d(nb)),'ffttmp3',tmp3(2),tmp3(1))
1544      if (.not. value) call errquit('out of stack memory',0, MA_ERR)
1545
1546       nxh = (nx(nb)/2+1)
1547       nxhz = nxh*nz(nb)
1548       nxhy = nxh*ny(nb)
1549
1550      !**********************
1551      !**** slab mapping ****
1552      !**********************
1553      if (mapping.eq.1) then
1554*     ********************************************
1555*     ***         Do a transpose of A          ***
1556*     ***      A(kx,kz,ky) <- A(kx,ky,kz)      ***
1557*     ********************************************
1558c     call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
1559
1560*     *************************************************
1561*     ***     do fft along kz dimension             ***
1562*     ***   A(kx,nz(nb),ky) <- fft1d^(-1)[A(kx,kz,ky)]  ***
1563*     *************************************************
1564#ifdef MLIB
1565      !call z1dfft(dbl_mb(tmp3(1)),nz(nb),dcpl_mb(tmpz(1)),-3,ierr)
1566      do q=1,nq(nb)
1567      do i=1,(nx(nb)/2+1)
1568         do k=1,nz(nb)
1569            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb)
1570            dcpl_mb(tmp2(1)+k-1) = A(indx)
1571         end do
1572         call z1dfft(dcpl_mb(tmp2(1)),nz(nb),
1573     >               dcpl_mb(tmpz(1,nb)),-2,ierr)
1574         do k=1,nz(nb)
1575            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb)
1576            A(indx) = dcpl_mb(tmp2(1)+k-1)
1577         end do
1578      end do
1579      end do
1580      !call dscal((nx(nb)+2)*ny(nb)*nq(nb),dble(nz(nb)),A,1)
1581
1582#else
1583
1584#ifdef FFTW3
1585      do q=1,nq(nb)
1586        indx = 1+(q-1)*nxhz
1587        call dfftw_execute_dft(plans(1,nb),A(indx),A(indx))
1588      end do
1589#else
1590      !call dcffti(nz(nb),dcpl_mb(tmp1(1)))
1591      indx0 = 0
1592      do q=1,nq(nb)
1593      do i=1,nxh
1594
1595         indx  = i + indx0
1596         indx1 = indx
1597         do k=1,nz(nb)
1598            dcpl_mb(tmp2(1)+k-1) = A(indx)
1599            indx = indx + nxh
1600         end do
1601         call dcfftb(nz(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpz(1,nb)))
1602         do k=1,nz(nb)
1603            A(indx1) = dcpl_mb(tmp2(1)+k-1)
1604            indx1 = indx1 + nxh
1605         end do
1606
1607      end do
1608      indx0 = indx0 + nxhz
1609      end do
1610#endif
1611#endif
1612
1613*     ********************************************
1614*     ***         Do a transpose of A          ***
1615*     ***      A(kx,ky,nz(nb)) <- A(kx,nz(nb),ky)      ***
1616*     ********************************************
1617      call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
1618
1619*     *************************************************
1620*     ***     do fft along ky dimension             ***
1621*     ***   A(kx,ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ky,nz(nb))]  ***
1622*     *************************************************
1623#ifdef MLIB
1624      !call z1dfft(dbl_mb(tmp3(1)),ny(nb),dcpl_mb(tmp1(1)),-3,ierr)
1625      do q=1,nq(nb)
1626      do i=1,(nx(nb)/2+1)
1627         do j=1,ny(nb)
1628            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
1629            dcpl_mb(tmp2(1)+j-1) = A(indx)
1630         end do
1631         call z1dfft(dcpl_mb(tmp2(1)),ny(nb),
1632     >               dcpl_mb(tmpy(1,nb)),-2,ierr)
1633         do j=1,ny(nb)
1634            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
1635            A(indx) = dcpl_mb(tmp2(1)+j-1)
1636         end do
1637      end do
1638      end do
1639      !call dscal((nx(nb)+2)*ny(nb)*nq(nb),dble(ny(nb)),A,1)
1640#else
1641
1642#ifdef FFTW3
1643      do q=1,nq(nb)
1644         indx = 1+(q-1)*nxhy
1645         call dfftw_execute_dft(plans(2,nb),A(indx),A(indx))
1646      end do
1647#else
1648      !call dcffti(ny(nb),dcpl_mb(tmp1(1)))
1649      indx0 = 0
1650      do q=1,nq(nb)
1651      do i=1,(nx(nb)/2+1)
1652
1653         indx  = i + indx0
1654         indx1 = indx
1655         do j=1,ny(nb)
1656            dcpl_mb(tmp2(1)+j-1) = A(indx)
1657            indx = indx + nxh
1658         end do
1659         call dcfftb(ny(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpy(1,nb)))
1660         do j=1,ny(nb)
1661            A(indx1) = dcpl_mb(tmp2(1)+j-1)
1662            indx1 = indx1 + nxh
1663         end do
1664
1665      end do
1666      indx0 = indx0 + nxhy
1667      end do
1668#endif
1669#endif
1670
1671*     *************************************************
1672*     ***     do fft along kx dimension             ***
1673*     ***   A(nx(nb),ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ny(nb),nz(nb))]  ***
1674*     *************************************************
1675#ifdef MLIB
1676      !call drc1ft (dbl_mb(tmp3(1)),nx(nb),dcpl_mb(tmp1(1)),-3,ierr)
1677      do q=1,nq(nb)
1678      do j=1,ny(nb)
1679         indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
1680         call drc1ft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),-2,ierr)
1681      end do
1682      end do
1683c     call drcfts(A,nx(nb),1,ny(nb)*nq(nb),
1684c    >                  nx(nb)+2,-2,ierr)
1685c     call dscal((nx(nb)+2)*ny(nb)*nq(nb),dble(nx(nb)),A,1)
1686
1687#else
1688
1689#ifdef FFTW3
1690      call dfftw_execute_dft_c2r(plans(3,nb),A,A)
1691
1692#else
1693      !call drffti(nx(nb),dcpl_mb(tmp1(1)))
1694
1695c      do q=1,nq(nb)
1696c      do j=1,ny(nb)
1697c         indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
1698c         call dcopy((nx(nb)+2),A(indx),1,dbl_mb(tmp3(1)),1)
1699c         do i=2,nx(nb)
1700c            dbl_mb(tmp3(1)+i-1) = dbl_mb(tmp3(1)+i)
1701c         end do
1702c         call drfftb(nx(nb),dbl_mb(tmp3(1)),dcpl_mb(tmp1(1)))
1703c         dbl_mb(tmp3(1)+nx(nb)) = 0.0d0
1704c         dbl_mb(tmp3(1)+nx(nb)+1) = 0.0d0
1705c         call dcopy((nx(nb)+2),dbl_mb(tmp3(1)),1,A(indx),1)
1706c      end do
1707c      end do
1708
1709      call cshift1_fftb(nx(nb),ny(nb),nq(nb),1,A)
1710      indx = 1
1711      do q=1,nq(nb)
1712      do j=1,ny(nb)
1713         !indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
1714         call drfftb(nx(nb),A(indx),dcpl_mb(tmpx(1,nb)))
1715         indx = indx + nxh
1716      end do
1717      end do
1718      call zeroend_fftb(nx(nb),ny(nb),nq(nb),1,A)
1719#endif
1720
1721#endif
1722
1723
1724      !*************************
1725      !**** hilbert mapping ****
1726      !*************************
1727      else
1728
1729
1730*     *************************************************
1731*     ***     do fft along kz dimension             ***
1732*     ***   A(nz(nb),kx,ky) <- fft1d^(-1)[A(kz,kx,ky)]  ***
1733*     *************************************************
1734#ifdef MLIB
1735      indx = 1
1736      do q=1,nq3(nb)
1737         !indx = 1 + (q-1)*nz(nb)
1738         call z1dfft(A(indx),nz(nb),dcpl_mb(tmpz(1,nb)),-2,ierr)
1739         indx = indx + nz(nb)
1740      end do
1741#else
1742
1743#ifdef FFTW3
1744      call dfftw_execute_dft(plans(11,nb),A,A)
1745
1746#else
1747
1748      offset=tid*(2*nz(nb)+15)
1749      do i=tid+1,nq3(nb),nthr
1750        call dcfftb(nz(nb),A(1+(i-1)*nz(nb)),dcpl_mb(tmpz(1,nb)+offset))
1751      end do
1752!$OMP BARRIER
1753
1754#endif
1755#endif
1756
1757      call D3dB_c_transpose_ijk(nb,3,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
1758
1759*     *************************************************
1760*     ***     do fft along ky dimension             ***
1761*     ***   A(ny(nb),nz(nb),kx) <- fft1d^(-1)[A(ky,nz(nb),kx)]  ***
1762*     *************************************************
1763#ifdef MLIB
1764      indx = 1
1765      do q=1,nq2(nb)
1766         !indx = 1 + (q-1)*ny(nb)
1767         call z1dfft(A(indx),ny(nb),dcpl_mb(tmpy(1,nb)),-2,ierr)
1768         indx = indx + ny(nb)
1769      end do
1770#else
1771
1772#ifdef FFTW3
1773      call dfftw_execute_dft(plans(12,nb),A,A)
1774
1775#else
1776      offset=tid*(2*ny(nb)+15)
1777      do i=tid+1,nq2(nb),nthr
1778        call dcfftb(ny(nb),A(1+(i-1)*ny(nb)),dcpl_mb(tmpy(1,nb)+offset))
1779      end do
1780!$OMP BARRIER
1781#endif
1782#endif
1783
1784      call D3dB_c_transpose_ijk(nb,4,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
1785
1786*     *************************************************
1787*     ***     do fft along kx dimension             ***
1788*     ***   A(nx(nb),ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ny(nb),nz(nb))]  ***
1789*     *************************************************
1790#ifdef MLIB
1791      indx = 1
1792      do q=1,nq1(nb)
1793         !indx = 1 + (q-1)*(nx(nb)/2+1)
1794         call drc1ft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),-2,ierr)
1795         indx = indx + nxh
1796      end do
1797#else
1798
1799#ifdef FFTW3
1800      call dfftw_execute_dft_c2r(plans(13,nb),A,A)
1801
1802#else
1803      offset=tid*(2*nx(nb)+15)
1804      call cshift1_fftb(nx(nb),nq1(nb),1,1,A)
1805      do i=tid+1,nq1(nb),nthr
1806        call drfftb(nx(nb),A(1+(i-1)*nxh),dcpl_mb(tmpx(1,nb)+offset))
1807      end do
1808!$OMP BARRIER
1809
1810      call zeroend_fftb(nx(nb),nq1(nb),1,1,A)
1811
1812#endif
1813#endif
1814
1815      end if
1816
1817
1818*     **** deallocate temporary space  ****
1819      value = BA_pop_stack(tmp3(2))
1820      value = value.and.BA_pop_stack(tmp2(2))
1821      !value = BA_pop_stack(tmp1(2))
1822      if (.not. value) call errquit('popping stack memory',0,MA_ERR)
1823
1824      call nwpw_timing_end(1)
1825
1826      return
1827      end
1828
1829      subroutine D3dB_fftbx_sub(n,nx,nxh,tmpx,A)
1830      implicit none
1831      integer n,nx,nxh
1832      real*8     tmpx(2*nx+15)
1833      complex*16 A(nxh,n)
1834      integer i
1835
1836
1837
1838      do i=1,n
1839         call drfftb(nx,A(1,i),tmpx)
1840      end do
1841
1842
1843      return
1844      end
1845
1846      subroutine D3dB_fftby_sub2(n,ny,tmpy,A)
1847      implicit none
1848      integer n,ny
1849      real*8     tmpy(4*ny+15)
1850      complex*16 A(ny,n)
1851      integer i
1852
1853
1854
1855      do i=1,n
1856         call dcfftb(ny,A(1,i),tmpy)
1857      end do
1858
1859
1860      return
1861      end
1862
1863      subroutine D3dB_fftbz_sub2(n,nz,tmpz,A)
1864      implicit none
1865      integer n,nz
1866      real*8     tmpz(4*nz+15)
1867      complex*16 A(nz,n)
1868      integer i
1869
1870
1871
1872      do i=1,n
1873         call dcfftb(nz,A(1,i),tmpz)
1874      end do
1875
1876
1877      return
1878      end
1879
1880      subroutine cshift1_fftb(nx,ny,nq,ne,A)
1881      implicit none
1882      integer nx,ny,nq,ne
1883      real*8 A(*)
1884
1885      integer i,j,indx
1886
1887!$OMP DO
1888      do j=1,(ny*nq*ne)
1889        indx = 1+(j-1)*(nx+2)
1890c        indx = 1 + (j-1)*(nx+2) + (q-1)*(nx+2)*ny
1891c    >            + (n-1)*(nx+2)*ny*nq
1892         do i=2,nx
1893            A(indx+i-1) = A(indx+i)
1894         end do
1895      end do
1896!$OMP END DO
1897
1898c     end do
1899c     end do
1900      return
1901      end
1902
1903
1904      subroutine cshift1_fftb1(nx,A)
1905      implicit none
1906      integer nx
1907      real*8 A(*)
1908      integer i
1909      do i=2,nx
1910         A(i) = A(i+1)
1911      end do
1912      return
1913      end
1914
1915
1916      subroutine zeroend_fftb1(nx,A)
1917      implicit none
1918      integer nx
1919      real*8 A(*)
1920      integer i
1921      A(nx+1) = 0.0d0
1922      A(nx+2) = 0.0d0
1923      return
1924      end
1925
1926
1927      subroutine zeroend_fftb(nx,ny,nq,ne,A)
1928      implicit none
1929      integer nx,ny,nq,ne
1930      real*8 A(*)
1931
1932      integer i,indx
1933
1934!$OMP DO
1935      do i=1,(ny*nq*ne)
1936         indx = nx+1+(i-1)*(nx+2)
1937         A(indx)   = 0.0d0
1938         A(indx+1) = 0.0d0
1939      end do
1940!$OMP END DO
1941
1942      return
1943      end
1944
1945c*     ***********************************
1946c*     *					*
1947c*     *	        D3dB_ncr_fft3b		*
1948c*     *					*
1949c*     ***********************************
1950c
1951c      subroutine D3dB_ncr_fft3b(nb,ne,A)
1952c
1953c*****************************************************
1954c*                                                   *
1955c*      This routine performs the operation of       *
1956c*      a three dimensional complex to complex       *
1957c*      inverse fft                                  *
1958c* A(nx,ny(nb),nz(nb),n) <- FFT3^(-1)[A(kx,ky,kz,n)] *
1959c*                                                   *
1960c*      Entry - 					    *
1961c*              A: a column distribuded 3d block     *
1962c*              tmp: tempory work space must be at   *
1963c*                    least the size of (complex)    *
1964c*                    (nfft*nfft + 1) + 10*nfft      *
1965c*                                                   *
1966*       Exit - A is transformed and the imaginary   *
1967cc*              part of A is set to zero             *
1968c*       uses - D3dB_c_transpose_jk, dcopy           *
1969c*                                                   *
1970c*****************************************************
1971c
1972c      implicit none
1973c      integer nb,ne
1974c      complex*16  A(*)
1975c
1976c#include "bafdecls.fh"
1977c#include "errquit.fh"
1978c
1979c#include "D3dB.fh"
1980c
1981c
1982c      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
1983c      common    / D3dB_fft / tmpx,tmpy,tmpz
1984c
1985c      integer tmp2(2,NBLOCKS),tmp3(2,NBLOCKS)
1986c      common    / D3dB_n_fft / tmp2,tmp3
1987c
1988c*     *** local variables ***
1989c      integer i,j,k,q,n,indx,ierr
1990c
1991cc     complex*16  tmp1(*)
1992cc     complex*16  tmp2(*)
1993cc     real*8      tmp3(*)
1994c      !integer tmp1(2),tmp2(2),tmp3(2)
1995c      logical value
1996c
1997c
1998c      call nwpw_timing_start(1)
1999c
2000c*     ***** allocate temporary space ****
2001c      !call D3dB_nfft3d(nb,nfft3d)
2002c
2003c
2004c*     ********************************************
2005c*     ***         Do a transpose of A          ***
2006c*     ***      A(kx,kz,ky) <- A(kx,ky,kz)      ***
2007c*     ********************************************
2008cc     call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1,nb)))
2009c
2010c*     *************************************************
2011c*     ***     do fft along kz dimension             ***
2012c*     ***   A(kx,nz(nb),ky) <- fft1d^(-1)[A(kx,kz,ky)]  ***
2013c*     *************************************************
2014c#ifdef MLIB
2015c      do n=1,ne
2016c      do q=1,nq(nb)
2017c      do i=1,(nx(nb)/2+1)
2018c         do k=1,nz(nb)
2019c            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb)
2020c     >               + (n-1)*nfft3d(nb)
2021c            dcpl_mb(tmp2(1,nb)+k-1) = A(indx)
2022c         end do
2023c         call z1dfft(dcpl_mb(tmp2(1,nb)),nz(nb),
2024c     >               dcpl_mb(tmpz(1,nb)),-2,ierr)
2025c         do k=1,nz(nb)
2026c            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb)
2027c     >               + (n-1)*nfft3d(nb)
2028c            A(indx) = dcpl_mb(tmp2(1,nb)+k-1)
2029c         end do
2030c      end do
2031c      end do
2032c      end do
2033c
2034c#else
2035c      do n=1,ne
2036c      do q=1,nq(nb)
2037c      do i=1,(nx(nb)/2+1)
2038c         do k=1,nz(nb)
2039c            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb)
2040c     >               + (n-1)*nfft3d(nb)
2041c            dcpl_mb(tmp2(1,nb)+k-1) = A(indx)
2042c         end do
2043c         call dcfftb(nz(nb),dcpl_mb(tmp2(1,nb)),dcpl_mb(tmpz(1,nb)))
2044c         do k=1,nz(nb)
2045c            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*nz(nb)
2046c     >               + (n-1)*nfft3d(nb)
2047c            A(indx) = dcpl_mb(tmp2(1,nb)+k-1)
2048c         end do
2049c      end do
2050c      end do
2051c      end do
2052c#endif
2053c
2054c*     ********************************************
2055c*     ***         Do a transpose of A          ***
2056c*     ***      A(kx,ky,nz(nb)) <- A(kx,nz(nb),ky)      ***
2057c*     ********************************************
2058c      do n=1,ne
2059c        indx = 1 + (n-1)*nfft3d(nb)
2060c        call D3dB_c_transpose_jk(nb,A(indx),
2061c     >                           dcpl_mb(tmp2(1,nb)),
2062c     >                           dbl_mb(tmp3(1,nb)))
2063c      end do
2064cc     call D3dB_nc_transpose_jk(nb,ne,A,
2065cc    >                          dcpl_mb(tmp2(1,nb)),
2066cc    >                          dbl_mb(tmp3(1,nb)))
2067c
2068c*     *************************************************
2069c*     ***     do fft along ky dimension             ***
2070c*     ***   A(kx,ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ky,nz(nb))]  ***
2071c*     *************************************************
2072c#ifdef MLIB
2073c      do n=1,ne
2074c      do q=1,nq(nb)
2075c      do i=1,(nx(nb)/2+1)
2076c         do j=1,ny(nb)
2077c            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2078c     >               + (n-1)*nfft3d(nb)
2079c            dcpl_mb(tmp2(1,nb)+j-1) = A(indx)
2080c         end do
2081c         call z1dfft(dcpl_mb(tmp2(1,nb)),ny(nb),
2082c     >               dcpl_mb(tmpy(1,nb)),-2,ierr)
2083c         do j=1,ny(nb)
2084c            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2085c     >               + (n-1)*nfft3d(nb)
2086c            A(indx) = dcpl_mb(tmp2(1,nb)+j-1)
2087c         end do
2088c      end do
2089c      end do
2090c      end do
2091c      !call dscal((nx(nb)+2)*ny(nb)*nq(nb),dble(ny(nb)),A,1)
2092c#else
2093c      do n=1,ne
2094c      do q=1,nq(nb)
2095c      do i=1,(nx(nb)/2+1)
2096c         do j=1,ny(nb)
2097c            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2098c     >               + (n-1)*nfft3d(nb)
2099c            dcpl_mb(tmp2(1,nb)+j-1) = A(indx)
2100c         end do
2101c         call dcfftb(ny(nb),dcpl_mb(tmp2(1,nb)),dcpl_mb(tmpy(1,nb)))
2102c         do j=1,ny(nb)
2103c            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2104c     >               + (n-1)*nfft3d(nb)
2105c            A(indx) = dcpl_mb(tmp2(1,nb)+j-1)
2106c         end do
2107c      end do
2108c      end do
2109c      end do
2110c#endif
2111c
2112c*     *************************************************
2113c*     ***     do fft along kx dimension             ***
2114c*     ***   A(nx(nb),ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ny(nb),nz(nb))]  ***
2115c*     *************************************************
2116c#ifdef MLIB
2117c      do n=1,ne
2118c      do q=1,nq(nb)
2119c      do j=1,ny(nb)
2120c         indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2121c     >            + (n-1)*nfft3d(nb)
2122c         call drc1ft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),-2,ierr)
2123c      end do
2124c      end do
2125c      end do
2126c
2127c#else
2128c
2129c      call cshift1_fftb(nx(nb),ny(nb),nq(nb),ne,A)
2130c      do n=1,ne
2131c      do q=1,nq(nb)
2132c      do j=1,ny(nb)
2133c         indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2134c     >            + (n-1)*nfft3d(nb)
2135c         call drfftb(nx(nb),A(indx),dcpl_mb(tmpx(1,nb)))
2136c      end do
2137c      end do
2138c      end do
2139c      call zeroend_fftb(nx(nb),ny(nb),nq(nb),ne,A)
2140c#endif
2141c
2142c      call nwpw_timing_end(1)
2143c      return
2144c      end
2145
2146
2147
2148
2149*     ***********************************
2150*     *					*
2151*     *	        D3dB_rc_fft3f		*
2152*     *					*
2153*     ***********************************
2154
2155      subroutine D3dB_rc_fft3f(nb,A)
2156
2157*****************************************************
2158*                                                   *
2159*      This routine performs the operation of       *
2160*      a three dimensional complex to complex fft   *
2161*           A(kx,ky,kz) <- FFT3[A(nx(nb),ny(nb),nz(nb))]        *
2162*                                                   *
2163*      Entry - 					    *
2164*              A: a column distribuded 3d block     *
2165*              tmp: tempory work space must be at   *
2166*                    least the size of (complex)    *
2167*                    (nfft*nfft + 1) + 10*nfft      *
2168*                                                   *
2169*       Exit - A is transformed                     *
2170*                                                   *
2171*       uses - transpose1 subroutine                *
2172*                                                   *
2173*****************************************************
2174
2175      implicit none
2176      integer nb
2177      complex*16  A(*)
2178
2179#include "bafdecls.fh"
2180#include "errquit.fh"
2181
2182#include "D3dB.fh"
2183
2184      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
2185      common    / D3dB_fft / tmpx,tmpy,tmpz
2186
2187
2188*     *** local variables ***
2189      integer i,j,k,q,indx,indx1
2190      integer nxh,nxhy,nxhz
2191
2192      !integer tmp1(2),tmp2(2),tmp3(2)
2193      integer tmp2(2),tmp3(2)
2194      logical value
2195
2196      integer  tid,nthr,offset,nnn
2197      integer  Parallel_threadid,Parallel_nthreads
2198      external Parallel_threadid,Parallel_nthreads
2199
2200      tid  = Parallel_threadid()
2201      nthr = Parallel_nthreads()
2202
2203      call nwpw_timing_start(1)
2204
2205
2206*     ***** allocate temporary space ****
2207      !call D3dB_nfft3d(nb,nfft3d)
2208      nnn = nfft3d(nb)
2209      if ((nthr*ny(nb)).gt.nnn) nnn = nthr*ny(nb)
2210      if ((nthr*nz(nb)).gt.nnn) nnn = nthr*nz(nb)
2211      value = BA_push_get(mt_dcpl,nnn,'tmp2',tmp2(2),tmp2(1))
2212c      value = BA_push_get(mt_dcpl,(nfft3d(nb)),'tmp2',tmp2(2),tmp2(1))
2213      value = value.and.
2214     >        BA_push_get(mt_dbl,(n2ft3d(nb)),'tmp3',tmp3(2),tmp3(1))
2215      if (.not. value) call errquit('out of stack memory',0, MA_ERR)
2216
2217      nxh = (nx(nb)/2+1)
2218      nxhz = nxh*nz(nb)
2219      nxhy = nxh*ny(nb)
2220
2221      !**********************
2222      !**** slab mapping ****
2223      !**********************
2224      if (mapping.eq.1) then
2225*     ********************************************
2226*     ***     do fft along nx(nb) dimension        ***
2227*     ***   A(kx,ny(nb),nz(nb)) <- fft1d[A(nx(nb),ny(nb),nz(nb))]  ***
2228*     ********************************************
2229#ifdef MLIB
2230c     do q=1,nq(nb)
2231c     do j=1,ny(nb)
2232c        indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2233c        call drc1ft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),1,ierr)
2234c     end do
2235c     end do
2236      call drcfts(A,nx(nb),1,ny(nb)*nq(nb),
2237     >                  nx(nb)+2,1,ierr)
2238
2239#else
2240
2241#ifdef FFTW3
2242      call dfftw_execute_dft_r2c(plans(4,nb),A,A)
2243
2244#else
2245      offset=tid*(2*nx(nb)+15)
2246      do j=tid+1,ny(nb)*nq(nb),nthr
2247         call drfftf(nx(nb),A((j-1)*nxh+1),dcpl_mb(tmpx(1,nb)+offset))
2248      end do
2249      call cshift_fftf(nx(nb),ny(nb),nq(nb),1,A)
2250#endif
2251#endif
2252
2253
2254*     ********************************************
2255*     ***     do fft along ny(nb) dimension        ***
2256*     ***   A(kx,ky,nz(nb)) <- fft1d[A(kx,ny(nb),nz(nb))]  ***
2257*     ********************************************
2258
2259#ifdef MLIB
2260      !call z1dfft(dbl_mb(tmp3(1)),ny(nb),dcpl_mb(tmp1(1)),-3,ierr)
2261      do q=1,nq(nb)
2262      do i=1,(nx(nb)/2+1)
2263         do j=1,ny(nb)
2264            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2265            dcpl_mb(tmp2(1)+j-1) = A(indx)
2266         end do
2267         !indx = i + (q-1)*(nx(nb)/2+1)*ny(nb)
2268         !call zcopy(ny(nb),A(indx),(nx(nb)/2+1),dcpl_mb(tmp2(1)),1)
2269         call z1dfft(dcpl_mb(tmp2(1)),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr)
2270         do j=1,ny(nb)
2271            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2272            A(indx) = dcpl_mb(tmp2(1)+j-1)
2273         end do
2274         !call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),(nx(nb)/2+1))
2275      end do
2276      end do
2277ccc   *** this should be faster but isn't  ***
2278c      do i=1,(nx(nb)/2+1)
2279c        !indx = 1 + (q-1)*(nx(nb)/2+1)*ny(nb)
2280c        call zffts(A(i),ny(nb),(nx(nb)/2+1),nq(nb),
2281c     >           (nx(nb)/2+1)*nq(nb),1,ierr)
2282c      end do
2283#else
2284
2285#ifdef FFTW3
2286      do q=1,nq(nb)
2287         indx = 1+(q-1)*nxhy
2288         call dfftw_execute_dft(plans(5,nb),A(indx),A(indx))
2289      end do
2290
2291#else
2292      offset=tid*(2*ny(nb)+15)
2293      do i=tid+1,nxh,nthr
2294      indx = i
2295      indx1= i
2296      do q=1,nq(nb)
2297         do j=1,ny(nb)
2298            !indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2299            dcpl_mb(tmp2(1)+j-1+tid*ny(nb)) = A(indx)
2300            indx = indx + nxh
2301         end do
2302
2303         call dcfftf(ny(nb),dcpl_mb(tmp2(1)+tid*ny(nb)),
2304     >                      dcpl_mb(tmpy(1,nb)+offset))
2305
2306         do j=1,ny(nb)
2307            !indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2308            A(indx1) = dcpl_mb(tmp2(1)+j-1+tid*ny(nb))
2309            indx1 = indx1 + nxh
2310         end do
2311      end do
2312      end do
2313
2314#endif
2315#endif
2316
2317*     ********************************************
2318*     ***         Do a transpose of A          ***
2319*     ***      A(ky,nz(nb),ky) <- A(kx,ky,nz(nb))      ***
2320*     ********************************************
2321      call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
2322
2323
2324*     ********************************************
2325*     ***     do fft along nz(nb) dimension        ***
2326*     ***   A(kx,kz,ky) <- fft1d[A(kx,nz(nb),ky)]  ***
2327*     ********************************************
2328#ifdef MLIB
2329      !call z1dfft(dbl_mb(tmp3(1)),nz(nb),dcpl_mb(tmp1(1)),-3,ierr)
2330      do q=1,nq(nb)
2331      do i=1,(nx(nb)/2+1)
2332         do k=1,nz(nb)
2333            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2334            dcpl_mb(tmp2(1)+k-1) = A(indx)
2335         end do
2336         call z1dfft(dcpl_mb(tmp2(1)),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr)
2337         do k=1,nz(nb)
2338            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2339            A(indx) = dcpl_mb(tmp2(1)+k-1)
2340         end do
2341      end do
2342      end do
2343#else
2344
2345#ifdef FFTW3
2346      do q=1,nq(nb)
2347         indx = 1+(q-1)*nxhz
2348         call dfftw_execute_dft(plans(6,nb),A(indx),A(indx))
2349      end do
2350
2351#else
2352      offset=tid*(2*nz(nb)+15)
2353      do i=tid+1,nxh,nthr
2354      indx  = i
2355      indx1 = i
2356      do q=1,nq(nb)
2357
2358         do k=1,nz(nb)
2359            !indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2360            dcpl_mb(tmp2(1)+k-1+tid*nz(nb)) = A(indx)
2361            indx = indx + nxh
2362         end do
2363         call dcfftf(nz(nb),dcpl_mb(tmp2(1)+tid*nz(nb)),
2364     >                      dcpl_mb(tmpz(1,nb)+offset))
2365         do k=1,nz(nb)
2366            !indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2367            A(indx1) = dcpl_mb(tmp2(1)+k-1+tid*nz(nb))
2368            indx1 = indx1 + nxh
2369         end do
2370
2371      end do
2372      end do
2373
2374#endif
2375#endif
2376
2377*     ********************************************
2378*     ***         Do a transpose of A          ***
2379*     ***      A(kx,ky,kz) <- A(kx,kz,ky)      ***
2380*     ********************************************
2381c     call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
2382
2383
2384
2385
2386
2387      !*************************
2388      !**** hilbert mapping ****
2389      !*************************
2390      else
2391
2392*     ********************************************
2393*     ***     do fft along nx(nb) dimension        ***
2394*     ***   A(kx,ny(nb),nz(nb)) <- fft1d[A(nx(nb),ny(nb),nz(nb))]  ***
2395*     ********************************************
2396#ifdef MLIB
2397      call drcfts(A,nx(nb),1,nq1(nb),
2398     >                  nx(nb)+2,1,ierr)
2399#else
2400
2401#ifdef FFTW3
2402      call dfftw_execute_dft_r2c(plans(14,nb),A,A)
2403#else
2404
2405      offset=tid*(2*nx(nb)+15)
2406      do i=tid+1,nq1(nb),nthr
2407        call drfftf(nx(nb),A(1+(i-1)*nxh),dcpl_mb(tmpx(1,nb)+offset))
2408      end do
2409      call cshift_fftf(nx(nb),nq1(nb),1,1,A)
2410
2411
2412#endif
2413#endif
2414
2415      call D3dB_c_transpose_ijk(nb,1,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
2416
2417*     ********************************************
2418*     ***     do fft along ny(nb) dimension        ***
2419*     ***   A(ky,nz(nb),kx) <- fft1d[A(ny(nb),nz(nb),kx)]  ***
2420*     ********************************************
2421#ifdef MLIB
2422      indx = 1
2423      do q=1,nq2(nb)
2424         !indx = 1 + (q-1)*ny(nb)
2425         call z1dfft(A(indx),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr)
2426         indx = indx + ny(nb)
2427      end do
2428#else
2429
2430#ifdef FFTW3
2431      call dfftw_execute_dft(plans(15,nb),A,A)
2432
2433#else
2434      offset=tid*(2*ny(nb)+15)
2435      do i=tid+1,nq2(nb),nthr
2436        call dcfftf(ny(nb),A(1+(i-1)*ny(nb)),dcpl_mb(tmpy(1,nb)+offset))
2437      end do
2438!$OMP BARRIER
2439#endif
2440#endif
2441
2442      call D3dB_c_transpose_ijk(nb,2,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
2443
2444*     ********************************************
2445*     ***     do fft along nz(nb) dimension        ***
2446*     ***   A(kz,kx,ky) <- fft1d[A(nz(nb),kx,ky)]  ***
2447*     ********************************************
2448#ifdef MLIB
2449      indx = 1
2450      do q=1,nq3(nb)
2451         !indx = 1 + (q-1)*nz(nb)
2452         call z1dfft(A(indx),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr)
2453         indx = indx + nz(nb)
2454      end do
2455#else
2456
2457#ifdef FFTW3
2458      call dfftw_execute_dft(plans(16,nb),A,A)
2459#else
2460      offset=tid*(2*nz(nb)+15)
2461      do i=tid+1,nq3(nb),nthr
2462        call dcfftf(nz(nb),A(1+(i-1)*nz(nb)),dcpl_mb(tmpz(1,nb)+offset))
2463      end do
2464!$OMP BARRIER
2465
2466#endif
2467#endif
2468
2469      end if
2470
2471*     **** deallocate temporary space  ****
2472      value = BA_pop_stack(tmp3(2))
2473      value = BA_pop_stack(tmp2(2))
2474      !value = BA_pop_stack(tmp1(2))
2475
2476      call nwpw_timing_end(1)
2477
2478      return
2479      end
2480
2481      subroutine D3dB_fftfx_sub(n,nx,nxh,tmpx,A)
2482      implicit none
2483      integer n,nx,nxh
2484      real*8     tmpx(2*nx+15)
2485      complex*16 A(nxh,n)
2486      integer i
2487
2488
2489
2490      do i=1,n
2491         call drfftf(nx,A(1,i),tmpx)
2492      end do
2493
2494
2495      return
2496      end
2497
2498      subroutine  D3dB_fftfy_sub2(n,ny,tmpy,A)
2499      implicit none
2500      integer n,ny
2501      real*8     tmpy(4*ny+15)
2502      complex*16 A(ny,n)
2503      integer i,indx
2504
2505
2506
2507      do i=1,n
2508         call dcfftf(ny,A(1,i),tmpy)
2509      end do
2510
2511
2512      return
2513      end
2514
2515      subroutine  D3dB_fftfz_sub2(n,nz,tmpz,A)
2516      implicit none
2517      integer n,nz
2518      real*8     tmpz(4*nz+15)
2519      complex*16 A(nz,n)
2520      integer i
2521
2522
2523
2524      do i=1,n
2525         call dcfftf(nz,A(1,i),tmpz)
2526      end do
2527
2528
2529      return
2530      end
2531
2532
2533
2534
2535
2536      subroutine cshift_fftf(nx,ny,nq,ne,A)
2537      implicit none
2538      integer nx,ny,nq,ne
2539      real*8 A(*)
2540
2541      integer i,j,indx
2542
2543!$OMP BARRIER
2544!$OMP DO
2545      do j=1,(ny*nq*ne)
2546        indx = 1+(j-1)*(nx+2)
2547c        indx = 1 + (j-1)*(nx+2) + (q-1)*(nx+2)*ny
2548c     >        + (n-1)*(nx+2)*ny*nq
2549
2550         do i=nx,2,-1
2551            A(indx+i) = A(indx+i-1)
2552         end do
2553         A(indx+1)    = 0.0d0
2554         A(indx+nx+1) = 0.0d0
2555!         indx = indx + (nx+2)
2556      end do
2557!$OMP END DO
2558
2559      return
2560      end
2561
2562
2563      subroutine cshift_fftf_ab(nx,ny,nq,ne,A,B)
2564      implicit none
2565      integer nx,ny,nq,ne
2566      real*8 A(*)
2567      real*8 B(*)
2568
2569      integer i,j,indx
2570
2571      indx = 1
2572      do j=1,(ny*nq*ne)
2573CDIR$ NOVECTOR
2574         do i=nx,2,-1
2575            B(indx+i) = A(indx+i-1)
2576         end do
2577         B(indx+1)    = 0.0d0
2578         B(indx+nx+1) = 0.0d0
2579         indx = indx + (nx+2)
2580      end do
2581
2582      return
2583      end
2584
2585
2586
2587
2588c*     ***********************************
2589c*     *					*
2590c*     *	        D3dB_nrc_fft3f		*
2591c*     *					*
2592c*     ***********************************
2593c
2594c      subroutine D3dB_nrc_fft3f(nb,ne,A)
2595c
2596c*****************************************************
2597c*                                                   *
2598c*      This routine performs the operation of       *
2599c*      a three dimensional complex to complex fft   *
2600c*           A(kx,ky,kz) <- FFT3[A(nx(nb),ny(nb),nz(nb))]        *
2601c*                                                   *
2602c*      Entry - 					    *
2603c*              A: a column distribuded 3d block     *
2604c*              tmp: tempory work space must be at   *
2605c*                    least the size of (complex)    *
2606c*                    (nfft*nfft + 1) + 10*nfft      *
2607c*                                                   *
2608c*       Exit - A is transformed                     *
2609c*                                                   *
2610c*       uses - transpose1 subroutine                *
2611c*                                                   *
2612c*****************************************************
2613c
2614c      implicit none
2615c      integer nb,ne
2616c      complex*16  A(*)
2617c
2618c#include "bafdecls.fh"
2619c#include "errquit.fh"
2620c
2621c#include "D3dB.fh"
2622c
2623c      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
2624c      common    / D3dB_fft / tmpx,tmpy,tmpz
2625c
2626c*     *** local variables ***
2627c      integer i,j,k,q,n,indx,ierr
2628c
2629c      !integer tmp1(2),tmp2(2),tmp3(2)
2630c      integer tmp2(2),tmp3(2)
2631c      logical value
2632c
2633c
2634c      call nwpw_timing_start(1)
2635c
2636c
2637c*     ***** allocate temporary space ****
2638c      !call D3dB_nfft3d(nb,nfft3d)
2639c      value = BA_push_get(mt_dcpl,(nfft3d(nb)),'tmp2',tmp2(2),tmp2(1))
2640c      value = value.and.
2641c     >        BA_push_get(mt_dbl,(n2ft3d(nb)),'tmp3',tmp3(2),tmp3(1))
2642c      if (.not. value) call errquit('out of stack memory',0, MA_ERR)
2643c
2644c
2645c*     ********************************************
2646c*     ***     do fft along nx(nb) dimension        ***
2647c*     ***   A(kx,ny(nb),nz(nb)) <- fft1d[A(nx(nb),ny(nb),nz(nb))]  ***
2648c*     ********************************************
2649c#ifdef MLIB
2650c      call drcfts(A,nx(nb),1,ny(nb)*nq(nb)*ne,
2651c     >                  nx(nb)+2,1,ierr)
2652c
2653c#else
2654c      !call drffti(nx(nb),dcpl_mb(tmp1(1)))
2655c      do n=1,ne
2656c      do q=1,nq(nb)
2657c      do j=1,ny(nb)
2658c         indx = 1 + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2659c     >        + (n-1)*nfft3d(nb)
2660c
2661c
2662c         call drfftf(nx(nb),A(indx),dcpl_mb(tmpx(1,nb)))
2663c      end do
2664c      end do
2665c      end do
2666c      call cshift_fftf(nx(nb),ny(nb),nq(nb),ne,A)
2667c#endif
2668c
2669c
2670c*     ********************************************
2671c*     ***     do fft along ny(nb) dimension        ***
2672c*     ***   A(kx,ky,nz(nb)) <- fft1d[A(kx,ny(nb),nz(nb))]  ***
2673c*     ********************************************
2674c
2675c#ifdef MLIB
2676c      !call z1dfft(dbl_mb(tmp3(1)),ny(nb),dcpl_mb(tmp1(1)),-3,ierr)
2677c      do n=1,ne
2678c      do q=1,nq(nb)
2679c      do i=1,(nx(nb)/2+1)
2680c         do j=1,ny(nb)
2681c            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2682c     >               + (n-1)*nfft3d(nb)
2683c            dcpl_mb(tmp2(1)+j-1) = A(indx)
2684c         end do
2685c         !indx = i + (q-1)*(nx(nb)/2+1)*ny(nb)
2686c         !call zcopy(ny(nb),A(indx),(nx(nb)/2+1),dcpl_mb(tmp2(1)),1)
2687c         call z1dfft(dcpl_mb(tmp2(1)),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr)
2688c         do j=1,ny(nb)
2689c            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2690c     >               + (n-1)*nfft3d(nb)
2691c            A(indx) = dcpl_mb(tmp2(1)+j-1)
2692c         end do
2693c         !call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),(nx(nb)/2+1))
2694c      end do
2695c      end do
2696c      end do
2697cccc   *** this should be faster but isn't  ***
2698cc      do i=1,(nx(nb)/2+1)
2699cc        !indx = 1 + (q-1)*(nx(nb)/2+1)*ny(nb)
2700cc        call zffts(A(i),ny(nb),(nx(nb)/2+1),nq(nb),
2701cc     >           (nx(nb)/2+1)*nq(nb),1,ierr)
2702cc      end do
2703c#else
2704c      !call dcffti(ny(nb),dcpl_mb(tmp1(1)))
2705c      do n=1,ne
2706c      do q=1,nq(nb)
2707c      do i=1,(nx(nb)/2+1)
2708c         do j=1,ny(nb)
2709c            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2710c     >               + (n-1)*nfft3d(nb)
2711c            dcpl_mb(tmp2(1)+j-1) = A(indx)
2712c         end do
2713c         call dcfftf(ny(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpy(1,nb)))
2714c         do j=1,ny(nb)
2715c            indx = i + (j-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2716c     >               + (n-1)*nfft3d(nb)
2717c            A(indx) = dcpl_mb(tmp2(1)+j-1)
2718c         end do
2719c      end do
2720c      end do
2721c      end do
2722c#endif
2723c
2724c
2725c*     ********************************************
2726c*     ***         Do a transpose of A          ***
2727c*     ***      A(ky,nz(nb),ky) <- A(kx,ky,nz(nb))      ***
2728c*     ********************************************
2729c      do n=1,ne
2730c        indx = 1 + (n-1)*nfft3d(nb)
2731c        call D3dB_c_transpose_jk(nb,A(indx),
2732c     >                           dcpl_mb(tmp2(1)),
2733c     >                           dbl_mb(tmp3(1)))
2734c      end do
2735c
2736c
2737c*     ********************************************
2738c*     ***     do fft along nz(nb) dimension        ***
2739c*     ***   A(kx,kz,ky) <- fft1d[A(kx,nz(nb),ky)]  ***
2740c*     ********************************************
2741c#ifdef MLIB
2742c      !call z1dfft(dbl_mb(tmp3(1)),nz(nb),dcpl_mb(tmp1(1)),-3,ierr)
2743c      do n=1,ne
2744c      do q=1,nq(nb)
2745c      do i=1,(nx(nb)/2+1)
2746c         do k=1,nz(nb)
2747c            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2748c     >               + (n-1)*nfft3d(nb)
2749c            dcpl_mb(tmp2(1)+k-1) = A(indx)
2750c         end do
2751c         call z1dfft(dcpl_mb(tmp2(1)),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr)
2752c         do k=1,nz(nb)
2753c            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2754c     >               + (n-1)*nfft3d(nb)
2755c            A(indx) = dcpl_mb(tmp2(1)+k-1)
2756c         end do
2757c      end do
2758c      end do
2759c      end do
2760c#else
2761c      !call dcffti(nz(nb),dcpl_mb(tmp1(1)))
2762c      do n=1,ne
2763c      do q=1,nq(nb)
2764c      do i=1,(nx(nb)/2+1)
2765c         do k=1,nz(nb)
2766c            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2767c     >               + (n-1)*nfft3d(nb)
2768c            dcpl_mb(tmp2(1)+k-1) = A(indx)
2769c         end do
2770c         call dcfftf(nz(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpz(1,nb)))
2771c         do k=1,nz(nb)
2772c            indx = i + (k-1)*(nx(nb)/2+1) + (q-1)*(nx(nb)/2+1)*ny(nb)
2773c     >               + (n-1)*nfft3d(nb)
2774c            A(indx) = dcpl_mb(tmp2(1)+k-1)
2775c         end do
2776c      end do
2777c      end do
2778c      end do
2779c#endif
2780c
2781c*     ********************************************
2782c*     ***         Do a transpose of A          ***
2783c*     ***      A(kx,ky,kz) <- A(kx,kz,ky)      ***
2784c*     ********************************************
2785cc     call D3dB_c_transpose_jk(nb,A,dcpl_mb(tmp2(1)),dbl_mb(tmp3(1)))
2786c
2787c
2788c*     **** deallocate temporary space  ****
2789c      value = BA_pop_stack(tmp3(2))
2790c      value = BA_pop_stack(tmp2(2))
2791c      !value = BA_pop_stack(tmp1(2))
2792
2793c      call nwpw_timing_end(1)
2794c      return
2795c      end
2796
2797
2798
2799
2800*     ***********************************
2801*     *					*
2802*     *	       D3dB_(c,r,t)_SMul 	*
2803*     *					*
2804*     ***********************************
2805
2806*  This routine performs the operation	C = scale * A
2807* where scale is a real*8 number.
2808
2809      subroutine D3dB_c_SMul(nb,scale,A,C)
2810      implicit none
2811      integer    nb
2812      real*8     scale
2813      complex*16 A(*)
2814      complex*16 C(*)
2815
2816#include "D3dB.fh"
2817
2818      integer i
2819
2820      do i=1,nfft3d_map(nb)
2821         C(i) = scale*A(i)
2822      end do
2823      return
2824      end
2825
2826
2827      subroutine D3dB_c_SMul1(nb,scale,A)
2828      implicit none
2829      integer    nb
2830      real*8     scale
2831      complex*16 A(*)
2832
2833#include "D3dB.fh"
2834
2835      integer i
2836
2837      do i=1,nfft3d_map(nb)
2838         A(i) = scale*A(i)
2839      end do
2840      return
2841      end
2842
2843
2844
2845      subroutine D3dB_r_SMul(nb,scale,A,C)
2846      implicit none
2847      integer nb
2848      real*8     scale
2849      real*8 A(*)
2850      real*8 C(*)
2851
2852#include "D3dB.fh"
2853
2854      integer i
2855
2856!$OMP DO
2857      do i=1,n2ft3d_map(nb)
2858         C(i) = scale*A(i)
2859      end do
2860!$OMP END DO
2861      return
2862      end
2863
2864      subroutine D3dB_r_SMul1(nb,scale,A)
2865      implicit none
2866      integer nb
2867      real*8     scale
2868      real*8 A(*)
2869
2870#include "D3dB.fh"
2871
2872      integer i
2873
2874!$OMP DO
2875      do i=1,n2ft3d_map(nb)
2876         A(i) = scale*A(i)
2877      end do
2878!$OMP END DO
2879      return
2880      end
2881
2882
2883      subroutine D3dB_t_SMul(nb,scale,A,C)
2884      implicit none
2885      integer nb
2886      real*8 scale
2887      real*8 A(*)
2888      real*8 C(*)
2889
2890#include "D3dB.fh"
2891
2892      integer i
2893
2894      do i=1,nfft3d_map(nb)
2895         C(i) = scale*A(i)
2896      end do
2897      return
2898      end
2899
2900
2901      subroutine D3dB_t_SMul1(nb,scale,A)
2902      implicit none
2903      integer nb
2904      real*8 scale
2905      real*8 A(*)
2906
2907#include "D3dB.fh"
2908
2909      integer i
2910
2911      do i=1,nfft3d_map(nb)
2912         A(i) = scale*A(i)
2913      end do
2914      return
2915      end
2916
2917
2918
2919      subroutine D3dB_c_ZMul(nb,scale,A,C)
2920      implicit none
2921      integer    nb
2922      complex*16 scale
2923      complex*16 A(*)
2924      complex*16 C(*)
2925
2926#include "D3dB.fh"
2927
2928
2929      integer i
2930
2931      do i=1,nfft3d_map(nb)
2932         C(i) = scale*A(i)
2933      end do
2934      return
2935      end
2936
2937      subroutine D3dB_r_Power1(nb,y,A)
2938      implicit none
2939      integer nb
2940      real*8  y
2941      real*8 A(*)
2942
2943#include "D3dB.fh"
2944
2945      integer ii
2946
2947      do ii=1,n2ft3d_map(nb)
2948         A(ii) = A(ii)**y
2949      end do
2950
2951      return
2952      end
2953
2954
2955      subroutine D3dB_rr_Power(nb,y,A,B)
2956      implicit none
2957      integer nb
2958      real*8  y
2959      real*8 A(*)
2960      real*8 B(*)
2961
2962#include "D3dB.fh"
2963
2964      integer i
2965
2966      do i=1,n2ft3d_map(nb)
2967         B(i) = A(i)**y
2968      end do
2969      return
2970      end
2971
2972
2973
2974
2975
2976*     ***********************************
2977*     *					*
2978*     *	       D3dB_ct_Sqr	 	*
2979*     *					*
2980*     ***********************************
2981
2982*  This routine performs the operation	C = A * A
2983
2984      subroutine D3dB_ct_Sqr(nb,A,C)
2985      implicit none
2986      integer    nb
2987      complex*16 A(*)
2988      real*8     C(*)
2989
2990#include "D3dB.fh"
2991
2992      integer i
2993
2994      do i=1,nfft3d_map(nb)
2995         C(i) = dble(A(i))**2 + dimag(A(i))**2
2996      end do
2997      return
2998      end
2999
3000*     ***********************************
3001*     *					*
3002*     *	       D3dB_rr_Sqr	 	*
3003*     *					*
3004*     ***********************************
3005
3006      subroutine D3dB_rr_Sqr(nb,A,C)
3007      implicit none
3008      integer nb
3009      real*8 A(*)
3010      real*8 C(*)
3011
3012#include "D3dB.fh"
3013
3014
3015      integer i
3016
3017!$OMP DO
3018      do i=1,n2ft3d_map(nb)
3019         C(i) = A(i)**2
3020      end do
3021!$OMP END DO
3022      return
3023      end
3024
3025
3026*     ***********************************
3027*     *                                 *
3028*     *        D3dB_rr_SqrAdd           *
3029*     *                                 *
3030*     ***********************************
3031
3032      subroutine D3dB_rr_SqrAdd(nb,A,C)
3033      implicit none
3034      integer nb
3035      real*8 A(*)
3036      real*8 C(*)
3037
3038#include "D3dB.fh"
3039
3040
3041      integer i
3042
3043!$OMP DO
3044      do i=1,n2ft3d_map(nb)
3045         C(i) = C(i) + A(i)**2
3046      end do
3047!$OMP END DO
3048      return
3049      end
3050
3051
3052
3053
3054*     ***********************************
3055*     *                                 *
3056*     *        D3dB_rr_Sqr1             *
3057*     *                                 *
3058*     ***********************************
3059
3060      subroutine D3dB_rr_Sqr1(nb,A)
3061      implicit none
3062      integer nb
3063      real*8 A(*)
3064
3065#include "D3dB.fh"
3066
3067      integer i
3068
3069!$OMP DO
3070      do i=1,n2ft3d_map(nb)
3071         A(i) = A(i)**2
3072      end do
3073!$OMP END DO
3074      return
3075      end
3076
3077
3078*     ***********************************
3079*     *					*
3080*     *	       D3dB_rr_Sqrt	 	*
3081*     *					*
3082*     ***********************************
3083
3084      subroutine D3dB_rr_Sqrt(nb,A,C)
3085      implicit none
3086      integer nb
3087      real*8 A(*)
3088      real*8 C(*)
3089
3090#include "D3dB.fh"
3091
3092      integer i
3093
3094      do i=1,n2ft3d_map(nb)
3095         C(i) = dsqrt(A(i))
3096      end do
3097      return
3098      end
3099
3100
3101
3102*     ***********************************
3103*     *                                 *
3104*     *        D3dB_rr_Sqrt1            *
3105*     *                                 *
3106*     ***********************************
3107
3108      subroutine D3dB_rr_Sqrt1(nb,A)
3109      implicit none
3110      integer nb
3111      real*8 A(*)
3112
3113#include "D3dB.fh"
3114
3115      integer i
3116
3117!$OMP DO
3118      do i=1,n2ft3d_map(nb)
3119         A(i) = dsqrt(A(i))
3120      end do
3121!$OMP END DO
3122      return
3123      end
3124
3125
3126*     ***********************************
3127*     *					*
3128*     *	       D3dB_tt_Sqr	 	*
3129*     *					*
3130*     ***********************************
3131
3132      subroutine D3dB_tt_Sqr(nb,A,C)
3133      implicit none
3134      integer nb
3135      real*8 A(*)
3136      real*8 C(*)
3137
3138#include "D3dB.fh"
3139
3140      integer i
3141
3142      do i=1,nfft3d_map(nb)
3143         C(i) = A(i)**2
3144      end do
3145      return
3146      end
3147
3148
3149
3150*     ***********************************
3151*     *					*
3152*     *	   D3dB_c_transpose_jk_init	*
3153*     *					*
3154*     ***********************************
3155
3156      subroutine D3dB_c_transpose_jk_init(nb)
3157      implicit none
3158      integer nb
3159
3160#include "bafdecls.fh"
3161#include "errquit.fh"
3162#include "D3dB.fh"
3163
3164
3165c     integer iq_to_i1((NFFT1/2+1)*NFFT2*NSLABS)
3166c     integer iq_to_i2((NFFT1/2+1)*NFFT2*NSLABS)
3167c     integer i1_start(NFFT3+1)
3168c     integer i2_start(NFFT3+1)
3169      integer iq_to_i1(2,NBLOCKS)
3170      integer iq_to_i2(2,NBLOCKS)
3171      integer i1_start(2,NBLOCKS)
3172      integer i2_start(2,NBLOCKS)
3173      common / trans_blk / iq_to_i1,iq_to_i2,i1_start,i2_start
3174
3175
3176*     **** local variables ****
3177      integer proc_to,proc_from
3178      integer pto,qto,np,taskid
3179      integer pfrom,qfrom
3180      integer phere,qhere
3181      integer index1,index2,itmp
3182      integer i,j,k,it
3183      logical value
3184
3185
3186*     **** allocate trans_blk common block ****
3187      value = BA_alloc_get(mt_int,((nx(nb)/2+1)*ny(nb)*nq(nb)),
3188     >                     'iq_to_i1',iq_to_i1(2,nb),iq_to_i1(1,nb))
3189      value=value.and.BA_alloc_get(mt_int,((nx(nb)/2+1)*ny(nb)*nq(nb)),
3190     >                     'iq_to_i2',iq_to_i2(2,nb),iq_to_i2(1,nb))
3191      value = value.and.BA_alloc_get(mt_int,(nz(nb)+1),
3192     >                     'i1_start',i1_start(2,nb),i1_start(1,nb))
3193      value = value.and.BA_alloc_get(mt_int,(nz(nb)+1),
3194     >                     'i2_start',i2_start(2,nb),i2_start(1,nb))
3195      if (.not. value)
3196     > call errquit('D3dB_transpose_jk_init:out of heap',0,MA_ERR)
3197
3198      call Parallel2d_taskid_i(taskid)
3199      call Parallel2d_np_i(np)
3200
3201!MATHIAS
3202      index1 = 1
3203      index2 = 1
3204      do it=0,np-1
3205         proc_to   = mod(taskid+it,np)
3206         proc_from = mod(taskid-it+np,np)
3207c        i1_start(it+1) = index1
3208c        i2_start(it+1) = index2
3209         int_mb(i1_start(1,nb)+it) = index1
3210         int_mb(i2_start(1,nb)+it) = index2
3211
3212         do k=1,nz(nb)
3213         do j=1,ny(nb)
3214
3215*           **** packing scheme ****
3216            call D3dB_ktoqp(nb,k,qhere,phere)
3217            call D3dB_ktoqp(nb,j,qto,pto)
3218            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
3219               do i=1,(nx(nb)/2+1)
3220                  itmp = i + (j-1)*(nx(nb)/2+1)
3221     >                     + (qhere-1)*(nx(nb)/2+1)*ny(nb)
3222c                 iq_to_i1(itmp) = index1
3223                  int_mb(iq_to_i1(1,nb)+itmp-1) = index1
3224                  index1 = index1 + 1
3225               end do
3226            end if
3227
3228*           **** unpacking scheme ****
3229            call D3dB_ktoqp(nb,j,qhere,phere)
3230            call D3dB_ktoqp(nb,k,qfrom,pfrom)
3231            if ((phere.eq.taskid).and.(pfrom.eq.proc_from)) then
3232               do i=1,(nx(nb)/2+1)
3233                  itmp = i + (k-1)*(nx(nb)/2+1)
3234     >                     + (qhere-1)*(nx(nb)/2+1)*ny(nb)
3235c                 iq_to_i2(itmp) = index2
3236                  int_mb(iq_to_i2(1,nb)+itmp-1) = index2
3237                  index2 = index2 + 1
3238               end do
3239            end if
3240         end do
3241         end do
3242      end do
3243c     i1_start(np+1) = index1
3244c     i2_start(np+1) = index2
3245      int_mb(i1_start(1,nb)+np) = index1
3246      int_mb(i2_start(1,nb)+np) = index2
3247
3248
3249      return
3250      end
3251
3252
3253
3254*     ***********************************
3255*     *					*
3256*     *	         D3dB_cc_dot  	 	*
3257*     *					*
3258*     ***********************************
3259
3260      subroutine D3dB_cc_dot(nb,A,B,sumall)
3261      implicit none
3262      integer nb
3263      complex*16 A(*)
3264      complex*16 B(*)
3265      real*8     sumall
3266
3267
3268#include "D3dB.fh"
3269
3270      integer i,j,k,q,index,np,taskid,p
3271      real*8  sum
3272
3273
3274      call nwpw_timing_start(2)
3275
3276      call Parallel2d_np_i(np)
3277
3278*     **** sum up dot product on this node ****
3279      sum = 0.0d0
3280
3281      !**********************
3282      !**** slab mapping ****
3283      !**********************
3284      if (mapping.eq.1) then
3285*     ***** kx!=0 plane, so double count *****
3286      do q=1,nq(nb)
3287         do j=1,ny(nb)
3288         do i=2,(nx(nb)/2+1)
3289            index = (q-1)*(nx(nb)/2+1)*ny(nb)
3290     >            + (j-1)*(nx(nb)/2+1) + i
3291            sum = sum + dble(A(index))  * dble(B(index))
3292     >                + dimag(A(index)) * dimag(B(index))
3293         end do
3294         end do
3295      end do
3296      sum = sum*2.0d0
3297
3298*     ***** kx==0 plane, so single count *****
3299      do q=1,nq(nb)
3300         do j=1,ny(nb)
3301            i=1
3302            index = (q-1)*(nx(nb)/2+1)*ny(nb) + (j-1)*(nx(nb)/2+1) + 1
3303            sum = sum + dble(A(index))  * dble(B(index))
3304     >                + dimag(A(index)) * dimag(B(index))
3305         end do
3306      end do
3307
3308      !*************************
3309      !**** hilbert mapping ****
3310      !*************************
3311      else
3312      call Parallel2d_taskid_i(taskid)
3313*     ***** kx!=0 plane, so double count *****
3314      do index=1,nfft3d_map(nb)
3315            sum = sum + dble(A(index))  * dble(B(index))
3316     >                + dimag(A(index)) * dimag(B(index))
3317      end do
3318      sum = sum*2.0d0
3319
3320*     ***** kx==0 plane, so single count *****
3321      do k=1,nz(nb)
3322      do j=1,ny(nb)
3323         i=1
3324         call D3dB_ijktoindexp(1,i,j,k,index,p)
3325         if (p.eq.taskid) then
3326         sum = sum - dble(A(index))  * dble(B(index))
3327     >             - dimag(A(index)) * dimag(B(index))
3328         end if
3329      end do
3330      end do
3331
3332      end if
3333
3334
3335*     **** add up sums from other nodes ****
3336      if (np.gt.1) then
3337         call D3dB_SumAll(sum)
3338      end if
3339
3340      call nwpw_timing_end(2)
3341
3342      sumall = sum
3343      return
3344      end
3345
3346*     ***********************************
3347*     *					*
3348*     *	         D3dB_cc_idot  	 	*
3349*     *					*
3350*     ***********************************
3351
3352      subroutine D3dB_cc_idot(nb,A,B,sumall)
3353      implicit none
3354      integer nb
3355      complex*16 A(*)
3356      complex*16 B(*)
3357      real*8     sumall
3358
3359
3360#include "D3dB.fh"
3361
3362      integer i,j,k,q,index,np,taskid,p
3363      real*8  sum
3364
3365
3366      call nwpw_timing_start(2)
3367
3368c      call Parallel2d_np_i(np)
3369
3370*     **** sum up dot product on this node ****
3371      sum = 0.0d0
3372
3373      !**********************
3374      !**** slab mapping ****
3375      !**********************
3376      if (mapping.eq.1) then
3377*     ***** kx!=0 plane, so double count *****
3378      do q=1,nq(nb)
3379         do j=1,ny(nb)
3380         do i=2,(nx(nb)/2+1)
3381            index = (q-1)*(nx(nb)/2+1)*ny(nb)
3382     >            + (j-1)*(nx(nb)/2+1) + i
3383            sum = sum + dble(A(index))  * dble(B(index))
3384     >                + dimag(A(index)) * dimag(B(index))
3385         end do
3386         end do
3387      end do
3388      sum = sum*2.0d0
3389
3390*     ***** kx==0 plane, so single count *****
3391      do q=1,nq(nb)
3392         do j=1,ny(nb)
3393            i=1
3394            index = (q-1)*(nx(nb)/2+1)*ny(nb) + (j-1)*(nx(nb)/2+1) + 1
3395            sum = sum + dble(A(index))  * dble(B(index))
3396     >                + dimag(A(index)) * dimag(B(index))
3397         end do
3398      end do
3399
3400
3401      !*************************
3402      !**** hilbert mapping ****
3403      !*************************
3404      else
3405      call Parallel2d_taskid_i(taskid)
3406*     ***** kx!=0 plane, so double count *****
3407      do index=1,nfft3d_map(nb)
3408            sum = sum + dble(A(index))  * dble(B(index))
3409     >                + dimag(A(index)) * dimag(B(index))
3410      end do
3411      sum = sum*2.0d0
3412
3413*     ***** kx==0 plane, so single count *****
3414      do k=1,nz(nb)
3415      do j=1,ny(nb)
3416         i=1
3417         call D3dB_ijktoindexp(1,i,j,k,index,p)
3418         if (p.eq.taskid) then
3419         sum = sum - dble(A(index))  * dble(B(index))
3420     >             - dimag(A(index)) * dimag(B(index))
3421         end if
3422      end do
3423      end do
3424
3425      end if
3426
3427
3428*     **** do not add up sums from other nodes ****
3429
3430      call nwpw_timing_end(2)
3431
3432      sumall = sum
3433      return
3434      end
3435
3436*     ***********************************
3437*     *					*
3438*     *	         D3dB_tt_dot  	 	*
3439*     *					*
3440*     ***********************************
3441
3442      subroutine D3dB_tt_dot(nb,A,B,sumall)
3443      implicit none
3444      integer nb
3445      real*8 A(*)
3446      real*8 B(*)
3447      real*8 sumall
3448
3449#include "D3dB.fh"
3450
3451      integer i,j,k,q,index,np,nxh,taskid,p
3452      real*8  sum
3453
3454      nxh=nx(nb)/2
3455      call Parallel2d_np_i(np)
3456
3457*     **** sum up dot product on this node ****
3458      sum = 0.0d0
3459
3460      !**********************
3461      !**** slab mapping ****
3462      !**********************
3463      if (mapping.eq.1) then
3464*     ***** k!=0 plane, so double count *****
3465      do q=1,nq(nb)
3466         do j=1,ny(nb)
3467         do i=2,(nxh+1)
3468            index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + i
3469            sum = sum + A(index)*B(index)
3470         end do
3471         end do
3472      end do
3473      sum = sum*2.0d0
3474
3475*     **** kx==0 plane, so single count *****
3476      do q=1,nq(nb)
3477         do j=1,ny(nb)
3478            i=1
3479            index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + 1
3480            sum = sum + A(index)*B(index)
3481         end do
3482      end do
3483
3484
3485      !*************************
3486      !**** hilbert mapping ****
3487      !*************************
3488      else
3489      call Parallel2d_taskid_i(taskid)
3490*     ***** kx!=0 plane, so double count *****
3491      do index=1,nfft3d_map(nb)
3492            sum = sum + A(index)*B(index)
3493      end do
3494      sum = sum*2.0d0
3495
3496*     ***** kx==0 plane, so single count *****
3497      do k=1,nz(nb)
3498      do j=1,ny(nb)
3499         i=1
3500         call D3dB_ijktoindexp(1,i,j,k,index,p)
3501         if (p.eq.taskid) then
3502         sum = sum - A(index)*B(index)
3503         end if
3504      end do
3505      end do
3506
3507      end if
3508
3509
3510*     **** add up sums from other nodes ****
3511      if (np.gt.1) then
3512         call D3dB_SumAll(sum)
3513      end if
3514
3515      sumall = sum
3516      return
3517      end
3518
3519
3520*     ***********************************
3521*     *					*
3522*     *	         D3dB_tt_idot  	 	*
3523*     *					*
3524*     ***********************************
3525
3526      subroutine D3dB_tt_idot(nb,A,B,sumall)
3527      implicit none
3528      integer nb
3529      real*8 A(*)
3530      real*8 B(*)
3531      real*8 sumall
3532
3533
3534#include "D3dB.fh"
3535
3536      integer i,j,k,q,index,np,nxh,taskid,p
3537      real*8  sum
3538
3539
3540      call nwpw_timing_start(2)
3541
3542      nxh=nx(nb)/2
3543c      call Parallel2d_np_i(np)
3544
3545*     **** sum up dot product on this node ****
3546      sum = 0.0d0
3547
3548      !**********************
3549      !**** slab mapping ****
3550      !**********************
3551      if (mapping.eq.1) then
3552*     ***** k!=0 plane, so double count *****
3553      do q=1,nq(nb)
3554         do j=1,ny(nb)
3555         do i=2,(nxh+1)
3556            index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + i
3557            sum = sum + A(index)*B(index)
3558         end do
3559         end do
3560      end do
3561      sum = sum*2.0d0
3562
3563*     **** kx==0 plane, so single count *****
3564      do q=1,nq(nb)
3565         do j=1,ny(nb)
3566            i=1
3567            index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + 1
3568            sum = sum + A(index)*B(index)
3569         end do
3570      end do
3571
3572
3573
3574      !*************************
3575      !**** hilbert mapping ****
3576      !*************************
3577      else
3578      call Parallel2d_taskid_i(taskid)
3579*     ***** kx!=0 plane, so double count *****
3580      do index=1,nfft3d_map(nb)
3581            sum = sum + A(index)*B(index)
3582      end do
3583      sum = sum*2.0d0
3584
3585*     ***** kx==0 plane, so single count *****
3586      do k=1,nz(nb)
3587      do j=1,ny(nb)
3588         i=1
3589         call D3dB_ijktoindexp(1,i,j,k,index,p)
3590         if (p.eq.taskid) then
3591         sum = sum - A(index)*B(index)
3592         end if
3593      end do
3594      end do
3595
3596      end if
3597
3598
3599
3600*     **** !!!! do not add up sums from other nodes ****
3601
3602      call nwpw_timing_end(2)
3603
3604      sumall = sum
3605      return
3606      end
3607
3608
3609
3610*     ***********************************
3611*     *					*
3612*     *	         D3dB_rr_dot  	 	*
3613*     *					*
3614*     ***********************************
3615*     shared memory output
3616*     - sumall
3617
3618      subroutine D3dB_rr_dot(nb,A,B,sumall)
3619      implicit none
3620      integer nb
3621      real*8  A(*)
3622      real*8  B(*)
3623      real*8  sumall
3624
3625#include "D3dB.fh"
3626
3627      integer i,np
3628      real*8  sum
3629      common /D3dB_RR_TSUM/ sum
3630
3631      call Parallel2d_np_i(np)
3632
3633*     **** sum up dot product on this node ****
3634!$OMP MASTER
3635      sum = 0.0d0
3636!$OMP END MASTER
3637!$OMP BARRIER
3638!$OMP DO REDUCTION(+:sum)
3639      do i=1,n2ft3d_map(nb)
3640         sum = sum + A(i)*B(i)
3641      end do
3642!$OMP END DO
3643
3644
3645*     **** add up sums from other nodes ****
3646      if (np.gt.1) then
3647         call D3dB_SumAll(sum)
3648      end if
3649
3650!$OMP MASTER
3651      sumall = sum
3652!$OMP END MASTER
3653!$OMP BARRIER
3654      return
3655      end
3656
3657*     ***********************************
3658*     *					*
3659*     *	         D3dB_rr_idot  	 	*
3660*     *					*
3661*     ***********************************
3662
3663      subroutine D3dB_rr_idot(nb,A,B,sumall)
3664      implicit none
3665      integer nb
3666      real*8  A(*)
3667      real*8  B(*)
3668      real*8  sumall
3669
3670#include "D3dB.fh"
3671
3672      integer i,np
3673      real*8  sum
3674      common /D3dB_RR_TSUM/ sum
3675
3676
3677*     **** sum up dot product on this node ****
3678!$OMP MASTER
3679      sum = 0.0d0
3680!$OMP END MASTER
3681!$OMP BARRIER
3682!$OMP DO reduction(+:sum)
3683      do i=1,n2ft3d_map(nb)
3684         sum = sum + A(i)*B(i)
3685      end do
3686!$OMP END DO
3687
3688*     **** add up sums from other nodes ****
3689*     if (np.gt.1) then
3690*        call D3dB_SumAll(sum)
3691*     end if
3692
3693!$OMP MASTER
3694      sumall = sum
3695!$OMP END MASTER
3696!$OMP BARRIER
3697      return
3698      end
3699
3700*     ***********************************
3701*     *                                 *
3702*     *          D3dB_rrm_sym_dot       *
3703*     *                                 *
3704*     ***********************************
3705
3706      subroutine D3dB_rrm_sym_dot(nb,n,A,B,matrix)
3707      implicit none
3708      integer nb,n
3709      real*8  A(*)
3710      real*8  B(*)
3711      real*8  matrix(n,n)
3712
3713#include "D3dB.fh"
3714
3715*     **** local variables ****
3716      integer j,k
3717      integer np
3718
3719      call nwpw_timing_start(2)
3720      call Parallel2d_np_i(np)
3721
3722      do k=1,n
3723        call DGEMM_OMP('T','N',k,1,n2ft3d(nb),
3724     >             1.0d0,
3725     >             A,n2ft3d(nb),
3726     >             B(1+(k-1)*n2ft3d(nb)),n2ft3d(nb),
3727     >             0.0d0,
3728     >             matrix(1,k),k)
3729      end do
3730
3731!$OMP DO
3732      do k=1,n
3733      do j=k+1,n
3734        matrix(j,k) = matrix(k,j)
3735      end do
3736      end do
3737!$OMP END DO
3738
3739      if (np.gt.1) call D3dB_Vector_SumAll(n*n,matrix)
3740      call nwpw_timing_end(2)
3741      return
3742      end
3743
3744
3745
3746
3747
3748*     ***********************************
3749*     *					*
3750*     *	         D3dB_cc_Mul  	 	*
3751*     *					*
3752*     ***********************************
3753
3754      subroutine D3dB_cc_Mul(nb,A,B,C)
3755      implicit none
3756      integer    nb
3757      complex*16 A(*)
3758      complex*16 B(*)
3759      complex*16 C(*)
3760
3761#include "D3dB.fh"
3762
3763      integer i
3764
3765      do i=1,nfft3d_map(nb)
3766            C(i) = dconjg(A(i)) * B(i)
3767         end do
3768
3769      return
3770      end
3771
3772*     ***********************************
3773*     *                                 *
3774*     *          D3dB_cc_Mul2           *
3775*     *                                 *
3776*     ***********************************
3777
3778      subroutine D3dB_cc_Mul2(nb,A,B)
3779      implicit none
3780      integer    nb
3781      complex*16 A(*)
3782      complex*16 B(*)
3783
3784#include "D3dB.fh"
3785
3786      integer i
3787
3788      do i=1,nfft3d_map(nb)
3789         B(i) = A(i) * B(i)
3790      end do
3791
3792      return
3793      end
3794
3795*     ***********************************
3796*     *					*
3797*     *	         D3dB_lc_Mask  	 	*
3798*     *					*
3799*     ***********************************
3800
3801      subroutine D3dB_lc_Mask(nb,masker,A)
3802      implicit none
3803      integer    nb
3804      logical    masker(*)
3805      complex*16 A(*)
3806
3807#include "D3dB.fh"
3808
3809      integer i
3810
3811      do i=1,nfft3d_map(nb)
3812         if (masker(i)) A(i) = dcmplx(0.0d0,0.0d0)
3813      end do
3814      return
3815      end
3816
3817*     ***********************************
3818*     *					*
3819*     *	         D3dB_lr_Mask  	 	*
3820*     *					*
3821*     ***********************************
3822
3823      subroutine D3dB_lr_Mask(nb,masker,A)
3824      implicit none
3825      integer   nb
3826      logical   masker(*)
3827      real*8    A(*)
3828
3829#include "D3dB.fh"
3830
3831      integer i
3832
3833      do i=1,nfft3d_map(nb)
3834         if (masker(i)) A(i) = 0.0d0
3835      end do
3836      return
3837      end
3838
3839
3840*     ***********************************
3841*     *					*
3842*     *	         D3dB_tc_Mul  	 	*
3843*     *					*
3844*     ***********************************
3845
3846      subroutine D3dB_tc_Mul(nb,A,B,C)
3847      implicit none
3848      integer    nb
3849      real*8     A(*)
3850      complex*16 B(*)
3851      complex*16 C(*)
3852
3853#include "D3dB.fh"
3854
3855      integer i
3856
3857      do i=1,nfft3d_map(nb)
3858            C(i) = A(i) * B(i)
3859      end do
3860
3861      return
3862      end
3863
3864
3865
3866*     ***********************************
3867*     *                                 *
3868*     *          D3dB_tc_Mul2           *
3869*     *                                 *
3870*     ***********************************
3871
3872      subroutine D3dB_tc_Mul2(nb,A,B)
3873      implicit none
3874      integer    nb
3875      real*8     A(*)
3876      complex*16 B(*)
3877
3878#include "D3dB.fh"
3879
3880      integer i
3881
3882!$OMP DO
3883      do i=1,nfft3d_map(nb)
3884            B(i) = B(i) * A(i)
3885      end do
3886!$OMP END DO
3887
3888      return
3889      end
3890
3891*     ***********************************
3892*     *					*
3893*     *	         D3dB_rr_Mul  	 	*
3894*     *					*
3895*     ***********************************
3896
3897      subroutine D3dB_rr_Mul(nb,A,B,C)
3898      implicit none
3899
3900#include "D3dB.fh"
3901
3902      integer nb
3903      real*8 A(*)
3904      real*8 B(*)
3905      real*8 C(*)
3906      integer i,n
3907
3908!$OMP DO
3909      do i=1,n2ft3d_map(nb)
3910         C(i) = A(i) * B(i)
3911      end do
3912!$OMP END DO
3913
3914      return
3915      end
3916
3917*     ***********************************
3918*     *                                 *
3919*     *          D3dB_rr_Mul2           *
3920*     *                                 *
3921*     ***********************************
3922
3923      subroutine D3dB_rr_Mul2(nb,A,B)
3924      implicit none
3925      integer nb
3926      real*8 A(*)
3927      real*8 B(*)
3928
3929#include "D3dB.fh"
3930
3931      integer i
3932
3933!$OMP DO
3934      do i=1,n2ft3d_map(nb)
3935         B(i) = B(i) * A(i)
3936      end do
3937!$OMP END DO
3938
3939      return
3940      end
3941
3942
3943
3944*     ***********************************
3945*     *					*
3946*     *	         D3dB_cc_Sum  	 	*
3947*     *					*
3948*     ***********************************
3949
3950      subroutine D3dB_cc_Sum(nb,A,B,C)
3951      implicit none
3952      integer    nb
3953      complex*16 A(*)
3954      complex*16 B(*)
3955      complex*16 C(*)
3956
3957#include "D3dB.fh"
3958
3959      integer i
3960
3961!$OMP DO
3962      do i=1,nfft3d_map(nb)
3963         C(i) = A(i) + B(i)
3964      end do
3965!$OMP END DO
3966
3967      return
3968      end
3969
3970
3971*     ***********************************
3972*     *                                 *
3973*     *          D3dB_cc_Sum2           *
3974*     *                                 *
3975*     ***********************************
3976
3977      subroutine D3dB_cc_Sum2(nb,A,B)
3978      implicit none
3979      integer    nb
3980      complex*16 A(*)
3981      complex*16 B(*)
3982
3983#include "D3dB.fh"
3984
3985      integer i
3986
3987!$OMP DO
3988      do i=1,nfft3d_map(nb)
3989         B(i) = B(i) + A(i)
3990      end do
3991!$OMP END DO
3992
3993      return
3994      end
3995
3996
3997*     ***********************************
3998*     *					*
3999*     *	         D3dB_rr_Sum  	 	*
4000*     *					*
4001*     ***********************************
4002
4003      subroutine D3dB_rr_Sum(nb,A,B,C)
4004      implicit none
4005      integer nb
4006      real*8  A(*)
4007      real*8  B(*)
4008      real*8  C(*)
4009
4010#include "D3dB.fh"
4011
4012      integer i
4013
4014!$OMP DO
4015      do i=1,n2ft3d_map(nb)
4016         C(i) = B(i)+A(i)
4017      end do
4018!$OMP END DO
4019
4020      return
4021      end
4022
4023
4024*     ***********************************
4025*     *                                 *
4026*     *          D3dB_rr_Sum2           *
4027*     *                                 *
4028*     ***********************************
4029      subroutine D3dB_rr_Sum2(nb,A,B)
4030      implicit none
4031      integer nb
4032      real*8  A(*)
4033      real*8  B(*)
4034
4035#include "D3dB.fh"
4036
4037      integer i
4038
4039!$OMP DO
4040      do i=1,n2ft3d_map(nb)
4041         B(i) = B(i) + A(i)
4042      end do
4043!$OMP END DO
4044      return
4045      end
4046
4047
4048*     ***********************************
4049*     *					*
4050*     *	         D3dB_cc_Sub  	 	*
4051*     *					*
4052*     ***********************************
4053
4054      subroutine D3dB_cc_Sub(nb,A,B,C)
4055      implicit none
4056      integer    nb
4057      complex*16 A(*)
4058      complex*16 B(*)
4059      complex*16 C(*)
4060
4061#include "D3dB.fh"
4062
4063      integer i
4064
4065      do i=1,nfft3d_map(nb)
4066         C(i) = A(i) - B(i)
4067      end do
4068
4069      return
4070      end
4071
4072
4073*     ***********************************
4074*     *					*
4075*     *	         D3dB_rr_Sub   		*
4076*     *					*
4077*     ***********************************
4078
4079      subroutine D3dB_rr_Sub(nb,A,B,C)
4080      implicit none
4081
4082#include "D3dB.fh"
4083
4084      integer nb
4085      real*8  A(n2ft3d_map(nb))
4086      real*8  B(n2ft3d_map(nb))
4087      real*8  C(n2ft3d_map(nb))
4088      integer i
4089
4090
4091!$OMP DO
4092      do i=1,n2ft3d_map(nb)
4093         C(i) = A(i) - B(i)
4094      end do
4095!$OMP END DO
4096
4097      return
4098      end
4099
4100
4101
4102*     ***********************************
4103*     *                                 *
4104*     *          D3dB_rr_Sub2           *
4105*     *                                 *
4106*     ***********************************
4107
4108      subroutine D3dB_rr_Sub2(nb,A,B)
4109      implicit none
4110
4111#include "D3dB.fh"
4112
4113      integer nb
4114      real*8  A(n2ft3d_map(nb))
4115      real*8  B(n2ft3d_map(nb))
4116
4117      integer i
4118
4119!$OMP DO
4120      do i=1,n2ft3d_map(nb)
4121         B(i) = B(i) - A(i)
4122      end do
4123!$OMP END DO
4124
4125
4126      return
4127      end
4128
4129*     ***********************************
4130*     *                                 *
4131*     *          D3dB_rr_Multiply2      *
4132*     *                                 *
4133*     ***********************************
4134      subroutine D3dB_rr_Multiply2(nb,A,B)
4135      implicit none
4136      integer nb
4137      real*8  A(*)
4138      real*8  B(*)
4139
4140#include "D3dB.fh"
4141
4142      integer i
4143
4144!$OMP DO
4145      do i=1,n2ft3d_map(nb)
4146         B(i) = B(i)*A(i)
4147      end do
4148!$OMP END DO
4149
4150      return
4151      end
4152
4153
4154
4155*     ***********************************
4156*     *                                 *
4157*     *       D3dB_rrr_MultiplyAdd      *
4158*     *                                 *
4159*     ***********************************
4160      subroutine D3dB_rrr_MultiplyAdd(nb,A,B,C)
4161      implicit none
4162      integer nb
4163      real*8  A(*)
4164      real*8  B(*)
4165      real*8  C(*)
4166
4167#include "D3dB.fh"
4168
4169      integer i
4170
4171!$OMP DO
4172      do i=1,n2ft3d_map(nb)
4173         C(i) = C(i) + B(i)*A(i)
4174      end do
4175!$OMP END DO
4176
4177      return
4178      end
4179
4180
4181
4182*     ***********************************
4183*     *					*
4184*     *	         D3dB_cc_zaxpy 	 	*
4185*     *					*
4186*     ***********************************
4187
4188      subroutine D3dB_cc_zaxpy(nb,alpha,A,B)
4189      implicit none
4190      integer    nb
4191      complex*16 alpha
4192      complex*16 A(*)
4193      complex*16 B(*)
4194
4195#include "D3dB.fh"
4196
4197      integer i
4198
4199!$OMP DO
4200      do i=1,nfft3d_map(nb)
4201         B(i) = B(i) + alpha*A(i)
4202      end do
4203!$OMP END DO
4204
4205      return
4206      end
4207
4208
4209
4210*     ***********************************
4211*     *					*
4212*     *	         D3dB_cc_daxpy 	 	*
4213*     *					*
4214*     ***********************************
4215
4216      subroutine D3dB_cc_daxpy(nb,alpha,A,B)
4217      implicit none
4218      integer    nb
4219      real*8     alpha
4220      complex*16 A(*)
4221      complex*16 B(*)
4222
4223#include "D3dB.fh"
4224
4225      integer i
4226
4227!$OMP DO
4228      do i=1,nfft3d_map(nb)
4229         B(i) = B(i) + alpha*A(i)
4230      end do
4231!$OMP END DO
4232
4233      return
4234      end
4235
4236*     ***********************************
4237*     *					*
4238*     *	         D3dB_rr_daxpy 	 	*
4239*     *					*
4240*     ***********************************
4241
4242      subroutine D3dB_rr_daxpy(nb,alpha,A,B)
4243      implicit none
4244      integer nb
4245      real*8  alpha
4246      real*8  A(*)
4247      real*8  B(*)
4248
4249#include "D3dB.fh"
4250
4251      integer i
4252
4253!$OMP DO
4254      do i=1,n2ft3d_map(nb)
4255         B(i) = B(i) + alpha* A(i)
4256      end do
4257!$OMP END DO
4258
4259      return
4260      end
4261
4262*     ***********************************
4263*     *                                 *
4264*     *          D3dB_rr_Divide         *
4265*     *                                 *
4266*     ***********************************
4267
4268      subroutine D3dB_rr_Divide(nb,A,B,C)
4269      implicit none
4270      integer nb
4271      real*8 A(*)
4272      real*8 B(*)
4273      real*8 C(*)
4274
4275#include "D3dB.fh"
4276
4277      real*8 eta
4278      parameter (eta=1.0d-9)
4279
4280      integer index
4281
4282      !do q=1,nq(nb)
4283      !do j=1,ny(nb)
4284      !do i=1,nx(nb)
4285CDIR$ NOVECTOR
4286!$OMP DO
4287      do index = 1,n2ft3d_map(nb)
4288         !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb)
4289         if (dabs(B(index)) .le. eta) then
4290           C(index) = 0.0d0
4291         else
4292           C(index) = A(index) / B(index)
4293         end if
4294      end do
4295!$OMP END DO
4296      !end do
4297      !end do
4298      !end do
4299
4300      return
4301      end
4302
4303
4304
4305*     ***********************************
4306*     *                                 *
4307*     *          D3dB_rr_Divide2         *
4308*     *                                 *
4309*     ***********************************
4310
4311      subroutine D3dB_rr_Divide2(nb,A,B)
4312      implicit none
4313      integer nb
4314      real*8 A(*)
4315      real*8 B(*)
4316
4317#include "D3dB.fh"
4318
4319      real*8 eta
4320      parameter (eta=1.0d-9)
4321
4322      integer index
4323
4324      !do q=1,nq(nb)
4325      !do j=1,ny(nb)
4326      !do i=1,nx(nb)
4327CDIR$ NOVECTOR
4328!$OMP DO
4329      do index = 1,n2ft3d_map(nb)
4330         !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb)
4331         if (dabs(A(index)) .le. eta) then
4332           B(index) = 0.0d0
4333         else
4334           B(index) = B(index) / A(index)
4335         end if
4336      end do
4337!$OMP END DO
4338      !end do
4339      !end do
4340      !end do
4341
4342      return
4343      end
4344
4345
4346
4347*     ***********************************
4348*     *                                 *
4349*     *          D3dB_r_ABS             *
4350*     *                                 *
4351*     ***********************************
4352
4353      subroutine D3dB_r_ABS(nb,A,C)
4354      implicit none
4355      integer nb
4356      real*8 A(*)
4357      real*8 C(*)
4358
4359#include "D3dB.fh"
4360
4361
4362      integer index
4363
4364      !do q=1,nq(nb)
4365      !do j=1,ny(nb)
4366      !do i=1,nx(nb)
4367!$OMP DO
4368      do index=1,n2ft3d_map(nb)
4369         !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb)
4370         C(index) = dabs(A(index))
4371      end do
4372!$OMP END DO
4373      !end do
4374      !end do
4375      !end do
4376
4377      return
4378      end
4379
4380      subroutine D3dB_r_abs1(nb,A)
4381      implicit none
4382      integer nb
4383      real*8 A(*)
4384
4385#include "D3dB.fh"
4386
4387      integer index
4388
4389!$OMP DO
4390      do index=1,n2ft3d_map(nb)
4391         !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb)
4392         A(index) = dabs(A(index))
4393      end do
4394!$OMP END DO
4395      return
4396      end
4397
4398*     ***********************************
4399*     *                                 *
4400*     *          D3dB_r_ABSMAX          *
4401*     *                                 *
4402*     ***********************************
4403
4404      subroutine D3dB_r_ABSMAX(nb,A,amax)
4405      implicit none
4406      integer nb
4407      real*8 A(*)
4408      real*8 amax
4409
4410#include "D3dB.fh"
4411
4412      integer index
4413      real*8 aa
4414
4415      amax = 0.0d0
4416      do index=1,n2ft3d_map(nb)
4417         !index = i + (j-1)*(nx(nb)+2) + (q-1)*(nx(nb)+2)*ny(nb)
4418         aa = dabs(A(index))
4419         if (aa.gt.amax) amax = aa
4420      end do
4421      call D3dB_MaxAll(amax)
4422
4423      return
4424      end
4425
4426
4427*     ***********************************
4428*     *                                 *
4429*     *          D3dB_r_ZeroNegative    *
4430*     *                                 *
4431*     ***********************************
4432
4433      subroutine D3dB_r_ZeroNegative(nb,A)
4434      implicit none
4435      integer nb
4436      real*8 A(*)
4437
4438#include "D3dB.fh"
4439
4440      integer i
4441
4442      do i=1,n2ft3d_map(nb)
4443         if (A(i).lt.0.0d0) A(i) = 0.0d0
4444      end do
4445
4446      return
4447      end
4448
4449*     ***********************************
4450*     *                                 *
4451*     *          D3dB_rr_Minus          *
4452*     *                                 *
4453*     ***********************************
4454      subroutine D3dB_rr_Minus(nb,A,B,C)
4455      implicit none
4456      integer nb
4457      real*8  A(*)
4458      real*8  B(*)
4459      real*8  C(*)
4460
4461#include "D3dB.fh"
4462
4463      integer i
4464
4465!$OMP DO
4466      do i=1,n2ft3d_map(nb)
4467         C(i) = A(i) - B(i)
4468      end do
4469!$OMP END DO
4470
4471      return
4472      end
4473
4474*     ***********************************
4475*     *					*
4476*     *          D3dB_r_Zero_Ends0	*
4477*     *					*
4478*     ***********************************
4479
4480      subroutine D3dB_r_Zero_Ends0(nb,A)
4481      integer nb
4482      real*8 A(*)
4483
4484#include "D3dB.fh"
4485
4486      integer j,k,q,index,taskid,p
4487
4488      !**** slab mapping ****
4489      if (mapping.eq.1) then
4490      do q=1,nq(nb)
4491         do j=1,ny(nb)
4492            index = (nx(nb)+1) + (j-1)*(nx(nb)+2)
4493     >                         + (q-1)*(nx(nb)+2)*(ny(nb))
4494
4495            A(index)   = 0.0d0
4496            A(index+1) = 0.0d0
4497         end do
4498      end do
4499
4500
4501      !**** hilbert mapping ****
4502      else
4503        call Parallel2d_taskid_i(taskid)
4504        do k=1,nz(nb)
4505        do j=1,ny(nb)
4506
4507         call D3dB_ijktoindex2p(nb,(nx(nb)+1),j,k,index,p)
4508         if (p.eq.taskid) A(index) = 0.0d0
4509
4510         call D3dB_ijktoindex2p(nb,(nx(nb)+2),j,k,index,p)
4511         if (p.eq.taskid) A(index) = 0.0d0
4512
4513        end do
4514        end do
4515      end if
4516
4517      if (n2ft3d_map(nb).lt.n2ft3d(nb)) then
4518         call dcopy((n2ft3d(nb)-n2ft3d_map(nb)),
4519     >              0.0d0,0,A(n2ft3d_map(nb)+1),1)
4520      end if
4521
4522      return
4523      end
4524
4525
4526
4527
4528
4529*     ***********************************
4530*     *					*
4531*     *          D3dB_r_Zero_Ends	*
4532*     *					*
4533*     ***********************************
4534
4535      subroutine D3dB_r_Zero_Ends(nb,A)
4536      integer nb
4537      real*8 A(*)
4538
4539#include "D3dB.fh"
4540
4541      integer j,k,q,index,taskid,p
4542
4543      !**** slab mapping ****
4544      if (mapping.eq.1) then
4545!$OMP DO
4546      do q=1,nq(nb)
4547         do j=1,ny(nb)
4548            index = (nx(nb)+1) + (j-1)*(nx(nb)+2)
4549     >                         + (q-1)*(nx(nb)+2)*(ny(nb))
4550
4551            A(index)   = 0.0d0
4552            A(index+1) = 0.0d0
4553         end do
4554      end do
4555!$OMP END DO
4556
4557
4558      !**** hilbert mapping ****
4559      else
4560        call Parallel2d_taskid_i(taskid)
4561!$OMP DO
4562        do k=1,nz(nb)
4563        do j=1,ny(nb)
4564
4565         call D3dB_ijktoindex2p(nb,(nx(nb)+1),j,k,index,p)
4566         if (p.eq.taskid) A(index) = 0.0d0
4567
4568         call D3dB_ijktoindex2p(nb,(nx(nb)+2),j,k,index,p)
4569         if (p.eq.taskid) A(index) = 0.0d0
4570
4571        end do
4572        end do
4573!$OMP end DO
4574      end if
4575
4576!$OMP MASTER
4577      if (n2ft3d_map(nb).lt.n2ft3d(nb)) then
4578         call dcopy((n2ft3d(nb)-n2ft3d_map(nb)),
4579     >              0.0d0,0,A(n2ft3d_map(nb)+1),1)
4580      end if
4581!$OMP END MASTER
4582!$OMP BARRIER
4583
4584      return
4585      end
4586
4587
4588
4589*     ***********************************
4590*     *                                 *
4591*     *          D3dB_r_notZero_Ends    *
4592*     *                                 *
4593*     ***********************************
4594
4595      subroutine D3dB_r_notZero_Ends(nb,A)
4596      integer nb
4597      real*8 A(*)
4598
4599#include "D3dB.fh"
4600
4601      integer j,k,q,index,taskid,p
4602
4603      !**** slab mapping ****
4604      if (mapping.eq.1) then
4605      do q=1,nq(nb)
4606         do j=1,ny(nb)
4607            index = (nx(nb)+1) + (j-1)*(nx(nb)+2)
4608     >                         + (q-1)*(nx(nb)+2)*(ny(nb))
4609            A(index)   = 1.0d0
4610            A(index+1) = 1.0d0
4611         end do
4612      end do
4613
4614
4615      !**** hilbert mapping ****
4616      else
4617        call Parallel2d_taskid_i(taskid)
4618        do k=1,nz(nb)
4619        do j=1,ny(nb)
4620
4621         call D3dB_ijktoindex2p(nb,(nx(nb)+1),j,k,index,p)
4622         if (p.eq.taskid) A(index) = 1.0d0
4623
4624         call D3dB_ijktoindex2p(nb,(nx(nb)+2),j,k,index,p)
4625         if (p.eq.taskid) A(index) = 1.0d0
4626
4627        end do
4628        end do
4629      end if
4630
4631      return
4632      end
4633
4634
4635
4636
4637*     ***********************************
4638*     *					*
4639*     *	         D3dB_r_dsum  	 	*
4640*     *					*
4641*     ***********************************
4642
4643      subroutine D3dB_r_dsum(nb,A,sumall)
4644      implicit none
4645      integer nb
4646      real*8  A(*)
4647      real*8  sumall
4648
4649#include "D3dB.fh"
4650
4651      integer i,np
4652      real*8 sum
4653
4654      call Parallel2d_np_i(np)
4655
4656*     **** sum up dot product on this node ****
4657      sum = 0.0d0
4658      do i=1,n2ft3d_map(nb)
4659         sum = sum + A(i)
4660      end do
4661
4662*     **** add up sums from other nodes ****
4663      if (np.gt.1) then
4664        call D3dB_SumAll(sum)
4665      end if
4666
4667      sumall = sum
4668
4669      return
4670      end
4671
4672*     ***********************************
4673*     *					*
4674*     *	         D3dB_t_dsum  	 	*
4675*     *					*
4676*     ***********************************
4677
4678      subroutine D3dB_t_dsum(nb,A,sumall)
4679      implicit none
4680      integer nb
4681      real*8  A(*)
4682      real*8  sumall
4683
4684#include "D3dB.fh"
4685
4686      integer i,j,k,q,np,nxh,index,taskid,p
4687      real*8 sum
4688
4689      nxh = nx(nb)/2
4690      call Parallel2d_np_i(np)
4691
4692*     **** sum up dot product on this node ****
4693      sum = 0.0d0
4694
4695      !**********************
4696      !**** slab mapping ****
4697      !**********************
4698      if (mapping.eq.1) then
4699*     ***** k!=0 plane so double count *****
4700      do q=1,nq(nb)
4701      do j=1,ny(nb)
4702         do i=2,(nxh+1)
4703            index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + i
4704             sum = sum + A(index)
4705         end do
4706      end do
4707      end do
4708      sum = sum*2.0d0
4709
4710*     ***** k==0 plane, so single count *****
4711      do q=1,nq(nb)
4712      do j=1,ny(nb)
4713            index = (q-1)*(nxh+1)*ny(nb) + (j-1)*(nxh+1) + 1
4714             sum = sum + A(index)
4715      end do
4716      end do
4717
4718
4719      !*************************
4720      !**** hilbert mapping ****
4721      !*************************
4722      else
4723      call Parallel2d_taskid_i(taskid)
4724*     ***** kx!=0 plane, so double count *****
4725      do index=1,nfft3d_map(nb)
4726            sum = sum + A(index)
4727      end do
4728      sum = sum*2.0d0
4729
4730*     ***** kx==0 plane, so single count *****
4731      do k=1,nz(nb)
4732      do j=1,ny(nb)
4733         i=1
4734         call D3dB_ijktoindexp(nb,i,j,k,index,p)
4735         if (p.eq.taskid) then
4736         sum = sum - A(index)
4737         end if
4738      end do
4739      end do
4740
4741      end if
4742
4743
4744
4745
4746*     **** add up sums from other nodes ****
4747      if (np.gt.1) then
4748        call D3dB_SumAll(sum)
4749      end if
4750
4751      sumall = sum
4752
4753      return
4754      end
4755
4756
4757*     ***********************************
4758*     *					*
4759*     *	     D3dB_cc_Vector_dot 	*
4760*     *					*
4761*     ***********************************
4762
4763      subroutine D3dB_cc_Vector_dot(nb,nnfft3d,nn,ne,A,B,sumall)
4764      implicit none
4765      integer    nb
4766      integer    nnfft3d,nn,ne
4767      complex*16 A(*)
4768      complex*16 B(*)
4769      real*8     sumall(nn,nn)
4770
4771
4772#include "D3dB.fh"
4773
4774      integer i,j,k,q,index,np,taskid,p
4775      integer index1,index2
4776      integer n,m,shift1,shift2
4777      real*8  sum
4778
4779
4780      call nwpw_timing_start(2)
4781
4782      call Parallel2d_np_i(np)
4783
4784      !**********************
4785      !**** slab mapping ****
4786      !**********************
4787      if (mapping.eq.1) then
4788*     **** sum up dot product on this node ****
4789      do n=1,ne
4790      do m=n,ne
4791
4792        shift1 = (n-1)*nnfft3d
4793        shift2 = (m-1)*nnfft3d
4794        sum    = 0.0d0
4795
4796*       ***** kx!=0 plane, so double count *****
4797        do q=1,nq(nb)
4798           do j=1,ny(nb)
4799           do i=2,(nx(nb)/2+1)
4800              index = (q-1)*(nx(nb)/2+1)*ny(nb)
4801     >              + (j-1)*(nx(nb)/2+1) + i
4802              index1 = index+shift1
4803              index2 = index+shift2
4804              sum = sum + dble(A(index1))  * dble(B(index2))
4805     >                  + dimag(A(index1)) * dimag(B(index2))
4806           end do
4807           end do
4808        end do
4809        sum = sum*2.0d0
4810
4811*       ***** kx==0 plane, so single count *****
4812        do q=1,nq(nb)
4813           do j=1,ny(nb)
4814              i=1
4815              index = (q-1)*(nx(nb)/2+1)*ny(nb)
4816     >              + (j-1)*(nx(nb)/2+1) + 1
4817              index1 = index+shift1
4818              index2 = index+shift2
4819              sum = sum + dble(A(index1))  * dble(B(index2))
4820     >                  + dimag(A(index1)) * dimag(B(index2))
4821           end do
4822        end do
4823
4824        sumall(n,m) = sum
4825        sumall(m,n) = sum
4826      end do
4827      end do
4828
4829
4830      !*************************
4831      !**** hilbert mapping ****
4832      !*************************
4833      else
4834      call Parallel2d_taskid_i(taskid)
4835*     **** sum up dot product on this node ****
4836      do n=1,ne
4837      do m=n,ne
4838
4839        shift1 = (n-1)*nnfft3d
4840        shift2 = (m-1)*nnfft3d
4841        sum    = 0.0d0
4842
4843*       ***** kx!=0 plane, so double count *****
4844        do index=1,nfft3d_map(nb)
4845            index1 = index+shift1
4846            index2 = index+shift2
4847            sum = sum + dble(A(index1))  * dble(B(index2))
4848     >                + dimag(A(index1)) * dimag(B(index2))
4849        end do
4850        sum = sum*2.0d0
4851
4852*       ***** kx==0 plane, so single count *****
4853        do k=1,nz(nb)
4854        do j=1,ny(nb)
4855         i=1
4856         call D3dB_ijktoindexp(nb,i,j,k,index,p)
4857         if (p.eq.taskid) then
4858         index1 = index+shift1
4859         index2 = index+shift2
4860         sum = sum - dble(A(index1))  * dble(B(index2))
4861     >             - dimag(A(index1)) * dimag(B(index2))
4862         end if
4863        end do
4864        end do
4865
4866
4867        sumall(n,m) = sum
4868        sumall(m,n) = sum
4869      end do
4870      end do
4871
4872      end if
4873
4874
4875*     **** add up sums from other nodes ****
4876      if (np.gt.1) then
4877         call D3dB_Vector_SumAll(nn*ne,sumall)
4878      end if
4879
4880      call nwpw_timing_end(2)
4881
4882      return
4883      end
4884
4885
4886
4887*     ***********************************
4888*     *					*
4889*     *	     D3dB_cc_Vector_ndot 	*
4890*     *					*
4891*     ***********************************
4892
4893      subroutine D3dB_cc_Vector_ndot(nb,nnfft3d,ne,A,B,sumall)
4894      implicit none
4895      integer    nb
4896      integer    nnfft3d,ne
4897      complex*16 A(*)
4898      complex*16 B(*)
4899      real*8     sumall(ne)
4900
4901
4902#include "D3dB.fh"
4903
4904      integer i,j,k,q,index,np,taskid,p
4905      integer index1,index2
4906      integer n,shift1,shift2
4907      real*8  sum
4908
4909
4910      call nwpw_timing_start(2)
4911
4912      call Parallel2d_np_i(np)
4913
4914      !**********************
4915      !**** slab mapping ****
4916      !**********************
4917      if (mapping.eq.1) then
4918*     **** sum up dot product on this node ****
4919      do n=1,ne
4920
4921        shift1 = (n-1)*nnfft3d
4922        shift2 = 0
4923        sum    = 0.0d0
4924*       ***** kx!=0 plane, so double count *****
4925        do q=1,nq(nb)
4926           do j=1,ny(nb)
4927           do i=2,(nx(nb)/2+1)
4928              index = (q-1)*(nx(nb)/2+1)*ny(nb)
4929     >              + (j-1)*(nx(nb)/2+1) + i
4930              index1 = index+shift1
4931              index2 = index+shift2
4932              sum = sum + dble(A(index1))  * dble(B(index2))
4933     >                  + dimag(A(index1)) * dimag(B(index2))
4934           end do
4935           end do
4936        end do
4937        sum = sum*2.0d0
4938
4939*       ***** kx==0 plane, so single count *****
4940        do q=1,nq(nb)
4941           do j=1,ny(nb)
4942              i=1
4943              index = (q-1)*(nx(nb)/2+1)*ny(nb)
4944     >              + (j-1)*(nx(nb)/2+1) + 1
4945              index1 = index+shift1
4946              index2 = index+shift2
4947              sum = sum + dble(A(index1))  * dble(B(index2))
4948     >                  + dimag(A(index1)) * dimag(B(index2))
4949           end do
4950        end do
4951
4952         sumall(n) = sum
4953      end do
4954
4955
4956      !*************************
4957      !**** hilbert mapping ****
4958      !*************************
4959      else
4960      call Parallel2d_taskid_i(taskid)
4961*     **** sum up dot product on this node ****
4962      do n=1,ne
4963
4964        shift1 = (n-1)*nnfft3d
4965        shift2 = 0
4966        sum    = 0.0d0
4967
4968*       ***** kx!=0 plane, so double count *****
4969        do index=1,nfft3d_map(nb)
4970            index1 = index+shift1
4971            index2 = index+shift2
4972            sum = sum + dble(A(index1))  * dble(B(index2))
4973     >                + dimag(A(index1)) * dimag(B(index2))
4974        end do
4975        sum = sum*2.0d0
4976
4977*       ***** kx==0 plane, so single count *****
4978        do k=1,nz(nb)
4979        do j=1,ny(nb)
4980         i=1
4981         call D3dB_ijktoindexp(nb,i,j,k,index,p)
4982         if (p.eq.taskid) then
4983         index1 = index+shift1
4984         index2 = index+shift2
4985         sum = sum - dble(A(index1))  * dble(B(index2))
4986     >             - dimag(A(index1)) * dimag(B(index2))
4987         end if
4988        end do
4989        end do
4990
4991        sumall(n) = sum
4992      end do
4993
4994      end if
4995
4996
4997*     **** add up sums from other nodes ****
4998      if (np.gt.1) then
4999         call D3dB_Vector_SumAll(ne,sumall)
5000      end if
5001
5002      call nwpw_timing_end(2)
5003
5004      return
5005      end
5006
5007
5008
5009
5010
5011c*     ***********************************
5012c*     *					*
5013c*     *	        D3dB_Vector_SumAll	*
5014c*     *					*
5015c*     ***********************************
5016c
5017c      subroutine D3dB_Vector_SumAll(n,sum)
5018cc     implicit none
5019c      integer n
5020c      real*8  sum(*)
5021c
5022c#include "bafdecls.fh"
5023c
5024c#include "tcgmsg.fh"
5025c#include "msgtypesf.h"
5026c#include "errquit.fh"
5027c
5028c
5029c
5030c      logical value
5031c      integer MASTER
5032c      parameter (MASTER=0)
5033c      integer msglen
5034c
5035c*     **** temporary workspace ****
5036c      integer sumall(2),np
5037c
5038c*     **** external functions ****
5039c      integer  Parallel2d_comm_i
5040c      external Parallel2d_comm_i
5041c
5042c
5043c
5044c      call Parallel_np(np)
5045c      call nwpw_timing_start(2)
5046c      if (np.gt.1) then
5047c
5048c*     ***** allocate temporary space ****
5049c      value = BA_push_get(mt_dbl,n,'sumall',sumall(2),sumall(1))
5050c      if (.not. value) call errquit('out of stack memory',0, MA_ERR)
5051c
5052c      msglen = n
5053c
5054c
5055c      call dcopy(n,sum,1,dbl_mb(sumall(1)),1)
5056c
5057c      call GA_PGROUP_DGOP(Parallel2d_comm_i(),
5058c     >                    9+MSGDBL,dbl_mb(sumall(1)),n,'+')
5059cc     call GA_DGOP(9+MSGDBL,dbl_mb(sumall(1)),n,'+')
5060cc     call DGOP(9+MSGDBL,dbl_mb(sumall(1)),n,'+')
5061c
5062c
5063c      call dcopy(n,dbl_mb(sumall(1)),1,sum,1)
5064c      value = BA_pop_stack(sumall(2))
5065c
5066c      end if
5067c      call nwpw_timing_end(2)
5068c      return
5069c      end
5070c
5071c
5072c*     ***********************************
5073c*     *					*
5074c*     *	        D3dB_Vector_ISumAll	*
5075c*     *					*
5076c*     ***********************************
5077c
5078c      subroutine D3dB_Vector_ISumAll(n,sum)
5079cc     implicit none
5080c      integer n
5081c      integer  sum(*)
5082c
5083c#include "bafdecls.fh"
5084c#include "errquit.fh"
5085c
5086c
5087c
5088c#include "tcgmsg.fh"
5089c#include "msgtypesf.h"
5090c
5091c
5092c
5093c      integer MASTER
5094c      parameter (MASTER=0)
5095c      integer msglen
5096c      logical value
5097c
5098c*     **** temporary workspace ****
5099c      integer sumall(2)
5100c
5101c*     **** external functions ****
5102c      integer  Parallel2d_comm_i
5103c      external Parallel2d_comm_i
5104c
5105c
5106c      call nwpw_timing_start(2)
5107c
5108c*     ***** allocate temporary space ****
5109c      value = BA_push_get(mt_int,n,'sumall',sumall(2),sumall(1))
5110c      if (.not. value) call errquit('out of stack memory',0, MA_ERR)
5111c
5112c      msglen = n
5113c
5114c
5115c      call icopy(n,sum,1,int_mb(sumall(1)),1)
5116c      call GA_PGROUP_IGOP(Parallel2d_comm_i(),
5117c     >                    9+MSGINT,int_mb(sumall(1)),n,'+')
5118cc     call GA_IGOP(9+MSGINT,int_mb(sumall(1)),n,'+')
5119c
5120c
5121c      call icopy(n,int_mb(sumall(1)),1,sum,1)
5122c      value = BA_pop_stack(sumall(2))
5123c
5124c      call nwpw_timing_end(2)
5125c      return
5126c      end
5127
5128
5129c *** icopy define in src/util directory!!!
5130c      subroutine icopy(n,a,inca,b,incb)
5131c      integer n
5132c      integer a(*),inca
5133c      integer b(*),incb
5134c
5135c      integer i,shifta,shiftb
5136c
5137c      shifta = 1
5138c      shiftb = 1
5139c      do i=1,n
5140c        b(shiftb)=a(shifta)
5141c        shifta=shifta+inca
5142c        shiftb=shiftb+incb
5143c      end do
5144c
5145c      return
5146c      end
5147
5148
5149*     ***********************************
5150*     *                                 *
5151*     *          D3dB_ic_Mul            *
5152*     *                                 *
5153*     ***********************************
5154cpgi$r opt=1
5155      subroutine D3dB_ic_Mul(nb,A,B,C)
5156      implicit none
5157      integer    nb
5158      real*8     A(*)
5159      complex*16 B(*)
5160      complex*16 C(*)
5161
5162#include "D3dB.fh"
5163
5164      integer i
5165
5166!$OMP DO
5167      do i=1,nfft3d_map(nb)
5168            C(i) = dcmplx(0.0d0,A(i)) * B(i)
5169      end do
5170!$OMP END DO
5171
5172      return
5173      end
5174
5175*     ***********************************
5176*     *                                 *
5177*     *          D3dB_ic_Mul2           *
5178*     *                                 *
5179*     ***********************************
5180cpgi$r opt=1
5181      subroutine D3dB_ic_Mul2(nb,A,B)
5182      implicit none
5183      integer    nb
5184      real*8     A(*)
5185      complex*16 B(*)
5186
5187#include "D3dB.fh"
5188
5189      integer i
5190
5191!$OMP DO
5192      do i=1,nfft3d_map(nb)
5193            B(i) = dcmplx(0.0d0,A(i)) * B(i)
5194      end do
5195!$OMP END DO
5196
5197      return
5198      end
5199
5200
5201*     ***********************************
5202*     *                                 *
5203*     *          D3dB_r_Expand          *
5204*     *                                 *
5205*     ***********************************
5206
5207      subroutine D3dB_r_Expand(nb1,A,nb2,B)
5208      implicit none
5209      integer nb1
5210      real*8  A(*)
5211      integer nb2
5212      real*8  B(*)
5213
5214#include "D3dB.fh"
5215
5216      integer i,j,q,index1,index2
5217
5218      if (mapping.eq.1) then
5219c      call dcopy(nq(nb2)*ny(nb2)*(nx(nb2)+2),0.0d0,0,B,1)
5220      call Parallel_shared_vector_zero(.true.,
5221     >                                 nq(nb2)*ny(nb2)*(nx(nb2)+2),B)
5222!$OMP DO
5223      do j=1,ny(nb1)
5224      do q=1,nq(nb1)
5225      do i=1,nx(nb1)
5226         index1 = i + (j-1)*(nx(nb1)+2) + (q-1)*(nx(nb1)+2)*ny(nb1)
5227         index2 = i + (j-1)*(nx(nb2)+2) + (q-1)*(nx(nb2)+2)*ny(nb2)
5228         B(index2) = A(index1)
5229      end do
5230      end do
5231      end do
5232!$OMP END DO
5233
5234      else
5235c      call dcopy(n2ft3d(nb2),0.0d0,0,B,1)
5236      call Parallel_shared_vector_zero(.true.,n2ft3d(nb2),B)
5237!$OMP DO
5238      do q=1,nq1(nb1)
5239      do i=1,nx(nb1)
5240         index1 = i + (q-1)*(nx(nb1)+2)
5241         index2 = i + (q-1)*(nx(nb2)+2)
5242         B(index2) = A(index1)
5243      end do
5244      end do
5245!$OMP END DO
5246      end if
5247      return
5248      end
5249
5250*     ***********************************
5251*     *                                 *
5252*     *          D3dB_r_Contract        *
5253*     *                                 *
5254*     ***********************************
5255
5256      subroutine D3dB_r_Contract(nb2,B,nb1,A)
5257      implicit none
5258      integer nb2
5259      real*8  B(*)
5260      integer nb1
5261      real*8  A(*)
5262
5263#include "D3dB.fh"
5264
5265      integer i,j,q,index1,index2
5266
5267      if (mapping.eq.1) then
5268c      call dcopy(nq(nb1)*ny(nb1)*(nx(nb1)+2),0.0d0,0,A,1)
5269      call Parallel_shared_vector_zero(.true.,
5270     >                                 nq(nb1)*ny(nb1)*(nx(nb1)+2),A)
5271!$OMP DO
5272      do j=1,ny(nb1)
5273      do q=1,nq(nb1)
5274      do i=1,nx(nb1)
5275         index1 = i + (j-1)*(nx(nb1)+2) + (q-1)*(nx(nb1)+2)*ny(nb1)
5276         index2 = i + (j-1)*(nx(nb2)+2) + (q-1)*(nx(nb2)+2)*ny(nb2)
5277         A(index1) = B(index2)
5278      end do
5279      end do
5280      end do
5281!$OMP END DO
5282
5283      else
5284c      call dcopy(n2ft3d(nb1),0.0d0,0,A,1)
5285      call Parallel_shared_vector_zero(.true.,n2ft3d(nb1),A)
5286!$OMP DO
5287      do q=1,nq1(nb1)
5288      do i=1,nx(nb1)
5289         index1 = i + (q-1)*(nx(nb1)+2)
5290         index2 = i + (q-1)*(nx(nb2)+2)
5291         A(index1) = B(index2)
5292      end do
5293      end do
5294!$OMP END DO
5295      end if
5296
5297      return
5298      end
5299
5300*     ***********************************
5301*     *					*
5302*     *	   D3dB_timereverse_size	*
5303*     *					*
5304*     ***********************************
5305
5306      integer function D3dB_timereverse_size(nb)
5307      implicit none
5308      integer nb
5309
5310#include "D3dB.fh"
5311
5312*     **** local variables ****
5313      integer proc_to,proc_from
5314      integer pto,np,taskid
5315      integer phere,itmp,itmp2
5316      integer index1,index2
5317      integer it,size
5318      integer i2,j2,k2
5319      integer i3,j3,k3
5320      integer nyh,nzh
5321
5322      call Parallel2d_taskid_i(taskid)
5323      call Parallel2d_np_i(np)
5324
5325      nyh = ny(nb)/2
5326      nzh = nz(nb)/2
5327
5328      index1 = 1
5329      index2 = 1
5330      do it=0,np-1
5331         proc_to   = mod(taskid+it,np)
5332         proc_from = mod(taskid-it+np,np)
5333
5334*        *********************
5335*        **** K=(0,0,k3)  ****
5336*        *********************
5337         do k3=1,(nzh-1)
5338            i3 =  k3
5339            j3 = -k3
5340            if (i3.lt.0) i3 = i3 + nz(nb)
5341            if (j3.lt.0) j3 = j3 + nz(nb)
5342            i2 = 1
5343            i3 = i3 + 1
5344            j2 = 1
5345            j3 = j3 + 1
5346
5347*           **** packing scheme ****
5348            call D3dB_ijktoindexp(nb,1,1,i3,itmp,phere)
5349            call D3dB_ijktoindexp(nb,1,1,j3,itmp2,pto)
5350            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
5351               index1 = index1 + 1
5352            end if
5353
5354*           **** unpacking scheme ****
5355            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
5356               index2 = index2 + 1
5357            end if
5358
5359         end do
5360
5361*        *********************
5362*        **** k=(0,k2,k3) ****
5363*        *********************
5364         do k3=(-nzh+1),(nzh-1)
5365         do k2=1,(nyh-1)
5366            i2 =  k2
5367            i3 =  k3
5368            j2 = -k2
5369            j3 = -k3
5370            if (i2.lt.0) i2 = i2 + ny(nb)
5371            if (i3.lt.0) i3 = i3 + nz(nb)
5372            if (j2.lt.0) j2 = j2 + ny(nb)
5373            if (j3.lt.0) j3 = j3 + nz(nb)
5374            i2 = i2 + 1
5375            i3 = i3 + 1
5376            j2 = j2 + 1
5377            j3 = j3 + 1
5378
5379*           **** packing scheme ****
5380            call D3dB_ijktoindexp(nb,1,i2,i3,itmp,phere)
5381            call D3dB_ijktoindexp(nb,1,j2,j3,itmp2,pto)
5382            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
5383               index1 = index1 + 1
5384            end if
5385
5386*           **** unpacking scheme ****
5387            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
5388               index2 = index2 + 1
5389            end if
5390         end do
5391         end do
5392
5393      end do
5394      size = index1
5395      if (size.lt.index2) size = index2
5396
5397      D3dB_timereverse_size = size
5398      return
5399      end
5400
5401*     ***********************************
5402*     *					*
5403*     *	   D3dB_c_timereverse_init	*
5404*     *					*
5405*     ***********************************
5406
5407      subroutine D3dB_c_timereverse_init(nb)
5408      implicit none
5409      integer nb
5410
5411#include "bafdecls.fh"
5412#include "D3dB.fh"
5413#include "errquit.fh"
5414
5415c     integer iq_to_i1(2*NFFT2*NSLABS)
5416c     integer iq_to_i2(2*NFFT2*NSLABS)
5417c     integer i1_start(NFFT3+1)
5418c     integer i2_start(NFFT3+1)
5419      integer iq_to_i1(2,NBLOCKS)
5420      integer iq_to_i2(2,NBLOCKS)
5421      integer i1_start(2,NBLOCKS)
5422      integer i2_start(2,NBLOCKS)
5423      common / timereverse_blk / iq_to_i1,iq_to_i2,i1_start,i2_start
5424
5425*     **** local variables ****
5426      integer proc_to,proc_from
5427      integer pto,np,taskid
5428      integer phere
5429      integer index1,index2,itmp,itmp2
5430      integer it
5431      integer i2,j2,k2
5432      integer i3,j3,k3
5433      integer nyh,nzh
5434      logical value
5435
5436      !**** external functions ****
5437      integer  D3dB_timereverse_size
5438      external D3dB_timereverse_size
5439
5440      call Parallel2d_taskid_i(taskid)
5441      call Parallel2d_np_i(np)
5442
5443      nyh = ny(nb)/2
5444      nzh = nz(nb)/2
5445
5446*     **** set zplane_size ****
5447      zplane_size(nb) = D3dB_timereverse_size(nb)
5448
5449*     **** allocate timereverse_blk common block ****
5450      value = BA_alloc_get(mt_int,zplane_size(nb),
5451     >                     'iq_to_i1',iq_to_i1(2,nb),iq_to_i1(1,nb))
5452      value = value.and.
5453     >        BA_alloc_get(mt_int,zplane_size(nb),
5454     >                     'iq_to_i2',iq_to_i2(2,nb),iq_to_i2(1,nb))
5455
5456      value = value.and.
5457     >        BA_alloc_get(mt_int,(np+1),
5458     >                     'i1_start',i1_start(2,nb),i1_start(1,nb))
5459      value = value.and.
5460     >        BA_alloc_get(mt_int,(np+1),
5461     >                     'i2_start',i2_start(2,nb),i2_start(1,nb))
5462      if (.not.value) call errquit('out of heap memory',0, MA_ERR)
5463
5464
5465
5466!MATHIAS
5467      index1 = 1
5468      index2 = 1
5469      do it=0,np-1
5470         proc_to   = mod(taskid+it,np)
5471         proc_from = mod(taskid-it+np,np)
5472c        i1_start(it+1) = index1
5473c        i2_start(it+1) = index2
5474         int_mb(i1_start(1,nb)+it) = index1
5475         int_mb(i2_start(1,nb)+it) = index2
5476
5477*        *********************
5478*        **** K=(0,0,k3)  ****
5479*        *********************
5480         do k3=1,(nzh-1)
5481            i3 =  k3
5482            j3 = -k3
5483            if (i3.lt.0) i3 = i3 + nz(nb)
5484            if (j3.lt.0) j3 = j3 + nz(nb)
5485            i2 = 1
5486            i3 = i3 + 1
5487            j2 = 1
5488            j3 = j3 + 1
5489
5490*           **** packing scheme ****
5491            call D3dB_ijktoindexp(nb,1,1,i3,itmp,phere)
5492            call D3dB_ijktoindexp(nb,1,1,j3,itmp2,pto)
5493            !call D3dB_ktoqp(nb,i3,qhere,phere)
5494            !call D3dB_ktoqp(nb,j3,qto,pto)
5495            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
5496c               itmp = 1 + (i2-1)*(nx(nb)/2+1)
5497c     >                  + (qhere-1)*(nx(nb)/2+1)*ny(nb)
5498c               iq_to_i1(index1) = itmp
5499               int_mb(iq_to_i1(1,nb)+index1-1)=itmp
5500               index1 = index1 + 1
5501            end if
5502
5503*           **** unpacking scheme ****
5504            !call D3dB_ktoqp(nb,j3,qhere,phere)
5505            !call D3dB_ktoqp(nb,i3,qfrom,pfrom)
5506            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
5507c               itmp = 1 + (j2-1)*(nx(nb)/2+1)
5508c     >                  + (qhere-1)*(nx(nb)/2+1)*ny(nb)
5509c               iq_to_i2(index2) = itmp
5510               int_mb(iq_to_i2(1,nb)+index2-1) = itmp2
5511               index2 = index2 + 1
5512            end if
5513
5514         end do
5515
5516*        *********************
5517*        **** k=(0,k2,k3) ****
5518*        *********************
5519         do k3=(-nzh+1),(nzh-1)
5520         do k2=1,(nyh-1)
5521            i2 =  k2
5522            i3 =  k3
5523            j2 = -k2
5524            j3 = -k3
5525            if (i2.lt.0) i2 = i2 + ny(nb)
5526            if (i3.lt.0) i3 = i3 + nz(nb)
5527            if (j2.lt.0) j2 = j2 + ny(nb)
5528            if (j3.lt.0) j3 = j3 + nz(nb)
5529            i2 = i2 + 1
5530            i3 = i3 + 1
5531            j2 = j2 + 1
5532            j3 = j3 + 1
5533
5534*           **** packing scheme ****
5535            call D3dB_ijktoindexp(nb,1,i2,i3,itmp,phere)
5536            call D3dB_ijktoindexp(nb,1,j2,j3,itmp2,pto)
5537            !call D3dB_ktoqp(nb,i3,qhere,phere)
5538            !call D3dB_ktoqp(nb,j3,qto,pto)
5539            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
5540c               itmp = 1 + (i2-1)*(nx(nb)/2+1)
5541c     >                  + (qhere-1)*(nx(nb)/2+1)*ny(nb)
5542c               iq_to_i1(index1) = itmp
5543               int_mb(iq_to_i1(1,nb)+index1-1) = itmp
5544               index1 = index1 + 1
5545            end if
5546
5547*           **** unpacking scheme ****
5548            !call D3dB_ktoqp(nb,j3,qhere,phere)
5549            !call D3dB_ktoqp(nb,i3,qfrom,pfrom)
5550            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
5551c              itmp = 1 + (j2-1)*(nx(nb)/2+1)
5552c    >                  + (qhere-1)*(nx(nb)/2+1)*ny(nb)
5553c               iq_to_i2(index2) = itmp
5554               int_mb(iq_to_i2(1,nb)+index2-1)=itmp2
5555               index2 = index2 + 1
5556            end if
5557         end do
5558         end do
5559
5560      end do
5561c     i1_start(np+1) = index1
5562c     i2_start(np+1) = index2
5563      int_mb(i1_start(1,nb)+np) = index1
5564      int_mb(i2_start(1,nb)+np) = index2
5565
5566      return
5567      end
5568
5569
5570
5571*     ***********************************
5572*     *					*
5573*     *	     D3dB_timereverse_end 	*
5574*     *					*
5575*     ***********************************
5576      subroutine D3dB_timereverse_end(nb)
5577      implicit none
5578      integer nb
5579
5580#include "bafdecls.fh"
5581#include "errquit.fh"
5582
5583c     integer iq_to_i1((NFFT1/2+1)*NFFT2*NSLABS)
5584c     integer iq_to_i2((NFFT1/2+1)*NFFT2*NSLABS)
5585c     integer i1_start(NFFT3+1)
5586c     integer i2_start(NFFT3+1)
5587      integer iq_to_i1(2,NBLOCKS)
5588      integer iq_to_i2(2,NBLOCKS)
5589      integer i1_start(2,NBLOCKS)
5590      integer i2_start(2,NBLOCKS)
5591      common / timereverse_blk / iq_to_i1,iq_to_i2,i1_start,i2_start
5592
5593      logical value
5594
5595      value = BA_free_heap(i1_start(2,nb))
5596      value = value.and.
5597     >        BA_free_heap(i2_start(2,nb))
5598      value = value.and.
5599     >        BA_free_heap(iq_to_i1(2,nb))
5600      value = value.and.
5601     >        BA_free_heap(iq_to_i2(2,nb))
5602      if (.not.value) call errquit('error freeing heap',0, MA_ERR)
5603      return
5604      end
5605
5606
5607
5608      subroutine D3dB_pfft_index1_copy(n,index,a,b)
5609      implicit none
5610      integer n
5611      integer index(*)
5612      complex*16  a(*),b(*)
5613      integer i
5614
5615#ifndef CRAY
5616!DIR$ ivdep
5617#endif
5618!$OMP DO
5619      do i=1,n
5620        b(i) = a(index(i))
5621      end do
5622!$OMP END DO
5623
5624      return
5625      end
5626
5627      subroutine D3dB_pfft_index2_copy(n,index,a,b)
5628      implicit none
5629      integer n
5630      integer index(*)
5631      complex*16  a(*),b(*)
5632      integer i
5633#ifndef CRAY
5634!DIR$ ivdep
5635#endif
5636!$OMP DO
5637      do i=1,n
5638        b(index(i)) = a(i)
5639      end do
5640!$OMP END DO NOWAIT
5641      return
5642      end
5643
5644
5645      subroutine D3dB_pfft_index2_zero(n,index,a)
5646      implicit none
5647      integer n
5648      integer index(*)
5649      complex*16  a(*)
5650      integer i
5651#ifndef CRAY
5652!DIR$ ivdep
5653#endif
5654!$OMP DO
5655      do i=1,n
5656        a(index(i)) = 0.0d0
5657      end do
5658!$OMP END DO
5659      return
5660      end
5661
5662
5663
5664      subroutine D3dB_pfft_index2_copy_conjg(n,index,a,b)
5665      implicit none
5666      integer n
5667      integer index(*)
5668      complex*16  a(*),b(*)
5669      integer i
5670#ifndef CRAY
5671!DIR$ ivdep
5672#endif
5673!$OMP DO
5674      do i=1,n
5675        b(index(i)) = dconjg(a(i))
5676      end do
5677!$OMP END DO NOWAIT
5678      return
5679      end
5680
5681
5682
5683