1
2#define NBLOCKS 2
3
4*
5* $Id$
6*
7
8*     ***********************************************************
9*     *								*
10*     *   		   C3dB_pfft library	 		*
11*     *			(MPI implemenation)			*
12*     *								*
13*     *   Author - Eric Bylaska					*
14*     *   date   - 3/23/96					*
15*     *								*
16*     ***********************************************************
17
18*     ***********************************
19*     *					*
20*     *	        C3dB_pfft_init		*
21*     *					*
22*     ***********************************
23
24      subroutine C3dB_pfft_init()
25      implicit none
26
27#include "bafdecls.fh"
28#include "errquit.fh"
29#include "C3dB.fh"
30#include "C3dB_pfft.fh"
31
32*     **** local variables ****
33      integer taskid,MASTER
34      parameter (MASTER=0)
35
36      double precision eps
37      parameter (eps=1d-12)
38
39      logical value,yrow,zrow,yzslab
40      integer nxh,nyh,nzh,q,p
41      integer k1,k2,k3,fb,nbb
42      integer i,j,k
43      integer index2
44      double precision ggcut,g1,g2,g3,gg
45      integer zero_arow2(2),zero_arow3(2)
46
47*     **** external functions ***
48      integer  control_pfft3_qsize,brillioun_nbrillq
49      real*8   lattice_ggcut,lattice_wggcut,lattice_unitg
50      real*8   brillioun_k
51      external control_pfft3_qsize,brillioun_nbrillq
52      external lattice_ggcut,lattice_wggcut,lattice_unitg
53      external brillioun_k
54
55      call Parallel3d_taskid_i(taskid)
56      nxh = nx(1)/2
57      nyh = ny(1)/2
58      nzh = nz(1)/2
59
60      if (mapping.eq.1) then
61
62         value =           BA_alloc_get(mt_log,nx(1)*nq(1),
63     >                     'zero_row3',zero_row3(2,0),zero_row3(1,0))
64         value = value.and.BA_alloc_get(mt_log,nx(1)*nq(1),
65     >                     'zero_row3',zero_row3(2,1),zero_row3(1,1))
66
67         value = value.and.BA_alloc_get(mt_log,nx(1)*nq(1),
68     >                     'zero_row2',zero_row2(2,0),zero_row2(1,0))
69         value = value.and.BA_alloc_get(mt_log,nx(1)*nq(1),
70     >                     'zero_row2',zero_row2(2,1),zero_row2(1,1))
71
72         value = value.and.BA_alloc_get(mt_log,nx(1),
73     >               'zero_slab23',zero_slab23(2,0),zero_slab23(1,0))
74         value = value.and.BA_alloc_get(mt_log,nx(1),
75     >               'zero_slab23',zero_slab23(2,1),zero_slab23(1,1))
76         if (.not. value) call errquit('out of heap memory',0, MA_ERR)
77
78         value = value.and.BA_push_get(mt_log,(nx(1)*ny(1)),
79     >                     'zero_arow3',zero_arow3(2),zero_arow3(1))
80         if (.not. value) call errquit('out of stack memory',0,MA_ERR)
81
82
83         !*** fb==0 - density, fb=1 - all wavefunctions/brillioun zones ***
84         do fb=0,1
85
86         if (fb.eq.0) then
87            ggcut = lattice_ggcut()
88         else
89            ggcut = lattice_wggcut()
90         end if
91
92*        **** find zero_row3 - (i,*,k) rows that are zero ****
93*        **** note the crazy notation used below ****
94         do q=1,nx(1)*nq(1)
95             log_mb(zero_row3(1,fb)+q-1) =.true.
96         end do
97         do q=1,nx(1)*ny(1)
98           log_mb(zero_arow3(1)+q-1) =.true.
99         end do
100
101
102         do k2 = -nyh+1, nyh-1
103         do k1 = -nxh+1, nxh-1
104          i=k1
105          j=k2
106          if (i .lt. 0) i = i + nx(1)
107          if (j .lt. 0) j = j + ny(1)
108          i=i+1
109          j=j+1
110          zrow = .true.
111          do k3 = -nzh+1, nzh-1
112            if (fb.eq.0) then
113               g1 = k1*lattice_unitg(1,1)
114     >            + k3*lattice_unitg(1,2)
115     >            + k2*lattice_unitg(1,3)
116               g2 = k1*lattice_unitg(2,1)
117     >            + k3*lattice_unitg(2,2)
118     >            + k2*lattice_unitg(2,3)
119               g3 = k1*lattice_unitg(3,1)
120     >            + k3*lattice_unitg(3,2)
121     >            + k2*lattice_unitg(3,3)
122               gg = g1*g1 + g2*g2 + g3*g3
123               gg= gg-ggcut
124               if (gg.lt.-eps) zrow = .false.
125            else
126               do nbb=1,brillioun_nbrillq()
127                  g1 = k1*lattice_unitg(1,1)
128     >               + k3*lattice_unitg(1,2)
129     >               + k2*lattice_unitg(1,3)
130     >               + brillioun_k(1,nbb)
131                  g2 = k1*lattice_unitg(2,1)
132     >               + k3*lattice_unitg(2,2)
133     >               + k2*lattice_unitg(2,3)
134     >               + brillioun_k(2,nbb)
135                  g3 = k1*lattice_unitg(3,1)
136     >               + k3*lattice_unitg(3,2)
137     >               + k2*lattice_unitg(3,3)
138     >               + brillioun_k(3,nbb)
139                  gg = g1*g1 + g2*g2 + g3*g3
140                  gg= gg-ggcut
141                  if (gg.lt.-eps) zrow = .false.
142               end do
143            endif
144
145          end do
146          if (.not.zrow) then
147            log_mb(zero_arow3(1)+(i-1)+nx(1)*(j-1)) =.false.
148            call C3dB_ktoqp(1,j,q,p)
149            if (p.eq.taskid) then
150              index2 = i + nx(1)*(q-1)
151              log_mb(zero_row3(1,fb)+index2-1) =.false.
152            end if
153          end if
154
155         end do
156         end do
157
158         call C3dB_c_ptranspose_jk_init(fb,log_mb(zero_arow3(1)))
159
160
161*        **** find zero_slab23 - (i,*,*) slabs that are zero ****
162         do k1 = 1,nx(1)
163           log_mb(zero_slab23(1,fb)+k1-1) =.true.
164         end do
165
166         do k1 = -nxh+1, nxh-1
167          i=k1
168          if (i .lt. 0) i = i + nx(1)
169          i=i+1
170          yzslab = .true.
171          do k3 = -nzh+1, nzh-1
172          do k2 = -nyh+1, nyh-1
173            if (fb.eq.0) then
174               g1 = k1*lattice_unitg(1,1)
175     >            + k2*lattice_unitg(1,2)
176     >            + k3*lattice_unitg(1,3)
177               g2 = k1*lattice_unitg(2,1)
178     >            + k2*lattice_unitg(2,2)
179     >            + k3*lattice_unitg(2,3)
180               g3 = k1*lattice_unitg(3,1)
181     >            + k2*lattice_unitg(3,2)
182     >            + k3*lattice_unitg(3,3)
183               gg = g1*g1 + g2*g2 + g3*g3
184               gg= gg-ggcut
185               if (gg.lt.-eps) yzslab = .false.
186            else
187               do nbb=1,brillioun_nbrillq()
188                  g1 = k1*lattice_unitg(1,1)
189     >               + k2*lattice_unitg(1,2)
190     >               + k3*lattice_unitg(1,3)
191     >               + brillioun_k(1,nbb)
192                  g2 = k1*lattice_unitg(2,1)
193     >               + k2*lattice_unitg(2,2)
194     >               + k3*lattice_unitg(2,3)
195     >               + brillioun_k(2,nbb)
196                  g3 = k1*lattice_unitg(3,1)
197     >               + k2*lattice_unitg(3,2)
198     >               + k3*lattice_unitg(3,3)
199     >               + brillioun_k(3,nbb)
200                  gg = g1*g1 + g2*g2 + g3*g3
201                  gg= gg-ggcut
202                  if (gg.lt.-eps) yzslab = .false.
203               end do
204            end if
205          end do
206          end do
207          if (.not.yzslab) then
208            log_mb(zero_slab23(1,fb)+i-1) =.false.
209          end if
210
211         end do
212
213*        **** find zero_row2 - (i,*,k) rows that are zero after fft of (i,j,*) ****
214         do k3 = 1,nz(1)
215         do k1 = 1,nx(1)
216           call C3dB_ktoqp(1,k3,q,p)
217           if (p.eq.taskid) then
218            index2 = k1 + nx(1)*(q-1)
219            log_mb(zero_row2(1,fb)+index2-1)
220     >       = log_mb(zero_slab23(1,fb)+k1-1)
221           end if
222         end do
223         end do
224
225
226         end do !*fb*
227         value = BA_pop_stack(zero_arow3(2))
228         if (.not. value) call errquit('error freeing stack',0,MA_ERR)
229
230      !*** mapping == 2 ***
231      else
232         value =           BA_alloc_get(mt_log,nq3(1),
233     >                     'zero_row3',zero_row3(2,0),zero_row3(1,0))
234         value = value.and.BA_alloc_get(mt_log,nq3(1),
235     >                     'zero_row3',zero_row3(2,1),zero_row3(1,1))
236
237         value = value.and.BA_alloc_get(mt_log,nq2(1),
238     >                     'zero_row2',zero_row2(2,0),zero_row2(1,0))
239         value = value.and.BA_alloc_get(mt_log,nq2(1),
240     >                     'zero_row2',zero_row2(2,1),zero_row2(1,1))
241
242         value = value.and.BA_alloc_get(mt_log,nx(1),
243     >               'zero_slab23',zero_slab23(2,0),zero_slab23(1,0))
244         value = value.and.BA_alloc_get(mt_log,nx(1),
245     >               'zero_slab23',zero_slab23(2,1),zero_slab23(1,1))
246         if (.not. value) call errquit('out of heap memory',0, MA_ERR)
247
248         value = value.and.BA_push_get(mt_log,nx(1)*nz(1),
249     >                     'zero_arow2',zero_arow2(2),zero_arow2(1))
250         value = value.and.BA_push_get(mt_log,nx(1)*ny(1),
251     >                     'zero_arow3',zero_arow3(2),zero_arow3(1))
252         if (.not. value) call errquit('out of stack memory',0,MA_ERR)
253
254         !*** fb==0 - density, fb=1 - all wavefunctions/brillioun zones ***
255         do fb=0,1
256
257         if (fb.eq.0) then
258            ggcut = lattice_ggcut()
259         else
260            ggcut = lattice_wggcut()
261         end if
262
263*        **** find zero_row3 - (i,j,*) rows that are zero ****
264         do q = 1,nq3(1)
265           log_mb(zero_row3(1,fb)+q-1) =.true.
266         end do
267         do q = 1,nx(1)*ny(1)
268           log_mb(zero_arow3(1)+q-1) =.true.
269         end do
270
271         do k2 = -nyh+1, nyh-1
272         do k1 = -nxh+1, nxh-1
273          i=k1
274          j=k2
275          if (i .lt. 0) i = i + nx(1)
276          if (j .lt. 0) j = j + ny(1)
277          i=i+1
278          j=j+1
279          zrow = .true.
280          do k3 = -nzh+1, nzh-1
281            if (fb.eq.0) then
282               g1 = k1*lattice_unitg(1,1)
283     >            + k2*lattice_unitg(1,2)
284     >            + k3*lattice_unitg(1,3)
285               g2 = k1*lattice_unitg(2,1)
286     >            + k2*lattice_unitg(2,2)
287     >            + k3*lattice_unitg(2,3)
288               g3 = k1*lattice_unitg(3,1)
289     >            + k2*lattice_unitg(3,2)
290     >            + k3*lattice_unitg(3,3)
291               gg = g1*g1 + g2*g2 + g3*g3
292               gg= gg-ggcut
293               if (gg.lt.-eps) zrow = .false.
294            else
295               do nbb=1,brillioun_nbrillq()
296                  g1 = k1*lattice_unitg(1,1)
297     >               + k2*lattice_unitg(1,2)
298     >               + k3*lattice_unitg(1,3)
299     >               + brillioun_k(1,nbb)
300                  g2 = k1*lattice_unitg(2,1)
301     >               + k2*lattice_unitg(2,2)
302     >               + k3*lattice_unitg(2,3)
303     >               + brillioun_k(2,nbb)
304                  g3 = k1*lattice_unitg(3,1)
305     >               + k2*lattice_unitg(3,2)
306     >               + k3*lattice_unitg(3,3)
307     >               + brillioun_k(3,nbb)
308                  gg = g1*g1 + g2*g2 + g3*g3
309                  gg= gg-ggcut
310                  if (gg.lt.-eps) zrow = .false.
311               end do
312            end if
313          end do
314          if (.not.zrow) then
315            log_mb(zero_arow3(1)+(i-1)+nx(1)*(j-1)) =.false.
316            q = int_mb(q_map3(1,1)+(i-1)+(j-1)*(nx(1)))
317            p = int_mb(p_map3(1,1)+(i-1)+(j-1)*(nx(1)))
318            if (p.eq.taskid) then
319              log_mb(zero_row3(1,fb)+q-1) =.false.
320            end if
321          end if
322
323         end do
324         end do
325
326*        **** find zero_slab23 - (i,*,*) slabs that are zero ****
327         do k1 = 1,nx(1)
328           log_mb(zero_slab23(1,fb)+k1-1) =.true.
329         end do
330
331         do k1 = -nxh+1, nxh-1
332          i=k1
333          if (i .lt. 0) i = i + nx(1)
334          i=i+1
335          yzslab = .true.
336          do k3 = -nzh+1, nzh-1
337          do k2 = -nyh+1, nyh-1
338            if (fb.eq.0) then
339               g1 = k1*lattice_unitg(1,1)
340     >            + k2*lattice_unitg(1,2)
341     >            + k3*lattice_unitg(1,3)
342               g2 = k1*lattice_unitg(2,1)
343     >            + k2*lattice_unitg(2,2)
344     >            + k3*lattice_unitg(2,3)
345               g3 = k1*lattice_unitg(3,1)
346     >            + k2*lattice_unitg(3,2)
347     >            + k3*lattice_unitg(3,3)
348               gg = g1*g1 + g2*g2 + g3*g3
349               gg= gg-ggcut
350               if (gg.lt.-eps) yzslab = .false.
351            else
352               do nbb=1,brillioun_nbrillq()
353                  g1 = k1*lattice_unitg(1,1)
354     >               + k2*lattice_unitg(1,2)
355     >               + k3*lattice_unitg(1,3)
356     >               + brillioun_k(1,nbb)
357                  g2 = k1*lattice_unitg(2,1)
358     >               + k2*lattice_unitg(2,2)
359     >               + k3*lattice_unitg(2,3)
360     >               + brillioun_k(2,nbb)
361                  g3 = k1*lattice_unitg(3,1)
362     >               + k2*lattice_unitg(3,2)
363     >               + k3*lattice_unitg(3,3)
364     >               + brillioun_k(3,nbb)
365                  gg = g1*g1 + g2*g2 + g3*g3
366                  gg= gg-ggcut
367                  if (gg.lt.-eps) yzslab = .false.
368               end do
369            end if
370          end do
371          end do
372          if (.not.yzslab) then
373            log_mb(zero_slab23(1,fb)+i-1) =.false.
374          end if
375
376         end do
377
378
379*        **** find zero_row2 - (i,*,k) rows that are zero after fft of (i,j,*) ****
380         do k = 1,nz(1)
381         do i = 1,nx(1)
382           q = int_mb(q_map2(1,1)+(k-1)+(i-1)*(nz(1)))
383           p = int_mb(p_map2(1,1)+(k-1)+(i-1)*(nz(1)))
384
385           log_mb(zero_arow2(1)+i-1+nx(1)*(k-1))
386     >       = log_mb(zero_slab23(1,fb)+i-1)
387           if (p.eq.taskid) then
388            log_mb(zero_row2(1,fb)+q-1)
389     >       = log_mb(zero_slab23(1,fb)+i-1)
390           end if
391         end do
392         end do
393
394         call C3dB_c_ptranspose_ijk_init(fb,
395     >                                   log_mb(zero_arow2(1)),
396     >                                   log_mb(zero_arow3(1)))
397
398
399         end do
400
401       value =           BA_pop_stack(zero_arow3(2))
402       value = value.and.BA_pop_stack(zero_arow2(2))
403       if (.not. value) call errquit('error pop stack',0,MA_ERR)
404
405
406      end if
407
408      call C3dB_pfft3_queue_init(control_pfft3_qsize())
409
410      return
411      end
412
413
414
415
416*     ***********************************
417*     *                                 *
418*     *         C3dB_pfft_end           *
419*     *                                 *
420*     ***********************************
421
422      subroutine C3dB_pfft_end()
423      implicit none
424
425#include "bafdecls.fh"
426#include "errquit.fh"
427#include "C3dB.fh"
428#include "C3dB_pfft.fh"
429
430
431
432*     **** indexing variables ****
433      integer iq_to_i1(2,0:1)
434      integer iq_to_i2(2,0:1)
435      integer iz_to_i2(2,0:1)
436      integer i1_start(2,0:1)
437      integer i2_start(2,0:1)
438      common / c_ptrans_blk1 / iq_to_i1,iq_to_i2,iz_to_i2,
439     >                       i1_start,i2_start
440
441*     **** indexing variables ****
442      integer jq_to_i1(2,0:1)
443      integer jq_to_i2(2,0:1)
444      integer jz_to_i2(2,0:1)
445      integer j1_start(2,0:1)
446      integer j2_start(2,0:1)
447      common / c_ptrans_blk2 / jq_to_i1,jq_to_i2,jz_to_i2,
448     >                       j1_start,j2_start
449
450
451*     *** hilbert tranpose data structure ****
452      integer h_iq_to_i1(2,6,0:1)
453      integer h_iq_to_i2(2,6,0:1)
454      integer h_iz_to_i2(2,6,0:1)
455      integer h_iz_to_i2_count(6,0:1)
456      integer h_i1_start(2,6,0:1)
457      integer h_i2_start(2,6,0:1)
458      common / c_ptrans_blk_ijk / h_iq_to_i1,
459     >                         h_iq_to_i2,
460     >                         h_iz_to_i2,
461     >                         h_iz_to_i2_count,
462     >                         h_i1_start,
463     >                         h_i2_start
464
465
466      logical value
467      integer i
468
469      value =           BA_free_heap(zero_row2(2,0))
470      value = value.and.BA_free_heap(zero_row2(2,1))
471      value = value.and.BA_free_heap(zero_row3(2,0))
472      value = value.and.BA_free_heap(zero_row3(2,1))
473      value = value.and.BA_free_heap(zero_slab23(2,0))
474      value = value.and.BA_free_heap(zero_slab23(2,1))
475
476      if (mapping.eq.1) then
477      value = value.and.BA_free_heap(iq_to_i1(2,0))
478      value = value.and.BA_free_heap(iq_to_i1(2,1))
479      value = value.and.BA_free_heap(iq_to_i2(2,0))
480      value = value.and.BA_free_heap(iq_to_i2(2,1))
481      value = value.and.BA_free_heap(iz_to_i2(2,0))
482      value = value.and.BA_free_heap(iz_to_i2(2,1))
483      value = value.and.BA_free_heap(i1_start(2,0))
484      value = value.and.BA_free_heap(i1_start(2,1))
485      value = value.and.BA_free_heap(i2_start(2,0))
486      value = value.and.BA_free_heap(i2_start(2,1))
487
488      value = value.and.BA_free_heap(jq_to_i1(2,0))
489      value = value.and.BA_free_heap(jq_to_i1(2,1))
490      value = value.and.BA_free_heap(jq_to_i2(2,0))
491      value = value.and.BA_free_heap(jq_to_i2(2,1))
492      value = value.and.BA_free_heap(jz_to_i2(2,0))
493      value = value.and.BA_free_heap(jz_to_i2(2,1))
494      value = value.and.BA_free_heap(j1_start(2,0))
495      value = value.and.BA_free_heap(j1_start(2,1))
496      value = value.and.BA_free_heap(j2_start(2,0))
497      value = value.and.BA_free_heap(j2_start(2,1))
498      end if
499
500      if (mapping.eq.2) then
501      do i=1,6
502      value = value.and.BA_free_heap(h_i1_start(2,i,0))
503      value = value.and.BA_free_heap(h_i2_start(2,i,0))
504      value = value.and.BA_free_heap(h_iq_to_i1(2,i,0))
505      value = value.and.BA_free_heap(h_iq_to_i2(2,i,0))
506      value = value.and.BA_free_heap(h_iz_to_i2(2,i,0))
507      value = value.and.BA_free_heap(h_i1_start(2,i,1))
508      value = value.and.BA_free_heap(h_i2_start(2,i,1))
509      value = value.and.BA_free_heap(h_iq_to_i1(2,i,1))
510      value = value.and.BA_free_heap(h_iq_to_i2(2,i,1))
511      value = value.and.BA_free_heap(h_iz_to_i2(2,i,1))
512      end do
513      end if
514
515      if (.not. value) call errquit('error freeing heap',0,MA_ERR)
516
517      call C3dB_pfft3_queue_end()
518      return
519      end
520
521
522
523
524*     ***********************************
525*     *					*
526*     *	        C3dB_cr_pfft3b		*
527*     *					*
528*     ***********************************
529
530      subroutine C3dB_cr_pfft3b(nb,nbb,A)
531
532*****************************************************
533*                                                   *
534*      This routine performs the operation of       *
535*      a three dimensional complex to complex       *
536*      inverse fft                                  *
537*           A(nx,ny(nb),nz(nb)) <- FFT3^(-1)[A(kx,ky,kz)]   *
538*                                                   *
539*      Entry - 					    *
540*              A: a column distribuded 3d block     *
541*              tmp: tempory work space must be at   *
542*                    least the size of (complex)    *
543*                    (nfft*nfft + 1) + 10*nfft      *
544*                                                   *
545*       Exit - A is transformed and the imaginary   *
546*              part of A is set to zero             *
547*       uses - C3dB_c_ptranspose_jk, dcopy           *
548*                                                   *
549*****************************************************
550
551      implicit none
552      integer nb,nbb
553      complex*16  A(*)
554
555#include "bafdecls.fh"
556#include "errquit.fh"
557
558#include "C3dB.fh"
559#include "C3dB_pfft.fh"
560
561      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
562      common    / C3dB_fft / tmpx,tmpy,tmpz
563
564
565*     *** local variables ***
566      integer i,j,k,q,indx,fb
567      integer indx2
568#ifdef MLIB
569      integer ierr
570#endif
571
572
573      !integer tmp1(2),tmp2(2),tmp3(2)
574      integer tmp2(2),tmp3(2)
575      logical value
576
577
578      call nwpw_timing_start(1)
579
580      !*** set fb ***
581      if (nbb.eq.0) then
582         fb = 0
583      else
584         fb = 1
585      end if
586
587
588*     ***** allocate temporary space ****
589      !call C3dB_nfft3d(nb,nfft3d)
590      value = BA_push_get(mt_dcpl,(nfft3d(nb)),'ffttmp2',
591     >                    tmp2(2),tmp2(1))
592      value = value.and.
593     >      BA_push_get(mt_dcpl,(nfft3d(nb)),'ffttmp3',tmp3(2),tmp3(1))
594      if (.not. value) call errquit('out of stack memory',0, MA_ERR)
595
596
597      !**********************
598      !**** slab mapping ****
599      !**********************
600      if (mapping.eq.1) then
601
602*     ****************************************************
603*     ***     do fft along ky dimension                ***
604*     ***  A(kx,ny(nb),kz) <- fft1d^(-1)[A(kx,ky,kz)]  ***
605*     ****************************************************
606#ifdef MLIB
607      !call z1dfft(dbl_mb(tmp3(1)),nz(nb),dcpl_mb(tmpz(1)),-3,ierr)
608      indx2 = 0
609      do q=1,nq(nb)
610      do i=1,nx(nb)
611       indx2 = indx2 + 1
612       if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then
613
614         indx = i + (q-1)*nx(nb)*nz(nb)
615         call zcopy(nz(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1)
616         call z1dfft(dcpl_mb(tmp2(1)),nz(nb),
617     >               dcpl_mb(tmpz(1,nb)),-2,ierr)
618         call zcopy(nz(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb))
619
620       end if
621      end do
622      end do
623
624#else
625      !call dcffti(nz(nb),dcpl_mb(tmp1(1)))
626      indx2 = 0
627      do q=1,nq(nb)
628      do i=1,nx(nb)
629       indx2 = indx2 + 1
630       if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then
631         indx = i + (q-1)*nx(nb)*nz(nb)
632         call zcopy(nz(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1)
633         call dcfftb(nz(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpz(1,nb)))
634         call zcopy(nz(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb))
635       end if
636      end do
637      end do
638#endif
639
640*     ********************************************
641*     ***         Do a ptranspose of A          ***
642*     ***   A(kx,kz,ny(nb)) <- A(kx,ny(nb),kz)  ***
643*     ********************************************
644      call C3dB_c_ptranspose1_jk(fb,A,dcpl_mb(tmp2(1)),dcpl_mb(tmp3(1)))
645
646*     *************************************************
647*     ***     do fft along kz dimension             ***
648*     ***   A(kx,nz(nb),ny(nb)) <- fft1d^(-1)[A(kx,kz,ny(nb))]  ***
649*     *************************************************
650#ifdef MLIB
651
652      indx2 = 0
653      do q=1,nq(nb)
654      do i=1,nx(nb)
655        indx2 = indx2 + 1
656        if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then
657          indx = i + (q-1)*nx(nb)*ny(nb)
658          call zcopy(ny(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1)
659          call z1dfft(dcpl_mb(tmp2(1)),ny(nb),
660     >               dcpl_mb(tmpy(1,nb)),-2,ierr)
661          call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb))
662        end if
663      end do
664      end do
665
666#else
667      indx2 = 0
668      do q=1,nq(nb)
669      do i=1,nx(nb)
670        indx2 = indx2 + 1
671        if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then
672         indx = i + (q-1)*nx(nb)*ny(nb)
673         call zcopy(ny(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1)
674         call dcfftb(ny(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpy(1,nb)))
675         call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb))
676        end if
677      end do
678      end do
679#endif
680
681*     *********************************************************************
682*     ***     do fft along kx dimension                                 ***
683*     ***   A(nx(nb),nz(nb),ny(nb)) <- fft1d^(-1)[A(kx,nz(nb),ny(nb))]  ***
684*     *********************************************************************
685#ifdef MLIB
686      !call drc1ft (dbl_mb(tmp3(1)),nx(nb),dcpl_mb(tmp1(1)),-3,ierr)
687      indx = 1
688      do q=1,nq(nb)
689      do j=1,ny(nb)
690         call z1dfft(A(indx),nx(nb),
691     >               dcpl_mb(tmpx(1,nb)),-2,ierr)
692         indx = indx + nx(nb)
693      end do
694      end do
695#else
696      indx = 1
697      do q=1,nq(nb)
698      do j=1,ny(nb)
699         call dcfftb(nx(nb),A(indx),dcpl_mb(tmpx(1,nb)))
700         indx = indx + nx(nb)
701      end do
702      end do
703#endif
704
705
706      !*************************
707      !**** hilbert mapping ****
708      !*************************
709      else
710
711*     *************************************************
712*     ***     do fft along kz dimension             ***
713*     ***   A(nz(nb),kx,ky) <- fft1d^(-1)[A(kz,kx,ky)]  ***
714*     *************************************************
715#ifdef MLIB
716      indx = 1
717      do q=1,nq3(nb)
718         if (.not.log_mb(zero_row3(1,fb)+q-1)) then
719         call z1dfft(A(indx),nz(nb),dcpl_mb(tmpz(1,nb)),-2,ierr)
720         end if
721         indx = indx + nz(nb)
722      end do
723#else
724      indx = 1
725      do q=1,nq3(nb)
726         if (.not.log_mb(zero_row3(1,fb)+q-1)) then
727         call dcfftb(nz(nb),A(indx),dcpl_mb(tmpz(1,nb)))
728         end if
729         indx = indx + nz(nb)
730      end do
731#endif
732
733      call C3dB_c_ptranspose_ijk(fb,3,A,
734     >                           dcpl_mb(tmp2(1)),
735     >                           dcpl_mb(tmp3(1)))
736
737*     *************************************************
738*     ***     do fft along ky dimension             ***
739*     ***   A(ny(nb),nz(nb),kx) <- fft1d^(-1)[A(ky,nz(nb),kx)]  ***
740*     *************************************************
741#ifdef MLIB
742      indx = 1
743      do q=1,nq2(nb)
744         if (.not.log_mb(zero_row2(1,fb)+q-1)) then
745         call z1dfft(A(indx),ny(nb),dcpl_mb(tmpy(1,nb)),-2,ierr)
746         end if
747         indx = indx + ny(nb)
748      end do
749#else
750      indx = 1
751      do q=1,nq2(nb)
752         if (.not.log_mb(zero_row2(1,fb)+q-1)) then
753         call dcfftb(ny(nb),A(indx),dcpl_mb(tmpy(1,nb)))
754         end if
755         indx = indx + ny(nb)
756      end do
757#endif
758
759      call C3dB_c_ptranspose_ijk(fb,4,A,
760     >                           dcpl_mb(tmp2(1)),
761     >                           dcpl_mb(tmp3(1)))
762
763*     *************************************************
764*     ***     do fft along kx dimension             ***
765*     ***   A(nx(nb),ny(nb),nz(nb)) <- fft1d^(-1)[A(kx,ny(nb),nz(nb))]  ***
766*     *************************************************
767#ifdef MLIB
768      indx = 1
769      do q=1,nq1(nb)
770         call z1dfft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),-2,ierr)
771         indx = indx + nx(nb)
772      end do
773#else
774      indx = 1
775      do q=1,nq1(nb)
776         call dcfftb(nx(nb),A(indx),dcpl_mb(tmpx(1,nb)))
777         indx = indx + nx(nb)
778      end do
779#endif
780
781      end if
782
783*     **** deallocate temporary space  ****
784      value = BA_pop_stack(tmp3(2))
785      value = BA_pop_stack(tmp2(2))
786      !value = BA_pop_stack(tmp1(2))
787
788      call nwpw_timing_end(1)
789      return
790      end
791
792
793
794
795*     ***********************************
796*     *					*
797*     *	        C3dB_rc_pfft3f		*
798*     *					*
799*     ***********************************
800
801      subroutine C3dB_rc_pfft3f(nb,nbb,A)
802
803*****************************************************
804*                                                   *
805*      This routine performs the operation of       *
806*      a three dimensional complex to complex fft   *
807*           A(kx,ky,kz) <- FFT3[A(nx(nb),ny(nb),nz(nb))]        *
808*                                                   *
809*      Entry - 					    *
810*              A: a column distribuded 3d block     *
811*              tmp: tempory work space must be at   *
812*                    least the size of (complex)    *
813*                    (nfft*nfft + 1) + 10*nfft      *
814*                                                   *
815*       Exit - A is transformed                     *
816*                                                   *
817*       uses - ptranspose1 subroutine                *
818*                                                   *
819*****************************************************
820
821      implicit none
822      integer nb,nbb
823      complex*16  A(*)
824
825#include "bafdecls.fh"
826#include "errquit.fh"
827
828#include "C3dB.fh"
829#include "C3dB_pfft.fh"
830
831      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
832      common    / C3dB_fft / tmpx,tmpy,tmpz
833
834*     *** local variables ***
835      integer i,j,k,q,indx,indx1,indx2,fb
836#ifdef MLIB
837      integer ierr
838#endif
839
840      !integer tmp1(2),tmp2(2),tmp3(2)
841      integer tmp2(2),tmp3(2)
842      logical value
843
844
845      call nwpw_timing_start(1)
846
847      !*** set fb ***
848      if (nbb.eq.0) then
849         fb = 0
850      else
851         fb = 1
852      end if
853
854*     ***** allocate temporary space ****
855      !call C3dB_nfft3d(nb,nfft3d)
856      value = BA_push_get(mt_dcpl,(nfft3d(nb)),'tmp2',tmp2(2),tmp2(1))
857      value = value.and.
858     >        BA_push_get(mt_dcpl,(nfft3d(nb)),'tmp3',tmp3(2),tmp3(1))
859      if (.not. value) call errquit('out of stack memory',0, MA_ERR)
860
861      !**********************
862      !**** slab mapping ****
863      !**********************
864      if (mapping.eq.1) then
865*     ********************************************
866*     ***     do fft along nx(nb) dimension        ***
867*     ***   A(kx,nz(nb),ny(nb)) <- fft1d[A(nx(nb),nz(nb),ny(nb))]  ***
868*     ********************************************
869#ifdef MLIB
870      indx = 1
871      do q=1,nq(nb)
872      do j=1,ny(nb)
873         call z1dfft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),1,ierr)
874         indx = indx + nx(nb)
875      end do
876      end do
877#else
878      indx = 1
879      do q=1,nq(nb)
880      do j=1,ny(nb)
881         call dcfftf(nx(nb),A(indx),dcpl_mb(tmpx(1,nb)))
882         indx = indx + nx(nb)
883      end do
884      end do
885#endif
886
887
888*     ********************************************
889*     ***     do fft along nz(nb) dimension        ***
890*     ***   A(kx,kz,nz(nb)) <- fft1d[A(kx,nz(nb),ny(nb))]  ***
891*     ********************************************
892
893#ifdef MLIB
894
895      indx2 = 0
896      do q=1,nq(nb)
897      do i=1,nx(nb)
898        indx2 = indx2 + 1
899        if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then
900
901         indx = i + (q-1)*nx(nb)*ny(nb)
902         call zcopy(ny(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1)
903         call z1dfft(dcpl_mb(tmp2(1)),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr)
904         call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb))
905
906        end if
907      end do
908      end do
909#else
910
911       indx2 = 0
912       do q=1,nq(nb)
913       do i=1,nx(nb)
914        indx2 = indx2 + 1
915        if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then
916
917          indx = i + (q-1)*nx(nb)*ny(nb)
918          call zcopy(ny(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1)
919          call dcfftf(ny(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpy(1,nb)))
920          call zcopy(ny(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb))
921        end if
922       end do
923       end do
924
925#endif
926
927*     ********************************************
928*     ***         Do a ptranspose of A          ***
929*     ***      A(ky,ny(nb),kz) <- A(kx,kz,ny(nb))      ***
930*     ********************************************
931      call C3dB_c_ptranspose2_jk(fb,A,dcpl_mb(tmp2(1)),dcpl_mb(tmp3(1)))
932
933
934*     ********************************************
935*     ***     do fft along ny(nb) dimension        ***
936*     ***   A(kx,ky,kz) <- fft1d[A(kx,ny(nb),kz)]  ***
937*     ********************************************
938#ifdef MLIB
939
940      indx2 = 0
941      do q=1,nq(nb)
942      do i=1,nx(nb)
943        indx2 = indx2 + 1
944        if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then
945         indx = i + (q-1)*nx(nb)*ny(nb)
946         call zcopy(nz(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1)
947         call z1dfft(dcpl_mb(tmp2(1)),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr)
948         call zcopy(nz(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb))
949
950        end if
951      end do
952      end do
953#else
954
955       indx2 = 0
956       do q=1,nq(nb)
957       do i=1,nx(nb)
958        indx2 = indx2 + 1
959        if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then
960          indx = i + (q-1)*nx(nb)*ny(nb)
961          call zcopy(nz(nb),A(indx),nx(nb),dcpl_mb(tmp2(1)),1)
962          call dcfftf(nz(nb),dcpl_mb(tmp2(1)),dcpl_mb(tmpz(1,nb)))
963          call zcopy(nz(nb),dcpl_mb(tmp2(1)),1,A(indx),nx(nb))
964        end if
965       end do
966       end do
967
968#endif
969
970
971      !*************************
972      !**** hilbert mapping ****
973      !*************************
974      else
975
976*     ********************************************
977*     ***     do fft along nx(nb) dimension        ***
978*     ***   A(kx,ny(nb),nz(nb)) <- fft1d[A(nx(nb),ny(nb),nz(nb))]  ***
979*     ********************************************
980#ifdef MLIB
981      indx = 1
982      do q=1,nq1(nb)
983         call z1dfft(A(indx),nx(nb),dcpl_mb(tmpx(1,nb)),1,ierr)
984         indx = indx + nx(nb)
985      end do
986#else
987      indx = 1
988      do q=1,nq1(nb)
989         call dcfftf(nx(nb),A(indx),dcpl_mb(tmpx(1,nb)))
990         indx = indx + nx(nb)
991      end do
992#endif
993
994      call C3dB_c_ptranspose_ijk(fb,1,A,
995     >                           dcpl_mb(tmp2(1)),
996     >                           dcpl_mb(tmp3(1)))
997
998*     ********************************************
999*     ***     do fft along ny(nb) dimension        ***
1000*     ***   A(ky,nz(nb),kx) <- fft1d[A(ny(nb),nz(nb),kx)]  ***
1001*     ********************************************
1002#ifdef MLIB
1003      indx = 1
1004      do q=1,nq2(nb)
1005         if (.not.log_mb(zero_row2(1,fb)+q-1)) then
1006         call z1dfft(A(indx),ny(nb),dcpl_mb(tmpy(1,nb)),1,ierr)
1007         end if
1008         indx = indx + ny(nb)
1009      end do
1010#else
1011      indx = 1
1012      do q=1,nq2(nb)
1013         if (.not.log_mb(zero_row2(1,fb)+q-1)) then
1014         call dcfftf(ny(nb),A(indx),dcpl_mb(tmpy(1,nb)))
1015         end if
1016         indx = indx + ny(nb)
1017      end do
1018#endif
1019
1020      call C3dB_c_ptranspose_ijk(fb,2,A,
1021     >                          dcpl_mb(tmp2(1)),
1022     >                          dcpl_mb(tmp3(1)))
1023
1024*     ********************************************
1025*     ***     do fft along nz(nb) dimension        ***
1026*     ***   A(kz,kx,ky) <- fft1d[A(nz(nb),kx,ky)]  ***
1027*     ********************************************
1028#ifdef MLIB
1029      indx = 1
1030      do q=1,nq3(nb)
1031         if (.not.log_mb(zero_row3(1,fb)+q-1)) then
1032         call z1dfft(A(indx),nz(nb),dcpl_mb(tmpz(1,nb)),1,ierr)
1033         end if
1034         indx = indx + nz(nb)
1035      end do
1036#else
1037      indx = 1
1038      do q=1,nq3(nb)
1039         if (.not.log_mb(zero_row3(1,fb)+q-1)) then
1040         call dcfftf(nz(nb),A(indx),dcpl_mb(tmpz(1,nb)))
1041         end if
1042         indx = indx + nz(nb)
1043      end do
1044#endif
1045
1046      end if
1047
1048
1049*     **** deallocate temporary space  ****
1050      value = BA_pop_stack(tmp3(2))
1051      value = BA_pop_stack(tmp2(2))
1052      !value = BA_pop_stack(tmp1(2))
1053
1054      call nwpw_timing_end(1)
1055      return
1056      end
1057
1058
1059
1060
1061*     ***********************************
1062*     *					*
1063*     *	   C3dB_c_ptranspose_jk_init	*
1064*     *					*
1065*     ***********************************
1066
1067      subroutine C3dB_c_ptranspose_jk_init(fb,zero_arow3)
1068      implicit none
1069      integer fb
1070      logical zero_arow3(*)
1071
1072#include "bafdecls.fh"
1073#include "errquit.fh"
1074#include "C3dB.fh"
1075
1076
1077*     **** indexing variables ****
1078      integer iq_to_i1(2,0:1)
1079      integer iq_to_i2(2,0:1)
1080      integer iz_to_i2(2,0:1)
1081      integer i1_start(2,0:1)
1082      integer i2_start(2,0:1)
1083      common / c_ptrans_blk1 / iq_to_i1,iq_to_i2,iz_to_i2,
1084     >                       i1_start,i2_start
1085
1086*     **** indexing variables ****
1087      integer jq_to_i1(2,0:1)
1088      integer jq_to_i2(2,0:1)
1089      integer jz_to_i2(2,0:1)
1090      integer j1_start(2,0:1)
1091      integer j2_start(2,0:1)
1092      common / c_ptrans_blk2 / jq_to_i1,jq_to_i2,jz_to_i2,
1093     >                       j1_start,j2_start
1094
1095
1096
1097*     **** local variables ****
1098      integer proc_to,proc_from
1099      integer pto,qto,np,taskid
1100      integer pfrom,qfrom
1101      integer phere,qhere
1102      integer index1,index2,index3
1103      integer jndex1,jndex2,jndex3
1104      integer itmp,ii,jj
1105      integer i,j,k,it
1106      logical value
1107
1108
1109
1110*     **** allocate ptrans_blk1 and ptrans_blk2 common block ****
1111      value = BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)),
1112     >                     'piq_to_i1',iq_to_i1(2,fb),iq_to_i1(1,fb))
1113      value=value.and.BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)),
1114     >                     'piq_to_i2',iq_to_i2(2,fb),iq_to_i2(1,fb))
1115      value=value.and.BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)),
1116     >                     'piz_to_i2',iz_to_i2(2,fb),iz_to_i2(1,fb))
1117
1118      value = value.and.BA_alloc_get(mt_int,(nz(1)+1),
1119     >                     'pi1_start',i1_start(2,fb),i1_start(1,fb))
1120      value = value.and.BA_alloc_get(mt_int,(nz(1)+1),
1121     >                     'pi2_start',i2_start(2,fb),i2_start(1,fb))
1122
1123
1124      value = BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)),
1125     >                     'riq_to_i1',jq_to_i1(2,fb),jq_to_i1(1,fb))
1126      value=value.and.BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)),
1127     >                     'riq_to_i2',jq_to_i2(2,fb),jq_to_i2(1,fb))
1128      value=value.and.BA_alloc_get(mt_int,(nx(1)*ny(1)*nq(1)),
1129     >                     'riz_to_i2',jz_to_i2(2,fb),jz_to_i2(1,fb))
1130
1131      value = value.and.BA_alloc_get(mt_int,(nz(1)+1),
1132     >                     'ri1_start',j1_start(2,fb),j1_start(1,fb))
1133      value = value.and.BA_alloc_get(mt_int,(nz(1)+1),
1134     >                     'ri2_start',j2_start(2,fb),j2_start(1,fb))
1135
1136      if (.not. value)
1137     > call errquit('C3dB_ptranspose_jk_init:out of heap',0,MA_ERR)
1138
1139      call Parallel3d_taskid_i(taskid)
1140      call Parallel3d_np_i(np)
1141
1142
1143      index1 = 1
1144      index2 = 1
1145      index3 = 1
1146      jndex1 = 1
1147      jndex2 = 1
1148      jndex3 = 1
1149      do it=0,np-1
1150         proc_to   = mod(taskid+it,np)
1151         proc_from = mod(taskid-it+np,np)
1152         int_mb(i1_start(1,fb)+it) = index1
1153         int_mb(i2_start(1,fb)+it) = index2
1154         int_mb(j1_start(1,fb)+it) = jndex1
1155         int_mb(j2_start(1,fb)+it) = jndex2
1156
1157         do k=1,nz(1)
1158         do j=1,ny(1)
1159
1160*           **** packing scheme ****
1161            call C3dB_ktoqp(1,k,qhere,phere)
1162            call C3dB_ktoqp(1,j,qto,pto)
1163            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
1164               do i=1,nx(1)
1165                  ii = i + nx(1)*(k-1)
1166                  jj = i + nx(1)*(j-1)
1167                  itmp = i + (j-1)*nx(1)
1168     >                     + (qhere-1)*nx(1)*ny(1)
1169
1170                  if (.not.zero_arow3(ii)) then
1171                  int_mb(iq_to_i1(1,fb)+index1-1) = itmp
1172                  index1 = index1 + 1
1173                  end if
1174
1175                  if (.not.zero_arow3(jj)) then
1176                  int_mb(jq_to_i1(1,fb)+jndex1-1) = itmp
1177                  jndex1 = jndex1 + 1
1178                  end if
1179
1180               end do
1181            end if
1182
1183*           **** unpacking scheme ****
1184            call C3dB_ktoqp(1,j,qhere,phere)
1185            call C3dB_ktoqp(1,k,qfrom,pfrom)
1186            if ((phere.eq.taskid).and.(pfrom.eq.proc_from)) then
1187               do i=1,nx(1)
1188                  ii = i + nx(1)*(k-1)
1189                  jj = i + nx(1)*(j-1)
1190                  itmp = i + (k-1)*nx(1)
1191     >                     + (qhere-1)*nx(1)*ny(1)
1192                  if (zero_arow3(ii)) then
1193                  int_mb(iz_to_i2(1,fb)+index3-1) = itmp
1194                  index3 = index3 + 1
1195                  else
1196                  int_mb(iq_to_i2(1,fb)+index2-1) = itmp
1197                  index2 = index2 + 1
1198                  end if
1199
1200                  if (zero_arow3(jj)) then
1201                  int_mb(jz_to_i2(1,fb)+jndex3-1) = itmp
1202                  jndex3 = jndex3 + 1
1203                  else
1204                  int_mb(jq_to_i2(1,fb)+jndex2-1) = itmp
1205                  jndex2 = jndex2 + 1
1206                  end if
1207               end do
1208            end if
1209         end do
1210         end do
1211      end do
1212      int_mb(i1_start(1,fb)+np) = index1
1213      int_mb(i2_start(1,fb)+np) = index2
1214      int_mb(j1_start(1,fb)+np) = jndex1
1215      int_mb(j2_start(1,fb)+np) = jndex2
1216
1217
1218      return
1219      end
1220
1221
1222
1223
1224
1225*     ***********************************
1226*     *					*
1227*     *	   C3dB_c_ptranspose_ijk_init	*
1228*     *					*
1229*     ***********************************
1230
1231      subroutine C3dB_c_ptranspose_ijk_init(fb,zero_arow2,zero_arow3)
1232      implicit none
1233      integer fb
1234      logical zero_arow2(*)
1235      logical zero_arow3(*)
1236
1237#include "bafdecls.fh"
1238#include "errquit.fh"
1239#include "C3dB.fh"
1240
1241
1242*     *** hilbert tranpose data structure ****
1243      integer h_iq_to_i1(2,6,0:1)
1244      integer h_iq_to_i2(2,6,0:1)
1245      integer h_iz_to_i2(2,6,0:1)
1246      integer h_iz_to_i2_count(6,0:1)
1247      integer h_i1_start(2,6,0:1)
1248      integer h_i2_start(2,6,0:1)
1249      common / c_ptrans_blk_ijk / h_iq_to_i1,
1250     >                         h_iq_to_i2,
1251     >                         h_iz_to_i2,
1252     >                         h_iz_to_i2_count,
1253     >                         h_i1_start,
1254     >                         h_i2_start
1255
1256*     **** local variables ****
1257      logical value,iszero
1258      integer proc_to,proc_from
1259      integer pto,qto,np,taskid
1260      integer phere,qhere
1261      integer index1,index2,index3,itmp
1262      integer i,j,k,it
1263
1264
1265      call Parallel3d_taskid_i(taskid)
1266      call Parallel3d_np_i(np)
1267
1268
1269*     ********************************************************
1270*     **** map1to2 mapping - done - tranpose operation #1  ***
1271*     ****   (ny,nz,nx)  <-- (nx,ny,nz)                    ***
1272*     ****   use zero_arow2                                ***
1273*     ********************************************************
1274
1275*     **** allocate trans_blk_ijk common block ****
1276      value = BA_alloc_get(mt_int,(nx(1)*nq1(1)),
1277     >                     'h_iq_to_i1_1',
1278     >                      h_iq_to_i1(2,1,fb),
1279     >                      h_iq_to_i1(1,1,fb))
1280      value = value.and.
1281     >        BA_alloc_get(mt_int,(ny(1)*nq2(1)),
1282     >                     'h_iq_to_i2_1',
1283     >                      h_iq_to_i2(2,1,fb),
1284     >                      h_iq_to_i2(1,1,fb))
1285      value = value.and.
1286     >        BA_alloc_get(mt_int,(ny(1)*nq2(1)),
1287     >                     'h_iz_to_i2_1',
1288     >                      h_iz_to_i2(2,1,fb),
1289     >                      h_iz_to_i2(1,1,fb))
1290      value = value.and.
1291     >        BA_alloc_get(mt_int,(np+1),
1292     >                     'h_i1_start_1',
1293     >                      h_i1_start(2,1,fb),
1294     >                      h_i1_start(1,1,fb))
1295      value = value.and.
1296     >        BA_alloc_get(mt_int,(np+1),
1297     >                     'h_i2_start_1',
1298     >                      h_i2_start(2,1,fb),
1299     >                      h_i2_start(1,1,fb))
1300      if (.not.value)
1301     > call errquit('C3dB_transpose_ijk_initt:out of heap',0,MA_ERR)
1302
1303      index1 = 1
1304      index2 = 1
1305      index3 = 1
1306      do it=0,np-1
1307         proc_to   = mod(taskid+it,np)
1308         proc_from = mod(taskid-it+np,np)
1309         int_mb(h_i1_start(1,1,fb)+it) = index1
1310         int_mb(h_i2_start(1,1,fb)+it) = index2
1311
1312         do k=1,nz(1)
1313         do j=1,ny(1)
1314         do i=1,nx(1)
1315            iszero = zero_arow2(i+(k-1)*nx(1))
1316
1317*           **** packing scheme ****
1318            phere = int_mb(p_map1(1,1)+(j-1)+(k-1)*ny(1))
1319            qhere = int_mb(q_map1(1,1)+(j-1)+(k-1)*ny(1))
1320
1321            pto   = int_mb(p_map2(1,1)+(k-1)+(i-1)*nz(1))
1322            qto   = int_mb(q_map2(1,1)+(k-1)+(i-1)*nz(1))
1323
1324
1325            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
1326               itmp = i + (qhere-1)*nx(1)
1327               if (.not.iszero) then
1328               int_mb(h_iq_to_i1(1,1,fb)+index1-1) = itmp
1329               index1 = index1 + 1
1330               end if
1331            end if
1332
1333*           **** unpacking scheme ****
1334            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
1335               itmp = j + (qto-1)*ny(1)
1336               if (.not.iszero) then
1337               int_mb(h_iq_to_i2(1,1,fb)+index2-1) =  itmp
1338               index2 = index2 + 1
1339               else
1340               int_mb(h_iz_to_i2(1,1,fb)+index3-1) =  itmp
1341               index3 = index3 + 1
1342               end if
1343            end if
1344
1345         end do
1346         end do
1347         end do
1348
1349      end do
1350      int_mb(h_i1_start(1,1,fb)+np) = index1
1351      int_mb(h_i2_start(1,1,fb)+np) = index2
1352      h_iz_to_i2_count(1,fb) = index3 - 1
1353
1354
1355
1356
1357
1358*     *********************************************************
1359*     **** map2to3 mapping - done - transpose operation #2 ****
1360*     ****    (nz,nx,ny)  <-- (ny,nz,nx)                   ****
1361*     ****    use zero_arow3                               ****
1362*     *********************************************************
1363
1364*     **** allocate trans_blk_ijk common block ****
1365      value = BA_alloc_get(mt_int,(ny(1)*nq2(1)),
1366     >                     'h_iq_to_i1_2',
1367     >                      h_iq_to_i1(2,2,fb),
1368     >                      h_iq_to_i1(1,2,fb))
1369      value = value.and.
1370     >        BA_alloc_get(mt_int,(nz(1)*nq3(1)),
1371     >                     'h_iq_to_i2_2',
1372     >                      h_iq_to_i2(2,2,fb),
1373     >                      h_iq_to_i2(1,2,fb))
1374      value = value.and.
1375     >        BA_alloc_get(mt_int,(nz(1)*nq3(1)),
1376     >                     'h_iz_to_i2_2',
1377     >                      h_iz_to_i2(2,2,fb),
1378     >                      h_iz_to_i2(1,2,fb))
1379      value = value.and.
1380     >        BA_alloc_get(mt_int,(np+1),
1381     >                     'h_i1_start_2',
1382     >                      h_i1_start(2,2,fb),
1383     >                      h_i1_start(1,2,fb))
1384      value = value.and.
1385     >        BA_alloc_get(mt_int,(np+1),
1386     >                     'h_i2_start_2',
1387     >                      h_i2_start(2,2,fb),
1388     >                      h_i2_start(1,2,fb))
1389      if (.not.value)
1390     > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR)
1391
1392      index1 = 1
1393      index2 = 1
1394      index3 = 1
1395      do it=0,np-1
1396         proc_to   = mod(taskid+it,np)
1397         proc_from = mod(taskid-it+np,np)
1398         int_mb(h_i1_start(1,2,fb)+it) = index1
1399         int_mb(h_i2_start(1,2,fb)+it) = index2
1400
1401         do k=1,nz(1)
1402         do j=1,ny(1)
1403         do i=1,nx(1)
1404
1405            iszero = zero_arow3(i+(j-1)*nx(1))
1406
1407*           **** packing scheme ****
1408            phere = int_mb(p_map2(1,1)+(k-1)+(i-1)*nz(1))
1409            qhere = int_mb(q_map2(1,1)+(k-1)+(i-1)*nz(1))
1410
1411            pto   = int_mb(p_map3(1,1)+(i-1)+(j-1)*nx(1))
1412            qto   = int_mb(q_map3(1,1)+(i-1)+(j-1)*nx(1))
1413
1414
1415            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
1416               itmp = j + (qhere-1)*ny(1)
1417               if (.not.iszero) then
1418               int_mb(h_iq_to_i1(1,2,fb)+index1-1) = itmp
1419               index1 = index1 + 1
1420               end if
1421            end if
1422
1423*           **** unpacking scheme ****
1424            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
1425               itmp = k + (qto-1)*nz(1)
1426               if (.not.iszero) then
1427               int_mb(h_iq_to_i2(1,2,fb)+index2-1) = itmp
1428               index2 = index2 + 1
1429               else
1430               int_mb(h_iz_to_i2(1,2,fb)+index3-1) = itmp
1431               index3 = index3 + 1
1432               end if
1433            end if
1434
1435
1436         end do
1437         end do
1438         end do
1439
1440      end do
1441      int_mb(h_i1_start(1,2,fb)+np) = index1
1442      int_mb(h_i2_start(1,2,fb)+np) = index2
1443      h_iz_to_i2_count(2,fb) = index3 - 1
1444
1445
1446
1447
1448
1449
1450*     ********************************************************
1451*     **** map3to2 mapping - done - tranpose operation #3 ****
1452*     ****  (ny,nz,nx)  <-- (nz,nx,ny)                    ****
1453*     ****   use zero_arow3                               ****
1454*     ********************************************************
1455
1456*     **** allocate trans_blk_ijk common block ****
1457      value = BA_alloc_get(mt_int,(nz(1)*nq3(1)),
1458     >                     'h_iq_to_i1_3',
1459     >                      h_iq_to_i1(2,3,fb),
1460     >                      h_iq_to_i1(1,3,fb))
1461      value = value.and.
1462     >        BA_alloc_get(mt_int,(ny(1)*nq2(1)),
1463     >                     'h_iq_to_i2_3',
1464     >                      h_iq_to_i2(2,3,fb),
1465     >                      h_iq_to_i2(1,3,fb))
1466      value = value.and.
1467     >        BA_alloc_get(mt_int,(ny(1)*nq2(1)),
1468     >                     'h_iz_to_i2_3',
1469     >                      h_iz_to_i2(2,3,fb),
1470     >                      h_iz_to_i2(1,3,fb))
1471      value = value.and.
1472     >        BA_alloc_get(mt_int,(np+1),
1473     >                     'h_i1_start_3',
1474     >                      h_i1_start(2,3,fb),
1475     >                      h_i1_start(1,3,fb))
1476      value = value.and.
1477     >        BA_alloc_get(mt_int,(np+1),
1478     >                     'h_i2_start_3',
1479     >                      h_i2_start(2,3,fb),
1480     >                      h_i2_start(1,3,fb))
1481      if (.not.value)
1482     > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR)
1483
1484      index1 = 1
1485      index2 = 1
1486      index3 = 1
1487      do it=0,np-1
1488         proc_to   = mod(taskid+it,np)
1489         proc_from = mod(taskid-it+np,np)
1490         int_mb(h_i1_start(1,3,fb)+it) = index1
1491         int_mb(h_i2_start(1,3,fb)+it) = index2
1492
1493         do k=1,nz(1)
1494         do j=1,ny(1)
1495         do i=1,nx(1)
1496
1497            iszero = zero_arow3(i+(j-1)*nx(1))
1498
1499*           **** packing scheme ****
1500            phere = int_mb(p_map3(1,1)+(i-1)+(j-1)*nx(1))
1501            qhere = int_mb(q_map3(1,1)+(i-1)+(j-1)*nx(1))
1502
1503            pto   = int_mb(p_map2(1,1)+(k-1)+(i-1)*nz(1))
1504            qto   = int_mb(q_map2(1,1)+(k-1)+(i-1)*nz(1))
1505
1506
1507            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
1508               itmp = k + (qhere-1)*nz(1)
1509               if (.not.iszero) then
1510               int_mb(h_iq_to_i1(1,3,fb)+index1-1) = itmp
1511               index1 = index1 + 1
1512               end if
1513            end if
1514
1515*           **** unpacking scheme ****
1516            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
1517               itmp = j + (qto-1)*ny(1)
1518               if (.not.iszero) then
1519               int_mb(h_iq_to_i2(1,3,fb)+index2-1) = itmp
1520               index2 = index2 + 1
1521               else
1522               int_mb(h_iz_to_i2(1,3,fb)+index3-1) = itmp
1523               index3 = index3 + 1
1524               end if
1525            end if
1526
1527
1528         end do
1529         end do
1530         end do
1531
1532      end do
1533      int_mb(h_i1_start(1,3,fb)+np) = index1
1534      int_mb(h_i2_start(1,3,fb)+np) = index2
1535      h_iz_to_i2_count(3,fb) = index3 - 1
1536
1537
1538
1539
1540*     ********************************************************
1541*     **** map2to1 mapping - done - tranpose operation #4 ****
1542*     ****  (nx,ny,nz)  <-- (ny,nz,nx)                    ****
1543*     ****  use zero_arow2                                ****
1544*     ********************************************************
1545
1546*     **** allocate trans_blk_ijk common block ****
1547      value = BA_alloc_get(mt_int,(ny(1)*nq2(1)),
1548     >                     'h_iq_to_i1_4',
1549     >                      h_iq_to_i1(2,4,fb),
1550     >                      h_iq_to_i1(1,4,fb))
1551      value = value.and.
1552     >        BA_alloc_get(mt_int,(nx(1)*nq1(1)),
1553     >                     'h_iq_to_i2_4',
1554     >                      h_iq_to_i2(2,4,fb),
1555     >                      h_iq_to_i2(1,4,fb))
1556      value = value.and.
1557     >        BA_alloc_get(mt_int,(nx(1)*nq1(1)),
1558     >                     'h_iz_to_i2_4',
1559     >                      h_iz_to_i2(2,4,fb),
1560     >                      h_iz_to_i2(1,4,fb))
1561      value = value.and.
1562     >        BA_alloc_get(mt_int,(np+1),
1563     >                     'h_i1_start_4',
1564     >                      h_i1_start(2,4,fb),
1565     >                      h_i1_start(1,4,fb))
1566      value = value.and.
1567     >        BA_alloc_get(mt_int,(np+1),
1568     >                     'h_i2_start_4',
1569     >                      h_i2_start(2,4,fb),
1570     >                      h_i2_start(1,4,fb))
1571      if (.not.value)
1572     > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR)
1573
1574      index1 = 1
1575      index2 = 1
1576      index3 = 1
1577      do it=0,np-1
1578         proc_to   = mod(taskid+it,np)
1579         proc_from = mod(taskid-it+np,np)
1580         int_mb(h_i1_start(1,4,fb)+it) = index1
1581         int_mb(h_i2_start(1,4,fb)+it) = index2
1582
1583         do k=1,nz(1)
1584         do j=1,ny(1)
1585         do i=1,nx(1)
1586
1587            iszero = zero_arow2(i+(k-1)*nx(1))
1588
1589*           **** packing scheme ****
1590            phere = int_mb(p_map2(1,1)+(k-1)+(i-1)*nz(1))
1591            qhere = int_mb(q_map2(1,1)+(k-1)+(i-1)*nz(1))
1592
1593            pto   = int_mb(p_map1(1,1)+(j-1)+(k-1)*ny(1))
1594            qto   = int_mb(q_map1(1,1)+(j-1)+(k-1)*ny(1))
1595
1596
1597            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
1598               itmp = j + (qhere-1)*ny(1)
1599               if (.not.iszero) then
1600               int_mb(h_iq_to_i1(1,4,fb)+index1-1) = itmp
1601               index1 = index1 + 1
1602               end if
1603            end if
1604
1605*           **** unpacking scheme ****
1606            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
1607               itmp = i + (qto-1)*nx(1)
1608               if (.not.iszero) then
1609               int_mb(h_iq_to_i2(1,4,fb)+index2-1) = itmp
1610               index2 = index2 + 1
1611               else
1612               int_mb(h_iz_to_i2(1,4,fb)+index3-1) = itmp
1613               index3 = index3 + 1
1614               end if
1615            end if
1616
1617
1618         end do
1619         end do
1620         end do
1621
1622      end do
1623      int_mb(h_i1_start(1,4,fb)+np) = index1
1624      int_mb(h_i2_start(1,4,fb)+np) = index2
1625      h_iz_to_i2_count(4,fb) = index3 - 1
1626
1627
1628
1629
1630
1631*     **********************************************************
1632*     **** map1to3 mapping  - done - tranpose operation # 5 ****
1633*     **********************************************************
1634
1635*     **** allocate trans_blk_ijk common block ****
1636      value = BA_alloc_get(mt_int,(nx(1)*nq1(1)),
1637     >                     'h_iq_to_i1_5',
1638     >                      h_iq_to_i1(2,5,fb),
1639     >                      h_iq_to_i1(1,5,fb))
1640      value = value.and.
1641     >        BA_alloc_get(mt_int,(nz(1)*nq3(1)),
1642     >                     'h_iq_to_i2_5',
1643     >                      h_iq_to_i2(2,5,fb),
1644     >                      h_iq_to_i2(1,5,fb))
1645      value = value.and.
1646     >        BA_alloc_get(mt_int,(nz(1)*nq3(1)),
1647     >                     'h_iz_to_i2_5',
1648     >                      h_iz_to_i2(2,5,fb),
1649     >                      h_iz_to_i2(1,5,fb))
1650      value = value.and.
1651     >        BA_alloc_get(mt_int,(np+1),
1652     >                     'h_i1_start_5',
1653     >                      h_i1_start(2,5,fb),
1654     >                      h_i1_start(1,5,fb))
1655      value = value.and.
1656     >        BA_alloc_get(mt_int,(np+1),
1657     >                     'h_i2_start_5',
1658     >                      h_i2_start(2,5,fb),
1659     >                      h_i2_start(1,5,fb))
1660      if (.not.value)
1661     > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR)
1662
1663      index1 = 1
1664      index2 = 1
1665      index3 = 1
1666      do it=0,np-1
1667         proc_to   = mod(taskid+it,np)
1668         proc_from = mod(taskid-it+np,np)
1669         int_mb(h_i1_start(1,5,fb)+it) = index1
1670         int_mb(h_i2_start(1,5,fb)+it) = index2
1671
1672         do k=1,nz(1)
1673         do j=1,ny(1)
1674         do i=1,nx(1)
1675
1676*           **** packing scheme ****
1677            phere = int_mb(p_map1(1,1)+(j-1)+(k-1)*ny(1))
1678            qhere = int_mb(q_map1(1,1)+(j-1)+(k-1)*ny(1))
1679
1680            pto   = int_mb(p_map3(1,1)+(i-1)+(j-1)*nx(1))
1681            qto   = int_mb(q_map3(1,1)+(i-1)+(j-1)*nx(1))
1682
1683
1684            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
1685               itmp = i + (qhere-1)*nx(1)
1686               int_mb(h_iq_to_i1(1,5,fb)+index1-1) = itmp
1687               index1 = index1 + 1
1688            end if
1689
1690*           **** unpacking scheme ****
1691            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
1692               itmp = k + (qto-1)*nz(1)
1693               int_mb(h_iq_to_i2(1,5,fb)+index2-1) = itmp
1694               index2 = index2 + 1
1695            end if
1696
1697         end do
1698         end do
1699         end do
1700
1701      end do
1702      int_mb(h_i1_start(1,5,fb)+np) = index1
1703      int_mb(h_i2_start(1,5,fb)+np) = index2
1704      h_iz_to_i2_count(5,fb) = index3 - 1
1705
1706
1707
1708
1709
1710
1711*     *************************
1712*     **** map3to1 mapping ****
1713*     *************************
1714
1715*     **** allocate trans_blk_ijk common block ****
1716      value = BA_alloc_get(mt_int,(nz(1)*nq3(1)),
1717     >                     'h_iq_to_i1_6',
1718     >                      h_iq_to_i1(2,6,fb),
1719     >                      h_iq_to_i1(1,6,fb))
1720      value = value.and.
1721     >        BA_alloc_get(mt_int,(nx(1)*nq1(1)),
1722     >                     'h_iq_to_i2_6',
1723     >                      h_iq_to_i2(2,6,fb),
1724     >                      h_iq_to_i2(1,6,fb))
1725      value = value.and.
1726     >        BA_alloc_get(mt_int,(nx(1)*nq1(1)),
1727     >                     'h_iz_to_i2_6',
1728     >                      h_iz_to_i2(2,6,fb),
1729     >                      h_iz_to_i2(1,6,fb))
1730      value = value.and.
1731     >        BA_alloc_get(mt_int,(np+1),
1732     >                     'h_i1_start_6',
1733     >                      h_i1_start(2,6,fb),
1734     >                      h_i1_start(1,6,fb))
1735      value = value.and.
1736     >        BA_alloc_get(mt_int,(np+1),
1737     >                     'h_i2_start_6',
1738     >                      h_i2_start(2,6,fb),
1739     >                      h_i2_start(1,6,fb))
1740      if (.not.value)
1741     > call errquit('C3dB_transpose_ijk_init:out of heap',0,MA_ERR)
1742
1743      index1 = 1
1744      index2 = 1
1745      index3 = 1
1746      do it=0,np-1
1747         proc_to   = mod(taskid+it,np)
1748         proc_from = mod(taskid-it+np,np)
1749         int_mb(h_i1_start(1,6,fb)+it) = index1
1750         int_mb(h_i2_start(1,6,fb)+it) = index2
1751
1752         do k=1,nz(1)
1753         do j=1,ny(1)
1754         do i=1,nx(1)
1755
1756*           **** packing scheme ****
1757            phere = int_mb(p_map3(1,1)+(i-1)+(j-1)*nx(1))
1758            qhere = int_mb(q_map3(1,1)+(i-1)+(j-1)*nx(1))
1759
1760            pto   = int_mb(p_map1(1,1)+(j-1)+(k-1)*ny(1))
1761            qto   = int_mb(q_map1(1,1)+(j-1)+(k-1)*ny(1))
1762
1763
1764            if ((phere.eq.taskid).and.(pto.eq.proc_to)) then
1765               itmp = k + (qhere-1)*nz(1)
1766               int_mb(h_iq_to_i1(1,6,fb)+index1-1) = itmp
1767               index1 = index1 + 1
1768            end if
1769
1770*           **** unpacking scheme ****
1771            if ((pto.eq.taskid).and.(phere.eq.proc_from)) then
1772               itmp = i + (qto-1)*nx(1)
1773               int_mb(h_iq_to_i2(1,6,fb)+index2-1) = itmp
1774               index2 = index2 + 1
1775            end if
1776
1777         end do
1778         end do
1779         end do
1780
1781      end do
1782      int_mb(h_i1_start(1,6,fb)+np) = index1
1783      int_mb(h_i2_start(1,6,fb)+np) = index2
1784      h_iz_to_i2_count(6,fb) = index3 - 1
1785
1786
1787      return
1788      end
1789
1790
1791
1792*     ***********************************
1793*     *                                 *
1794*     *         C3dB_pfftfx             *
1795*     *                                 *
1796*     ***********************************
1797*
1798*     do fft along nx(1) dimension
1799*
1800*        A(kx,ny(1),nz(1)) <- fft1d[A(nx(1),ny(1),nz(1))]
1801*
1802
1803      subroutine C3dB_pfftfx(nbb,A,tmp1,tmp2,request,reqcnt)
1804      implicit none
1805      integer nbb
1806      complex*16 A(*)
1807      complex*16 tmp1(*)
1808      complex*16 tmp2(*)
1809      integer    request(*),reqcnt
1810
1811
1812#include "bafdecls.fh"
1813#include "errquit.fh"
1814
1815#include "C3dB.fh"
1816#include "C3dB_pfft.fh"
1817
1818
1819      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
1820      common    / C3dB_fft / tmpx,tmpy,tmpz
1821
1822      !*** local variables ***
1823      integer j,q,indx,fb
1824
1825      if (nbb.eq.0) then
1826         fb = 0
1827      else
1828         fb = 1
1829      end if
1830
1831      !**********************
1832      !**** slab mapping ****
1833      !**********************
1834      if (mapping.eq.1) then
1835      call nwpw_timing_start(31)
1836#ifdef MLIB
1837      indx = 1
1838      do q=1,nq(1)
1839      do j=1,ny(1)
1840         call z1dfft(A(indx),nx(1),dcpl_mb(tmpx(1,1)),1,ierr)
1841         indx = indx + nx(1)
1842      end do
1843      end do
1844
1845#else
1846      indx = 1
1847      do q=1,nq(1)
1848      do j=1,ny(1)
1849         call dcfftf(nx(1),A(indx),dcpl_mb(tmpx(1,1)))
1850         indx = indx + nx(1)
1851      end do
1852      end do
1853#endif
1854      call dcopy(2*nx(1)*ny(1)*nq(1),A,1,tmp1,1)
1855      call nwpw_timing_end(31)
1856
1857
1858      !*************************
1859      !**** hilbert mapping ****
1860      !*************************
1861      else
1862      call nwpw_timing_start(31)
1863
1864#ifdef MLIB
1865      indx = 1
1866      do q=1,nq1(1)
1867         call z1dfft(A(indx),nx(1),dcpl_mb(tmpx(1,1)),1,ierr)
1868         indx = indx + nx(1)
1869      end do
1870#else
1871      indx = 1
1872      do q=1,nq1(1)
1873         call dcfftf(nx(1),A(indx),dcpl_mb(tmpx(1,1)))
1874         indx = indx + nx(1)
1875      end do
1876#endif
1877
1878      call nwpw_timing_end(31)
1879      call nwpw_timing_start(32)
1880      call C3dB_c_ptranspose_ijk_start(fb,1,A,tmp1,tmp2,
1881     >                                 request,reqcnt,40)
1882      call nwpw_timing_end(32)
1883
1884      end if
1885
1886      return
1887      end
1888
1889
1890*     ***********************************
1891*     *                                 *
1892*     *         C3dB_pfftfy             *
1893*     *                                 *
1894*     ***********************************
1895*
1896*     do fft along ny(1) dimension
1897*
1898*        A(kx,ny(1),nz(1)) <- fft1d[A(nx(1),ny(1),nz(1))]
1899*
1900
1901      subroutine C3dB_pfftfy(nbb,tmp1,tmp2,request,reqcnt)
1902      implicit none
1903      integer nbb
1904      complex*16 tmp1(*)
1905      complex*16 tmp2(*)
1906      integer    request(*),reqcnt
1907
1908
1909#include "bafdecls.fh"
1910#include "errquit.fh"
1911
1912#include "C3dB.fh"
1913#include "C3dB_pfft.fh"
1914
1915
1916      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
1917      common    / C3dB_fft / tmpx,tmpy,tmpz
1918
1919
1920      !*** local variables ***
1921      integer i,j,k,indx,indx2,q,fb
1922
1923      if (nbb.eq.0) then
1924         fb = 0
1925      else
1926         fb = 1
1927      end if
1928
1929      !**********************
1930      !**** slab mapping ****
1931      !**********************
1932      if (mapping.eq.1) then
1933      call nwpw_timing_start(31)
1934
1935*     ********************************************
1936*     ***     do fft along ny(1) dimension        ***
1937*     ***   A(kx,ky,nz(1)) <- fft1d[A(kx,ny(1),nz(1))]  ***
1938*     ********************************************
1939
1940#ifdef MLIB
1941
1942      indx2 = 0
1943      do q=1,nq(1)
1944      do i=1,nx(1)
1945        indx2 = indx2 + 1
1946        if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then
1947         indx = i + (q-1)*nx(1)*ny(1)
1948         call zcopy(ny(1),tmp1(indx),nx(1),tmp2,1)
1949         call z1dfft(tmp2,ny(1),dcpl_mb(tmpy(1,1)),1,ierr)
1950         call zcopy(ny(1),tmp2,1,tmp1(indx),nx(1))
1951        end if
1952      end do
1953      end do
1954#else
1955       indx2 = 0
1956       do q=1,nq(1)
1957       do i=1,nx(1)
1958        indx2 = indx2 + 1
1959        if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then
1960          indx = i + (q-1)*nx(1)*ny(1)
1961          call zcopy(ny(1),tmp1(indx),nx(1),tmp2,1)
1962          call dcfftf(ny(1),tmp2,dcpl_mb(tmpy(1,1)))
1963          call zcopy(ny(1),tmp2,1,tmp1(indx),nx(1))
1964        end if
1965       end do
1966       end do
1967#endif
1968
1969
1970*     ********************************************
1971*     ***         Do a ptranspose of A          ***
1972*     ***      A(ky,nz(1),ky) <- A(kx,ky,nz(1))      ***
1973*     ********************************************
1974      call nwpw_timing_end(31)
1975      call nwpw_timing_start(32)
1976      call C3dB_c_ptranspose2_jk_start(fb,tmp1,tmp2,tmp1,
1977     >                                 request,reqcnt,41)
1978      call nwpw_timing_end(32)
1979
1980
1981
1982
1983      !*************************
1984      !**** hilbert mapping ****
1985      !*************************
1986      else
1987      call nwpw_timing_start(32)
1988      call C3dB_c_ptranspose_ijk_end(fb,1,tmp1,tmp2,request,reqcnt)
1989      call nwpw_timing_end(32)
1990      call nwpw_timing_start(31)
1991
1992*     ********************************************
1993*     ***     do fft along ny(1) dimension        ***
1994*     ***   A(ky,nz(1),kx) <- fft1d[A(ny(1),nz(1),kx)]  ***
1995*     ********************************************
1996#ifdef MLIB
1997      indx = 1
1998      do q=1,nq2(1)
1999         if (.not.log_mb(zero_row2(1,fb)+q-1)) then
2000         call z1dfft(tmp1(indx),ny(1),dcpl_mb(tmpy(1,1)),1,ierr)
2001         end if
2002         indx = indx + ny(1)
2003      end do
2004#else
2005      indx = 1
2006      do q=1,nq2(1)
2007         if (.not.log_mb(zero_row2(1,fb)+q-1)) then
2008         call dcfftf(ny(1),tmp1(indx),dcpl_mb(tmpy(1,1)))
2009         end if
2010         indx = indx + ny(1)
2011      end do
2012#endif
2013
2014      call nwpw_timing_end(31)
2015      call nwpw_timing_start(32)
2016      call C3dB_c_ptranspose_ijk_start(fb,2,tmp1,tmp2,tmp1,
2017     >                                 request,reqcnt,42)
2018      call nwpw_timing_end(32)
2019
2020
2021      end if
2022
2023      return
2024      end
2025
2026
2027
2028
2029
2030*     ***********************************
2031*     *                                 *
2032*     *         C3dB_pfftfz             *
2033*     *                                 *
2034*     ***********************************
2035*
2036
2037      subroutine C3dB_pfftfz(nbb,tmp1,tmp2,request,reqcnt)
2038      implicit none
2039      integer nbb
2040      complex*16 tmp1(*)
2041      complex*16 tmp2(*)
2042      integer    request(*),reqcnt
2043
2044
2045#include "bafdecls.fh"
2046#include "errquit.fh"
2047
2048#include "C3dB.fh"
2049#include "C3dB_pfft.fh"
2050
2051
2052      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
2053      common    / C3dB_fft / tmpx,tmpy,tmpz
2054
2055
2056      !*** local variables ***
2057      integer i,k,q,fb
2058      integer indx,indx2
2059
2060#ifdef MLIB
2061      integer ierr
2062#endif
2063
2064      if (nbb.eq.0) then
2065         fb = 0
2066      else
2067         fb = 1
2068      end if
2069
2070      !**********************
2071      !**** slab mapping ****
2072      !**********************
2073      if (mapping.eq.1) then
2074
2075      call nwpw_timing_start(32)
2076      call C3dB_c_ptranspose2_jk_end(fb,tmp2,tmp1,request,reqcnt)
2077      call nwpw_timing_end(32)
2078      call nwpw_timing_start(31)
2079
2080*     ********************************************
2081*     ***     do fft along nz(1) dimension        ***
2082*     ***   A(kx,kz,ky) <- fft1d[A(kx,nz(1),ky)]  ***
2083*     ********************************************
2084
2085#ifdef MLIB
2086      indx2 = 0
2087      do q=1,nq(1)
2088      do i=1,nx(1)
2089        indx2 = indx2 + 1
2090        if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then
2091         indx = i + (q-1)*nx(1)*nz(1)
2092         call zcopy(nz(1),tmp2(indx),nx(1),tmp1,1)
2093         call z1dfft(tmp1,nz(1),dcpl_mb(tmpz(1,1)),1,ierr)
2094         call zcopy(nz(1),tmp1,1,tmp2(indx),nx(1))
2095        end if
2096      end do
2097      end do
2098#else
2099      indx2 = 0
2100      do q=1,nq(1)
2101      do i=1,nx(1)
2102        indx2 = indx2 + 1
2103        if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then
2104          indx = i + (q-1)*nx(1)*nz(1)
2105          call zcopy(nz(1),tmp2(indx),nx(1),tmp1,1)
2106          call dcfftf(nz(1),tmp1,dcpl_mb(tmpz(1,1)))
2107          call zcopy(nz(1),tmp1,1,tmp2(indx),nx(1))
2108        end if
2109      end do
2110      end do
2111      call nwpw_timing_end(31)
2112
2113#endif
2114
2115
2116      !*************************
2117      !**** hilbert mapping ****
2118      !*************************
2119      else
2120      call nwpw_timing_start(32)
2121      call C3dB_c_ptranspose_ijk_end(fb,2,tmp2,tmp1,request,reqcnt)
2122      call nwpw_timing_end(32)
2123      call nwpw_timing_start(31)
2124
2125*     ********************************************
2126*     ***     do fft along nz(1) dimension        ***
2127*     ***   A(kz,kx,ky) <- fft1d[A(nz(1),kx,ky)]  ***
2128*     ********************************************
2129#ifdef MLIB
2130      indx = 1
2131      do q=1,nq3(1)
2132         if (.not.log_mb(zero_row3(1,fb)+q-1)) then
2133         call z1dfft(tmp2(indx),nz(1),dcpl_mb(tmpz(1,1)),1,ierr)
2134         end if
2135         indx = indx + nz(1)
2136      end do
2137#else
2138      indx = 1
2139      do q=1,nq3(1)
2140         if (.not.log_mb(zero_row3(1,fb)+q-1)) then
2141         call dcfftf(nz(1),tmp2(indx),dcpl_mb(tmpz(1,1)))
2142         end if
2143         indx = indx + nz(1)
2144      end do
2145#endif
2146      call nwpw_timing_end(31)
2147
2148      end if
2149
2150
2151
2152      call nwpw_timing_start(32)
2153      call Cram_c_pack_start(nbb,tmp2,tmp1,request,reqcnt,43)
2154      call nwpw_timing_end(32)
2155
2156      return
2157      end
2158
2159
2160
2161*     ***********************************
2162*     *                                 *
2163*     *         C3dB_pfftf_final        *
2164*     *                                 *
2165*     ***********************************
2166*
2167
2168      subroutine C3dB_pfftf_final(nbb,tmp1,tmp2,request,reqcnt)
2169      implicit none
2170      integer nbb
2171      complex*16 tmp1(*)
2172      complex*16 tmp2(*)
2173      integer    request(*),reqcnt
2174
2175#include "bafdecls.fh"
2176#include "errquit.fh"
2177
2178#include "C3dB.fh"
2179#include "C3dB_pfft.fh"
2180
2181      call nwpw_timing_start(32)
2182      call Cram_c_pack_end(nbb,tmp2,request,reqcnt)
2183      call nwpw_timing_end(32)
2184
2185      return
2186      end
2187
2188
2189
2190
2191*     ***********************************
2192*     *                                 *
2193*     *         C3dB_pfftf_step         *
2194*     *                                 *
2195*     ***********************************
2196*
2197
2198      subroutine C3dB_pfftf_step(step,nbb,A,tmp1,tmp2,request,reqcnt)
2199      implicit none
2200      integer step
2201      integer nbb
2202      complex*16 A(*)
2203      complex*16 tmp1(*)
2204      complex*16 tmp2(*)
2205      integer    request(*),reqcnt
2206
2207
2208      if (step.eq.1) then
2209         call C3dB_pfftfx(nbb,A,tmp1,tmp2,request,reqcnt)
2210      else if (step.eq.2) then
2211         call C3dB_pfftfy(nbb,tmp1,tmp2,request,reqcnt)
2212      else if (step.eq.3) then
2213         call C3dB_pfftfz(nbb,tmp1,tmp2,request,reqcnt)
2214      else if (step.eq.4) then
2215         call C3dB_pfftf_final(nbb,tmp1,tmp2,request,reqcnt)
2216      end if
2217
2218      return
2219      end
2220
2221
2222
2223
2224*     ***********************************
2225*     *					*
2226*     *	        C3dB_pfftbz		*
2227*     *					*
2228*     ***********************************
2229
2230*
2231*      This routine performs the operation of
2232*      a three dimensional complex to complex
2233*      inverse fft
2234*           A(nx,ny(1),nz(1)) <- FFT3^(-1)[A(kx,ky,kz)]
2235*
2236*      Entry -
2237*              A: a column distribuded 3d block
2238*              tmp: tempory work space must be at
2239*                    least the size of (complex)
2240*                    (nfft*nfft + 1) + 10*nfft
2241*
2242*       Exit - A is transformed and the imaginary
2243*              part of A is set to zero
2244*       uses - C3dB_c_ptranspose_jk, dcopy
2245*
2246
2247
2248      subroutine C3dB_pfftbz(nbb,tmp1,tmp2,request,reqcnt)
2249      implicit none
2250      integer nbb
2251      complex*16  tmp1(*)
2252      complex*16  tmp2(*)
2253      integer     request(*),reqcnt
2254
2255
2256#include "bafdecls.fh"
2257#include "errquit.fh"
2258#include "C3dB.fh"
2259#include "C3dB_pfft.fh"
2260
2261      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
2262      common    / C3dB_fft / tmpx,tmpy,tmpz
2263
2264
2265
2266*     *** local variables ***
2267      integer i,k,q,indx,ierr
2268      integer indx2,fb
2269
2270      if (nbb.eq.0) then
2271         fb = 0
2272      else
2273         fb = 1
2274      end if
2275
2276      !**********************
2277      !**** slab mapping ****
2278      !**********************
2279      if (mapping.eq.1) then
2280
2281      call nwpw_timing_start(31)
2282*     *************************************************
2283*     ***     do fft along kz dimension             ***
2284*     ***   A(kx,nz(1),ky) <- fft1d^(-1)[A(kx,kz,ky)]  ***
2285*     *************************************************
2286#ifdef MLIB
2287      indx2 = 0
2288      do q=1,nq(1)
2289      do i=1,nx(1)
2290       indx2 = indx2 + 1
2291       if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then
2292         indx = i + (q-1)*nx(1)*nz(1)
2293         call zcopy(nz(1),tmp1(indx),nx(1),tmp2,1)
2294         call z1dfft(tmp2,nz(1),dcpl_mb(tmpz(1,1)),-2,ierr)
2295         call zcopy(nz(1),tmp2,1,tmp1(indx),nx(1))
2296       end if
2297      end do
2298      end do
2299#else
2300      indx2 = 0
2301      do q=1,nq(1)
2302      do i=1,nx(1)
2303       indx2 = indx2 + 1
2304       if (.not.log_mb(zero_row3(1,fb)+indx2-1)) then
2305         indx = i + (q-1)*nx(1)*nz(1)
2306         call zcopy(nz(1),tmp1(indx),nx(1),tmp2,1)
2307         call dcfftb(nz(1),tmp2,dcpl_mb(tmpz(1,1)))
2308         call zcopy(nz(1),tmp2,1,tmp1(indx),nx(1))
2309       end if
2310      end do
2311      end do
2312#endif
2313
2314*     ********************************************
2315*     ***         Do a ptranspose of A          ***
2316*     ***      A(kx,ky,nz(1)) <- A(kx,nz(1),ky)      ***
2317*     ********************************************
2318      call nwpw_timing_end(31)
2319      call nwpw_timing_start(32)
2320      call C3dB_c_ptranspose1_jk_start(fb,tmp1,tmp2,tmp1,
2321     >                                 request,reqcnt,44)
2322      call nwpw_timing_end(32)
2323
2324
2325      !*************************
2326      !**** hilbert mapping ****
2327      !*************************
2328      else
2329
2330
2331      call nwpw_timing_start(31)
2332*     *************************************************
2333*     ***     do fft along kz dimension             ***
2334*     ***   A(nz(1),kx,ky) <- fft1d^(-1)[A(kz,kx,ky)]  ***
2335*     *************************************************
2336#ifdef MLIB
2337      indx = 1
2338      do q=1,nq3(1)
2339         if (.not.log_mb(zero_row3(1,fb)+q-1)) then
2340         call z1dfft(tmp1(indx),nz(1),dcpl_mb(tmpz(1,1)),-2,ierr)
2341         end if
2342         indx = indx + nz(1)
2343      end do
2344#else
2345      indx = 1
2346      do q=1,nq3(1)
2347         if (.not.log_mb(zero_row3(1,fb)+q-1)) then
2348         call dcfftb(nz(1),tmp1(indx),dcpl_mb(tmpz(1,1)))
2349         end if
2350         indx = indx + nz(1)
2351      end do
2352#endif
2353
2354      call nwpw_timing_end(31)
2355      call nwpw_timing_start(32)
2356      call C3dB_c_ptranspose_ijk_start(fb,3,tmp1,tmp2,tmp1,
2357     >                                 request,reqcnt,45)
2358      call nwpw_timing_end(32)
2359
2360
2361      end if
2362
2363      return
2364      end
2365
2366
2367
2368
2369
2370*     ***********************************
2371*     *					*
2372*     *	        C3dB_pfftby		*
2373*     *					*
2374*     ***********************************
2375
2376      subroutine C3dB_pfftby(nbb,tmp1,tmp2,request,reqcnt)
2377      implicit none
2378      integer nbb
2379      complex*16  tmp1(*)
2380      complex*16  tmp2(*)
2381      integer     request(*),reqcnt
2382
2383#include "bafdecls.fh"
2384#include "errquit.fh"
2385#include "C3dB.fh"
2386#include "C3dB_pfft.fh"
2387
2388      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
2389      common    / C3dB_fft / tmpx,tmpy,tmpz
2390
2391
2392*     *** local variables ***
2393      integer i,j,q,indx,ierr
2394      integer indx2,fb
2395
2396      if (nbb.eq.0) then
2397         fb = 0
2398      else
2399         fb = 1
2400      end if
2401
2402      !**********************
2403      !**** slab mapping ****
2404      !**********************
2405      if (mapping.eq.1) then
2406
2407
2408*     ********************************************
2409*     ***         Do a ptranspose of A          ***
2410*     ***      A(kx,ky,nz(1)) <- A(kx,nz(1),ky)      ***
2411*     ********************************************
2412      call nwpw_timing_start(32)
2413      call C3dB_c_ptranspose1_jk_end(fb,tmp2,tmp1,request,reqcnt)
2414      call nwpw_timing_end(32)
2415      call nwpw_timing_start(31)
2416
2417*     *************************************************
2418*     ***     do fft along ky dimension             ***
2419*     ***   A(kx,ny(1),nz(1)) <- fft1d^(-1)[A(kx,ky,nz(1))]  ***
2420*     *************************************************
2421#ifdef MLIB
2422
2423      indx2 = 0
2424      do q=1,nq(1)
2425      do i=1,nx(1)
2426        indx2 = indx2 + 1
2427        if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then
2428          indx = i + (q-1)*nx(1)*nz(1)
2429          call zcopy(ny(1),tmp2(indx),nx(1),tmp1,1)
2430          call z1dfft(tmp1,ny(1),dcpl_mb(tmpy(1,1)),-2,ierr)
2431          call zcopy(ny(1),tmp1,1,tmp2(indx),nx(1))
2432        end if
2433      end do
2434      end do
2435#else
2436      indx2 = 0
2437      do q=1,nq(1)
2438      do i=1,nx(1)
2439        indx2 = indx2 + 1
2440        if (.not.log_mb(zero_row2(1,fb)+indx2-1)) then
2441         indx = i + (q-1)*nx(1)*nz(1)
2442         call zcopy(ny(1),tmp2(indx),nx(1),tmp1,1)
2443         call dcfftb(ny(1),tmp1,dcpl_mb(tmpy(1,1)))
2444         call zcopy(ny(1),tmp1,1,tmp2(indx),nx(1))
2445        end if
2446      end do
2447      end do
2448#endif
2449
2450      call nwpw_timing_end(31)
2451
2452
2453      !*************************
2454      !**** hilbert mapping ****
2455      !*************************
2456      else
2457
2458      call nwpw_timing_start(32)
2459      call C3dB_c_ptranspose_ijk_end(fb,3,tmp2,tmp1,request,reqcnt)
2460
2461      call nwpw_timing_end(32)
2462      call nwpw_timing_start(31)
2463*     *************************************************
2464*     ***     do fft along ky dimension             ***
2465*     ***   A(ny(1),nz(1),kx) <- fft1d^(-1)[A(ky,nz(1),kx)]  ***
2466*     *************************************************
2467#ifdef MLIB
2468      indx = 1
2469      do q=1,nq2(1)
2470         if (.not.log_mb(zero_row2(1,fb)+q-1)) then
2471         call z1dfft(tmp2(indx),ny(1),dcpl_mb(tmpy(1,1)),-2,ierr)
2472         end if
2473         indx = indx + ny(1)
2474      end do
2475#else
2476      indx = 1
2477      do q=1,nq2(1)
2478         if (.not.log_mb(zero_row2(1,fb)+q-1)) then
2479         call dcfftb(ny(1),tmp2(indx),dcpl_mb(tmpy(1,1)))
2480         end if
2481         indx = indx + ny(1)
2482      end do
2483#endif
2484
2485      call nwpw_timing_end(31)
2486      call nwpw_timing_start(32)
2487      call C3dB_c_ptranspose_ijk_start(fb,4,tmp2,tmp1,tmp2,
2488     >                                 request,reqcnt,46)
2489      call nwpw_timing_end(32)
2490
2491
2492      end if
2493
2494      return
2495      end
2496
2497
2498
2499
2500*     ***********************************
2501*     *					*
2502*     *	        C3dB_pfftbx		*
2503*     *					*
2504*     ***********************************
2505
2506      subroutine C3dB_pfftbx(nbb,tmp1,tmp2,request,reqcnt)
2507      implicit none
2508      integer nbb
2509      complex*16  tmp1(*)
2510      complex*16  tmp2(*)
2511      integer     request(*),reqcnt
2512
2513#include "bafdecls.fh"
2514#include "errquit.fh"
2515#include "C3dB.fh"
2516#include "C3dB_pfft.fh"
2517
2518      integer tmpx(2,NBLOCKS),tmpy(2,NBLOCKS),tmpz(2,NBLOCKS)
2519      common    / C3dB_fft / tmpx,tmpy,tmpz
2520
2521*     *** local variables ***
2522      integer j,q,indx,ierr,fb
2523
2524
2525      if (nbb.eq.0) then
2526         fb = 0
2527      else
2528         fb = 1
2529      endif
2530
2531      !**********************
2532      !**** slab mapping ****
2533      !**********************
2534      if (mapping.eq.1) then
2535
2536      call nwpw_timing_start(31)
2537*     *************************************************
2538*     ***     do fft along kx dimension             ***
2539*     ***   A(nx(1),ny(1),nz(1)) <- fft1d^(-1)[A(kx,ny(1),nz(1))]  ***
2540*     *************************************************
2541#ifdef MLIB
2542      indx = 1
2543      do q=1,nq(1)
2544      do j=1,ny(1)
2545         call z1dfft(tmp2(indx),nx(1),dcpl_mb(tmpx(1,1)),-2,ierr)
2546         indx = indx + nx(1)
2547      end do
2548      end do
2549
2550#else
2551      indx = 1
2552      do q=1,nq(1)
2553      do j=1,ny(1)
2554         call dcfftb(nx(1),tmp2(indx),dcpl_mb(tmpx(1,1)))
2555         indx = indx + nx(1)
2556      end do
2557      end do
2558#endif
2559      call dcopy(2*nx(1)*ny(1)*nq(1),tmp2,1,tmp1,1)
2560
2561
2562
2563      call nwpw_timing_end(31)
2564*     *************************************************
2565      !*************************
2566      !**** hilbert mapping ****
2567      !*************************
2568      else
2569
2570      call nwpw_timing_start(32)
2571      call C3dB_c_ptranspose_ijk_end(fb,4,tmp1,tmp2,request,reqcnt)
2572      call nwpw_timing_end(32)
2573      call nwpw_timing_start(31)
2574
2575*     *************************************************
2576*     ***     do fft along kx dimension             ***
2577*     ***   A(nx(1),ny(1),nz(1)) <- fft1d^(-1)[A(kx,ny(1),nz(1))]  ***
2578*     *************************************************
2579#ifdef MLIB
2580      indx = 1
2581      do q=1,nq1(1)
2582         call z1dfft(tmp1(indx),nx(1),dcpl_mb(tmpx(1,1)),-2,ierr)
2583         indx = indx + nx(1)
2584      end do
2585#else
2586      indx = 1
2587      do q=1,nq1(1)
2588         call dcfftb(nx(1),tmp1(indx),dcpl_mb(tmpx(1,1)))
2589         indx = indx + nx(1)
2590      end do
2591#endif
2592      call nwpw_timing_end(31)
2593
2594      end if
2595
2596      return
2597      end
2598
2599
2600*     ***********************************
2601*     *                                 *
2602*     *         C3dB_pfftb_step         *
2603*     *                                 *
2604*     ***********************************
2605*
2606
2607      subroutine C3dB_pfftb_step(step,nbb,A,tmp1,tmp2,request,reqcnt)
2608      implicit none
2609      integer step
2610      integer nbb
2611      complex*16 A(*)
2612      complex*16 tmp1(*)
2613      complex*16 tmp2(*)
2614      integer    request(*),reqcnt
2615
2616
2617#include "bafdecls.fh"
2618
2619
2620      if (step.eq.1) then
2621         call nwpw_timing_start(32)
2622         call Cram_c_unpack_start(nbb,A,tmp1,request,reqcnt,47)
2623         call nwpw_timing_end(32)
2624      else if (step.eq.2) then
2625         call nwpw_timing_start(32)
2626         call Cram_c_unpack_end(nbb,tmp1,tmp2,request,reqcnt)
2627         call nwpw_timing_end(32)
2628      else if (step.eq.3) then
2629         call C3dB_pfftbz(nbb,tmp1,tmp2,request,reqcnt)
2630      else if (step.eq.4) then
2631         call C3dB_pfftby(nbb,tmp1,tmp2,request,reqcnt)
2632      else if (step.eq.5) then
2633         call C3dB_pfftbx(nbb,tmp1,tmp2,request,reqcnt)
2634      end if
2635
2636      return
2637      end
2638
2639
2640*     ***********************************
2641*     *                                 *
2642*     *     C3dB_pfft3_queue_init       *
2643*     *                                 *
2644*     ***********************************
2645
2646      subroutine C3dB_pfft3_queue_init(qmax_in)
2647      implicit none
2648      integer qmax_in
2649
2650
2651#include "bafdecls.fh"
2652#include "errquit.fh"
2653
2654#include "C3dB.fh"
2655#include "C3dB_pfft.fh"
2656
2657
2658      !**** pfft_queues ****
2659      integer aqindx(2),aqstatus(2)
2660      integer atmp(2),arequest(2),areqcnt(2),aqnbb(2)
2661      integer aqmax,aqsize,alast_index
2662      common / c_pfft_aqueue_common / aqindx,aqstatus,atmp,arequest,
2663     >                              areqcnt,aqnbb,
2664     >                              aqmax,aqsize,alast_index
2665      integer bqindx(2),bqstatus(2)
2666      integer btmp(2),brequest(2),breqcnt(2),bqnbb(2)
2667      integer bqmax,bqsize,blast_index
2668      common / c_pfft_bqueue_common / bqindx,bqstatus,btmp,brequest,
2669     >                              breqcnt,bqnbb,
2670     >                              bqmax,bqsize,blast_index
2671
2672
2673      logical value
2674      integer np,size
2675
2676      call Parallel3d_np_i(np)
2677
2678c     **** allocate aqueue ****
2679      aqmax       = qmax_in
2680      aqsize      = 0
2681      alast_index = aqmax
2682      size  = nfft3d(1)*aqmax*2
2683      value = BA_alloc_get(mt_dcpl,size,'atmp',atmp(2),atmp(1))
2684      size  = np*aqmax*2
2685      value = value.and.
2686     >      BA_alloc_get(mt_int,size,'arequest',arequest(2),arequest(1))
2687      size  = aqmax
2688      value = value.and.
2689     >        BA_alloc_get(mt_int,size,'aqindx',aqindx(2),aqindx(1))
2690      value = value.and.
2691     >      BA_alloc_get(mt_int,size,'aqstatus',aqstatus(2),aqstatus(1))
2692      value = value.and.
2693     >        BA_alloc_get(mt_int,size,'areqcnt',areqcnt(2),areqcnt(1))
2694      value = value.and.
2695     >        BA_alloc_get(mt_int,size,'aqnbb',aqnbb(2),aqnbb(1))
2696
2697
2698c     **** allocate bqueue ****
2699      bqmax       = qmax_in
2700      bqsize      = 0
2701      blast_index = bqmax
2702      size  = nfft3d(1)*bqmax*2
2703      value = BA_alloc_get(mt_dcpl,size,'btmp',btmp(2),btmp(1))
2704      size  = np*bqmax*2
2705      value = value.and.
2706     >      BA_alloc_get(mt_int,size,'brequest',brequest(2),brequest(1))
2707      size  = bqmax
2708      value = value.and.
2709     >        BA_alloc_get(mt_int,size,'bqindx',bqindx(2),bqindx(1))
2710      value = value.and.
2711     >      BA_alloc_get(mt_int,size,'bqstatus',bqstatus(2),bqstatus(1))
2712      value = value.and.
2713     >        BA_alloc_get(mt_int,size,'breqcnt',breqcnt(2),breqcnt(1))
2714      value = value.and.
2715     >        BA_alloc_get(mt_int,size,'bqnbb',bqnbb(2),bqnbb(1))
2716
2717      return
2718      end
2719
2720
2721
2722
2723*     ***********************************
2724*     *                                 *
2725*     *     C3dB_pfft3_queue_end        *
2726*     *                                 *
2727*     ***********************************
2728
2729      subroutine C3dB_pfft3_queue_end()
2730      implicit none
2731
2732
2733#include "bafdecls.fh"
2734#include "errquit.fh"
2735
2736#include "C3dB.fh"
2737#include "C3dB_pfft.fh"
2738
2739
2740
2741      !**** pfft_queues ****
2742      integer aqindx(2),aqstatus(2)
2743      integer atmp(2),arequest(2),areqcnt(2),aqnbb(2)
2744      integer aqmax,aqsize,alast_index
2745      common / c_pfft_aqueue_common / aqindx,aqstatus,atmp,arequest,
2746     >                              areqcnt,aqnbb,
2747     >                              aqmax,aqsize,alast_index
2748      integer bqindx(2),bqstatus(2)
2749      integer btmp(2),brequest(2),breqcnt(2),bqnbb(2)
2750      integer bqmax,bqsize,blast_index
2751      common / c_pfft_bqueue_common / bqindx,bqstatus,btmp,brequest,
2752     >                              breqcnt,bqnbb,
2753     >                              bqmax,bqsize,blast_index
2754
2755
2756
2757      logical value
2758
2759      value =           BA_free_heap(atmp(2))
2760      value = value.and.BA_free_heap(arequest(2))
2761      value = value.and.BA_free_heap(aqindx(2))
2762      value = value.and.BA_free_heap(aqstatus(2))
2763      value = value.and.BA_free_heap(areqcnt(2))
2764      value = value.and.BA_free_heap(aqnbb(2))
2765
2766      value = value.and.BA_free_heap(btmp(2))
2767      value = value.and.BA_free_heap(brequest(2))
2768      value = value.and.BA_free_heap(bqindx(2))
2769      value = value.and.BA_free_heap(bqstatus(2))
2770      value = value.and.BA_free_heap(breqcnt(2))
2771      value = value.and.BA_free_heap(bqnbb(2))
2772
2773      if (.not. value) call errquit('error freeing heap',0,MA_ERR)
2774
2775      return
2776      end
2777
2778
2779
2780
2781
2782
2783
2784*     ***********************************
2785*     *                                 *
2786*     *     C3dB_cr_pfft3_queue_filled  *
2787*     *                                 *
2788*     ***********************************
2789
2790      logical function C3dB_cr_pfft3_queue_filled()
2791      implicit none
2792
2793#include "bafdecls.fh"
2794#include "errquit.fh"
2795
2796#include "C3dB.fh"
2797#include "C3dB_pfft.fh"
2798
2799
2800
2801
2802      !**** pfft_queue ****
2803      integer qindx(2),qstatus(2)
2804      integer tmp(2),request(2),reqcnt(2),qnbb(2)
2805      integer qmax,qsize,last_index
2806      common / c_pfft_aqueue_common / qindx,qstatus,tmp,request,reqcnt,
2807     >                             qnbb,qmax,qsize,last_index
2808
2809      C3dB_cr_pfft3_queue_filled = (qsize.ge.qmax)
2810      return
2811      end
2812
2813
2814
2815*     ***********************************
2816*     *                                 *
2817*     *     C3dB_rc_pfft3_queue_filled  *
2818*     *                                 *
2819*     ***********************************
2820
2821      logical function C3dB_rc_pfft3_queue_filled()
2822      implicit none
2823
2824#include "bafdecls.fh"
2825#include "errquit.fh"
2826
2827#include "C3dB.fh"
2828#include "C3dB_pfft.fh"
2829
2830
2831      !**** pfft_queue ****
2832      integer qindx(2),qstatus(2)
2833      integer tmp(2),request(2),reqcnt(2),qnbb(2)
2834      integer qmax,qsize,last_index
2835      common / c_pfft_bqueue_common / qindx,qstatus,tmp,request,reqcnt,
2836     >                             qnbb,qmax,qsize,last_index
2837
2838      C3dB_rc_pfft3_queue_filled = (qsize.ge.qmax)
2839      return
2840      end
2841
2842
2843
2844
2845*     ***********************************
2846*     *                                 *
2847*     *     C3dB_rc_pfft3f_queuein      *
2848*     *                                 *
2849*     ***********************************
2850
2851      subroutine C3dB_rc_pfft3f_queuein(nbb,A)
2852      implicit none
2853      integer nbb
2854      complex*16 A(*)
2855
2856
2857#include "bafdecls.fh"
2858#include "errquit.fh"
2859
2860#include "C3dB.fh"
2861#include "C3dB_pfft.fh"
2862
2863
2864      !**** pfft_queue ****
2865      integer qindx(2),qstatus(2)
2866      integer tmp(2),request(2),reqcnt(2),qnbb(2)
2867      integer qmax,qsize,last_index
2868      common / c_pfft_bqueue_common / qindx,qstatus,tmp,request,reqcnt,
2869     >                             qnbb,qmax,qsize,last_index
2870
2871      !*** local variables ***
2872      integer shift1,shift2,shift3
2873      integer q,indx,status,np
2874
2875      call nwpw_timing_start(30)
2876      call Parallel3d_np_i(np)
2877
2878      do q=1,qsize
2879         indx   = int_mb(qindx(1)+q-1)
2880         int_mb(qstatus(1)+indx-1) = int_mb(qstatus(1)+indx-1) + 1
2881         status = int_mb(qstatus(1)+indx-1)
2882         shift1=nfft3d(1)*(2*(indx-1))
2883         shift2=nfft3d(1)*(2*(indx-1)+1)
2884         shift3=2*np*(indx-1)
2885         call C3dB_pfftf_step(status,
2886     >                        int_mb(qnbb(1)+indx-1),
2887     >                        A,
2888     >                        dcpl_mb(tmp(1)+shift1),
2889     >                        dcpl_mb(tmp(1)+shift2),
2890     >                        int_mb(request(1)+shift3),
2891     >                        int_mb(reqcnt(1)+indx-1))
2892      end do
2893
2894      qsize = qsize + 1
2895      last_index = last_index+1
2896      if (last_index.gt.qmax) last_index = 1
2897      int_mb(qindx(1)+qsize-1)        = last_index
2898      int_mb(qstatus(1)+last_index-1) = 1
2899      status = 1
2900      int_mb(qnbb(1)+last_index-1) = nbb
2901      shift1=nfft3d(1)*(2*(last_index-1))
2902      shift2=nfft3d(1)*(2*(last_index-1)+1)
2903      shift3=2*np*(last_index-1)
2904
2905      call C3dB_pfftf_step(status,nbb,A,
2906     >                     dcpl_mb(tmp(1)+shift1),
2907     >                     dcpl_mb(tmp(1)+shift2),
2908     >                     int_mb(request(1)+shift3),
2909     >                     int_mb(reqcnt(1)+last_index-1))
2910
2911
2912      call nwpw_timing_end(30)
2913      return
2914      end
2915
2916
2917
2918
2919*     ***********************************
2920*     *                                 *
2921*     *     C3dB_rc_pfft3f_queueout     *
2922*     *                                 *
2923*     ***********************************
2924
2925      subroutine C3dB_rc_pfft3f_queueout(nbb,A)
2926      implicit none
2927      integer nbb
2928      complex*16 A(*)
2929
2930
2931#include "bafdecls.fh"
2932#include "errquit.fh"
2933
2934#include "C3dB.fh"
2935#include "C3dB_pfft.fh"
2936
2937
2938      !**** pfft_queue ****
2939      integer qindx(2),qstatus(2)
2940      integer tmp(2),request(2),reqcnt(2),qnbb(2)
2941      integer qmax,qsize,last_index
2942      common / c_pfft_bqueue_common / qindx,qstatus,tmp,request,reqcnt,
2943     >                             qnbb,qmax,qsize,last_index
2944
2945      !*** local variables ***
2946      integer shift1,shift2,shift3
2947      integer q,indx,indx1,status,np
2948
2949      call nwpw_timing_start(30)
2950      call Parallel3d_np_i(np)
2951
2952      indx1   = int_mb(qindx(1))
2953      do while (int_mb(qstatus(1)+indx1-1).lt.4)
2954         do q=1,qsize
2955            indx   = int_mb(qindx(1)+q-1)
2956            int_mb(qstatus(1)+indx-1) = int_mb(qstatus(1)+indx-1) + 1
2957            status = int_mb(qstatus(1)+indx-1)
2958            shift1=nfft3d(1)*(2*(indx-1))
2959            shift2=nfft3d(1)*(2*(indx-1)+1)
2960            shift3=2*np*(indx-1)
2961            call C3dB_pfftf_step(status,
2962     >                        int_mb(qnbb(1)+indx-1),
2963     >                        A,
2964     >                        dcpl_mb(tmp(1)+shift1),
2965     >                        dcpl_mb(tmp(1)+shift2),
2966     >                        int_mb(request(1)+shift3),
2967     >                        int_mb(reqcnt(1)+indx-1))
2968         end do
2969      end do
2970
2971      qsize = qsize -1
2972      do q=1,qsize
2973        int_mb(qindx(1)+q-1) = int_mb(qindx(1)+q)
2974      end do
2975
2976      shift2=nfft3d(1)*(2*(indx1-1)+1)
2977      call Cram_c_Copy(nbb,dcpl_mb(tmp(1)+shift2),A)
2978
2979      call nwpw_timing_end(30)
2980      return
2981      end
2982
2983
2984
2985
2986
2987*     ***********************************
2988*     *                                 *
2989*     *     C3dB_cr_pfft3b_queuein      *
2990*     *                                 *
2991*     ***********************************
2992
2993      subroutine C3dB_cr_pfft3b_queuein(nbb,A)
2994      implicit none
2995      integer nbb
2996      complex*16 A(*)
2997
2998
2999#include "bafdecls.fh"
3000#include "errquit.fh"
3001
3002#include "C3dB.fh"
3003#include "C3dB_pfft.fh"
3004
3005
3006      !**** pfft_queue ****
3007      integer qindx(2),qstatus(2)
3008      integer tmp(2),request(2),reqcnt(2),qnbb(2)
3009      integer qmax,qsize,last_index
3010      common / c_pfft_aqueue_common / qindx,qstatus,tmp,request,reqcnt,
3011     >                             qnbb,qmax,qsize,last_index
3012
3013      !*** local variables ***
3014      integer shift1,shift2,shift3
3015      integer q,indx,status,np
3016
3017      call nwpw_timing_start(30)
3018      call Parallel3d_np_i(np)
3019
3020      do q=1,qsize
3021         indx   = int_mb(qindx(1)+q-1)
3022         int_mb(qstatus(1)+indx-1) = int_mb(qstatus(1)+indx-1) + 1
3023         status = int_mb(qstatus(1)+indx-1)
3024         shift1=nfft3d(1)*(2*(indx-1))
3025         shift2=nfft3d(1)*(2*(indx-1)+1)
3026         shift3=2*np*(indx-1)
3027         call C3dB_pfftb_step(status,
3028     >                        int_mb(qnbb(1)+indx-1),
3029     >                        A,
3030     >                        dcpl_mb(tmp(1)+shift1),
3031     >                        dcpl_mb(tmp(1)+shift2),
3032     >                        int_mb(request(1)+shift3),
3033     >                        int_mb(reqcnt(1)+indx-1))
3034      end do
3035
3036      qsize = qsize + 1
3037      last_index = last_index+1
3038      if (last_index.gt.qmax) last_index = 1
3039      int_mb(qindx(1)+qsize-1)        = last_index
3040      int_mb(qstatus(1)+last_index-1) = 1
3041      status = 1
3042      int_mb(qnbb(1)+last_index-1) = nbb
3043      shift1=nfft3d(1)*(2*(last_index-1))
3044      shift2=nfft3d(1)*(2*(last_index-1)+1)
3045      shift3=2*np*(last_index-1)
3046
3047      call C3dB_pfftb_step(status,nbb,A,
3048     >                     dcpl_mb(tmp(1)+shift1),
3049     >                     dcpl_mb(tmp(1)+shift2),
3050     >                     int_mb(request(1)+shift3),
3051     >                     int_mb(reqcnt(1)+last_index-1))
3052
3053      call nwpw_timing_end(30)
3054      return
3055      end
3056
3057
3058
3059
3060*     ***********************************
3061*     *                                 *
3062*     *     C3dB_cr_pfft3b_queueout     *
3063*     *                                 *
3064*     ***********************************
3065
3066      subroutine C3dB_cr_pfft3b_queueout(nbb,A)
3067      implicit none
3068      integer nbb
3069      complex*16 A(*)
3070
3071
3072#include "bafdecls.fh"
3073#include "errquit.fh"
3074
3075#include "C3dB.fh"
3076#include "C3dB_pfft.fh"
3077
3078
3079      !**** pfft_queue ****
3080      integer qindx(2),qstatus(2)
3081      integer tmp(2),request(2),reqcnt(2),qnbb(2)
3082      integer qmax,qsize,last_index
3083      common / c_pfft_aqueue_common / qindx,qstatus,tmp,request,reqcnt,
3084     >                             qnbb,qmax,qsize,last_index
3085
3086      !*** local variables ***
3087      integer shift1,shift2,shift3
3088      integer q,indx,indx1,status,np
3089
3090      call Parallel3d_np_i(np)
3091
3092      call nwpw_timing_start(30)
3093
3094      indx1   = int_mb(qindx(1))
3095      do while (int_mb(qstatus(1)+indx1-1).lt.5)
3096         do q=1,qsize
3097            indx   = int_mb(qindx(1)+q-1)
3098            int_mb(qstatus(1)+indx-1) = int_mb(qstatus(1)+indx-1) + 1
3099            status = int_mb(qstatus(1)+indx-1)
3100            shift1=nfft3d(1)*(2*(indx-1))
3101            shift2=nfft3d(1)*(2*(indx-1)+1)
3102            shift3=2*np*(indx-1)
3103            call C3dB_pfftb_step(status,
3104     >                        int_mb(qnbb(1)+indx-1),
3105     >                        A,
3106     >                        dcpl_mb(tmp(1)+shift1),
3107     >                        dcpl_mb(tmp(1)+shift2),
3108     >                        int_mb(request(1)+shift3),
3109     >                        int_mb(reqcnt(1)+indx-1))
3110         end do
3111      end do
3112
3113      qsize = qsize -1
3114      do q=1,qsize
3115        int_mb(qindx(1)+q-1) = int_mb(qindx(1)+q)
3116      end do
3117
3118      shift1=nfft3d(1)*(2*(indx1-1))
3119      call dcopy(2*nfft3d(1),dcpl_mb(tmp(1)+shift1),1,A,1)
3120
3121      call nwpw_timing_end(30)
3122      return
3123      end
3124
3125