1
2*
3* $Id$
4*
5
6*     ***********************************************************
7*     *								*
8*     *   		   CMatrix library			*
9*     *								*
10*     *   Author - Eric Bylaska					*
11*     *   date   - 5/19/06					*
12*     *								*
13*     ***********************************************************
14c
15c
16c
17c
18
19*     ***********************************
20*     *                                 *
21*     *         CMatrix_zgemm1_rot2     *
22*     *                                 *
23*     ***********************************
24
25      subroutine CMatrix_zgemm1_rot2(m,n,k,
26     >                  alpha,
27     >                  A,lda,ma,na,
28     >                  B,ldb,mb,nb,
29     >                  beta,
30     >                  C,ldc,mc,nc,
31     >                  taskid_i,taskid_j,
32     >                  np_i,np_j,
33     >                  comm_i, comm_j,
34     >                  Bcol,Bwork,work1,work2)
35      implicit none
36      integer m,n,k
37      complex*16  alpha
38
39      integer lda,ma(*),na(*)
40      complex*16  A(lda,*)
41
42      integer ldb,mb(*),nb(*)
43c      real*8  B(ldb,*)
44      complex*16  B(*)
45
46      complex*16  beta
47
48      integer ldc,mc(*),nc(*)
49      complex*16  C(ldc,*)
50
51      integer taskid_i,taskid_j
52      integer np_i,np_j
53      integer comm_i,comm_j
54
55      complex*16  Bcol(*),Bwork(*)
56      complex*16  work1(*),work2(*)
57
58#include "mpif.h"
59#ifdef MPI4
60#include "stupid_mpi4.fh"
61#endif
62
63
64*     **** local variables ****
65      logical jeven
66      integer i,j,j1,ii,jj,ne0
67      integer iwrk,jcur,ierr,bshift,iwrk2
68      integer request1(4),request2(4)
69      integer bshift2(0:np_j)
70
71      bshift     = 1
72      bshift2(0) = 1
73      do jj=1,np_j
74         bshift      = bshift + na(jj)*nb(taskid_j+1)
75         bshift2(jj) = bshift
76      end do
77
78*     *** collect B into columns ***
79      ne0 = 0
80      do i=1,np_i
81         ne0 = ne0+mb(i)
82      end do
83      j1 = 0
84      do jj=1,taskid_j
85         j1 = j1 + nb(jj)
86      end do
87
88      bshift = 0
89      iwrk2 = nb(taskid_j+1)
90      ii    = 0
91      do jcur=0,np_j-1
92         iwrk  = na(jcur+1)
93         do j=1,iwrk2
94         do i=1,iwrk
95            Bwork(bshift+i+(j-1)*iwrk) = B(ii+i+(j+j1-1)*ne0)
96         end do
97         end do
98         bshift = bshift + iwrk*iwrk2
99         ii     = ii     + iwrk
100      end do
101
102
103*     *** C = beta*C ***
104c      call dscal(ldc*nc(taskid_j+1),beta,C,1)
105      do j=1,nc(taskid_j+1)
106         do i=1,mc(taskid_i+1)
107            C(i,j) = beta*C(i,j)
108         end do
109      end do
110
111      call Cmatrix_start_rot(1,
112     >                       taskid_j,np_j,comm_j,
113     >                       A,work1,lda,na,
114     >                       request1)
115      jcur = taskid_j
116      iwrk = na(jcur+1)
117
118      if ((mc(taskid_i+1).gt.0).and.
119     >    (nc(taskid_j+1).gt.0).and.
120     >    (iwrk.gt.0))
121     >     call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
122     >                alpha,
123     >                A, ma(taskid_i+1),
124     >                Bwork(bshift2(jcur)), iwrk,
125     >                dcmplx(1.0d0,0.0d0),
126     >                C, ldc)
127
128      jeven = .true.
129      do j=2,np_j-1
130         if (jeven) then
131            jeven = .false.
132            jcur = mod(jcur-1+np_j,np_j)
133            iwrk = na(jcur+1)
134            call Cmatrix_end_rot(request1)
135            call Cmatrix_start_rot(j,
136     >                             taskid_j,np_j,comm_j,
137     >                             A,work2,lda,na,
138     >                             request2)
139         if ((mc(taskid_i+1).gt.0).and.
140     >       (nc(taskid_j+1).gt.0).and.
141     >       (iwrk.gt.0))
142     >       call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
143     >                alpha,
144     >                work1, ma(taskid_i+1),
145     >                Bwork(bshift2(jcur)), iwrk,
146     >                dcmplx(1.0d0,0.0d0),
147     >                C, ldc)
148
149         else
150            jeven = .true.
151            jcur = mod(jcur-1+np_j,np_j)
152            iwrk = na(jcur+1)
153            call Cmatrix_end_rot(request2)
154            call Cmatrix_start_rot(j,
155     >                             taskid_j,np_j,comm_j,
156     >                             A,work1,lda,na,
157     >                             request1)
158            if ((mc(taskid_i+1).gt.0).and.
159     >          (nc(taskid_j+1).gt.0).and.
160     >          (iwrk.gt.0))
161     >          call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
162     >                alpha,
163     >                work2, ma(taskid_i+1),
164     >                Bwork(bshift2(jcur)), iwrk,
165     >                dcmplx(1.0d0,0.0d0),
166     >                C, ldc)
167
168         end if
169      end do
170      if (jeven) then
171         jcur = mod(jcur-1+np_j,np_j)
172         iwrk = na(jcur+1)
173         call Cmatrix_end_rot(request1)
174         if ((mc(taskid_i+1).gt.0).and.
175     >       (nc(taskid_j+1).gt.0).and.
176     >       (iwrk.gt.0))
177     >       call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
178     >             alpha,
179     >             work1, ma(taskid_i+1),
180     >             Bwork(bshift2(jcur)), iwrk,
181     >             dcmplx(1.0d0,0.0d0),
182     >             C, ldc)
183
184      else
185         jcur = mod(jcur-1+np_j,np_j)
186         iwrk = na(jcur+1)
187         call Cmatrix_end_rot(request2)
188         if ((mc(taskid_i+1).gt.0).and.
189     >       (nc(taskid_j+1).gt.0).and.
190     >       (iwrk.gt.0))
191     >       call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
192     >             alpha,
193     >             work2, ma(taskid_i+1),
194     >             Bwork(bshift2(jcur)), iwrk,
195     >             dcmplx(1.0d0,0.0d0),
196     >             C, ldc)
197      end if
198      return
199      end
200
201
202
203
204
205*     ***********************************
206*     *                                 *
207*     *         CMatrix_zgemm1_rot      *
208*     *                                 *
209*     ***********************************
210
211      subroutine CMatrix_zgemm1_rot(m,n,k,
212     >                  alpha,
213     >                  A,lda,ma,na,
214     >                  B,ldb,mb,nb,
215     >                  beta,
216     >                  C,ldc,mc,nc,
217     >                  taskid_i,taskid_j,
218     >                  np_i,np_j,
219     >                  comm_i, comm_j,
220     >                  Bcol,Bwork,work1,work2)
221      implicit none
222      integer m,n,k
223      complex*16  alpha
224
225      integer lda,ma(*),na(*)
226      complex*16  A(lda,*)
227
228      integer ldb,mb(*),nb(*)
229c      real*8  B(ldb,*)
230      complex*16  B(*)
231
232      complex*16  beta
233
234      integer ldc,mc(*),nc(*)
235      complex*16  C(ldc,*)
236
237      integer taskid_i,taskid_j
238      integer np_i,np_j
239      integer comm_i,comm_j
240
241      complex*16  Bcol(*),Bwork(*)
242      complex*16  work1(*),work2(*)
243
244#include "mpif.h"
245#ifdef MPI4
246#include "stupid_mpi4.fh"
247#endif
248
249
250*     **** local variables ****
251      logical jeven
252      integer i,j,ii,jj,mbmax,nbmax,ne0
253      integer iwrk,jcur,ierr,bshift,iwrk2
254      integer request1(4),request2(4)
255      integer bshift2(0:np_j)
256
257      bshift     = 1
258      bshift2(0) = 1
259      do jj=1,np_j
260         bshift      = bshift + na(jj)*nb(taskid_j+1)
261         bshift2(jj) = bshift
262      end do
263
264*     *** collect B into columns ***
265      ne0 = 0
266      mbmax = 0
267      do i=1,np_i
268         if (mb(i).gt.mbmax) mbmax = mb(i)
269         ne0 = ne0+mb(i)
270      end do
271      nbmax = 0
272      do j=1,np_j
273         if (nb(j).gt.nbmax) nbmax = nb(j)
274      end do
275      if (np_i.gt.1) then
276
277
278         do j=1,nb(taskid_j+1)
279
280
281          !*** Allgather is flaky on my mac laptop ***
282          call dcopy(ne0,0.0,0,Bcol(1+(j-1)*ne0),1)
283          bshift = 0
284          do ii=1,taskid_i
285             bshift = bshift + mb(ii)
286          end do
287          do i=1,mb(taskid_i+1)
288            Bcol(bshift+i+(j-1)*ne0) = B(i+(j-1)*mb(taskid_i+1))
289          end do
290          call C3dB_Vector_SumAll(2*ne0,Bcol(1+(j-1)*ne0))
291         end do
292
293         bshift = 0
294         iwrk2 = nb(taskid_j+1)
295         ii    = 0
296         do jcur=0,np_j-1
297            iwrk  = na(jcur+1)
298            do j=1,iwrk2
299            do i=1,iwrk
300               Bwork(bshift+i+(j-1)*iwrk) = Bcol(ii+i+(j-1)*ne0)
301
302            end do
303            end do
304            bshift = bshift + iwrk*iwrk2
305            ii     = ii     + iwrk
306         end do
307      else
308         bshift = 0
309         iwrk2 = nb(taskid_j+1)
310         ii    = 0
311         do jcur=0,np_j-1
312            iwrk  = na(jcur+1)
313            do j=1,iwrk2
314            do i=1,iwrk
315               Bwork(bshift+i+(j-1)*iwrk) = B(ii+i+(j-1)*ne0)
316            end do
317            end do
318            bshift = bshift + iwrk*iwrk2
319            ii     = ii     + iwrk
320         end do
321      end if
322
323
324
325*     *** C = beta*C ***
326c      call dscal(ldc*nc(taskid_j+1),beta,C,1)
327      do j=1,nc(taskid_j+1)
328         do i=1,mc(taskid_i+1)
329            C(i,j) = beta*C(i,j)
330         end do
331      end do
332
333      call Cmatrix_start_rot(1,
334     >                       taskid_j,np_j,comm_j,
335     >                       A,work1,lda,na,
336     >                       request1)
337      jcur = taskid_j
338      iwrk = na(jcur+1)
339
340      if ((mc(taskid_i+1).gt.0).and.
341     >    (nc(taskid_j+1).gt.0).and.
342     >    (iwrk.gt.0))
343     >     call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
344     >                alpha,
345     >                A, ma(taskid_i+1),
346     >                Bwork(bshift2(jcur)), iwrk,
347     >                dcmplx(1.0d0,0.0d0),
348     >                C, ldc)
349
350      jeven = .true.
351      do j=2,np_j-1
352         if (jeven) then
353            jeven = .false.
354            jcur = mod(jcur-1+np_j,np_j)
355            iwrk = na(jcur+1)
356            call Cmatrix_end_rot(request1)
357            call Cmatrix_start_rot(j,
358     >                             taskid_j,np_j,comm_j,
359     >                             A,work2,lda,na,
360     >                             request2)
361         if ((mc(taskid_i+1).gt.0).and.
362     >       (nc(taskid_j+1).gt.0).and.
363     >       (iwrk.gt.0))
364     >       call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
365     >                alpha,
366     >                work1, ma(taskid_i+1),
367     >                Bwork(bshift2(jcur)), iwrk,
368     >                dcmplx(1.0d0,0.0d0),
369     >                C, ldc)
370
371         else
372            jeven = .true.
373            jcur = mod(jcur-1+np_j,np_j)
374            iwrk = na(jcur+1)
375            call Cmatrix_end_rot(request2)
376            call Cmatrix_start_rot(j,
377     >                             taskid_j,np_j,comm_j,
378     >                             A,work1,lda,na,
379     >                             request1)
380            if ((mc(taskid_i+1).gt.0).and.
381     >          (nc(taskid_j+1).gt.0).and.
382     >          (iwrk.gt.0))
383     >          call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
384     >                alpha,
385     >                work2, ma(taskid_i+1),
386     >                Bwork(bshift2(jcur)), iwrk,
387     >                dcmplx(1.0d0,0.0d0),
388     >                C, ldc)
389
390         end if
391      end do
392      if (jeven) then
393         jcur = mod(jcur-1+np_j,np_j)
394         iwrk = na(jcur+1)
395         call Cmatrix_end_rot(request1)
396         if ((mc(taskid_i+1).gt.0).and.
397     >       (nc(taskid_j+1).gt.0).and.
398     >       (iwrk.gt.0))
399     >       call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
400     >             alpha,
401     >             work1, ma(taskid_i+1),
402     >             Bwork(bshift2(jcur)), iwrk,
403     >             dcmplx(1.0d0,0.0d0),
404     >             C, ldc)
405
406      else
407         jcur = mod(jcur-1+np_j,np_j)
408         iwrk = na(jcur+1)
409         call Cmatrix_end_rot(request2)
410         if ((mc(taskid_i+1).gt.0).and.
411     >       (nc(taskid_j+1).gt.0).and.
412     >       (iwrk.gt.0))
413     >       call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
414     >             alpha,
415     >             work2, ma(taskid_i+1),
416     >             Bwork(bshift2(jcur)), iwrk,
417     >             dcmplx(1.0d0,0.0d0),
418     >             C, ldc)
419      end if
420      return
421      end
422
423
424      subroutine CMatrix_Bwork_copy(m,n,A,lda,ii,B,ldb)
425      implicit none
426      integer n,m
427      integer lda,ii,ldb
428      complex*16 A(LDA,*)
429      complex*16 B(LDB,*)
430
431*     *** local variables ***
432      integer i,j
433
434      do j=1,n
435         do i=1,m
436            B(i,j) = A(ii+i,j)
437         end do
438      end do
439      return
440      end
441
442*     ***********************************
443*     *                                 *
444*     *         CMatrix_start_rot       *
445*     *                                 *
446*     ***********************************
447
448      subroutine CMatrix_start_rot(j,
449     >                             taskid_j,np_j,comm_j,
450     >                             A,W,lda,na,
451     >                             request)
452      implicit none
453      integer j
454      integer taskid_j,np_j,comm_j
455      complex*16 A(*),W(*)
456      integer lda,na(*)
457      integer request(*)
458
459#include "mpif.h"
460#ifdef MPI4
461#include "stupid_mpi4.fh"
462#endif
463
464*     **** local variables ****
465      integer proc_to,proc_from,msgtype,amsglen,wmsglen,mpierr
466
467      proc_to   = mod(taskid_j+j,np_j)
468      proc_from = mod(taskid_j-j+np_j,np_j)
469      msgtype   = j
470      amsglen = lda*na(taskid_j+1)
471      wmsglen = lda*na(proc_from+1)
472
473#ifdef MPI4
474            if (wmsglen.gt.0) then
475               stupid_msglen = wmsglen
476               stupid_type   = msgtype
477               stupid_taskid = proc_from
478               call MPI_IRECV(W,
479     >                    stupid_msglen,stupid_complex,
480     >                    stupid_taskid,
481     >                    stupid_type,stupid_comm_j,
482     >                    stupid_request,stupid_ierr)
483               request(1) = stupid_request
484               request(3) = 1
485            else
486               request(3) = 0
487            end if
488
489            if (amsglen.gt.0) then
490               stupid_msglen = amsglen
491               stupid_type   = msgtype
492               stupid_taskid = proc_to
493               call MPI_ISEND(A,
494     >                     stupid_msglen,stupid_complex,
495     >                     stupid_taskid,
496     >                     stupid_type,stupid_comm_j,
497     >                     stupid_request,stupid_ierr)
498               request(2) = stupid_request
499               request(4) = 1
500            else
501               request(4) = 0
502            end if
503#else
504            if (wmsglen.gt.0) then
505               call MPI_IRECV(W,wmsglen,MPI_DOUBLE_COMPLEX,
506     >                    proc_from,
507     >                    msgtype,comm_j,
508     >                    request(1),mpierr)
509               request(3) = 1
510            else
511               request(3) = 0
512            end if
513            if (amsglen.gt.0) then
514               call MPI_ISEND(A,amsglen,MPI_DOUBLE_COMPLEX,
515     >                     proc_to,
516     >                     msgtype,comm_j,
517     >                     request(2),mpierr)
518               request(4) = 1
519            else
520               request(4) = 0
521            end if
522#endif
523
524      if ((request(3).eq.1).and.(request(4).eq.1)) then
525         request(3) = 1
526      else if (request(3).eq.1) then
527         request(3) = 2
528      else if (request(4).eq.1) then
529         request(3) = 3
530      else
531         request(3) = 4
532      end if
533
534      return
535      end
536
537*     ***********************************
538*     *                                 *
539*     *         CMatrix_end_rot         *
540*     *                                 *
541*     ***********************************
542
543      subroutine CMatrix_end_rot(request)
544      implicit none
545      integer request(*)
546
547*     **** wait for completion of mp_send, also do a sync ****
548      if (request(3).eq.1) then
549         call Parallel_mpiWaitAll(2,request)
550      else if (request(3).eq.2) then
551         call Parallel_mpiWaitAll(1,request)
552      else if (request(3).eq.3) then
553         call Parallel_mpiWaitAll(1,request(2))
554      endif
555
556      return
557      end
558
559
560
561
562
563*     ***********************************
564*     *                                 *
565*     *         CMatrix_zgemm1          *
566*     *                                 *
567*     ***********************************
568
569      subroutine CMatrix_zgemm1(m,n,k,nblock,
570     >                  alpha,
571     >                  A,lda,ma,na,
572     >                  B,ldb,mb,nb,
573     >                  beta,
574     >                  C,ldc,mc,nc,
575     >                  taskid_i,taskid_j,
576     >                  np_i,np_j,
577     >                  comm_i, comm_j,
578     >                  work1,work2)
579      implicit none
580      integer m,n,k,nblock
581      complex*16  alpha
582
583      integer lda,ma(*),na(*)
584      complex*16  A(lda,*)
585
586      integer ldb,mb(*),nb(*)
587      complex*16  B(ldb,*)
588
589      complex*16  beta
590
591      integer ldc,mc(*),nc(*)
592      complex*16  C(ldc,*)
593
594      integer taskid_i,taskid_j
595      integer np_i,np_j
596      integer comm_i,comm_j
597
598      complex*16  work1(*),work2(*)
599
600
601#ifdef MPI4
602#include "stupid_mpi4.fh"
603#else
604#include "mpif.h"
605#endif
606
607
608*     **** local variables ****
609      logical docalc1,docalc2
610      integer i,j,ii,jj
611      integer kk,iwrk,icur,jcur,ierr,shift
612
613
614      do j=1,nc(taskid_j+1)
615         do i=1,mc(taskid_i+1)
616            C(i,j) = beta*C(i,j)
617         end do
618      end do
619
620      ii = 0
621      jj = 0
622      kk = 0
623      icur = 0
624      jcur = 0
625c     **** loop over all row pannels of C ***
626      do while (kk.lt.k)
627         iwrk = min(nblock, mb(icur+1)-ii)
628         iwrk = min(iwrk,   na(jcur+1)-jj)
629
630
631*        **** pack current iwrk columns of A into work1 ***
632         if (taskid_j.eq.jcur) then
633            call zlacpy("G", ma(taskid_i+1),iwrk,
634     >                   A(1,jj+1), lda,
635     >                   work1,     ma(taskid_i+1))
636         end if
637
638*        **** pack current iwrk rows of B into work2 ***
639         if (taskid_i.eq.icur) then
640            call zlacpy("G", iwrk,nb(taskid_j+1),
641     >                   B(ii+1,1), ldb,
642     >                   work2,  iwrk)
643         end if
644
645#ifdef MPI4
646c        **** broadcast work1  within my row ***
647         stupid_msglen = iwrk*ma(taskid_i+1)
648         stupid_taskid = jcur
649         call MPI_Bcast(work1,stupid_msglen,stupid_complex,
650     >                  stupid_taskid,stupid_comm_j,stupid_ierr)
651
652c        **** broadcast work2  within my column ***
653         stupid_msglen = iwrk*nb(taskid_j+1)
654         stupid_taskid = icur
655         call MPI_Bcast(work2,stupid_msglen,stupid_complex,
656     >                  stupid_taskid,stupid_comm_i,stupid_ierr)
657#else
658c        **** broadcast work1  within my row ***
659         call MPI_Bcast(work1,iwrk*ma(taskid_i+1),MPI_DOUBLE_COMPLEX,
660     >                  jcur,comm_j,ierr)
661
662c        **** broadcast work2  within my column ***
663         call MPI_Bcast(work2,iwrk*nb(taskid_j+1),MPI_DOUBLE_COMPLEX,
664     >                  icur,comm_i,ierr)
665#endif
666
667
668         if ((iwrk.gt.0)          .and.
669     >       (mc(taskid_i+1).gt.0).and.
670     >       (nc(taskid_j+1).gt.0))
671     >     call zgemm('N','N',mc(taskid_i+1),nc(taskid_j+1),iwrk,
672     >                alpha,
673     >                work1, ma(taskid_i+1),
674     >                work2, iwrk,
675     >                dcmplx(1.0d0,0.0d0),
676     >                C, ldc)
677
678
679         ii = ii + iwrk
680         jj = jj + iwrk
681         kk = kk + iwrk
682
683         if (jj.ge.na(jcur+1)) then
684           jcur = jcur + 1
685           jj   = 0
686         end if
687         if (ii.ge.mb(icur+1)) then
688           icur = icur + 1
689           ii   = 0
690         end if
691
692      end do
693
694      return
695      end
696
697
698*     ***********************************
699*     *                                 *
700*     *         CMatrix_zgemm2          *
701*     *                                 *
702*     ***********************************
703      subroutine CMatrix_zgemm2(m,n,k,nblock,
704     >                  alpha,
705     >                  A,lda,ma,na,
706     >                  B,ldb,mb,nb,
707     >                  beta,
708     >                  C,ldc,mc,nc,
709     >                  taskid_i,taskid_j,
710     >                  np_i,np_j,
711     >                  comm_i, comm_j,
712     >                  work1,work2)
713      implicit none
714      integer m,n,k,nblock
715      complex*16  alpha
716
717      integer lda,ma(*),na(*)
718      complex*16  A(lda,*)
719
720      integer ldb,mb(*),nb(*)
721      complex*16  B(ldb,*)
722
723      complex*16  beta
724
725      integer ldc,mc(*),nc(*)
726      complex*16  C(ldc,*)
727
728      integer taskid_i,taskid_j
729      integer np_i,np_j
730      integer comm_i,comm_j
731
732      complex*16  work1(*),work2(*)
733
734
735#ifdef MPI4
736#include "stupid_mpi4.fh"
737#else
738#include "mpif.h"
739#endif
740
741
742*     **** local variables ****
743      logical docalc1,docalc2
744      integer i,j,ii,jj
745      integer kk,iwrk,icur,jcur,ierr,shift
746
747      do j=1,nc(taskid_j+1)
748         do i=1,mc(taskid_i+1)
749            C(i,j) = beta*C(i,j)
750         end do
751      end do
752
753      ii = 0
754      jj = 0
755      kk = 0
756      icur = 0
757      jcur = 0
758c     **** loop over all row pannels of C ***
759      do while (kk.lt.m)
760         iwrk = min(nblock, mc(icur+1)-ii)
761         iwrk = min(iwrk,   na(jcur+1)-jj)
762
763
764*        **** iwrk*nc(taskid_j+1) submatrix !=0 ****
765         if (ma(taskid_i+1).gt.0) then
766
767*           **** pack current iwrk columns of A into work1 ***
768            if (taskid_j.eq.jcur) then
769               call zlacpy("G", ma(taskid_i+1),iwrk,
770     >                   A(1,jj+1), lda,
771     >                   work1,     ma(taskid_i+1))
772            end if
773
774c           **** broadcast work1  within my row ***
775#ifdef MPI4
776            stupid_msglen = iwrk*ma(taskid_i+1)
777            stupid_taskid = jcur
778            call MPI_Bcast(work1,stupid_msglen,
779     >                     stupid_complex,stupid_taskid,
780     >                     stupid_comm_j,stupid_ierr)
781#else
782            call MPI_Bcast(work1,iwrk*ma(taskid_i+1),
783     >                     MPI_DOUBLE_COMPLEX,jcur,comm_j,ierr)
784#endif
785
786
787c            if ((iwrk.gt.0)          .and.
788c     >          (nb(taskid_j+1).gt.0).and.
789c     >          (ma(taskid_i+1).gt.0))
790            if ((iwrk.gt.0)          .and.
791     >          (nb(taskid_j+1).gt.0))
792     >        call zgemm('C','N',iwrk,nb(taskid_j+1),ma(taskid_i+1),
793     >                   alpha,
794     >                   work1, ma(taskid_i+1),
795     >                   B, ldb,
796     >                   dcmplx(0.0d0,0.0d0),
797     >                   work2, iwrk)
798
799*        **** iwrk*nc(taskid_j+1) submatrix ==0 ****
800         else
801            call dcopy(2*nc(taskid_j+1)*iwrk,0.0d0,0,work2,1)
802         end if
803
804
805c        **** summ to node that holds current rows of C ****
806#ifdef MPI4
807         stupid_msglen = nc(taskid_j+1)*iwrk
808         stupid_taskid = icur
809         call MPI_Reduce(work2,work1,stupid_msglen,
810     >                   stupid_complex,stupid_sum,
811     >                   stupid_taskid,stupid_comm_i,stupid_ierr)
812#else
813         call MPI_Reduce(work2,work1,nc(taskid_j+1)*iwrk,
814     >                   MPI_DOUBLE_COMPLEX,MPI_SUM,icur,comm_i,ierr)
815#endif
816
817
818c        **** add to current rows of C ****
819         if (taskid_i.eq.icur) then
820            shift = 1
821            do i=ii,(ii+iwrk-1)
822               call daxpy(2*nc(taskid_j+1),1.0d0,work1(shift),iwrk,
823     >                                    C(i+1,1),mc(taskid_i+1))
824               shift = shift + 1
825            end do
826         end if
827
828         ii = ii + iwrk
829         jj = jj + iwrk
830         kk = kk + iwrk
831
832         if (jj.ge.na(jcur+1)) then
833           jcur = jcur + 1
834           jj   = 0
835         end if
836         if (ii.ge.mc(icur+1)) then
837           icur = icur + 1
838           ii   = 0
839         end if
840
841      end do
842
843
844      return
845      end
846
847
848*     ***********************************
849*     *                                 *
850*     *         CMatrix_zgemm3          *
851*     *                                 *
852*     ***********************************
853
854      subroutine CMatrix_zgemm3(m,n,k,nblock,
855     >                  alpha,
856     >                  A,lda,ma,na,
857     >                  B,ldb,mb,nb,
858     >                  beta,
859     >                  C,ldc,mc,nc,
860     >                  taskid_i,taskid_j,
861     >                  np_i,np_j,
862     >                  comm_i, comm_j,
863     >                  work1,work2)
864      implicit none
865      integer m,n,k,nblock
866      complex*16  alpha
867
868      integer lda,ma(*),na(*)
869      complex*16  A(lda,*)
870
871      integer ldb,mb(*),nb(*)
872      complex*16  B(ldb,*)
873
874      complex*16  beta
875
876      integer ldc,mc(*),nc(*)
877      complex*16  C(ldc,*)
878
879      integer taskid_i,taskid_j
880      integer np_i,np_j
881      integer comm_i,comm_j
882
883      complex*16  work1(*),work2(*)
884
885#ifdef MPI4
886#include "stupid_mpi4.fh"
887#else
888#include "mpif.h"
889#endif
890
891
892*     **** local variables ****
893      logical docalc1,docalc2
894      integer i,j,ii,jj
895      integer kk,iwrk,icur,jcur,ierr,shift
896      real*8  dum
897
898
899      do j=1,nc(taskid_j+1)
900         do i=1,mc(taskid_i+1)
901            C(i,j) = beta*C(i,j)
902         end do
903      end do
904
905      ii = 0
906      jj = 0
907      kk = 0
908      icur = 0
909      jcur = 0
910      do while (kk.lt.n)
911         iwrk = min(nblock, mb(icur+1)-ii)
912         iwrk = min(iwrk,   nc(jcur+1)-jj)
913
914
915         if (taskid_i.eq.icur) then
916            call zlacpy("G", iwrk,nb(taskid_j+1),
917     >                   B(ii+1,1), ldb,
918     >                   work2,     iwrk)
919         end if
920
921#ifdef MPI4
922        stupid_msglen = iwrk*nb(taskid_j+1)
923        stupid_taskid = icur
924        call MPI_Bcast(work2,stupid_msglen,stupid_complex,
925     >                  stupid_taskid,stupid_comm_i,stupid_ierr)
926#else
927        call MPI_Bcast(work2,iwrk*nb(taskid_j+1),MPI_DOUBLE_COMPLEX,
928     >                  icur,comm_i,ierr)
929#endif
930
931         if ((iwrk.gt.0)          .and.
932     >       (na(taskid_j+1).gt.0).and.
933     >       (mc(taskid_i+1).gt.0))
934     >      call zgemm('N','C',mc(taskid_i+1),iwrk,na(taskid_j+1),
935     >              alpha,
936     >              A, lda,
937     >              work2, iwrk,
938     >              dcmplx(0.0d0,0.0d0),
939     >              work1, mc(taskid_i+1))
940
941#ifdef MPI4
942        stupid_msglen = mc(taskid_i+1)*iwrk
943        stupid_taskid = jcur
944        call MPI_Reduce(work1,work2,stupid_msglen,stupid_complex,
945     >                  stupid_sum,stupid_taskid,
946     >                  stupid_comm_j,stupid_ierr)
947#else
948        call MPI_Reduce(work1,work2,mc(taskid_i+1)*iwrk,
949     >                   MPI_DOUBLE_COMPLEX,MPI_SUM,jcur,comm_j,ierr)
950#endif
951
952
953         if (taskid_j.eq.jcur) then
954            shift = 1
955            do j=jj,(jj+iwrk-1)
956               call daxpy(2*mc(taskid_i+1),
957     >                    1.0d0,
958     >                    work2(shift),1,
959     >                    C(1,j+1),1)
960               shift = shift + mc(taskid_i+1)
961            end do
962         end if
963
964         ii = ii + iwrk
965         jj = jj + iwrk
966         kk = kk + iwrk
967
968         if (jj.ge.nc(jcur+1)) then
969           jcur = jcur + 1
970           jj   = 0
971         end if
972         if (ii.ge.mb(icur+1)) then
973           icur = icur + 1
974           ii   = 0
975         end if
976
977      end do
978
979
980      return
981      end
982
983
984
985
986
987*     ***********************************
988*     *                                 *
989*     *         CMatrix_tqliq           *
990*     *                                 *
991*     ***********************************
992
993      subroutine CMatrix_tqliq(n,eig,tu,
994     >                  Q,ldq,mq,nq,
995     >                  taskid_i,taskid_j,
996     >                  np_i,np_j,
997     >                  comm_i, comm_j,
998     >                  work1,work2)
999      implicit none
1000      integer n
1001
1002      integer ldq,mq(*),nq(*)
1003      complex*16  Q(ldq,*)
1004      real*8  eig(*),tu(*)
1005
1006      integer taskid_i,taskid_j
1007      integer np_i,np_j
1008      integer comm_i,comm_j
1009      complex*16  work1(*),work2(*)
1010
1011#ifdef MPI4
1012#include "stupid_mpi4.fh"
1013#else
1014#include "mpif.h"
1015#endif
1016
1017*     **** local variables ****
1018      integer MAXITER
1019      parameter (MAXITER = 100)
1020      real*8  tole
1021      parameter (tole=1.0d-15)
1022
1023      logical notdone
1024      integer i,j,l,m,iter
1025      integer ii,jj0,jj1,jcur0,jcur1,ierr,istat
1026      real*8  b,c,f,g,p,r,s
1027
1028
1029      do l=1,n-1
1030         iter = 0
1031
1032         do m=l,n-1
1033         if (dabs(tu(m)).lt.tole) go to 2
1034         end do
1035         m = n
1036  2      continue
1037         if (m.eq.l) then
1038            notdone = .false.
1039         else
1040            notdone = .true.
1041         end if
1042         do while ((iter.lt.MAXITER).and.(notdone))
1043            g = (eig(l+1)-eig(l))/(2.0d0*tu(l))
1044            r = dsqrt(g**2+1.0d0)
1045ccccneed to fixccc            g = eig(m)-eig(l)+tu(l)/(g+dsign(r,g))
1046            s = 1.0d0
1047            c = 1.0d0
1048            p = 0.0d0
1049            do i = m-1,l,-1
1050               f = s*tu(i)
1051               b = c*tu(i)
1052               if (dabs(f).ge.dabs(g)) then
1053                  c = g/f
1054                  r = dsqrt(c**2+1.0d0)
1055                  tu(i+1) = f*r
1056                  s = 1/r
1057                  c = c*s
1058               else
1059                  s = f/g
1060                  r = dsqrt(s**2+1.0d0)
1061                  tu(i+1) = g*r
1062                  c = 1/r
1063                  s = s*c
1064               end if
1065               g = eig(i+1)-p
1066               r = (eig(i)-g)*s + 2.0d0*c*b
1067               p = s*r
1068               eig(i+1) = g+p
1069               g = c*r-b
1070
1071
1072*              **** update eigenvectors ****
1073               jcur0 = 0
1074               jj0   = 1
1075               do j=1,i-1
1076                 jj0 = jj0 + 1
1077                 if (jj0.gt.nq(jcur0+1)) then
1078                    jcur0 = jcur0 + 1
1079                    jj0   = 1
1080                 end if
1081               end do
1082               jcur1 = jcur0
1083               jj1   = jj0 + 1
1084               if (jj1.gt.nq(jcur1+1)) then
1085                  jcur1 = jcur1 + 1
1086                  jj1   = 1
1087               end if
1088
1089               if (jcur0.eq.taskid_j)
1090     >             call zcopy(mq(taskid_i+1),Q(1,jj0),1,work1,1)
1091               if (jcur1.eq.taskid_j)
1092     >             call zcopy(mq(taskid_i+1),Q(1,jj1),1,work2,1)
1093
1094#ifdef MPI4
1095               stupid_msglen = mq(taskid_i+1)
1096               stupid_taskid = jcur0
1097               stupid_type   = jcur1
1098               call MPI_Bcast(work1,stupid_msglen,stupid_complex,
1099     >                  stupid_taskid,stupid_comm_j,stupid_ierr)
1100               call MPI_Bcast(work2,stupid_msglen,stupid_complex,
1101     >                  stupid_type,stupid_comm_j,stupid_ierr)
1102#else
1103               call MPI_Bcast(work1,mq(taskid_i+1),MPI_DOUBLE_COMPLEX,
1104     >                  jcur0,comm_j,ierr)
1105               call MPI_Bcast(work2,mq(taskid_i+1),MPI_DOUBLE_COMPLEX,
1106     >                  jcur1,comm_j,ierr)
1107#endif
1108
1109               if (jcur0.eq.taskid_j) then
1110                  do ii=1,mq(taskid_i+1)
1111                    Q(ii,jj0) = c*Q(ii,jj0) - s*work2(ii)
1112                  end do
1113               end if
1114
1115               if (jcur1.eq.taskid_j) then
1116                  do ii=1,mq(taskid_i+1)
1117                    Q(ii,jj1) = c*Q(ii,jj1) + s*work1(ii)
1118                  end do
1119               end if
1120
1121            end do
1122            eig(l) = eig(l) - p
1123            tu(l)  = g
1124            tu(m)  = 0.0d0
1125
1126
1127            do m=l,n-1
1128            if (dabs(tu(m)).lt.tole) go to 3
1129            end do
1130            m = n
1131  3         continue
1132            if (m.eq.l) then
1133               notdone = .false.
1134            else
1135               notdone = .true.
1136            end if
1137
1138            iter = iter + 1
1139         end do
1140
1141      end do
1142
1143      return
1144      end
1145
1146
1147
1148*     ***********************************
1149*     *                                 *
1150*     *         CMatrix_houseq          *
1151*     *                                 *
1152*     ***********************************
1153      subroutine CMatrix_houseq(jcol,
1154     >                  n,
1155     >                  A,V,Q,lda,ma,na,
1156     >                  taskid_i,taskid_j,
1157     >                  np_i,np_j,
1158     >                  comm_i, comm_j,
1159     >                  work1,work2)
1160      implicit none
1161      integer jcol,n
1162
1163      integer lda,ma(*),na(*)
1164      complex*16  A(lda,*),V(lda,*),Q(lda,*)
1165
1166      integer taskid_i,taskid_j
1167      integer np_i,np_j
1168      integer comm_i,comm_j
1169
1170      complex*16  work1(*),work2(*)
1171
1172#ifdef MPI4
1173#include "stupid_mpi4.fh"
1174#else
1175#include "mpif.h"
1176#endif
1177
1178*     **** local variables ****
1179      integer i,j,ii,jj
1180      integer kk,iwrk,icur,jcur,ierr,shift
1181      integer ii0,icur0,ii1,icur1,ii2,icur2
1182      integer jj0,jcur0,jj1,jcur1
1183      real*8  beta,mu0,mu,v20,v2
1184
1185      call dcopy(2*ma(taskid_i+1)*na(taskid_j+1),0.0d0,0,V,1)
1186
1187      jcur0 = 0
1188      jj0   = 1
1189      do j=1,jcol-1
1190        jj0 = jj0 + 1
1191        if (jj0.gt.na(jcur0+1)) then
1192           jcur0 = jcur0 + 1
1193           jj0 = 1
1194        end if
1195      end do
1196      jcur1 = jcur0
1197      jj1   = jj0 + 1
1198      if (jj1.gt.na(jcur1+1)) then
1199           jcur1 = jcur1 + 1
1200           jj1 = 1
1201      end if
1202
1203      icur0 = 0
1204      ii0   = 1
1205      do i=1,jcol-1
1206        ii0 = ii0 + 1
1207        if (ii0.gt.ma(icur0+1)) then
1208           icur0 = icur0 + 1
1209           ii0 = 1
1210        end if
1211      end do
1212      icur1 = icur0
1213      ii1   = ii0 + 1
1214      if (ii1.gt.ma(icur1+1)) then
1215           icur1 = icur1 + 1
1216           ii1 = 1
1217      end if
1218      icur2 = icur1
1219      ii2   = ii1 + 1
1220      if (ii2.gt.ma(icur2+1)) then
1221           icur2 = icur2 + 1
1222           ii2 = 1
1223      end if
1224
1225      if (jcur0.eq.taskid_j) then
1226
1227         icur = icur1
1228         ii   = ii1
1229         do i=jcol+1,n
1230            if (icur.eq.taskid_i) V(ii,jj0) = A(ii,jj0)
1231            ii = ii + 1
1232            if (ii.gt.ma(icur+1)) then
1233               icur = icur + 1
1234               ii = 1
1235            end if
1236         end do
1237
1238
1239         mu0 = 0.0d0
1240         icur = icur1
1241         ii   = ii1
1242         do i=jcol+1,n
1243            if (icur.eq.taskid_i)
1244     >         mu0 = mu0 + dconjg(V(ii,jj0))*V(ii,jj0)
1245            ii = ii + 1
1246            if (ii.gt.ma(icur+1)) then
1247               icur = icur + 1
1248               ii = 1
1249            end if
1250         end do
1251#ifdef MPI4
1252         stupid_msglen = 1
1253         call MPI_AllReduce(mu0,mu,stupid_msglen,
1254     >                      stupid_double,stupid_sum,
1255     >                      stupid_comm_i,stupid_ierr)
1256#else
1257         call MPI_AllReduce(mu0,mu,1,
1258     >                      MPI_DOUBLE_PRECISION,MPI_SUM,comm_i,ierr)
1259#endif
1260         mu = dsqrt(mu)
1261
1262
1263         if (mu.ne.0.0d0) then
1264cccc need to fix           if (icur1.eq.taskid_i)
1265cccc need to fix   >        beta = V(ii1,jj0) + dsign(mu,V(ii1,jj0))
1266#ifdef MPI4
1267           stupid_msglen = 1
1268           stupid_taskid = icur1
1269           call MPI_Bcast(beta,stupid_msglen,stupid_double,
1270     >                  stupid_taskid,stupid_comm_i,stupid_ierr)
1271#else
1272           call MPI_Bcast(beta,1,MPI_DOUBLE_PRECISION,
1273     >                  icur1,comm_i,ierr)
1274#endif
1275
1276           icur = icur2
1277           ii   = ii2
1278           do i=jcol+2,n
1279              if (icur.eq.taskid_i) V(ii,jj0) = V(ii,jj0)/beta
1280              ii = ii + 1
1281              if (ii.gt.ma(icur+1)) then
1282                 icur = icur + 1
1283                 ii = 1
1284              end if
1285           end do
1286         end if
1287         if (icur1.eq.taskid_i)  V(ii1,jj0) = dcmplx(1.0d0,0.0d0)
1288         if (icur0.eq.taskid_i)  V(ii0,jj0) = dcmplx(0.0d0,0.0d0)
1289
1290         v20 = 0.0d0
1291         icur = icur0
1292         ii   = ii0
1293         do i=jcol,n
1294            if (icur.eq.taskid_i)
1295     >         v20 = v20 + dcmplx(V(ii,jj0))*V(ii,jj0)
1296            ii = ii + 1
1297            if (ii.gt.ma(icur+1)) then
1298               icur = icur + 1
1299               ii = 1
1300            end if
1301         end do
1302#ifdef MPI4
1303         stupid_msglen = 1
1304         call MPI_AllReduce(v20,v2,stupid_msglen,
1305     >                      stupid_double,stupid_sum,
1306     >                      stupid_comm_i,stupid_ierr)
1307#else
1308         call MPI_AllReduce(v20,v2,1,
1309     >                      MPI_DOUBLE_PRECISION,MPI_SUM,comm_i,ierr)
1310#endif
1311         v2 = 2.0d0/v2
1312      end if
1313#ifdef MPI4
1314      stupid_msglen = 1
1315      stupid_taskid = jcur0
1316      call MPI_Bcast(v2,stupid_msglen,stupid_double,
1317     >               stupid_taskid,stupid_comm_j,stupid_ierr)
1318#else
1319      call MPI_Bcast(v2,1,MPI_DOUBLE_PRECISION,
1320     >               jcur0,comm_j,ierr)
1321#endif
1322
1323      call CMatrix_eye(n,n,dcmplx(1.0d0,0.0d0),
1324     >                 Q,lda,ma,na,taskid_i,taskid_j)
1325      call CMatrix_zgemm3(n,n,n,64,
1326     >             dcmplx(-v2,0.0d0),
1327     >             V,ma(taskid_i+1), ma,na,
1328     >             V,ma(taskid_i+1), ma,na,
1329     >             dcmplx(1.0d0,0.0d0),
1330     >             Q,ma(taskid_i+1), ma,na,
1331     >             taskid_i,taskid_j,
1332     >             np_i,np_j,
1333     >             comm_i, comm_j,
1334     >             work1,work2)
1335
1336
1337      return
1338      end
1339
1340
1341*     ***********************************
1342*     *                                 *
1343*     *         CMatrix_eigsrtq         *
1344*     *                                 *
1345*     ***********************************
1346      subroutine CMatrix_eigsrtq(n,eig,
1347     >                  Q,ldq,mq,nq,
1348     >                  taskid_i,taskid_j,
1349     >                  np_i,np_j,
1350     >                  comm_i, comm_j,
1351     >                  work1,work2)
1352      implicit none
1353      integer n
1354
1355      integer ldq,mq(*),nq(*)
1356      real*8     eig(*)
1357      complex*16 Q(ldq,*)
1358
1359      integer taskid_i,taskid_j
1360      integer np_i,np_j
1361      integer comm_i,comm_j
1362      complex*16 work1(*),work2(*)
1363
1364#ifdef MPI4
1365#include "stupid_mpi4.fh"
1366#else
1367#include "mpif.h"
1368#endif
1369
1370
1371*     **** local variables ****
1372      logical notdone
1373      integer i,j,k,l,m,iter
1374      integer ii,jj0,jj1,jcur0,jcur1,ierr,istat
1375      real*8  b,c,f,g,p,r,s
1376
1377
1378      do i=1,n-1
1379         k = i
1380         p = eig(i)
1381         do j=i+1,n
1382            if (eig(j).ge.p) then
1383               k = j
1384               p = eig(j)
1385            end if
1386         end do
1387         if (k.ne.i) then
1388            eig(k) = eig(i)
1389            eig(i) = p
1390
1391            jcur0 = 0
1392            jj0   = 1
1393            do j=1,i-1
1394               jj0 = jj0 + 1
1395               if (jj0.gt.nq(jcur0+1)) then
1396                  jcur0 = jcur0 + 1
1397                  jj0 = 1
1398               end if
1399            end do
1400            jcur1 = 0
1401            jj1   = 1
1402            do j=1,k-1
1403               jj1 = jj1 + 1
1404               if (jj1.gt.nq(jcur1+1)) then
1405                  jcur1 = jcur1 + 1
1406                  jj1 = 1
1407               end if
1408            end do
1409
1410
1411            if (jcur0.eq.taskid_j)
1412     >         call zcopy(mq(taskid_i+1),Q(1,jj0),1,work1,1)
1413            if (jcur1.eq.taskid_j)
1414     >         call zcopy(mq(taskid_i+1),Q(1,jj1),1,work2,1)
1415
1416#ifdef MPI4
1417            stupid_msglen = mq(taskid_i+1)
1418            stupid_taskid = jcur0
1419            stupid_type   = jcur1
1420            call MPI_Bcast(work1,stupid_msglen,stupid_complex,
1421     >                     stupid_taskid,stupid_comm_j,stupid_ierr)
1422            call MPI_Bcast(work2,stupid_msglen,stupid_complex,
1423     >                     stupid_type,stupid_comm_j,stupid_ierr)
1424#else
1425            call MPI_Bcast(work1,mq(taskid_i+1),MPI_DOUBLE_COMPLEX,
1426     >                     jcur0,comm_j,ierr)
1427            call MPI_Bcast(work2,mq(taskid_i+1),MPI_DOUBLE_COMPLEX,
1428     >                     jcur1,comm_j,ierr)
1429#endif
1430
1431            if (jcur0.eq.taskid_j)
1432     >         call zcopy(mq(taskid_i+1),work2,1,Q(1,jj0),1)
1433            if (jcur1.eq.taskid_j)
1434     >         call zcopy(mq(taskid_i+1),work1,1,Q(1,jj1),1)
1435
1436         end if
1437
1438      end do
1439
1440      return
1441      end
1442
1443
1444*     ***********************************
1445*     *                                 *
1446*     *         CMatrix_getdiags        *
1447*     *                                 *
1448*     ***********************************
1449      subroutine CMatrix_getdiags(n,eig,tu,
1450     >                  A,lda,ma,na,
1451     >                  taskid_i,taskid_j,
1452     >                  np_i,np_j,
1453     >                  comm_i,comm_j,
1454     >                  work1)
1455      implicit none
1456      integer n
1457
1458      integer lda,ma(*),na(*)
1459      real*8     eig(*),tu(*)
1460      complex*16 A(lda,*)
1461
1462      integer taskid_i,taskid_j
1463      integer np_i,np_j
1464      integer comm_i,comm_j
1465      real*8 work1(*)
1466
1467#ifdef MPI4
1468#include "stupid_mpi4.fh"
1469#else
1470#include "mpif.h"
1471#endif
1472
1473
1474
1475*     **** local variables ****
1476      integer i,j,ii,jj,is,ie,js,je
1477      integer icur,jcur,ierr
1478
1479*     **************************
1480*     **** gather diagonals ****
1481*     **************************
1482      call dcopy(n,0.0d0,0,work1,1)
1483      call dcopy(n,0.0d0,0,eig,1)
1484      js = 1
1485      do jcur = 0,taskid_j-1
1486        js = js + na(jcur+1)
1487      end do
1488      jcur = taskid_j
1489      je   = js-1 + na(jcur+1)
1490      jj   = 1
1491      do j=js,je
1492
1493         icur=0
1494         ii = 1
1495         do i=1,j-1
1496            ii = ii + 1
1497            if (ii.gt.ma(icur+1)) then
1498               icur = icur + 1
1499               ii = 1
1500            end if
1501         end do
1502         work1(j) = dble(A(ii,jj))
1503
1504#ifdef MPI4
1505         stupid_msglen = 1
1506         stupid_taskid = icur
1507         call MPI_Bcast(work1(j),stupid_msglen,stupid_double,
1508     >                  stupid_taskid,stupid_comm_i,stupid_ierr)
1509#else
1510         call MPI_Bcast(work1(j),1,MPI_DOUBLE_PRECISION,
1511     >                  icur,comm_i,ierr)
1512#endif
1513         jj = jj + 1
1514         if (jj.gt.na(jcur+1)) then
1515            jcur = jcur + 1
1516            jj = 1
1517         end if
1518      end do
1519
1520#ifdef MPI4
1521      stupid_msglen = n
1522      call MPI_AllReduce(work1,eig,stupid_msglen,
1523     >                   stupid_double,stupid_sum,
1524     >                   stupid_comm_j,stupid_ierr)
1525#else
1526      call MPI_AllReduce(work1,eig,n,
1527     >                   MPI_DOUBLE_PRECISION,MPI_SUM,comm_j,ierr)
1528#endif
1529
1530
1531*     ******************************
1532*     **** gather off-diagonals ****
1533*     ******************************
1534      call dcopy(n,0.0d0,0,work1,1)
1535      call dcopy(n,0.0d0,0,tu,1)
1536      is = 1
1537      do icur = 0,taskid_i-1
1538        is = is + ma(icur+1)
1539      end do
1540      icur = taskid_i
1541      ie   = is-1 + ma(icur+1)
1542      if (ie.ge.n) ie=ie-1
1543      ii   = 1
1544      do i=is,ie
1545
1546         jcur=0
1547         jj = 1
1548         do j=1,i
1549            jj = jj + 1
1550            if (jj.gt.na(jcur+1)) then
1551               jcur = jcur + 1
1552               jj = 1
1553            end if
1554         end do
1555         work1(i) = dble(A(ii,jj))
1556#ifdef MPI4
1557         stupid_msglen = 1
1558         stupid_taskid = jcur
1559         call MPI_Bcast(work1(i),stupid_msglen,stupid_double,
1560     >                  stupid_taskid,stupid_comm_j,stupid_ierr)
1561#else
1562         call MPI_Bcast(work1(i),1,MPI_DOUBLE_PRECISION,
1563     >                  jcur,comm_j,ierr)
1564#endif
1565         ii = ii + 1
1566         if (ii.gt.ma(icur+1)) then
1567            icur = icur + 1
1568            ii = 1
1569         end if
1570      end do
1571#ifdef MPI4
1572      stupid_msglen = n-1
1573      call MPI_AllReduce(work1,tu,stupid_msglen,
1574     >                   stupid_double,stupid_sum,
1575     >                   stupid_comm_i,stupid_ierr)
1576#else
1577      call MPI_AllReduce(work1,tu,n-1,
1578     >                   MPI_DOUBLE_PRECISION,MPI_SUM,comm_i,ierr)
1579#endif
1580      return
1581      end
1582
1583
1584
1585
1586*     ***********************************
1587*     *                                 *
1588*     *         CMatrix_MaxAll          *
1589*     *                                 *
1590*     ***********************************
1591      subroutine CMatrix_MaxAll(sum)
1592c     implicit none
1593      complex*16  sum
1594
1595#ifdef MPI4
1596#include "stupid_mpi4.fh"
1597#else
1598#include "mpif.h"
1599#endif
1600
1601
1602      integer msglen,mpierr,np
1603      complex*16  sumall
1604
1605*     **** external functions ****
1606
1607      call Parallel_np(np)
1608      if (np.gt.1) then
1609        msglen = 1
1610#ifdef MPI4
1611        stupid_msglen = 1
1612        call MPI_Allreduce(sum,sumall,stupid_msglen,stupid_complex,
1613     >                      stupid_max,stupid_world,stupid_ierr)
1614#else
1615        call MPI_Allreduce(sum,sumall,msglen,MPI_DOUBLE_COMPLEX,
1616     >                      MPI_MAX,MPI_COMM_WORLD,mpierr)
1617#endif
1618        sum = sumall
1619      end if
1620
1621      return
1622      end
1623
1624
1625
1626
1627*     ***********************************
1628*     *                                 *
1629*     *         CMatrix_SumAll          *
1630*     *                                 *
1631*     ***********************************
1632      subroutine CMatrix_SumAll(sum)
1633c     implicit none
1634      complex*16  sum
1635
1636#ifdef MPI4
1637#include "stupid_mpi4.fh"
1638#else
1639#include "mpif.h"
1640#endif
1641
1642
1643      integer msglen,mpierr,np
1644      complex*16 sumall
1645
1646*     **** external functions ****
1647
1648      call Parallel_np(np)
1649      if (np.gt.1) then
1650        msglen = 1
1651#ifdef MPI4
1652        stupid_msglen = msglen
1653        call MPI_Allreduce(sum,sumall,stupid_msglen,stupid_complex,
1654     >                      stupid_sum,stupid_world,stupid_ierr)
1655#else
1656        call MPI_Allreduce(sum,sumall,msglen,MPI_DOUBLE_COMPLEX,
1657     >                      MPI_SUM,MPI_COMM_WORLD,mpierr)
1658#endif
1659        sum = sumall
1660      end if
1661
1662      return
1663      end
1664
1665
1666
1667
1668
1669*     ***********************************
1670*     *                                 *
1671*     *         CMatrix_mm_transpose     *
1672*     *                                 *
1673*     ***********************************
1674
1675      subroutine CMatrix_mm_transpose(n,A,B,ldq,mq,nq)
1676      implicit none
1677      integer n
1678      integer ldq,mq(*),nq(*)
1679      complex*16  A(ldq,*)
1680      complex*16  B(ldq,*)
1681
1682#include "bafdecls.fh"
1683#include "errquit.fh"
1684#include "mpif.h"
1685
1686#ifdef MPI4
1687#include "stupid_mpi4.fh"
1688#endif
1689
1690*     **** local variables ****
1691      logical value
1692      integer taskid
1693      integer i,j
1694      integer ii,jj,rr,ss
1695      integer icur,jcur,rcur,scur
1696      integer psend,precv,msglen,msgtype,mpierr,status(2)
1697
1698*     **** external functions ****
1699      integer  Parallel2d_convert_taskid_ij
1700      external Parallel2d_convert_taskid_ij
1701
1702
1703      call Parallel_taskid(taskid)
1704      msglen  = 1
1705      msgtype = 1
1706
1707*    **** allocate memory ****
1708      value = BA_push_get(mt_int,MPI_STATUS_SIZE,
1709     >                     'status',status(2),status(1))
1710      if (.not. value)
1711     > call errquit(' CMatrix_m_transpose:out of stack',0,MA_ERR)
1712
1713
1714      jj   = 1
1715      jcur = 0
1716      rr   = 1
1717      rcur = 0
1718      do j=1,n
1719         ii   = 1
1720         icur = 0
1721         ss   = 1
1722         scur = 0
1723         do i=1,n
1724
1725
1726            psend = Parallel2d_convert_taskid_ij(icur,jcur)
1727            precv = Parallel2d_convert_taskid_ij(rcur,scur)
1728
1729            if (psend.eq.precv) then
1730               if (psend.eq.taskid) B(rr,ss) = A(ii,jj)
1731            else
1732#ifdef MPI4
1733               !**** send ****
1734               if (psend.eq.taskid) then
1735                  stupid_msglen = 1
1736                  stupid_type   = msgtype
1737                  stupid_taskid = precv
1738                  call MPI_SEND(A(ii,jj),
1739     >                          stupid_msglen,stupid_complex,
1740     >                          stupid_taskid,
1741     >                          stupid_type,stupid_world,stupid_ierr)
1742               end if
1743
1744               !**** recv ****
1745               if (precv.eq.taskid) then
1746                  stupid_msglen = msglen
1747                  stupid_type   = msgtype
1748                  stupid_taskid = psend
1749                  call MPI_RECV(B(rr,ss),
1750     >                       stupid_msglen,stupid_complex,
1751     >                       stupid_taskid,
1752     >                       stupid_type,stupid_world,
1753     >                       int_mb(status(1)),stupid_ierr)
1754               end if
1755
1756#else
1757               if (psend.eq.taskid)
1758     >         call MPI_SEND(A(ii,jj),1,
1759     >                       MPI_DOUBLE_COMPLEX,precv,
1760     >                       msgtype,MPI_COMM_WORLD,mpierr)
1761
1762               !**** recv ****
1763               if (precv.eq.taskid)
1764     >         call MPI_RECV(B(rr,ss),msglen,
1765     >                       MPI_DOUBLE_COMPLEX,psend,
1766     >                       msgtype,MPI_COMM_WORLD,
1767     >                       int_mb(status(1)),mpierr)
1768#endif
1769            end if
1770
1771
1772            ii = ii + 1
1773            if (ii.gt.mq(icur+1)) then
1774              icur = icur + 1
1775              ii   = 1
1776            end if
1777
1778            ss = ss + 1
1779            if (ss.gt.nq(scur+1)) then
1780              scur = scur + 1
1781              ss   = 1
1782            end if
1783
1784         end do
1785
1786         jj = jj + 1
1787         if (jj.gt.nq(jcur+1)) then
1788           jcur = jcur + 1
1789           jj   = 1
1790         end if
1791
1792         rr = rr + 1
1793         if (rr.gt.mq(rcur+1)) then
1794           rcur = rcur + 1
1795           rr   = 1
1796         end if
1797
1798      end do
1799
1800      !*** deallocate memory ***
1801      value = BA_pop_stack(status(2))
1802      if (.not. value)
1803     > call errquit(' CMatrix_m_transpose:popping stack',0,MA_ERR)
1804
1805      return
1806      end
1807
1808
1809
1810
1811
1812
1813*     ***********************************
1814*     *                                 *
1815*     *       CMatrix_combo_zgemm2      *
1816*     *                                 *
1817*     ***********************************
1818      subroutine CMatrix_combo_zgemm2(m,n,k,nblock,
1819     >                  alpha,
1820     >                  A,B,lda,ma,na,
1821     >                  beta,
1822     >                  C,ldc,ldc2,mc,nc,
1823     >                  taskid_i,taskid_j,
1824     >                  np_i,np_j,
1825     >                  comm_i, comm_j,
1826     >                  work1,work2)
1827      implicit none
1828      integer m,n,k,nblock
1829      complex*16  alpha
1830
1831      integer lda,ma(*),na(*)
1832      complex*16  A(lda,*)
1833      complex*16  B(lda,*)
1834
1835      complex*16  beta
1836
1837      integer ldc,ldc2,mc(*),nc(*)
1838      complex*16  C(ldc,ldc2,3)
1839
1840      integer taskid_i,taskid_j
1841      integer np_i,np_j
1842      integer comm_i,comm_j
1843
1844      complex*16  work1(*),work2(*)
1845
1846
1847#ifdef MPI4
1848#include "stupid_mpi4.fh"
1849#else
1850#include "mpif.h"
1851#endif
1852
1853
1854*     **** local variables ****
1855      logical docalc1,docalc2
1856      integer i,j,ii,jj
1857      integer kk,iwrk,icur,jcur,ierr,shift,shft,shft2,shft3
1858
1859      do kk=1,3
1860      do j=1,nc(taskid_j+1)
1861         do i=1,mc(taskid_i+1)
1862            C(i,j,kk) = beta*C(i,j,kk)
1863         end do
1864      end do
1865      end do
1866
1867      ii = 0
1868      jj = 0
1869      kk = 0
1870      icur = 0
1871      jcur = 0
1872c     **** loop over all row pannels of C ***
1873      do while (kk.lt.m)
1874         iwrk = min(nblock, mc(icur+1)-ii)
1875         iwrk = min(iwrk,   na(jcur+1)-jj)
1876
1877
1878*        **** iwrk*nc(taskid_j+1) submatrix !=0 ****
1879         if (ma(taskid_i+1).gt.0) then
1880
1881            shft  = iwrk*ma(taskid_i+1)
1882            shft2 = iwrk*nc(taskid_j+1)
1883            shft3 = shft2+shft2
1884
1885*           **** pack current iwrk columns of A into work1 ***
1886            if (taskid_j.eq.jcur) then
1887               call zlacpy("G", ma(taskid_i+1),iwrk,
1888     >                   A(1,jj+1), lda,
1889     >                   work1,     ma(taskid_i+1))
1890
1891               call zlacpy("G", ma(taskid_i+1),iwrk,
1892     >                   B(1,jj+1), lda,
1893     >                   work1(1+shft),  ma(taskid_i+1))
1894            end if
1895
1896c           **** broadcast work1  within my row ***
1897#ifdef MPI4
1898            stupid_msglen = 2*iwrk*ma(taskid_i+1)
1899            stupid_taskid = jcur
1900            call MPI_Bcast(work1,stupid_msglen,
1901     >                     stupid_complex,stupid_taskid,
1902     >                     stupid_comm_j,stupid_ierr)
1903#else
1904            call MPI_Bcast(work1,2*iwrk*ma(taskid_i+1),
1905     >                     MPI_DOUBLE_COMPLEX,jcur,comm_j,ierr)
1906#endif
1907
1908
1909            if ((iwrk.gt.0)          .and.
1910     >          (na(taskid_j+1).gt.0)) then
1911
1912              call zgemm('C','N',iwrk,na(taskid_j+1),ma(taskid_i+1),
1913     >                   alpha,
1914     >                   work1, ma(taskid_i+1),
1915     >                   A, lda,
1916     >                   dcmplx(0.0d0,0.0d0),
1917     >                   work2, iwrk)
1918              call zgemm('C','N',iwrk,na(taskid_j+1),ma(taskid_i+1),
1919     >                   alpha,
1920     >                   work1, ma(taskid_i+1),
1921     >                   B, lda,
1922     >                   dcmplx(0.0d0,0.0d0),
1923     >                   work2(1+shft2), iwrk)
1924              call zgemm('C','N',iwrk,na(taskid_j+1),ma(taskid_i+1),
1925     >                   alpha,
1926     >                   work1(1+shft), ma(taskid_i+1),
1927     >                   B, lda,
1928     >                   dcmplx(0.0d0,0.0d0),
1929     >                   work2(1+shft3), iwrk)
1930            end if
1931
1932*        **** iwrk*nc(taskid_j+1) submatrix ==0 ****
1933         else
1934            call dcopy(3*nc(taskid_j+1)*iwrk,0.0d0,0,work2,1)
1935         end if
1936
1937
1938c        **** summ to node that holds current rows of C ****
1939#ifdef MPI4
1940         stupid_msglen = 3*nc(taskid_j+1)*iwrk
1941         stupid_taskid = icur
1942         call MPI_Reduce(work2,work1,stupid_msglen,
1943     >                   stupid_complex,stupid_sum,
1944     >                   stupid_taskid,stupid_comm_i,stupid_ierr)
1945#else
1946         call MPI_Reduce(work2,work1,3*nc(taskid_j+1)*iwrk,
1947     >                   MPI_DOUBLE_COMPLEX,MPI_SUM,icur,comm_i,ierr)
1948#endif
1949
1950
1951c        **** add to current rows of C ****
1952         if (taskid_i.eq.icur) then
1953            shift = 1
1954            do i=ii,(ii+iwrk-1)
1955              call daxpy(2*nc(taskid_j+1),1.0d0,work1(shift),iwrk,
1956     >                                    C(i+1,1,1),mc(taskid_i+1))
1957              call daxpy(2*nc(taskid_j+1),1.0d0,work1(shift+shft2),iwrk,
1958     >                                    C(i+1,1,2),mc(taskid_i+1))
1959              call daxpy(2*nc(taskid_j+1),1.0d0,work1(shift+shft3),iwrk,
1960     >                                    C(i+1,1,3),mc(taskid_i+1))
1961              shift = shift + 1
1962            end do
1963         end if
1964
1965         ii = ii + iwrk
1966         jj = jj + iwrk
1967         kk = kk + iwrk
1968
1969         if (jj.ge.na(jcur+1)) then
1970           jcur = jcur + 1
1971           jj   = 0
1972         end if
1973         if (ii.ge.mc(icur+1)) then
1974           icur = icur + 1
1975           ii   = 0
1976         end if
1977
1978      end do
1979
1980      return
1981      end
1982
1983