1#if HAVE_CONFIG_H
2#   include "config.fh"
3#endif
4#define xx_dgemm dgemm
5      subroutine ga_dgemm(transa, transb, m, n, k, alpha, g_a,
6     $     g_b, beta, g_c)
7C$Id: ga_dgemm.F,v 1.29 2000/11/04 01:46:31 d3h325 Exp $
8      implicit none
9      Character*1        transa, transb
10      Integer            m, n, k
11      Double precision   alpha, beta
12      Integer            g_a, g_b, g_c
13#include "mafdecls.fh"
14#include "global.fh"
15c
16c     GA_DGEMM  performs one of the matrix-matrix operations:
17c           C := alpha*op( A )*op( B ) + beta*C,
18c     where  op( X ) is one of
19c           op( X ) = X   or   op( X ) = X`,
20c
21c     alpha and beta are scalars, and A, B and C are matrices, with op( A )
22c     an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
23c
24c     On entry, TRANSA specifies the form of op( A ) to be used in
25c     the matrix multiplication as follows:
26c           transa = 'N' or 'n',  op( A ) = A.
27c           transa = 'T' or 't',  op( A ) = A`.
28c
29c     M      - On entry,  M  specifies  the number  of rows  of the  matrix
30c              op( A )  and of the  matrix  C.  M  must  be at least  zero.
31c     N      - On entry,  N  specifies the number  of columns of the matrix
32c              op( B ) and the number of columns of the matrix C. N must be
33c              at least zero.
34c     K      - On entry,  K  specifies  the number of columns of the matrix
35c              op( A ) and the number of rows of the matrix op( B ). K must
36c              be at least  zero.
37c
38      integer ilo, ihi, jlo, jhi, klo, khi, ichunk, jchunk, kchunk
39      integer idim, jdim, kdim, adim, bdim, cdim, ijk, me, nproc
40      integer l_a, k_a, l_b, k_b
41      logical status
42C
43      Logical Get_New_B ! Allow reuse of B patch when possible
44C
45      Double Precision Chunk_cube
46      Integer Min_Tasks, Max_Chunk, Mem_Avail
47      integer l_mxn,k_mxn,i0,i1,j0,j1,ldc,adrc
48      integer an1, an2, bn1, bn2, cn1, cn2
49      integer ilor, ihir,jlor,jhir,klor,khir,itipo,ijmax
50      double precision t0,t1,gflop
51      external MPI_Wtime
52      double precision MPI_Wtime
53      Parameter ( Min_Tasks = 10) ! Minimum acceptable tasks per node
54c
55C     Set defaults -- platform dependent
56#ifdef GATIME
57      t0=MPI_Wtime()
58#endif
59      ichunk = 512
60      jchunk = 512
61      kchunk = 512
62C
63      me = ga_nodeid()
64      nproc = ga_nnodes()
65c      if(me.eq.0)
66c     W     write(6,*) ' transa, transb ', transa, transb
67C
68C     Make an estimate of how large patches can be and still insure
69C     enough tasks per processor that loads will be reasonably balanced.
70C
71C     Patches per dimension are M/chunk, N/chunk, K/chunk so total tasks
72C     is roughly (K*M*N)/(chunk**3).  Assume all chunk sizes are the
73C     same and solve for the one that provides the minimum acceptable
74C     number of tasks.
75C
76C        Find out how much memory we can grab.  It will be used in
77C        three chunks, and the result includes only the first one.
78C
79         Mem_Avail = MA_Inquire_Avail( MT_DBL )
80     $      - 2 * MA_SizeOf_Overhead( MT_DBL )
81	 Mem_Avail = 0.9 * Mem_Avail ! Do not use every last drop!
82c         Call GA_IGOp(42, Mem_Avail, 1, 'min')
83C
84
85c
86      if (beta .eq. 0.0d0) then
87         call ga_zero(g_c)
88      else
89         call ga_scale(g_c, beta)
90      endif
91c
92      call ga_distribution(g_c,
93     .     ga_nodeid(), i0, i1, j0, j1)
94      call ga_inquire(g_a,
95     .     itipo, an1, an2)
96      call ga_inquire(g_b,
97     .     itipo, bn1, bn2)
98      call ga_inquire(g_c,
99     .     itipo, cn1, cn2)
100      if (i0.gt.0 .and. i0.le.i1) then
101         ilo=i0
102         ihi=i1
103         idim = ihi - ilo + 1
104         jlo=j0
105         jhi=j1
106         jdim = jhi - jlo + 1
107#if 0
108         write(6,'(I4,A,4I6)') ga_nodeid(),' IJ ',i0,i1,j0,j1
109      if(ga_nodeid().eq.0) call ffflush(6)
110      if(ga_nodeid().eq.0) call ffflush(0)
111#endif
112         ijmax=max(idim,jdim)
113         KChunk =  Int((DBLE(Mem_Avail/(2*ijmax))))
114         kchunk=min(kchunk,ijmax)
115      status = .true.
116      status = ma_push_get(MT_DBL, idim*kchunk, 'ga_dgemm:a', l_a,k_a)
117     $     .and. status
118      status = ma_push_get(MT_DBL, kchunk*jdim, 'ga_dgemm:b', l_b,k_b)
119     $     .and. status
120      if (.not. status) call ga_error('ga_dgemm: insufficent memory?',
121     A idim*kchunk+kchunk*jdim)
122         call ga_access(g_c, i0, i1, j0, j1, adrc, ldc)
123      ijk = 0
124         do klo = 1, k, kchunk
125            khi = min(k, klo+kchunk-1)
126            kdim = khi - klo + 1
127C
128C           Each pass through the outer two loops means we need a
129C           different patch of B.
130C
131            Get_New_B = .TRUE.
132C
133                  cdim = idim
134                  if (transa.eq.'n' .or. transa.eq.'N') then
135                     ilor=min(an1,ilo)
136                     ihir=min(an1,ihi)
137                     klor=min(an2,klo)
138                     khir=min(an2,khi)
139                     kdim=khir-klor+1
140                     idim=ihir-ilor+1
141                     adim = idim
142                     cdim = idim
143                     call ga_get(g_a, ilor, ihir, klor, khir,
144     $                  dbl_mb(k_a), adim)
145                  else
146                     klor=min(an1,klo)
147                     khir=min(an1,khi)
148                     ilor=min(an2,ilo)
149                     ihir=min(an2,ihi)
150                     kdim=khir-klor+1
151                     idim=ihir-ilor+1
152                     adim = kdim
153                     cdim=idim
154                     call ga_get(g_a, klor, khir, ilor, ihir,
155     $                  dbl_mb(k_a), adim)
156                  endif
157C
158C                 Avoid rereading B if it is the same patch as last time.
159C
160                  If ( Get_New_B ) then
161                     if (transb.eq.'n' .or. transb.eq.'N') then
162                        klor=min(bn1,klo)
163                        khir=min(bn1,khi)
164                        jlor=min(bn2,jlo)
165                        jhir=min(bn2,jhi)
166                        kdim=khir-klor+1
167                        idim=ihir-ilor+1
168                        bdim = kdim
169                        call ga_get(g_b, klor, khir, jlor, jhir,
170     $                     dbl_mb(k_b), bdim)
171                     else
172                        jlor=min(bn1,jlo)
173                        jhir=min(bn1,jhi)
174                        klor=min(bn2,klo)
175                        khir=min(bn2,khi)
176                        kdim=khir-klor+1
177                        jdim=jhir-jlor+1
178                        bdim = jdim
179                        call ga_get(g_b, jlor, jhir, klor, khir,
180     $                     dbl_mb(k_b), bdim)
181                     endif
182                     Get_New_B = .FALSE. ! Until J or K change again
183                  EndIf
184C
185                  call xx_dgemm(transa, transb, idim, jdim, kdim,
186     $                 alpha, dbl_mb(k_a), adim, dbl_mb(k_b), bdim,
187     $                 1.0d0, dbl_mb(adrc), cdim)
188         enddo
189      status = ma_chop_stack(l_a)
190      if (.not. status)call ga_error('ga_dgemm: pop of stack failed', 1)
191      endif
192      call ga_release_update(g_c, i0, i1, j0, j1)
193      call ga_sync()
194#ifdef GATIME
195      if(ga_nodeid().eq.0) then
196        call ffflush(6)
197          t1=MPI_Wtime()-t0
198          gflop=2d0*n*m*k/(t1*1d9)
199      write(6,'(I4,A,3F14.6)')
200     G     ga_nodeid(),' dgemm done in ',t1,
201     ,     gflop,gflop/ga_nnodes()
202      endif
203#endif
204c
205      end
206
207      subroutine lga_acc(out, dim1,dim2,
208     I     ilo, ihi, jlo, jhi, buf,
209     $     ld, alpha)
210      implicit none
211      integer dim1,dim2
212      integer ilo, ihi, jlo, jhi,ld
213      integer i,j
214      double precision alpha
215      double precision out(1:dim1,1:dim2)
216      double precision buf(1:ld, 1:*)
217
218      do j=jlo,jhi
219         do i=ilo,ihi
220            out(i, j) = out(i, j) +
221     +           alpha*buf(i-ilo+1, j-jlo+1)
222         enddo
223      enddo
224      return
225      end
226      subroutine lga_accbrd(g_c,m,n,out)
227      implicit none
228#include "global.fh"
229#include "mafdecls.fh"
230      integer m,n
231      double precision out(1:m,1:n)
232      integer g_c
233c
234      integer ilo,ihi,jlo,jhi,numi,numj,k_in,l_in,i,j
235      logical status
236c
237      call ga_distribution(g_c,
238     .     ga_nodeid(), ilo, ihi, jlo, jhi)
239      if (ilo.gt.0 .and. ilo.le.ihi) then
240         numi =  ihi-ilo+1
241         numj =  jhi-jlo+1
242         if (numi.gt.0 .and. numj.gt.0) then
243            if (.not.MA_Push_Get(MT_Dbl,numi*numj,'MxN',l_in,k_in))
244     &           call ga_error('dft_scf: cannot allocate eval',0)
245            call ga_get(g_c,ilo,ihi,jlo,jhi,
246     .           dbl_mb(k_in),numi)
247            do j=jlo,jhi
248               do i=ilo,ihi
249                  call daxpy(numi,1d0,dbl_mb(k_in+(j-jlo)*numi),1,
250     1                 out,1)
251               enddo
252            enddo
253            call ga_put(g_c,ilo,ihi,jlo,jhi,
254     .           out(ilo,jlo),m)
255            status = ma_pop_stack(l_in)
256            if (.not. status)call ga_error('gad: pop of stack failed',
257     3           numi*numj)
258         endif
259      endif
260      return
261      end
262