1#if HAVE_CONFIG_H 2# include "config.fh" 3#endif 4#define xx_dgemm dgemm 5 subroutine ga_dgemm(transa, transb, m, n, k, alpha, g_a, 6 $ g_b, beta, g_c) 7C$Id: ga_dgemm.F,v 1.29 2000/11/04 01:46:31 d3h325 Exp $ 8 implicit none 9 Character*1 transa, transb 10 Integer m, n, k 11 Double precision alpha, beta 12 Integer g_a, g_b, g_c 13#include "mafdecls.fh" 14#include "global.fh" 15c 16c GA_DGEMM performs one of the matrix-matrix operations: 17c C := alpha*op( A )*op( B ) + beta*C, 18c where op( X ) is one of 19c op( X ) = X or op( X ) = X`, 20c 21c alpha and beta are scalars, and A, B and C are matrices, with op( A ) 22c an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. 23c 24c On entry, TRANSA specifies the form of op( A ) to be used in 25c the matrix multiplication as follows: 26c transa = 'N' or 'n', op( A ) = A. 27c transa = 'T' or 't', op( A ) = A`. 28c 29c M - On entry, M specifies the number of rows of the matrix 30c op( A ) and of the matrix C. M must be at least zero. 31c N - On entry, N specifies the number of columns of the matrix 32c op( B ) and the number of columns of the matrix C. N must be 33c at least zero. 34c K - On entry, K specifies the number of columns of the matrix 35c op( A ) and the number of rows of the matrix op( B ). K must 36c be at least zero. 37c 38 integer ilo, ihi, jlo, jhi, klo, khi, ichunk, jchunk, kchunk 39 integer idim, jdim, kdim, adim, bdim, cdim, ijk, me, nproc 40 integer l_a, k_a, l_b, k_b 41 logical status 42C 43 Logical Get_New_B ! Allow reuse of B patch when possible 44C 45 Double Precision Chunk_cube 46 Integer Min_Tasks, Max_Chunk, Mem_Avail 47 integer l_mxn,k_mxn,i0,i1,j0,j1,ldc,adrc 48 integer an1, an2, bn1, bn2, cn1, cn2 49 integer ilor, ihir,jlor,jhir,klor,khir,itipo,ijmax 50 double precision t0,t1,gflop 51 external MPI_Wtime 52 double precision MPI_Wtime 53 Parameter ( Min_Tasks = 10) ! Minimum acceptable tasks per node 54c 55C Set defaults -- platform dependent 56#ifdef GATIME 57 t0=MPI_Wtime() 58#endif 59 ichunk = 512 60 jchunk = 512 61 kchunk = 512 62C 63 me = ga_nodeid() 64 nproc = ga_nnodes() 65c if(me.eq.0) 66c W write(6,*) ' transa, transb ', transa, transb 67C 68C Make an estimate of how large patches can be and still insure 69C enough tasks per processor that loads will be reasonably balanced. 70C 71C Patches per dimension are M/chunk, N/chunk, K/chunk so total tasks 72C is roughly (K*M*N)/(chunk**3). Assume all chunk sizes are the 73C same and solve for the one that provides the minimum acceptable 74C number of tasks. 75C 76C Find out how much memory we can grab. It will be used in 77C three chunks, and the result includes only the first one. 78C 79 Mem_Avail = MA_Inquire_Avail( MT_DBL ) 80 $ - 2 * MA_SizeOf_Overhead( MT_DBL ) 81 Mem_Avail = 0.9 * Mem_Avail ! Do not use every last drop! 82c Call GA_IGOp(42, Mem_Avail, 1, 'min') 83C 84 85c 86 if (beta .eq. 0.0d0) then 87 call ga_zero(g_c) 88 else 89 call ga_scale(g_c, beta) 90 endif 91c 92 call ga_distribution(g_c, 93 . ga_nodeid(), i0, i1, j0, j1) 94 call ga_inquire(g_a, 95 . itipo, an1, an2) 96 call ga_inquire(g_b, 97 . itipo, bn1, bn2) 98 call ga_inquire(g_c, 99 . itipo, cn1, cn2) 100 if (i0.gt.0 .and. i0.le.i1) then 101 ilo=i0 102 ihi=i1 103 idim = ihi - ilo + 1 104 jlo=j0 105 jhi=j1 106 jdim = jhi - jlo + 1 107#if 0 108 write(6,'(I4,A,4I6)') ga_nodeid(),' IJ ',i0,i1,j0,j1 109 if(ga_nodeid().eq.0) call ffflush(6) 110 if(ga_nodeid().eq.0) call ffflush(0) 111#endif 112 ijmax=max(idim,jdim) 113 KChunk = Int((DBLE(Mem_Avail/(2*ijmax)))) 114 kchunk=min(kchunk,ijmax) 115 status = .true. 116 status = ma_push_get(MT_DBL, idim*kchunk, 'ga_dgemm:a', l_a,k_a) 117 $ .and. status 118 status = ma_push_get(MT_DBL, kchunk*jdim, 'ga_dgemm:b', l_b,k_b) 119 $ .and. status 120 if (.not. status) call ga_error('ga_dgemm: insufficent memory?', 121 A idim*kchunk+kchunk*jdim) 122 call ga_access(g_c, i0, i1, j0, j1, adrc, ldc) 123 ijk = 0 124 do klo = 1, k, kchunk 125 khi = min(k, klo+kchunk-1) 126 kdim = khi - klo + 1 127C 128C Each pass through the outer two loops means we need a 129C different patch of B. 130C 131 Get_New_B = .TRUE. 132C 133 cdim = idim 134 if (transa.eq.'n' .or. transa.eq.'N') then 135 ilor=min(an1,ilo) 136 ihir=min(an1,ihi) 137 klor=min(an2,klo) 138 khir=min(an2,khi) 139 kdim=khir-klor+1 140 idim=ihir-ilor+1 141 adim = idim 142 cdim = idim 143 call ga_get(g_a, ilor, ihir, klor, khir, 144 $ dbl_mb(k_a), adim) 145 else 146 klor=min(an1,klo) 147 khir=min(an1,khi) 148 ilor=min(an2,ilo) 149 ihir=min(an2,ihi) 150 kdim=khir-klor+1 151 idim=ihir-ilor+1 152 adim = kdim 153 cdim=idim 154 call ga_get(g_a, klor, khir, ilor, ihir, 155 $ dbl_mb(k_a), adim) 156 endif 157C 158C Avoid rereading B if it is the same patch as last time. 159C 160 If ( Get_New_B ) then 161 if (transb.eq.'n' .or. transb.eq.'N') then 162 klor=min(bn1,klo) 163 khir=min(bn1,khi) 164 jlor=min(bn2,jlo) 165 jhir=min(bn2,jhi) 166 kdim=khir-klor+1 167 idim=ihir-ilor+1 168 bdim = kdim 169 call ga_get(g_b, klor, khir, jlor, jhir, 170 $ dbl_mb(k_b), bdim) 171 else 172 jlor=min(bn1,jlo) 173 jhir=min(bn1,jhi) 174 klor=min(bn2,klo) 175 khir=min(bn2,khi) 176 kdim=khir-klor+1 177 jdim=jhir-jlor+1 178 bdim = jdim 179 call ga_get(g_b, jlor, jhir, klor, khir, 180 $ dbl_mb(k_b), bdim) 181 endif 182 Get_New_B = .FALSE. ! Until J or K change again 183 EndIf 184C 185 call xx_dgemm(transa, transb, idim, jdim, kdim, 186 $ alpha, dbl_mb(k_a), adim, dbl_mb(k_b), bdim, 187 $ 1.0d0, dbl_mb(adrc), cdim) 188 enddo 189 status = ma_chop_stack(l_a) 190 if (.not. status)call ga_error('ga_dgemm: pop of stack failed', 1) 191 endif 192 call ga_release_update(g_c, i0, i1, j0, j1) 193 call ga_sync() 194#ifdef GATIME 195 if(ga_nodeid().eq.0) then 196 call ffflush(6) 197 t1=MPI_Wtime()-t0 198 gflop=2d0*n*m*k/(t1*1d9) 199 write(6,'(I4,A,3F14.6)') 200 G ga_nodeid(),' dgemm done in ',t1, 201 , gflop,gflop/ga_nnodes() 202 endif 203#endif 204c 205 end 206 207 subroutine lga_acc(out, dim1,dim2, 208 I ilo, ihi, jlo, jhi, buf, 209 $ ld, alpha) 210 implicit none 211 integer dim1,dim2 212 integer ilo, ihi, jlo, jhi,ld 213 integer i,j 214 double precision alpha 215 double precision out(1:dim1,1:dim2) 216 double precision buf(1:ld, 1:*) 217 218 do j=jlo,jhi 219 do i=ilo,ihi 220 out(i, j) = out(i, j) + 221 + alpha*buf(i-ilo+1, j-jlo+1) 222 enddo 223 enddo 224 return 225 end 226 subroutine lga_accbrd(g_c,m,n,out) 227 implicit none 228#include "global.fh" 229#include "mafdecls.fh" 230 integer m,n 231 double precision out(1:m,1:n) 232 integer g_c 233c 234 integer ilo,ihi,jlo,jhi,numi,numj,k_in,l_in,i,j 235 logical status 236c 237 call ga_distribution(g_c, 238 . ga_nodeid(), ilo, ihi, jlo, jhi) 239 if (ilo.gt.0 .and. ilo.le.ihi) then 240 numi = ihi-ilo+1 241 numj = jhi-jlo+1 242 if (numi.gt.0 .and. numj.gt.0) then 243 if (.not.MA_Push_Get(MT_Dbl,numi*numj,'MxN',l_in,k_in)) 244 & call ga_error('dft_scf: cannot allocate eval',0) 245 call ga_get(g_c,ilo,ihi,jlo,jhi, 246 . dbl_mb(k_in),numi) 247 do j=jlo,jhi 248 do i=ilo,ihi 249 call daxpy(numi,1d0,dbl_mb(k_in+(j-jlo)*numi),1, 250 1 out,1) 251 enddo 252 enddo 253 call ga_put(g_c,ilo,ihi,jlo,jhi, 254 . out(ilo,jlo),m) 255 status = ma_pop_stack(l_in) 256 if (.not. status)call ga_error('gad: pop of stack failed', 257 3 numi*numj) 258 endif 259 endif 260 return 261 end 262