1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /* $Id: matmul.c,v 1.60.4.1 2006-12-22 13:05:22 manoj Exp $ */
6 /*===========================================================
7  *
8  *         GA_Dgemm(): Parallel Matrix Multiplication
9  *              (i.e.  C = alpha*A*B + beta*C)
10  *
11  *===========================================================*/
12 
13 #if HAVE_STDLIB_H
14 #   include <stdlib.h>
15 #endif
16 #if HAVE_STRING_H
17 #   include <string.h>
18 #endif
19 #include "matmul.h"
20 #include "ga_iterator.h"
21 #include "ga-papi.h"
22 #include "ga-wapi.h"
23 
24 #define DEBUG_ 0 /*set 1, to verify the correctness of parallel matrix mult.*/
25 
26 /* some optimization macros */
27 #define KCHUNK_OPTIMIZATION 0 /* This Opt performing well only for m=1000;n=1000'k=2000 kinda cases and not for the opposite*/
28 
29 /* Optimization flags: Initialized everytime in pnga_matmul() */
30 static short int CYCLIC_DISTR_OPT_FLAG  = SET;
31 static short int CONTIG_CHUNKS_OPT_FLAG = SET;
32 static short int DIRECT_ACCESS_OPT_FLAG = SET;
33 
34 Integer gNbhdlA[2], gNbhdlB[2], gNbhdlC[2];/* for A and B matrix */
35 
36 static int _gai_matmul_patch_flag = 0;
gai_matmul_patch_flag(int flag)37 void gai_matmul_patch_flag(int flag)
38 {
39     _gai_matmul_patch_flag = flag;
40 }
41 
max3(int ichunk,int jchunk,int kchunk)42 static inline int max3(int ichunk, int jchunk, int kchunk) {
43   if(ichunk>jchunk) return GA_MAX(ichunk,kchunk);
44   else return GA_MAX(jchunk, kchunk);
45 }
46 
init_task_list(task_list_t * thing)47 static inline void init_task_list(task_list_t *thing)
48 {
49     thing->lo[0] = 0;
50     thing->lo[1] = 0;
51     thing->hi[0] = 0;
52     thing->hi[1] = 0;
53     thing->dim[0] = 0;
54     thing->dim[1] = 0;
55     thing->chunkBId = 0;
56     thing->do_put = 0;
57 }
58 
GET_BLOCK(Integer g_x,task_list_t * chunk,void * buf,char * trans,Integer xilo,Integer xjlo,Integer * dim_next,Integer * nbhdl)59 static void GET_BLOCK(Integer g_x, task_list_t *chunk, void *buf,
60 		      char *trans, Integer xilo, Integer xjlo,
61 		      Integer *dim_next, Integer *nbhdl) {
62 
63     Integer i0, i1, j0, j1;
64     Integer lo[2], hi[2];
65 
66     if(*trans == 'n' || *trans == 'N') {
67        *dim_next = chunk->dim[0];
68        i0= xilo+chunk->lo[0]; i1= xilo+chunk->hi[0];
69        j0= xjlo+chunk->lo[1]; j1= xjlo+chunk->hi[1];
70     }
71     else {
72        *dim_next = chunk->dim[1];
73        i0= xjlo+chunk->lo[1]; i1= xjlo+chunk->hi[1];
74        j0= xilo+chunk->lo[0]; j1= xilo+chunk->hi[0];
75     }
76 
77     lo[0] = i0;
78     lo[1] = j0;
79     hi[0] = i1;
80     hi[1] = j1;
81     pnga_nbget(g_x, lo, hi, buf, dim_next, nbhdl);
82 }
83 
84 static short int
gai_get_task_list(task_list_t * taskListA,task_list_t * taskListB,task_list_t * state,Integer istart,Integer jstart,Integer kstart,Integer iend,Integer jend,Integer kend,Integer Ichunk,Integer Jchunk,Integer Kchunk,int * max_tasks,Integer g_a)85 gai_get_task_list(task_list_t *taskListA, task_list_t *taskListB,
86 		  task_list_t *state, Integer istart, Integer jstart,
87 		  Integer kstart, Integer iend, Integer jend, Integer kend,
88 		  Integer Ichunk, Integer Jchunk, Integer Kchunk,
89 		  int *max_tasks, Integer g_a) {
90 
91     int ii, jj, nloops=0;
92     short int do_put, more_chunks_left=0, recovery=0;
93     Integer ilo, ihi, jlo, jhi, klo, khi, get_new_B;
94     Integer jstart_=jstart, kstart_=kstart;
95 
96     if(state->lo[0] != -1) recovery = 1;
97 
98     nloops = (iend-istart+1)/Ichunk + ( ((iend-istart+1)%Ichunk)?1:0 );
99     if(nloops>MAX_CHUNKS) pnga_error("Increase MAX_CHUNKS value in matmul.h",0L);
100 
101     if(recovery) jstart_ = state->lo[0]; /* recovering the previous state */
102     for(ii=jj=0, jlo = jstart_; jlo <= jend; jlo += Jchunk) {
103        jhi = GA_MIN(jend, jlo+Jchunk-1);
104 
105        if(recovery) {
106 	  do_put = state->do_put;
107 	  kstart_ =  state->lo[1];
108        }
109        else do_put = SET; /* for 1st shot we can put, instead of accumulate */
110 
111        for(klo = kstart_; klo <= kend; klo += Kchunk) {
112 	  khi = GA_MIN(kend, klo+Kchunk-1);
113 	  get_new_B = TRUE;
114 
115 	  /* set it back after the first loop */
116 	  recovery = 0;
117 	  jstart_ = jstart;
118 	  kstart_ = kstart;
119 
120 	  /* save CURRENT STATE. Saving state before "i" loop helps to avoid
121 	     tracking get_new_B, which is hassle in ga_matmul_regular() */
122 	  if(ii+nloops >= MAX_CHUNKS) {
123 	     more_chunks_left = 1;
124 	     state->lo[0]  = jlo;
125 	     state->lo[1]  = klo;
126 	     state->do_put   = do_put;
127 	     break;
128 	  }
129 
130 	  for(ilo = istart; ilo <= iend; ilo += Ichunk){
131 	     ihi = GA_MIN(iend, ilo+Ichunk-1);
132 	     taskListA[ii].dim[0] = ihi - ilo + 1;
133 	     taskListA[ii].dim[1] = khi - klo + 1;
134 	     taskListA[ii].lo[0]  = ilo; taskListA[ii].hi[0] = ihi;
135 	     taskListA[ii].lo[1]  = klo; taskListA[ii].hi[1] = khi;
136 	     taskListA[ii].do_put = do_put;
137 	     if(get_new_B) { /* B matrix */
138 		ihi = GA_MIN(iend, ilo+Ichunk-1);
139 		taskListB[jj].dim[0] = khi - klo + 1;
140 		taskListB[jj].dim[1] = jhi - jlo + 1;
141 		taskListB[jj].lo[0]  = klo; taskListB[jj].hi[0] = khi;
142 		taskListB[jj].lo[1]  = jlo; taskListB[jj].hi[1] = jhi;
143 		get_new_B = FALSE; /* Until J or K change again */
144 		taskListA[ii].chunkBId = jj;
145 		++jj;
146 	     }
147 	     else taskListA[ii].chunkBId = taskListA[ii-1].chunkBId;
148 	     ++ii;
149 	  }
150 	  if (more_chunks_left) break;
151 	  do_put = UNSET;
152        }
153        if (more_chunks_left) break;
154     }
155 
156     *max_tasks = ii;
157 
158     /* Optimization disabled if chunks exceeds buffer space */
159     if(more_chunks_left) CYCLIC_DISTR_OPT_FLAG = UNSET;
160 
161     if(CYCLIC_DISTR_OPT_FLAG) { /* should not be called for irregular matmul */
162        int prow, pcol, offset, grp_me;
163        Integer a_grp = pnga_get_pgroup(g_a);
164        grp_me = (int)pnga_pgroup_nodeid(a_grp);
165        prow = GA[GA_OFFSET + g_a].nblock[0];
166        pcol = GA[GA_OFFSET + g_a].nblock[1];
167        offset = (grp_me/prow + grp_me%prow) % pcol;
168        for(jj=0, ilo = istart; ilo <= iend; jj++, ilo += Ichunk)
169 	  taskListA[jj].do_put = UNSET;
170        for(jj=0, ilo = istart; ilo <= iend; jj++, ilo += Ichunk)
171 	  taskListA[jj+offset].do_put = SET;
172     }
173 
174     return more_chunks_left;
175 }
176 
gai_get_chunk_size(int irregular,Integer * Ichunk,Integer * Jchunk,Integer * Kchunk,Integer * elems,Integer atype,Integer m,Integer n,Integer k,short int nbuf,short int use_armci_memory,Integer a_grp)177 static void gai_get_chunk_size(int irregular,Integer *Ichunk,Integer *Jchunk,
178 			       Integer *Kchunk,Integer *elems,Integer atype,
179 			       Integer m,Integer n,Integer k, short int nbuf,
180 			       short int use_armci_memory, Integer a_grp) {
181     double temp;
182     Integer min_tasks = MINTASKS; /* Increase tasks if there is load imbalance.
183 				     This controls the granularity of chunks */
184     Integer  max_chunk, nproc=pnga_nnodes(), tmpa, tmpb;
185     Integer avail = pnga_memory_avail_type(atype);
186 
187     tmpa = *Ichunk;
188     tmpb = *Jchunk;
189 
190     if(irregular) {
191        temp = (k*(double)(m*(double)n)) / (min_tasks * nproc);
192        max_chunk = (Integer)pow(temp, (1.0/3.0) );
193        if (max_chunk < MIN_CHUNK_SIZE) max_chunk = MIN_CHUNK_SIZE;
194     }
195     else
196        max_chunk = (Integer) max3(*Ichunk, *Jchunk, *Kchunk);
197 
198     pnga_pgroup_gop(a_grp, pnga_type_f2c(MT_F_INT), &avail, (Integer)1, "min");
199 
200     if ( max_chunk > CHUNK_SIZE/nbuf) {
201        /*if memory if very limited, performance degrades for large matrices
202 	 as chunk size is very small, which leads to communication overhead)*/
203       if(avail<MINMEM && pnga_pgroup_nodeid(a_grp)==0) pnga_error("NotEnough memory",avail);
204       *elems = (Integer)(avail*0.9); /* Donot use every last drop */
205 
206       /* MAX: get the maximum chunk (or, block) size i.e  */
207       max_chunk=GA_MIN(max_chunk, (Integer)(sqrt( (double)((*elems-nbuf*NUM_MATS)/(nbuf*NUM_MATS)))));
208 
209       if(!irregular && use_armci_memory==SET)
210 	 max_chunk = *Ichunk = *Jchunk = *Kchunk = BLOCK_SIZE;
211 
212       if(irregular) {
213 	 /* NOTE:enable this part for regular cases, if later
214 	    part of the code is buggy or inefficient */
215 	 *Ichunk = GA_MIN(m,max_chunk);
216 	 *Jchunk = GA_MIN(n,max_chunk);
217 	 *Kchunk = GA_MIN(k,max_chunk);
218       }
219       else { /* This part of the code takes care of rectangular chunks and
220 		most probably gives optimum rectangular chunk size */
221 	 temp = max_chunk*max_chunk;
222 	 if(*Ichunk < max_chunk && *Kchunk > max_chunk) {
223 	    *Kchunk = GA_MIN(*Kchunk,(Integer)(temp/(*Ichunk)));
224 	    *Jchunk = GA_MIN(*Jchunk,(Integer)(temp/(*Kchunk)));
225 	 }
226 	 else if(*Kchunk < max_chunk && *Ichunk > max_chunk) {
227 	    temp *= 1.0/(*Kchunk);
228 	    *Ichunk = GA_MIN(*Ichunk,(Integer)temp);
229 	    *Jchunk = GA_MIN(*Jchunk,(Integer)temp);
230 	 }
231 	 else *Ichunk = *Jchunk = *Kchunk = max_chunk;
232       }
233     }
234     else
235        *Ichunk = *Jchunk = *Kchunk = CHUNK_SIZE/nbuf;
236 
237     /* Try to use 1-d data transfer & take advantage of zero-copy protocol */
238     if(CONTIG_CHUNKS_OPT_FLAG) { /* select a contiguous piece */
239        if(!irregular) {
240 	  if(*Ichunk > tmpa && *Jchunk > tmpb) {
241 	     *Ichunk = tmpa;
242 	     *Jchunk = tmpb;
243 	     *Kchunk = GA_MIN(*Ichunk,*Jchunk);
244 	  }
245 	  else {
246 	     int i=1;/* i should be >=1 , to avoid divide by zero error */
247 	     temp = max_chunk*max_chunk;
248 	     if(temp > tmpa) {
249 		*Ichunk = tmpa;
250 		*Jchunk = (Integer)(temp/(*Ichunk));
251 		if(*Jchunk < tmpb) {
252 		   while(tmpb/i > *Jchunk) ++i;
253 		   *Jchunk = tmpb/i;
254 		}
255 		else *Jchunk = tmpb;
256 		*Kchunk = GA_MIN(*Ichunk, *Jchunk);
257 	     }
258 	  }
259        }
260     }
261 
262     if(*Ichunk<=0) *Ichunk = 1; /* should be atleast 1 */
263     if(*Jchunk<=0) *Jchunk = 1;
264     if(*Kchunk<=0) *Kchunk = 1;
265 
266     /* Total elements "NUM_MAT" extra elems for safety - just in case */
267     *elems = ( nbuf*(*Ichunk)*(*Kchunk) + nbuf*(*Kchunk)*(*Jchunk) +
268 	       (*Ichunk)*(*Jchunk) );
269     *elems += nbuf*NUM_MATS*sizeof(DoubleComplex)/GAsizeofM(atype);
270 }
271 
272 static DoubleComplex*
gai_get_armci_memory(Integer Ichunk,Integer Jchunk,Integer Kchunk,short int nbuf,Integer atype)273 gai_get_armci_memory(Integer Ichunk, Integer Jchunk, Integer Kchunk,
274 		     short int nbuf, Integer atype) {
275 
276     DoubleComplex *tmp = NULL;
277     Integer elems;
278 
279     elems = (Integer) pow((double)BLOCK_SIZE,(double)2);
280     elems = nbuf*elems + nbuf*elems + elems; /* A,B,C temporary buffers */
281 
282     /* add extra elements for safety */
283     elems += nbuf*NUM_MATS*sizeof(DoubleComplex)/GAsizeofM(atype);
284 
285     /* allocate temporary storage using ARMCI_Malloc */
286     if( (Integer) (((double)nbuf)*(Ichunk* Kchunk) +
287 		   ((double)nbuf)*(Kchunk* Jchunk) +
288 		   Ichunk* Jchunk ) < elems) {
289        tmp=(DoubleComplex*)ARMCI_Malloc_local(elems*GAsizeofM(atype));
290     }
291     return tmp;
292 }
293 
294 /************************************
295  * Sequential DGEMM
296  *      i.e. BLAS dgemm Routines
297  ************************************/
298 
GAI_DGEMM(Integer atype,char * transa,char * transb,Integer idim,Integer jdim,Integer kdim,void * alpha,DoubleComplex * a,Integer adim,DoubleComplex * b,Integer bdim,DoubleComplex * c,Integer cdim)299 static void GAI_DGEMM(Integer atype, char *transa, char *transb,
300         Integer idim, Integer jdim, Integer kdim, void *alpha,
301         DoubleComplex *a, Integer adim, DoubleComplex *b,
302         Integer bdim, DoubleComplex *c, Integer cdim) {
303 
304     BlasInt idim_t, jdim_t, kdim_t, adim_t, bdim_t, cdim_t;
305     DoubleComplex ZERO;
306     SingleComplex ZERO_CF;
307 
308     idim_t=idim; jdim_t=jdim; kdim_t=kdim;
309     adim_t=adim; bdim_t=bdim; cdim_t=cdim;
310     ZERO.real = 0.; ZERO.imag = 0.;
311     ZERO_CF.real = 0.; ZERO_CF.imag = 0.;
312 
313     switch(atype) {
314         case C_FLOAT:
315             BLAS_SGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
316                     (Real *)alpha, (Real *)a, &adim_t,
317                     (Real *)b, &bdim_t, (Real *)&ZERO_CF,
318                     (Real *)c, &cdim_t);
319             break;
320         case C_DBL:
321             BLAS_DGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
322                     (DoublePrecision *)alpha, (DoublePrecision *)a, &adim_t,
323                     (DoublePrecision *)b, &bdim_t, (DoublePrecision *)&ZERO,
324                     (DoublePrecision *)c, &cdim_t);
325             break;
326         case C_DCPL:
327             BLAS_ZGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
328                     (DoubleComplex *)alpha, (DoubleComplex *)a, &adim_t,
329                     (DoubleComplex *)b, &bdim_t, (DoubleComplex *)&ZERO,
330                     (DoubleComplex *)c, &cdim_t);
331             break;
332         case C_SCPL:
333             BLAS_CGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
334                     (SingleComplex *)alpha, (SingleComplex *)a, &adim_t,
335                     (SingleComplex *)b, &bdim_t, (SingleComplex *)&ZERO_CF,
336                     (SingleComplex *)c, &cdim_t);
337             break;
338         default:
339             pnga_error("ga_matmul_patch: wrong data type", atype);
340     }
341 }
342 
343 
344 
gai_matmul_shmem(transa,transb,alpha,beta,atype,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi,Ichunk,Kchunk,Jchunk,a,b,c,need_scaling)345 static void gai_matmul_shmem(transa, transb, alpha, beta, atype,
346 			     g_a, ailo, aihi, ajlo, ajhi,
347 			     g_b, bilo, bihi, bjlo, bjhi,
348 			     g_c, cilo, cihi, cjlo, cjhi,
349 			     Ichunk, Kchunk, Jchunk, a,b,c, need_scaling)
350 
351      Integer g_a, ailo, aihi, ajlo, ajhi;    /* patch of g_a */
352      Integer g_b, bilo, bihi, bjlo, bjhi;    /* patch of g_b */
353      Integer g_c, cilo, cihi, cjlo, cjhi;    /* patch of g_c */
354      Integer Ichunk, Kchunk, Jchunk, atype;
355      void    *alpha, *beta;
356      char    *transa, *transb;
357      DoubleComplex *a, *b, *c;
358      short int need_scaling;
359 {
360 
361   Integer me = pnga_nodeid();
362   Integer get_new_B, loC[2]={0,0}, hiC[2]={0,0}, ld[2];
363   Integer i0, i1, j0, j1;
364   Integer ilo, ihi, idim, jlo, jhi, jdim, klo, khi, kdim, adim, bdim=0, cdim;
365   int istart, jstart, kstart, iend, jend, kend;
366   short int do_put=UNSET, single_task_flag=UNSET;
367   DoubleComplex ONE = {1.,0.};
368   SingleComplex ONE_CF = {1.,0.};
369   Integer clo[2], chi[2];
370 
371 
372   /* to skip accumulate and exploit data locality:
373      get chunks according to "C" matrix distribution*/
374   pnga_distribution(g_c, me, loC, hiC);
375   istart = loC[0]-1; iend = hiC[0]-1;
376   jstart = loC[1]-1; jend = hiC[1]-1;
377   kstart = 0       ; kend = ajhi-ajlo;
378 
379   if(DIRECT_ACCESS_OPT_FLAG) {
380     /* check if there is only one task. If so, then it is contiguous */
381     if( (iend-istart+1 <= Ichunk) && (jend-jstart+1 <= Jchunk) &&
382         (kend-kstart+1 <= Kchunk) ) {
383       single_task_flag = SET;
384       pnga_access_ptr(g_c, loC, hiC, &c, ld);
385     }
386   }
387 
388   /* loop through columns of g_c patch */
389   for(jlo = jstart; jlo <= jend; jlo += Jchunk) {
390     jhi  = GA_MIN(jend, jlo+Jchunk-1);
391     jdim = jhi - jlo +1;
392 
393     /* if beta=0,then for first shot we can do put,instead of accumulate */
394     if(need_scaling == UNSET) do_put = SET;
395 
396     /* loop cols of g_a patch : loop rows of g_b patch*/
397     for(klo = kstart; klo <= kend; klo += Kchunk) {
398       khi = GA_MIN(kend, klo+Kchunk-1);
399       kdim= khi - klo +1;
400       get_new_B = TRUE; /* Each pass thru' outer 2 loops means we
401                            need a different patch of B.*/
402       /*loop through rows of g_c patch */
403       for(ilo = istart; ilo <= iend; ilo += Ichunk){
404         ihi = GA_MIN(iend, ilo+Ichunk-1);
405         idim= cdim = ihi - ilo +1;
406 
407         /* STEP1(a): get matrix "A" chunk */
408         i0= ailo+ilo; i1= ailo+ihi;
409         j0= ajlo+klo; j1= ajlo+khi;
410         if (*transa == 'n' || *transa == 'N'){
411           clo[0] = i0;
412           clo[1] = j0;
413           chi[0] = i1;
414           chi[1] = j1;
415           adim=idim; pnga_get(g_a, clo, chi, a, &idim);
416         }else{
417           clo[0] = j0;
418           clo[1] = i0;
419           chi[0] = j1;
420           chi[1] = i1;
421           adim=kdim; pnga_get(g_a, clo, chi, a, &kdim);
422         }
423 
424         /* STEP1(b): get matrix "B" chunk*/
425         if(get_new_B) {/*Avoid rereading B if same patch as last time*/
426           i0= bilo+klo; i1= bilo+khi;
427           j0= bjlo+jlo; j1= bjlo+jhi;
428           if (*transb == 'n' || *transb == 'N'){
429             clo[0] = i0;
430             clo[1] = j0;
431             chi[0] = i1;
432             chi[1] = j1;
433             bdim=kdim; pnga_get(g_b, clo, chi, b, &kdim);
434           }else {
435             clo[0] = j0;
436             clo[1] = i0;
437             chi[0] = j1;
438             chi[1] = i1;
439             bdim=jdim; pnga_get(g_b, clo, chi, b, &jdim);
440           }
441           get_new_B = FALSE; /* Until J or K change again */
442         }
443 
444         /* STEP2: Do the sequential matrix multiply - i.e.BLAS dgemm */
445         GAI_DGEMM(atype, transa, transb, idim, jdim, kdim, alpha,
446             a, adim, b, bdim, c, cdim);
447 
448         /* STEP3: put/accumulate into "C" matrix */
449         i0= cilo+ilo; i1= cilo+ihi;
450         j0= cjlo+jlo; j1= cjlo+jhi;
451         /* if single_task_flag is SET (i.e =1), then there is no need to
452            update "C" matrix, as we use pointer directly in GAI_DGEMM */
453         if(single_task_flag != SET) {
454           switch(atype) {
455             case C_FLOAT:
456             case C_SCPL:
457               clo[0] = i0;
458               clo[1] = j0;
459               chi[0] = i1;
460               chi[1] = j1;
461               if(do_put==SET) /* i.e.beta == 0.0 */
462                 pnga_put(g_c, clo, chi, (float *)c, &cdim);
463               else {
464                 pnga_acc(g_c, clo, chi, (float*)c, &cdim, &ONE_CF);
465               }
466               break;
467             default:
468               clo[0] = i0;
469               clo[1] = j0;
470               chi[0] = i1;
471               chi[1] = j1;
472               if(do_put==SET) /* i.e.beta == 0.0 */
473                 pnga_put(g_c, clo, chi, (DoublePrecision*)c, &cdim);
474               else {
475                 pnga_acc(g_c, clo, chi, (DoublePrecision*)c,
476                     &cdim, (DoublePrecision*)&ONE);
477               }
478               break;
479           }
480         }
481       }
482       do_put = UNSET; /* In the second loop, accumulate should be done */
483     }
484   }
485 }
486 
487 
488 static
init_block_info(Integer g_c,Integer * proc_index,Integer * index,Integer * blocks,Integer * block_dims,Integer * topology,Integer * iblock)489 void init_block_info(Integer g_c, Integer *proc_index, Integer *index,
490                      Integer *blocks, Integer *block_dims, Integer *topology,
491                      Integer *iblock)
492 {
493     Integer me= pnga_nodeid();
494 
495     /* Uses simple block-cyclic data distribution */
496     if(!pnga_uses_proc_grid(g_c))
497     {
498        *iblock = me;
499     }
500     else /* Uses scalapack block-cyclic data distribution */
501     {
502        *iblock = 0;
503        pnga_get_proc_index(g_c, me, proc_index);
504        pnga_get_proc_index(g_c, me, index);
505        pnga_get_block_info(g_c, blocks, block_dims);
506        pnga_get_proc_grid(g_c, topology);
507     }
508 }
509 
510 /**
511  * get the lo/hi distribution info of the next block
512  *   return 0 indicates no more blocks available
513  *   return 1 indicates there is a block available
514  */
515 static
get_next_block_info(Integer g_c,Integer * proc_index,Integer * index,Integer * blocks,Integer * block_dims,Integer * topology,Integer * iblock,Integer * blo,Integer * bhi)516 int get_next_block_info(Integer g_c, Integer *proc_index, Integer *index,
517                         Integer *blocks, Integer *block_dims, Integer*topology,
518                         Integer *iblock, Integer *blo, Integer *bhi)
519 {
520     Integer dims[MAXDIM], ndim, type;
521     int i;
522 
523     /* works only upto 2 dims - i.e vectors/matrices*/
524     pnga_inquire(g_c,  &type, &ndim, dims);
525     if(ndim>2) pnga_error("get_next_block_info() supports upto 2-d only ", 0L);
526 
527     /* Uses simple block-cyclic data distribution */
528     if (!pnga_uses_proc_grid(g_c))
529     {
530        if(*iblock < pnga_total_blocks(g_c))
531        {
532           pnga_distribution(g_c, *iblock, blo, bhi);
533           *iblock += pnga_nnodes();
534           return 1;
535        }
536     }
537     else /* Uses scalapack block-cyclic data distribution */
538     {
539        if (index[ndim-1] < blocks[ndim-1])
540        {
541           /* find bounding coordinates of block */
542           for (i = 0; i < ndim; i++)
543           {
544              blo[i] = index[i]*block_dims[i]+1;
545              bhi[i] = (index[i] + 1)*block_dims[i];
546              if (bhi[i] > dims[i]) bhi[i] = dims[i];
547           }
548 
549           /* increment index to get next block on processor */
550           index[0] += topology[0];
551           for (i = 0; i < ndim; i++)
552           {
553              if (index[i] >= blocks[i] && i<ndim-1)
554              {
555                 index[i]    = proc_index[i];
556                 index[i+1] += topology[i+1];
557              }
558           }
559 
560           return 1;
561        } /* end of while */
562     }
563 
564     return 0;
565 }
566 
567 
568 
gai_matmul_regular(transa,transb,alpha,beta,atype,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi,Ichunk,Kchunk,Jchunk,a_ar,b_ar,c_ar,need_scaling,irregular)569 static void gai_matmul_regular(transa, transb, alpha, beta, atype,
570 			       g_a, ailo, aihi, ajlo, ajhi,
571 			       g_b, bilo, bihi, bjlo, bjhi,
572 			       g_c, cilo, cihi, cjlo, cjhi,
573 			       Ichunk, Kchunk, Jchunk, a_ar,b_ar,c_ar,
574 			       need_scaling, irregular)
575 
576      Integer g_a, ailo, aihi, ajlo, ajhi;    /* patch of g_a */
577      Integer g_b, bilo, bihi, bjlo, bjhi;    /* patch of g_b */
578      Integer g_c, cilo, cihi, cjlo, cjhi;    /* patch of g_c */
579      Integer Ichunk, Kchunk, Jchunk, atype;
580      void    *alpha, *beta;
581      char    *transa, *transb;
582      DoubleComplex **a_ar, **b_ar, **c_ar;
583      short int need_scaling, irregular;
584 {
585 
586   Integer me= pnga_nodeid();
587   Integer get_new_B=TRUE, i0, i1, j0, j1;
588   Integer idim, jdim, kdim;
589   Integer k, adim=0, bdim, cdim, adim_next, bdim_next;
590   Integer clo[2], chi[2], loC[2]={1,1}, hiC[2]={1,1}, ld[2];
591   int max_tasks=0, shiftA=0, shiftB=0;
592   int currA, nextA, currB, nextB=0; /* "current" and "next" task Ids */
593   task_list_t taskListA[MAX_CHUNKS], taskListB[MAX_CHUNKS], state;
594   short int do_put=UNSET, single_task_flag=UNSET, chunks_left=0;
595   DoubleComplex ONE, *a, *b, *c;
596   SingleComplex ONE_CF;
597   int offset=0, gTaskId=0;
598   int numblocks=0, has_more_blocks=1;
599   Integer ctype, cndim, cdims[2];
600   Integer iblock=0, proc_index[2], index[2];
601   Integer blocks[2], block_dims[2], topology[2];
602   _iterator_hdl hdl;
603   char *ptr_c;
604 
605   if(irregular) pnga_error("irregular flag set", 0L);
606 
607   init_task_list(&state);
608 
609 #if DEBUG_
610   if(me==0) { printf("@@ga_matmul_regular:m,n,k=%ld %ld %ld\n",aihi-ailo+1,
611       bjhi-bjlo+1,ajhi-ajlo+1);  fflush(stdout); }
612 #endif
613 
614   ONE.real =1.;    ONE.imag =0.;
615   ONE_CF.real =1.; ONE_CF.imag =0.;
616   clo[0] = cilo; chi[0] = cihi;
617   clo[1] = cjlo; chi[1] = cjhi;
618   k = ajhi - ajlo +1;
619 
620 #if 0
621   numblocks = pnga_total_blocks(g_c);
622   if(numblocks>=0) init_block_info(g_c, proc_index, index, blocks,
623       block_dims, topology, &iblock);
624 
625   has_more_blocks = 1;
626   while(has_more_blocks)
627   {
628     /* if block cyclic distribution and loop accordigly. In case of simple
629        block distribution we process this loop only once */
630     if(numblocks<0)
631     { /* simple block distribution */
632       has_more_blocks = 0;
633       pnga_distribution(g_c, me, loC, hiC);
634     }
635     else
636     { /* block cyclic */
637 
638       if(!get_next_block_info(g_c, proc_index, index, blocks, block_dims,
639             topology, &iblock, loC, hiC))
640         break;
641     }
642 /*  } */
643 #else
644   pnga_local_iterator_init(g_c, &hdl);
645   while (pnga_local_iterator_next(&hdl,loC,hiC,&ptr_c,ld)) {
646 #endif
647 
648     /* If loC and hiC intersects with current patch region, then they will
649      * be updated accordingly. Else it returns FALSE */
650     pnga_inquire(g_c, &ctype, &cndim, cdims);
651     if(!pnga_patch_intersect(clo,chi,loC,hiC,cndim)) continue;
652 
653 #if DEBUG_
654     printf("%d: Processing block #%d [%d,%d] - [%d,%d]\n", GAme, iblock,
655         loC[0], loC[1], hiC[0], hiC[1]);
656 #endif
657 
658     state.lo[0] = -1; /* just for first do-while loop */
659     do {
660 
661       /* Inital Settings */
662       a = a_ar[0];
663       b = b_ar[0];
664       c = c_ar[0];
665       do_put = single_task_flag = UNSET;
666       offset = 0;
667 
668       /*****************************************************************
669        * Task list: Collect information of all chunks. Matmul using
670        * Non-blocking call needs this list
671        *****************************************************************/
672       gTaskId=0;
673 
674       /* to skip accumulate and exploit data locality:
675          get chunks according to "C" matrix distribution*/
676       /* pnga_distribution(g_c, me, loC, hiC); */
677       chunks_left=gai_get_task_list(taskListA, taskListB, &state,loC[0]-1,
678           loC[1]-1, 0, hiC[0]-1, hiC[1]-1, k-1,
679           Ichunk,Jchunk,Kchunk, &max_tasks,g_a);
680       currA = nextA = 0;
681 
682       if(chunks_left) { /* then turn OFF this optimization */
683         if(DIRECT_ACCESS_OPT_FLAG) {
684           /* check if there is only one task.If so,then it is contiguous */
685           if(max_tasks == 1) {
686             if( !((hiC[0]-loC[0]+1 <= Ichunk) &&(hiC[1]-loC[1]+1 <=Jchunk)
687                   && (k <= Kchunk)))
688               pnga_error("Invalid task list", 0L);
689             single_task_flag = SET;
690             pnga_access_ptr(g_c, loC, hiC, &c, ld);
691           }
692         }
693       }
694 
695       if(CYCLIC_DISTR_OPT_FLAG) {
696         int prow,pcol,grp_me;
697         Integer a_grp=pnga_get_pgroup(g_a);
698         grp_me = (int)pnga_pgroup_nodeid(a_grp);
699         prow = GA[GA_OFFSET + g_a].nblock[0];
700         pcol = GA[GA_OFFSET + g_a].nblock[1];
701         offset = (grp_me/prow + grp_me%prow) % pcol;
702         currA = nextA = nextA + offset;
703       }
704 
705       /*************************************************
706        * Do the setup & issue non-blocking calls to get
707        * the first block/chunk I'm gonna work
708        *************************************************/
709       shiftA=0; shiftB=0;
710       if(nextA < max_tasks) {
711         currB = nextB = taskListA[currA].chunkBId;
712 
713         GET_BLOCK(g_a, &taskListA[nextA], a_ar[shiftA], transa,
714             ailo, ajlo, &adim_next, &gNbhdlA[shiftA]);
715 
716         GET_BLOCK(g_b, &taskListB[nextB], b_ar[shiftB], transb,
717             bilo, bjlo, &bdim_next, &gNbhdlB[shiftB]);
718 
719         adim=adim_next; bdim=bdim_next;
720         get_new_B = TRUE;
721       }
722 
723       /*************************************************************
724        * Main Parallel DGEMM Loop.
725        *************************************************************/
726       while(nextA < max_tasks) {
727         currA = nextA;
728         currB = taskListA[currA].chunkBId;
729 
730         idim = cdim = taskListA[currA].dim[0];
731         jdim = taskListB[currB].dim[1];
732         kdim = taskListA[currA].dim[1];
733         bdim=bdim_next;
734 
735         /* if beta=0.0 (i.e.if need_scaling=UNSET), then for first shot,
736            we can do put, instead of accumulate */
737         if(need_scaling == UNSET) do_put = taskListA[currA].do_put;
738 
739         nextA = ++gTaskId; /* get the next task id */
740 
741         if(CYCLIC_DISTR_OPT_FLAG && nextA < max_tasks)
742           nextA = (offset+nextA) % max_tasks;
743 
744 
745         /* ---- WAIT till we get the current A & B block ---- */
746         a = a_ar[shiftA];
747         WAIT_GET_BLOCK(&gNbhdlA[shiftA]);
748         if(get_new_B){/*Avoid rereading B if it is same patch as last time*/
749           get_new_B = FALSE;
750           b = b_ar[shiftB];
751           WAIT_GET_BLOCK(&gNbhdlB[shiftB]);
752         }
753 
754         /* ---- GET the next A & B block ---- */
755         if(nextA < max_tasks) {
756           GET_BLOCK(g_a, &taskListA[nextA], a_ar[(shiftA+1)%2], transa,
757               ailo, ajlo, &adim_next, &gNbhdlA[(shiftA+1)%2]);
758 
759           nextB = taskListA[nextA].chunkBId;
760           if(currB != nextB) {
761             shiftB=((shiftB+1)%2);
762 
763             GET_BLOCK(g_b, &taskListB[nextB], b_ar[shiftB], transb,
764                 bilo, bjlo, &bdim_next, &gNbhdlB[shiftB]);
765           }
766         }
767         if(currB != nextB) get_new_B = TRUE;
768 
769         /* Do the sequential matrix multiply - i.e.BLAS dgemm */
770         GAI_DGEMM(atype, transa, transb, idim, jdim, kdim, alpha,
771             a, adim, b, bdim, c, cdim);
772 
773         /* Non-blocking Accumulate Operation. Note: skip wait in 1st loop*/
774         i0 = cilo + taskListA[currA].lo[0];
775         i1 = cilo + taskListA[currA].hi[0];
776         j0 = cjlo + taskListB[currB].lo[1];
777         j1 = cjlo + taskListB[currB].hi[1];
778 
779         if(currA < max_tasks) {
780           if (single_task_flag != SET) {
781             switch(atype) {
782               case C_FLOAT:
783               case C_SCPL:
784                 clo[0] = i0;
785                 clo[1] = j0;
786                 chi[0] = i1;
787                 chi[1] = j1;
788                 if(do_put==SET) /* Note:do_put is UNSET, if beta!=0.0*/
789                   pnga_put(g_c, clo, chi, (float *)c, &cdim);
790                 else {
791                   pnga_acc(g_c, clo, chi, (float *)c, &cdim, &ONE_CF);
792                 }
793                 break;
794               default:
795                 clo[0] = i0;
796                 clo[1] = j0;
797                 chi[0] = i1;
798                 chi[1] = j1;
799                 if(do_put==SET) /* i.e.beta ==0.0 */
800                   pnga_put(g_c, clo, chi, (DoublePrecision*)c, &cdim);
801                 else {
802                   pnga_acc(g_c, clo, chi, (DoublePrecision*)c, &cdim,(DoublePrecision*)&ONE);
803                 }
804                 break;
805             }
806           }
807         }
808 
809         /* shift next buffer..toggles between 0 and 1: as we use 2 buffers,
810            one for computation and the other for communication (overlap) */
811         shiftA = ((shiftA+1)%2);
812         adim = adim_next;
813       }
814     } while(chunks_left);
815   } /* while(has_more_blocks) */
816 
817 }
818 
819 
820 
gai_matmul_irreg(transa,transb,alpha,beta,atype,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi,Ichunk,Kchunk,Jchunk,a_ar,b_ar,c_ar,need_scaling,irregular)821 static void gai_matmul_irreg(transa, transb, alpha, beta, atype,
822 			     g_a, ailo, aihi, ajlo, ajhi,
823 			     g_b, bilo, bihi, bjlo, bjhi,
824 			     g_c, cilo, cihi, cjlo, cjhi,
825 			     Ichunk, Kchunk, Jchunk, a_ar,b_ar,c_ar,
826 			     need_scaling, irregular)
827 
828      Integer g_a, ailo, aihi, ajlo, ajhi;    /* patch of g_a */
829      Integer g_b, bilo, bihi, bjlo, bjhi;    /* patch of g_b */
830      Integer g_c, cilo, cihi, cjlo, cjhi;    /* patch of g_c */
831      Integer Ichunk, Kchunk, Jchunk, atype;
832      void    *alpha, *beta;
833      char    *transa, *transb;
834      DoubleComplex **a_ar, **b_ar, **c_ar;
835      short int need_scaling, irregular;
836 {
837 
838 #if DEBUG_
839   Integer me= pnga_nodeid();
840 #endif
841   Integer nproc=pnga_nnodes();
842   Integer get_new_B, i, i0, i1, j0, j1;
843   Integer ilo, ihi, idim, jlo, jhi, jdim, klo, khi, kdim, ijk=0;
844   Integer n, m, k, adim, bdim=0, cdim;
845   Integer idim_prev=0, jdim_prev=0, kdim_prev=0;
846   Integer adim_prev=0, bdim_prev=0, cdim_prev=0;
847   task_list_t taskListC;
848   short int compute_flag=0, shiftA=0, shiftB=0;
849   DoubleComplex ONE, *a, *b, *c;
850   SingleComplex ONE_CF;
851   Integer grp_me, a_grp = pnga_get_pgroup(g_a);
852   Integer clo[2], chi[2];
853 
854   init_task_list(&taskListC);
855   ONE.real =1.; ONE.imag =0.;
856   ONE_CF.real =1.; ONE_CF.imag =0.;
857 #if DEBUG_
858   if(me==0) { printf("@@ga_matmul_irreg:m,n,k=%ld %ld %ld\n", aihi-ailo+1,
859       bjhi-bjlo+1,ajhi-ajlo+1); fflush(stdout); }
860 #endif
861 
862   m = aihi - ailo +1;
863   n = bjhi - bjlo +1;
864   k = ajhi - ajlo +1;
865   a = a_ar[0];
866   b = b_ar[0];
867   c = c_ar[0];
868 
869   grp_me = pnga_pgroup_nodeid(a_grp);
870   clo[0] = cilo; clo[1] = cjlo;
871   chi[0] = cihi; chi[1] = cjhi;
872   if(!need_scaling) pnga_fill_patch(g_c, clo, chi, beta);
873 
874   compute_flag=0;     /* take care of the last chunk */
875 
876   for(jlo = 0; jlo < n; jlo += Jchunk){ /* loop thru columns of g_c patch */
877     jhi = GA_MIN(n-1, jlo+Jchunk-1);
878     jdim= jhi - jlo +1;
879 
880     for(klo = 0; klo < k; klo += Kchunk){    /* loop cols of g_a patch */
881       khi = GA_MIN(k-1, klo+Kchunk-1);          /* loop rows of g_b patch */
882       kdim= khi - klo +1;
883 
884       /** Each pass through the outer two loops means we need a
885         different patch of B.*/
886       get_new_B = TRUE;
887 
888       for(ilo = 0; ilo < m; ilo+=Ichunk){ /* loop thru rows of g_c patch */
889 
890         if(ijk%nproc == grp_me){
891 
892           ihi = GA_MIN(m-1, ilo+Ichunk-1);
893           idim= cdim = ihi - ilo +1;
894 
895 
896           if (*transa == 'n' || *transa == 'N'){
897             adim = idim;
898             i0= ailo+ilo; i1= ailo+ihi;
899             j0= ajlo+klo; j1= ajlo+khi;
900             clo[0] = i0;
901             clo[1] = j0;
902             chi[0] = i1;
903             chi[1] = j1;
904             pnga_nbget(g_a, clo, chi, a_ar[shiftA],
905                 &idim, &gNbhdlA[shiftA]);
906           }else{
907             adim = kdim;
908             i0= ajlo+klo; i1= ajlo+khi;
909             j0= ailo+ilo; j1= ailo+ihi;
910             clo[0] = i0;
911             clo[1] = j0;
912             chi[0] = i1;
913             chi[1] = j1;
914             pnga_nbget(g_a, clo, chi, a_ar[shiftA],
915                 &kdim, &gNbhdlA[shiftA]);
916           }
917 
918           /* Avoid rereading B if it is same patch as last time. */
919           if(get_new_B) {
920             if (*transb == 'n' || *transb == 'N'){
921               bdim = kdim;
922               i0= bilo+klo; i1= bilo+khi;
923               j0= bjlo+jlo; j1= bjlo+jhi;
924               clo[0] = i0;
925               clo[1] = j0;
926               chi[0] = i1;
927               chi[1] = j1;
928               pnga_nbget(g_b, clo, chi, b_ar[shiftB],
929                   &kdim, &gNbhdlB[shiftB]);
930             }else{
931               bdim = jdim;
932               i0= bjlo+jlo; i1= bjlo+jhi;
933               j0= bilo+klo; j1= bilo+khi;
934               clo[0] = i0;
935               clo[1] = j0;
936               chi[0] = i1;
937               chi[1] = j1;
938               pnga_nbget(g_b, clo, chi, b_ar[shiftB],
939                   &jdim, &gNbhdlB[shiftB]);
940             }
941           }
942 
943           if(compute_flag) { /* compute loop */
944 
945             if(atype == C_FLOAT)
946               for(i=0;i<idim_prev*jdim_prev;i++) *(((float*)c)+i)=0;
947             else if(atype ==  C_DBL)
948               for(i=0;i<idim_prev*jdim_prev;i++) *(((double*)c)+i)=0;
949             else if(atype ==  C_SCPL)
950               for(i=0;i<idim_prev*jdim_prev;i++) {
951                 ((SingleComplex*)c)[i].real=0;
952                 ((SingleComplex*)c)[i].imag=0;
953               }
954             else for(i=0;i<idim_prev*jdim_prev;i++) {
955               c[i].real=0;c[i].imag=0; }
956 
957             /* wait till we get the previous block */
958             a = a_ar[shiftA^1];
959             WAIT_GET_BLOCK(&gNbhdlA[shiftA^1]);
960             if(taskListC.chunkBId) {
961               b = b_ar[shiftB^1];
962               WAIT_GET_BLOCK(&gNbhdlB[shiftB^1]);
963             }
964 
965             /* Do the sequential matrix multiply - i.e.BLAS dgemm */
966             GAI_DGEMM(atype, transa, transb, idim_prev, jdim_prev,
967                 kdim_prev, alpha, a, adim_prev, b, bdim_prev,
968                 c, cdim_prev);
969 
970             i0= cilo + taskListC.lo[0];
971             i1= cilo + taskListC.hi[0];
972             j0= cjlo + taskListC.lo[1];
973             j1= cjlo + taskListC.hi[1];
974 
975             if(atype == C_FLOAT || atype == C_SCPL) {
976               clo[0] = i0;
977               clo[1] = j0;
978               chi[0] = i1;
979               chi[1] = j1;
980               pnga_acc(g_c, clo, chi, (float *)c, &cdim_prev, &ONE_CF);
981             } else {
982               clo[0] = i0;
983               clo[1] = j0;
984               chi[0] = i1;
985               chi[1] = j1;
986               pnga_acc(g_c, clo, chi, (DoublePrecision*)c, &cdim_prev, (DoublePrecision*)&ONE);
987             }
988           }
989           compute_flag=1;
990 
991           /* meta-data of current block for next compute loop */
992           taskListC.lo[0] = ilo; taskListC.hi[0] = ihi;
993           taskListC.lo[1] = jlo; taskListC.hi[1] = jhi;
994           taskListC.chunkBId = get_new_B;
995           idim_prev = idim;   adim_prev = adim;
996           jdim_prev = jdim;   bdim_prev = bdim;
997           kdim_prev = kdim;   cdim_prev = cdim;
998 
999           /* shift bext buffer */
1000           shiftA ^= 1;
1001           if(get_new_B) shiftB ^= 1;
1002 
1003           get_new_B = FALSE; /* Until J or K change again */
1004         }
1005         ++ijk;
1006       }
1007     }
1008   }
1009 
1010   /* -------- compute the last chunk --------- */
1011   if(compute_flag) {
1012     if(atype == C_FLOAT)
1013       for(i=0;i<idim_prev*jdim_prev;i++) *(((float*)c)+i)=0;
1014     else if(atype ==  C_DBL)
1015       for(i=0;i<idim_prev*jdim_prev;i++) *(((double*)c)+i)=0;
1016     else if(atype ==  C_SCPL)
1017       for(i=0;i<idim_prev*jdim_prev;i++) {
1018         ((SingleComplex*)c)[i].real=0;
1019         ((SingleComplex*)c)[i].imag=0;
1020       }
1021     else for(i=0;i<idim_prev*jdim_prev;i++) {
1022       c[i].real=0;c[i].imag=0; }
1023 
1024     /* wait till we get the previous block */
1025     a = a_ar[shiftA^1];
1026     WAIT_GET_BLOCK(&gNbhdlA[shiftA^1]);
1027     if(taskListC.chunkBId) {
1028       b = b_ar[shiftB^1];
1029       WAIT_GET_BLOCK(&gNbhdlB[shiftB^1]);
1030     }
1031 
1032     /* Do the sequential matrix multiply - i.e.BLAS dgemm */
1033     GAI_DGEMM(atype, transa, transb, idim_prev, jdim_prev,
1034         kdim_prev, alpha, a, adim_prev, b, bdim_prev,
1035         c, cdim_prev);
1036 
1037     i0= cilo + taskListC.lo[0];
1038     i1= cilo + taskListC.hi[0];
1039     j0= cjlo + taskListC.lo[1];
1040     j1= cjlo + taskListC.hi[1];
1041 
1042     if(atype == C_FLOAT || atype == C_SCPL) {
1043       clo[0] = i0;
1044       clo[1] = j0;
1045       chi[0] = i1;
1046       chi[1] = j1;
1047       pnga_acc(g_c, clo, chi, (float *)c, &cdim_prev, &ONE_CF);
1048     } else {
1049       clo[0] = i0;
1050       clo[1] = j0;
1051       chi[0] = i1;
1052       chi[1] = j1;
1053       pnga_acc(g_c, clo, chi, (DoublePrecision*)c, &cdim_prev, (DoublePrecision*)&ONE);
1054     }
1055   }
1056   /* ----------------------------------------- */
1057 }
1058 
1059 #if DEBUG_
1060 
check_result(cond,transa,transb,alpha,beta,atype,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi)1061 static void check_result(cond, transa, transb, alpha, beta, atype,
1062                          g_a, ailo, aihi, ajlo, ajhi,
1063                          g_b, bilo, bihi, bjlo, bjhi,
1064                          g_c, cilo, cihi, cjlo, cjhi)
1065 
1066      Integer g_a, ailo, aihi, ajlo, ajhi;    /* patch of g_a */
1067      Integer g_b, bilo, bihi, bjlo, bjhi;    /* patch of g_b */
1068      Integer g_c, cilo, cihi, cjlo, cjhi;    /* patch of g_c */
1069      Integer atype, cond;
1070      void    *alpha, *beta;
1071      char    *transa, *transb;
1072 {
1073 
1074     DoubleComplex *tmpa=NULL, *tmpb=NULL, *tmpc2=NULL;
1075     static DoubleComplex *tmpc_orig = NULL;
1076     Integer i,j,m,n,k,adim,bdim,cdim;
1077     Integer factor=sizeof(DoubleComplex)/GAsizeofM(atype);
1078     BlasInt m_t, n_t, k_t, adim_t, bdim_t, cdim_t;
1079     Integer alo[2], ahi[2], blo[2], bhi[2], clo[2], chi[2];
1080 
1081     m = aihi - ailo +1;
1082     n = bjhi - bjlo +1;
1083     k = ajhi - ajlo +1;
1084     cdim = m;
1085 
1086     if(cond==0) { /* store the original matrix C before matmul starts, as
1087 		     matrix C is subject to change during pnga_matmul */
1088        tmpc_orig= (DoubleComplex*)malloc(sizeof(DoubleComplex)*(m*n/factor+1));
1089        if(tmpc_orig==NULL) pnga_error("check_result: malloc failed", 0);
1090 
1091        /* get matrix C */
1092 
1093        clo[0] = cilo;
1094        clo[1] = cjlo;
1095        chi[0] = cihi;
1096        chi[1] = cjhi;
1097        pnga_get(g_c, clo, chi, tmpc_orig, &m);
1098     }
1099     else { /* check for CORRECTNESS */
1100 
1101        /* Memory Allocation */
1102        tmpa = (DoubleComplex*)malloc( sizeof(DoubleComplex)* (m*k/factor+1));
1103        tmpb = (DoubleComplex*)malloc( sizeof(DoubleComplex)* (k*n/factor+1));
1104        if(tmpa==NULL || tmpb==NULL) pnga_error("check_result: malloc failed", 0);
1105 
1106        switch(atype) {
1107 	  case C_FLOAT:
1108 	     for(i=0; i<m*k; i++) ((float*)tmpa)[i] = -1.0;
1109 	     for(i=0; i<k*n; i++) ((float*)tmpb)[i] = -1.0;
1110 	     break;
1111 	  case C_DBL:
1112 	     for(i=0; i<m*k; i++) ((double*)tmpa)[i] = -1.0;
1113 	     for(i=0; i<k*n; i++) ((double*)tmpb)[i] = -1.0;
1114 	     break;
1115 	  case C_DCPL:
1116 	     for(i=0; i<m*k; i++) {tmpa[i].real=-1.0; tmpa[i].imag=0.0;}
1117 	     for(i=0; i<k*n; i++) {tmpb[i].real=-1.0; tmpb[i].imag=0.0;}
1118 	     break;
1119 	  case C_SCPL:
1120 	     for(i=0; i<m*k; i++) {((SingleComplex*)tmpa)[i].real=-1.0; ((SingleComplex*)tmpa)[i].imag=0.0;}
1121 	     for(i=0; i<k*n; i++) {((SingleComplex*)tmpb)[i].real=-1.0; ((SingleComplex*)tmpb)[i].imag=0.0;}
1122 	     break;
1123           default:
1124             pnga_error("ga_matmul_patch: wrong data type", atype);
1125        }
1126 
1127        /* get matrix A */
1128        if (*transa == 'n' || *transa == 'N'){
1129          alo[0] = ailo;
1130          alo[1] = ajlo;
1131          ahi[0] = aihi;
1132          ahi[1] = ajhi;
1133          adim=m;
1134          pnga_get(g_a, alo, ahi, tmpa, &m);
1135        } else {
1136          alo[0] = ajlo;
1137          alo[1] = ailo;
1138          ahi[0] = ajhi;
1139          ahi[1] = aihi;
1140          adim=k;
1141          pnga_get(g_a, alo, ahi, tmpa, &k);
1142        }
1143 
1144        /* get matrix B */
1145        if (*transb == 'n' || *transb == 'N'){
1146          blo[0] = bilo;
1147          blo[1] = bjlo;
1148          bhi[0] = bihi;
1149          bhi[1] = bjhi;
1150          bdim=k;
1151          pnga_get(g_b, blo, bhi, tmpb, &k);
1152        } else {
1153          blo[0] = bjlo;
1154          blo[1] = bilo;
1155          bhi[0] = bjhi;
1156          bhi[1] = bihi;
1157          bdim=n;
1158          pnga_get(g_b, blo, bhi, tmpb, &n);
1159        }
1160 
1161 
1162        m_t=m; n_t=n; k_t=k;
1163        adim_t=adim; bdim_t=bdim; cdim_t=cdim;
1164 #if (defined(CRAY) || defined(WIN32)) && !NOFORT
1165        pnga_error("check_result: Serial dgemms not defined", 0L);
1166 #else
1167        switch(atype) {
1168 	  case C_DBL:
1169 #   if F2C_HIDDEN_STRING_LENGTH_AFTER_ARGS
1170 	     dgemm_(transa, transb, &m_t, &n_t, &k_t, alpha, tmpa, &adim_t,
1171 		    tmpb, &bdim_t, beta, tmpc_orig, &cdim_t, 1, 1);
1172 #   else
1173 	     dgemm_(transa, 1, transb, 1, &m_t, &n_t, &k_t, alpha, tmpa, &adim_t,
1174 		    tmpb, &bdim_t, beta, tmpc_orig, &cdim_t);
1175 #   endif
1176 	     break;
1177 	  case C_DCPL:
1178 #   if F2C_HIDDEN_STRING_LENGTH_AFTER_ARGS
1179 	     zgemm_(transa, transb, &m_t, &n_t, &k_t, (DoubleComplex*)alpha,
1180 		    tmpa, &adim_t, tmpb, &bdim_t, beta, tmpc_orig, &cdim_t, 1, 1);
1181 #   else
1182 	     zgemm_(transa, 1, transb, 1, &m_t, &n_t, &k_t, (DoubleComplex*)alpha,
1183 		    tmpa, &adim_t, tmpb, &bdim_t, beta, tmpc_orig, &cdim_t);
1184 #   endif
1185 	     break;
1186 	  default:
1187 	     pnga_error("check_result: data type not supported here", atype);
1188        }
1189 #endif
1190 
1191        printf("CHK:%c%c : %ld %ld %ld %ld: %ld %ld %ld %ld: %ld %ld %ld %ld\n",
1192 	      *transa, *transb, ailo, aihi, ajlo, ajhi,
1193 	      bilo, bihi, bjlo, bjhi, cilo, cihi, cjlo, cjhi);
1194 
1195        free(tmpa); free(tmpb);
1196        /* after computing c locally, verify it with the values in g_c */
1197        tmpc2 = (DoubleComplex*)malloc( sizeof(DoubleComplex)* (m*n/factor+1));
1198        if(tmpc2==NULL) pnga_error("check_result: malloc failed for tmpc2", 0);
1199        clo[0] = cilo;
1200        clo[1] = cjlo;
1201        chi[0] = cihi;
1202        chi[1] = cjhi;
1203        pnga_get(g_c, clo, chi, tmpc2, &m);
1204 
1205 #define _GA_TOL_ 0.1 /* error tolerance */
1206 
1207        switch(atype) {
1208 
1209 	  case C_FLOAT:
1210 	    {
1211 	       float abs_value=0.0;
1212 	       for(i=0; i<m*n; i++) {
1213 		  abs_value = ((float*)tmpc_orig)[i] - ((float*)tmpc2)[i];
1214 		  if(abs_value > _GA_TOL_ || abs_value < -(_GA_TOL_)) {
1215 		     printf("Values are = %f : %f\n Alpha=%f Beta=%f\n",
1216 			    ((float*)tmpc_orig)[i], ((float*)tmpc2)[i],
1217 			    *((float*)alpha), *((float*)beta));
1218 		     pnga_error("Matmul (type:float) check failed", 0);
1219 		  }
1220 	       }
1221 	    }
1222 	    break;
1223 
1224 	  case C_DBL:
1225 	    {
1226 	       double abs_value=0.0;
1227 	       for(i=0; i<m*n; i++) {
1228 		  abs_value = ((double*)tmpc_orig)[i] - ((double*)tmpc2)[i];
1229 		  if(abs_value > _GA_TOL_ || abs_value < -(_GA_TOL_)) {
1230 		     printf("Values are = %lf : %lf\n Alpha=%lf Beta=%lf\n",
1231 			    ((double*)tmpc_orig)[i+j],	((double*)tmpc2)[i+j],
1232 			    *((double*)alpha),*((double*)beta));
1233 		     pnga_error("Matmul (type:double) check failed", 0);
1234 		  }
1235 	       }
1236 	    }
1237 	    break;
1238 
1239 	  case C_DCPL:
1240 	    {
1241 	       DoubleComplex abs_value;
1242 	       for(i=0; i<m*n; i++) {
1243 		  abs_value.real = tmpc_orig[i].real - tmpc2[i].real;
1244 		  abs_value.imag = tmpc_orig[i].imag - tmpc2[i].imag;
1245 		  if(abs_value.real>_GA_TOL_ || abs_value.real<-(_GA_TOL_) ||
1246 		     abs_value.imag>_GA_TOL_ || abs_value.imag<-(_GA_TOL_)) {
1247 		     printf("Values= %lf, %lf : %lf, %lf\n", tmpc_orig[i].real,
1248 			    tmpc_orig[i].imag,tmpc2[i].real,tmpc2[i].imag);
1249 		     pnga_error("Matmul (DoubleComplex) check failed", 0);
1250 		  }
1251 	       }
1252 	    }
1253 	    break;
1254 
1255 	  case C_SCPL:
1256 	    {
1257 	       SingleComplex abs_value;
1258 	       for(i=0; i<m*n; i++) {
1259 		  abs_value.real = ((SingleComplex*)tmpc_orig)[i].real - ((SingleComplex*)tmpc2)[i].real;
1260 		  abs_value.imag = ((SingleComplex*)tmpc_orig)[i].imag - ((SingleComplex*)tmpc2)[i].imag;
1261 		  if(abs_value.real>_GA_TOL_ || abs_value.real<-(_GA_TOL_) ||
1262 		     abs_value.imag>_GA_TOL_ || abs_value.imag<-(_GA_TOL_)) {
1263 		     printf("Values= %lf, %lf : %lf, %lf\n", ((SingleComplex*)tmpc_orig)[i].real,
1264 			    ((SingleComplex*)tmpc_orig)[i].imag,((SingleComplex*)tmpc2)[i].real,((SingleComplex*)tmpc2)[i].imag);
1265 		     pnga_error("Matmul (SingleComplex) check failed", 0);
1266 		  }
1267 	       }
1268 	    }
1269 	    break;
1270 
1271 	  default:
1272 	     pnga_error("ga_matmul_patch: wrong data type", atype);
1273        }
1274        printf("Matrix Multiplication check (m,n,k=%ld %ld %ld)...O.K\n",m,n,k);
1275        fflush(stdout);
1276        free(tmpc_orig); free(tmpc2);
1277        tmpc_orig = NULL;
1278     }
1279 }
1280 
1281 #endif
1282 
1283 /******************************************
1284  * PARALLEL DGEMM
1285  *     i.e.  C = alpha*A*B + beta*C
1286  ******************************************/
1287 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1288 #   pragma weak wnga_matmul = pnga_matmul
1289 #endif
pnga_matmul(transa,transb,alpha,beta,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi)1290 void pnga_matmul(transa, transb, alpha, beta,
1291 	       g_a, ailo, aihi, ajlo, ajhi,
1292 	       g_b, bilo, bihi, bjlo, bjhi,
1293 	       g_c, cilo, cihi, cjlo, cjhi)
1294 
1295      Integer g_a, ailo, aihi, ajlo, ajhi;    /* patch of g_a */
1296      Integer g_b, bilo, bihi, bjlo, bjhi;    /* patch of g_b */
1297      Integer g_c, cilo, cihi, cjlo, cjhi;    /* patch of g_c */
1298      void    *alpha, *beta;
1299      char    *transa, *transb;
1300 {
1301     DoubleComplex *a=NULL, *b, *c, *a_ar[2], *b_ar[2], *c_ar[2];
1302     Integer adim1=0, adim2=0, bdim1=0, bdim2=0, cdim1=0, cdim2=0, dims[2];
1303     Integer atype, btype, ctype, rank, me= pnga_nodeid();
1304     Integer n, m, k, Ichunk, Kchunk, Jchunk;
1305     Integer loA[2]={0,0}, hiA[2]={0,0};
1306     Integer loB[2]={0,0}, hiB[2]={0,0};
1307     Integer loC[2]={0,0}, hiC[2]={0,0};
1308     int local_sync_begin,local_sync_end;
1309     short int need_scaling=SET,use_NB_matmul=SET;
1310     short int irregular=UNSET, use_armci_memory=UNSET;
1311     Integer a_grp=pnga_get_pgroup(g_a), b_grp=pnga_get_pgroup(g_b);
1312     Integer c_grp=pnga_get_pgroup(g_c);
1313     Integer numblocks;
1314     Integer clo[2], chi[2];
1315 
1316     /* OPTIMIZATIONS FLAGS. To unset an optimization, replace SET by UNSET) */
1317     CYCLIC_DISTR_OPT_FLAG  = UNSET;
1318     CONTIG_CHUNKS_OPT_FLAG = SET;
1319     DIRECT_ACCESS_OPT_FLAG = SET;
1320 
1321     local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1322     _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1323     if(local_sync_begin)pnga_pgroup_sync(a_grp);
1324 
1325 
1326     if (a_grp != b_grp || a_grp != c_grp)
1327        pnga_error("Arrays must be defined on same group",0L);
1328 # if 0 /* disabled. should not fail if there are non-overlapping patches*/
1329     /* check if C is different from A and B */
1330     if (g_c == g_a || g_c == g_b)
1331        pnga_error("Global Array C should be different from A and B", 0);
1332 #endif
1333 
1334     /**************************************************
1335      * Do All Sanity Checks
1336      **************************************************/
1337 
1338     /* Check to make sure all global arrays are of the same type */
1339     if (!(pnga_is_mirrored(g_a) == pnga_is_mirrored(g_b) &&
1340 	  pnga_is_mirrored(g_a) == pnga_is_mirrored(g_c))) {
1341        pnga_error("Processors do not match for all arrays",pnga_nnodes());
1342     }
1343 
1344     /* check if ranks are O.K. */
1345     pnga_inquire(g_a, &atype, &rank, dims);
1346     VECTORCHECK(rank, dims, adim1, adim2, ailo, aihi, ajlo, ajhi);
1347     pnga_inquire(g_b, &btype, &rank, dims);
1348     VECTORCHECK(rank, dims, bdim1, bdim2, bilo, bihi, bjlo, bjhi);
1349     pnga_inquire(g_c, &ctype, &rank, dims);
1350     VECTORCHECK(rank, dims, cdim1, cdim2, cilo, cihi, cjlo, cjhi);
1351 
1352     /* check for data-types mismatch */
1353     if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
1354     if(atype != C_DCPL && atype != C_DBL && atype != C_FLOAT && atype!=C_SCPL)
1355        pnga_error(" type error",atype);
1356 
1357     /* check if patch indices and dims match */
1358     if (*transa == 'n' || *transa == 'N'){
1359        if (ailo <= 0 || aihi > adim1 || ajlo <= 0 || ajhi > adim2)
1360 	  pnga_error("  g_a indices out of range ", g_a);
1361     }else
1362        if (ailo <= 0 || aihi > adim2 || ajlo <= 0 || ajhi > adim1)
1363 	  pnga_error("  g_a indices out of range ", g_a);
1364 
1365     if (*transb == 'n' || *transb == 'N'){
1366        if (bilo <= 0 || bihi > bdim1 || bjlo <= 0 || bjhi > bdim2)
1367 	  pnga_error("  g_b indices out of range ", g_b);
1368     }else
1369        if (bilo <= 0 || bihi > bdim2 || bjlo <= 0 || bjhi > bdim1)
1370 	  pnga_error("  g_b indices out of range ", g_b);
1371 
1372     if (cilo <= 0 || cihi > cdim1 || cjlo <= 0 || cjhi > cdim2)
1373        pnga_error("  g_c indices out of range ", g_c);
1374 
1375     /* verify if patch dimensions are consistent */
1376     m = aihi - ailo +1;
1377     n = bjhi - bjlo +1;
1378     k = ajhi - ajlo +1;
1379     if( (cihi - cilo +1) != m) pnga_error(" a & c dims error",m);
1380     if( (cjhi - cjlo +1) != n) pnga_error(" b & c dims error",n);
1381     if( (bihi - bilo +1) != k) pnga_error(" a & b dims error",k);
1382 
1383 #if DEBUG_
1384     if(me==0) check_result(0, transa, transb, alpha, beta, atype,
1385 			   g_a, ailo, aihi, ajlo, ajhi,
1386 			   g_b, bilo, bihi, bjlo, bjhi,
1387 			   g_c, cilo, cihi, cjlo, cjhi);
1388     pnga_sync();
1389 #endif
1390 
1391     /* switch to various matmul algorithms here. more to come */
1392     if( GA[GA_OFFSET + g_c].irreg == 1 ||
1393 	GA[GA_OFFSET + g_b].irreg == 1 ||
1394 	GA[GA_OFFSET + g_a].irreg == 1 ||
1395 	_gai_matmul_patch_flag == SET) irregular = SET;
1396 
1397     /* even ga_dgemm is called, m,n & k might not match GA dimensions */
1398     pnga_inquire(g_c, &ctype, &rank, dims);
1399     if(dims[0] != m || dims[1] != n) irregular = SET; /* C matrix dims */
1400 
1401     if(!irregular) {
1402        if((adim1=GA_Cluster_nnodes()) > 1) use_NB_matmul = SET;
1403        else {
1404 	  use_NB_matmul = UNSET;
1405 	  CONTIG_CHUNKS_OPT_FLAG = UNSET;
1406 	  DIRECT_ACCESS_OPT_FLAG = UNSET;
1407        }
1408 #    if defined(__crayx1) || defined(NEC)
1409        use_NB_matmul = UNSET;
1410 #    endif
1411     }
1412 
1413     /* if block cyclic, then use regular algorithm. This is turned on for now
1414      * to test block cyclic */
1415     numblocks = pnga_total_blocks(g_c);
1416     if(numblocks>=0) {
1417 #if 0
1418        irregular     = UNSET;
1419        use_NB_matmul = SET;
1420 #else
1421        loA[0] = ailo;
1422        loA[1] = ajlo;
1423        hiA[0] = aihi;
1424        hiA[1] = ajhi;
1425        loB[0] = bilo;
1426        loB[1] = bjlo;
1427        hiB[0] = bihi;
1428        hiB[1] = bjhi;
1429        loC[0] = cilo;
1430        loC[1] = cjlo;
1431        hiC[0] = cihi;
1432        hiC[1] = cjhi;
1433        pnga_matmul_basic(transa, transb, alpha, beta, g_a, loA, hiA,
1434          g_b, loB, hiB, g_c, loC, hiC);
1435        return;
1436 #endif
1437     }
1438 
1439     /****************************************************************
1440      * Get the memory (i.e.static or dynamic) for temporary buffers
1441      ****************************************************************/
1442 
1443     /* to skip accumulate and exploit data locality:
1444        get chunks according to "C" matrix distribution*/
1445     pnga_distribution(g_a, me, loA, hiA);
1446     pnga_distribution(g_b, me, loB, hiB);
1447     pnga_distribution(g_c, me, loC, hiC);
1448 
1449        {
1450 	  Integer elems, factor=sizeof(DoubleComplex)/GAsizeofM(atype);
1451 	  short int nbuf=1;
1452 	  DoubleComplex *tmp = NULL;
1453 
1454 	  Ichunk = GA_MIN( (hiC[0]-loC[0]+1), (hiA[0]-loA[0]+1) );
1455 	  Jchunk = GA_MIN( (hiC[1]-loC[1]+1), (hiB[1]-loB[1]+1) );
1456 	  Kchunk = GA_MIN( (hiA[1]-loA[1]+1), (hiB[0]-loB[0]+1) );
1457 
1458 #if KCHUNK_OPTIMIZATION /*works great for m=1000,n=1000,k=4000 kinda cases*/
1459 	  pnga_distribution(g_a, me, loC, hiC);
1460 	  Kchunk = hiC[1]-loC[1]+1;
1461 	  pnga_distribution(g_b, me, loC, hiC);
1462 	  Kchunk = GA_MIN(Kchunk, (hiC[0]-loC[0]+1));
1463 #endif
1464 
1465 	  /* Just to avoid divide by zero error */
1466           if(Ichunk<=0) Ichunk = 1;
1467           if(Jchunk<=0) Jchunk = 1;
1468           if(Kchunk<=0) Kchunk = 1;
1469 
1470 	  {
1471 	     Integer irreg=0;
1472 	     if(Ichunk/Kchunk > GA_ASPECT_RATIO || Kchunk/Ichunk > GA_ASPECT_RATIO ||
1473 		Jchunk/Kchunk > GA_ASPECT_RATIO || Kchunk/Jchunk > GA_ASPECT_RATIO) {
1474                 irreg = SET;
1475              }
1476 	     pnga_pgroup_gop(a_grp, pnga_type_f2c(MT_F_INT), &irreg, (Integer)1, "max");
1477 	     if(irreg==SET) irregular = SET;
1478 	  }
1479 
1480 	  /* If non-blocking, we need 2 temporary buffers for A and B matrix */
1481 	  if(use_NB_matmul) nbuf = 2;
1482 
1483 	  if(!irregular) {
1484 	     tmp = a_ar[0] =a=gai_get_armci_memory(Ichunk,Jchunk,Kchunk,
1485 						   nbuf, atype);
1486 	     if(tmp != NULL) use_armci_memory = SET;
1487 	  }
1488 
1489 	  /* get ChunkSize (i.e.BlockSize), that fits in temporary buffer */
1490 	  gai_get_chunk_size(irregular, &Ichunk, &Jchunk, &Kchunk, &elems,
1491 			     atype, m, n, k, nbuf, use_armci_memory, a_grp);
1492 
1493 	  if(tmp == NULL) { /* try once again from armci for new chunk sizes */
1494 	     tmp = a_ar[0] =a=gai_get_armci_memory(Ichunk,Jchunk,Kchunk,
1495 						   nbuf, atype);
1496 	     if(tmp != NULL) use_armci_memory = SET;
1497 	  }
1498 
1499 	  if(tmp == NULL) { /*if armci malloc fails again, then get from MA */
1500 	     tmp = a_ar[0] = a =(DoubleComplex*) ga_malloc(elems,atype,
1501 							   "GA mulmat bufs");
1502 	  }
1503 
1504 	  if(use_NB_matmul) tmp = a_ar[1] = a_ar[0] + (Ichunk*Kchunk)/factor+1;
1505 
1506 	  tmp = b_ar[0] = b = tmp + (Ichunk*Kchunk)/factor + 1;
1507 	  if(use_NB_matmul) tmp = b_ar[1] = b_ar[0] + (Kchunk*Jchunk)/factor+1;
1508 
1509 	  c_ar[0] = c = tmp + (Kchunk*Jchunk)/factor + 1;
1510        }
1511 
1512        /** check if there is a need for scaling the data.
1513 	   Note: if beta=0, then need_scaling=0  */
1514        if(atype==C_DCPL){
1515 	  if((((DoubleComplex*)beta)->real == 0) &&
1516 	     (((DoubleComplex*)beta)->imag ==0)) need_scaling =0;}
1517        else if(atype==C_SCPL){
1518 	  if((((SingleComplex*)beta)->real == 0) &&
1519 	     (((SingleComplex*)beta)->imag ==0)) need_scaling =0;}
1520        else if(atype==C_DBL){
1521 	  if(*(DoublePrecision *)beta == 0) need_scaling =0;}
1522        else if( *(float*)beta ==0) need_scaling =0;
1523 
1524        clo[0] = cilo; clo[1] = cjlo;
1525        chi[0] = cihi; chi[1] = cjhi;
1526        if(need_scaling) pnga_scale_patch(g_c, clo, chi, beta);
1527 
1528        /********************************************************************
1529 	* Parallel Matrix Multiplication Starts Here.
1530 	* 3 Steps:
1531 	*    1. Get a chunk of A and B matrix, and store it in local buffer.
1532 	*    2. Do sequential dgemm.
1533 	*    3. Put/accumulate the result into C matrix.
1534 	*********************************************************************/
1535 
1536        /* if only one node, then enable the optimized shmem code */
1537        if(use_NB_matmul==UNSET) {
1538 	  gai_matmul_shmem(transa, transb, alpha, beta, atype,
1539 			   g_a, ailo, aihi, ajlo, ajhi,
1540 			   g_b, bilo, bihi, bjlo, bjhi,
1541 			   g_c, cilo, cihi, cjlo, cjhi,
1542 			   Ichunk, Kchunk, Jchunk, a,b,c, need_scaling);
1543        }
1544        else {
1545 	  if(irregular)
1546 	     gai_matmul_irreg(transa, transb, alpha, beta, atype,
1547 			      g_a, ailo, aihi, ajlo, ajhi,
1548 			      g_b, bilo, bihi, bjlo, bjhi,
1549 			      g_c, cilo, cihi, cjlo, cjhi,
1550 			      Ichunk, Kchunk, Jchunk, a_ar, b_ar, c_ar,
1551 			      need_scaling, irregular);
1552 	  else {
1553 	     gai_matmul_regular(transa, transb, alpha, beta, atype,
1554 				g_a, ailo, aihi, ajlo, ajhi,
1555 				g_b, bilo, bihi, bjlo, bjhi,
1556 				g_c, cilo, cihi, cjlo, cjhi,
1557 				Ichunk, Kchunk, Jchunk, a_ar, b_ar, c_ar,
1558 				need_scaling, irregular);
1559      }
1560        }
1561 
1562        a = a_ar[0];
1563        if(use_armci_memory == SET) ARMCI_Free_local(a);
1564        else ga_free(a);
1565 
1566 #if DEBUG_
1567        Integer grp_me;
1568        grp_me = pnga_pgroup_nodeid(a_grp);
1569        pnga_pgroup_sync(a_grp);
1570        if(me==0) check_result(1, transa, transb, alpha, beta, atype,
1571 			      g_a, ailo, aihi, ajlo, ajhi,
1572 			      g_b, bilo, bihi, bjlo, bjhi,
1573 			      g_c, cilo, cihi, cjlo, cjhi);
1574        pnga_pgroup_sync(a_grp);
1575 #endif
1576 
1577        if(local_sync_end)pnga_pgroup_sync(a_grp);
1578 }
1579 
1580 /* This is the old matmul code. It is enabled now for mirrored matrix multiply.
1581    It also work for normal matrix/vector multiply with no changes */
1582 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1583 #   pragma weak wnga_matmul_mirrored = pnga_matmul_mirrored
1584 #endif
pnga_matmul_mirrored(transa,transb,alpha,beta,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi)1585 void pnga_matmul_mirrored(transa, transb, alpha, beta,
1586 			g_a, ailo, aihi, ajlo, ajhi,
1587 			g_b, bilo, bihi, bjlo, bjhi,
1588 			g_c, cilo, cihi, cjlo, cjhi)
1589 
1590      Integer g_a, ailo, aihi, ajlo, ajhi;    /* patch of g_a */
1591      Integer g_b, bilo, bihi, bjlo, bjhi;    /* patch of g_b */
1592      Integer g_c, cilo, cihi, cjlo, cjhi;    /* patch of g_c */
1593      void    *alpha, *beta;
1594      char    *transa, *transb;
1595 {
1596 
1597     #ifdef STATBUF
1598   /* approx. sqrt(2) ratio in chunk size to use the same buffer space */
1599    DoubleComplex a[ICHUNK*KCHUNK], b[KCHUNK*JCHUNK], c[ICHUNK*JCHUNK];
1600 #else
1601    DoubleComplex *a, *b, *c;
1602 #endif
1603 Integer atype, btype, ctype, adim1=0, adim2=0, bdim1=0, bdim2=0, cdim1=0, cdim2=0, dims[2], rank;
1604 Integer me= pnga_nodeid(), nproc;
1605 Integer i, ijk = 0, i0, i1, j0, j1;
1606 Integer ilo, ihi, idim, jlo, jhi, jdim, klo, khi, kdim;
1607 Integer n, m, k, adim, bdim=0, cdim;
1608 Integer Ichunk, Kchunk, Jchunk;
1609 DoubleComplex ONE;
1610 SingleComplex ONE_CF;
1611 
1612 DoublePrecision chunk_cube;
1613 Integer min_tasks = 10, max_chunk;
1614 int need_scaling=1;
1615 Integer ZERO_I = 0, inode, iproc;
1616 Integer get_new_B;
1617 int local_sync_begin,local_sync_end;
1618 BlasInt idim_t, jdim_t, kdim_t, adim_t, bdim_t, cdim_t;
1619 Integer clo[2], chi[2];
1620 
1621    ONE.real =1.; ONE.imag =0.;
1622    ONE_CF.real =1.; ONE_CF.imag =0.;
1623 
1624    local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1625    _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1626    if(local_sync_begin)pnga_sync();
1627 
1628 
1629    /* Check to make sure all global arrays are of the same type */
1630    if (!(pnga_is_mirrored(g_a) == pnga_is_mirrored(g_b) &&
1631         pnga_is_mirrored(g_a) == pnga_is_mirrored(g_c))) {
1632      pnga_error("Processors do not match for all arrays",pnga_nnodes());
1633    }
1634    if (pnga_is_mirrored(g_a)) {
1635      inode = pnga_cluster_nodeid();
1636      nproc = pnga_cluster_nprocs(inode);
1637      iproc = me - pnga_cluster_procid(inode, ZERO_I);
1638    } else {
1639      nproc = pnga_nnodes();
1640      iproc = me;
1641    }
1642 
1643    pnga_inquire(g_a, &atype, &rank, dims);
1644    VECTORCHECK(rank, dims, adim1, adim2, ailo, aihi, ajlo, ajhi);
1645    pnga_inquire(g_b, &btype, &rank, dims);
1646    VECTORCHECK(rank, dims, bdim1, bdim2, bilo, bihi, bjlo, bjhi);
1647    pnga_inquire(g_c, &ctype, &rank, dims);
1648    VECTORCHECK(rank, dims, cdim1, cdim2, cilo, cihi, cjlo, cjhi);
1649 
1650    if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
1651    if(atype != C_DCPL && atype != C_DBL && atype != C_FLOAT && atype != C_SCPL)
1652      pnga_error(" type error",atype);
1653 
1654 
1655 
1656    /* check if patch indices and dims match */
1657    if (*transa == 'n' || *transa == 'N'){
1658      if (ailo <= 0 || aihi > adim1 || ajlo <= 0 || ajhi > adim2)
1659        pnga_error("  g_a indices out of range ", g_a);
1660    }else
1661      if (ailo <= 0 || aihi > adim2 || ajlo <= 0 || ajhi > adim1)
1662        pnga_error("  g_a indices out of range ", g_a);
1663 
1664    if (*transb == 'n' || *transb == 'N'){
1665      if (bilo <= 0 || bihi > bdim1 || bjlo <= 0 || bjhi > bdim2)
1666        pnga_error("  g_b indices out of range ", g_b);
1667    }else
1668      if (bilo <= 0 || bihi > bdim2 || bjlo <= 0 || bjhi > bdim1)
1669        pnga_error("  g_b indices out of range ", g_b);
1670 
1671    if (cilo <= 0 || cihi > cdim1 || cjlo <= 0 || cjhi > cdim2)
1672      pnga_error("  g_c indices out of range ", g_c);
1673 
1674    /* verify if patch dimensions are consistent */
1675    m = aihi - ailo +1;
1676    n = bjhi - bjlo +1;
1677    k = ajhi - ajlo +1;
1678    if( (cihi - cilo +1) != m) pnga_error(" a & c dims error",m);
1679    if( (cjhi - cjlo +1) != n) pnga_error(" b & c dims error",n);
1680    if( (bihi - bilo +1) != k) pnga_error(" a & b dims error",k);
1681 
1682 
1683    /* In 32-bit platforms, k*m*n might exceed the "long" range(2^31),
1684       eg:k=m=n=1600. So casting the temporary value to "double" helps */
1685    chunk_cube = (k*(double)(m*n)) / (min_tasks * nproc);
1686    max_chunk = (Integer)pow(chunk_cube, (DoublePrecision)(1.0/3.0) );
1687    if (max_chunk < 32) max_chunk = 32;
1688 
1689 #ifdef STATBUF
1690    if(atype ==  C_DBL || atype == C_FLOAT){
1691       Ichunk=D_CHUNK, Kchunk=D_CHUNK, Jchunk=D_CHUNK;
1692    }else{
1693       Ichunk=ICHUNK; Kchunk=KCHUNK; Jchunk=JCHUNK;
1694    }
1695 #else
1696    {
1697      /**
1698       * Find out how much memory we can grab.  It will be used in
1699       * three chunks, and the result includes only the first one.
1700       */
1701 
1702      Integer elems, factor = sizeof(DoubleComplex)/GAsizeofM(atype);
1703      Ichunk = Jchunk = Kchunk = CHUNK_SIZE;
1704 
1705      if ( max_chunk > Ichunk) {
1706        /*if memory if very limited, performance degrades for large matrices
1707 	 as chunk size is very small, which leads to communication overhead)*/
1708        Integer avail = pnga_memory_avail_type(atype);
1709        if (pnga_is_mirrored(g_a)) {
1710          fflush(stdout);
1711          if (sizeof(Integer)/sizeof(int) > 1)
1712            armci_msg_gop_scope(SCOPE_NODE, &avail, 1, "min", ARMCI_LONG);
1713          else
1714            armci_msg_gop_scope(SCOPE_NODE, &avail, 1, "min", ARMCI_INT);
1715          fflush(stdout);
1716        } else {
1717          fflush(stdout);
1718          pnga_gop(pnga_type_f2c(MT_F_INT), &avail, (Integer)1, "min");
1719        }
1720        if(avail<MINMEM && pnga_nodeid()==0) pnga_error("NotEnough memory",avail);
1721        elems = (Integer)(avail*0.9); /* Donot use every last drop */
1722 
1723        max_chunk=GA_MIN(max_chunk, (Integer)(sqrt( (double)((elems-EXTRA)/3))));
1724        Ichunk = GA_MIN(m,max_chunk);
1725        Jchunk = GA_MIN(n,max_chunk);
1726        Kchunk = GA_MIN(k,max_chunk);
1727      }
1728      else /* "EXTRA" elems for safety - just in case */
1729        elems = 3*Ichunk*Jchunk + EXTRA*factor;
1730 
1731      a = (DoubleComplex*) ga_malloc(elems, atype, "GA mulmat bufs");
1732      b = a + (Ichunk*Kchunk)/factor + 1;
1733      c = b + (Kchunk*Jchunk)/factor + 1;
1734    }
1735 #endif
1736 
1737    if(atype==C_DCPL){if((((DoubleComplex*)beta)->real == 0) &&
1738 	       (((DoubleComplex*)beta)->imag ==0)) need_scaling =0;}
1739    else if(atype==C_SCPL){if((((SingleComplex*)beta)->real == 0) &&
1740 	       (((SingleComplex*)beta)->imag ==0)) need_scaling =0;}
1741    else if(atype==C_DBL){if(*(DoublePrecision *)beta == 0) need_scaling =0;}
1742    else if( *(float*)beta ==0) need_scaling =0;
1743 
1744    pnga_mask_sync(ZERO_I, ZERO_I);
1745    clo[0] = cilo; clo[1] = cjlo;
1746    chi[0] = cihi; chi[1] = cjhi;
1747    if(need_scaling) pnga_scale_patch(g_c, clo, chi, beta);
1748    else  pnga_fill_patch(g_c, clo, chi, beta);
1749 
1750    for(jlo = 0; jlo < n; jlo += Jchunk){ /* loop through columns of g_c patch */
1751        jhi = GA_MIN(n-1, jlo+Jchunk-1);
1752        jdim= jhi - jlo +1;
1753 
1754        for(klo = 0; klo < k; klo += Kchunk){    /* loop cols of g_a patch */
1755 	 khi = GA_MIN(k-1, klo+Kchunk-1);          /* loop rows of g_b patch */
1756 	 kdim= khi - klo +1;
1757 
1758 	 /** Each pass through the outer two loops means we need a
1759 	     different patch of B.*/
1760 	 get_new_B = TRUE;
1761 
1762 	 for(ilo = 0; ilo < m; ilo += Ichunk){ /*loop through rows of g_c patch */
1763 
1764 	   if(ijk%nproc == iproc){
1765 
1766 	     ihi = GA_MIN(m-1, ilo+Ichunk-1);
1767 	     idim= cdim = ihi - ilo +1;
1768 
1769 	     if(atype == C_FLOAT)
1770 	       for (i = 0; i < idim*jdim; i++) *(((float*)c)+i)=0;
1771 	     else if(atype ==  C_DBL)
1772 	       for (i = 0; i < idim*jdim; i++) *(((double*)c)+i)=0;
1773 	     else if(atype ==  C_SCPL)
1774 	       for (i = 0; i < idim*jdim; i++){
1775                  ((SingleComplex*)c)[i].real=0;
1776                  ((SingleComplex*)c)[i].imag=0;
1777                }
1778 	     else
1779 	       for (i = 0; i < idim*jdim; i++){ c[i].real=0;c[i].imag=0;}
1780 
1781 	     if (*transa == 'n' || *transa == 'N'){
1782 	       adim = idim;
1783 	       i0= ailo+ilo; i1= ailo+ihi;
1784 	       j0= ajlo+klo; j1= ajlo+khi;
1785           clo[0] = i0;
1786           clo[1] = j0;
1787           chi[0] = i1;
1788           chi[1] = j1;
1789 	       pnga_get(g_a, clo, chi, a, &idim);
1790 	     }else{
1791 	       adim = kdim;
1792 	       i0= ajlo+klo; i1= ajlo+khi;
1793 	       j0= ailo+ilo; j1= ailo+ihi;
1794           clo[0] = i0;
1795           clo[1] = j0;
1796           chi[0] = i1;
1797           chi[1] = j1;
1798 	       pnga_get(g_a, clo, chi, a, &kdim);
1799 	     }
1800 
1801 
1802 	     /* Avoid rereading B if it is the same patch as last time. */
1803         if(get_new_B) {
1804           if (*transb == 'n' || *transb == 'N'){
1805             bdim = kdim;
1806             i0= bilo+klo; i1= bilo+khi;
1807             j0= bjlo+jlo; j1= bjlo+jhi;
1808             clo[0] = i0;
1809             clo[1] = j0;
1810             chi[0] = i1;
1811             chi[1] = j1;
1812             pnga_get(g_b, clo, chi, b, &kdim);
1813           }else{
1814             bdim = jdim;
1815             i0= bjlo+jlo; i1= bjlo+jhi;
1816             j0= bilo+klo; j1= bilo+khi;
1817             clo[0] = i0;
1818             clo[1] = j0;
1819             chi[0] = i1;
1820             chi[1] = j1;
1821             pnga_get(g_b, clo, chi, b, &jdim);
1822           }
1823           get_new_B = FALSE; /* Until J or K change again */
1824         }
1825 
1826 
1827 	     idim_t=idim; jdim_t=jdim; kdim_t=kdim;
1828 	     adim_t=adim; bdim_t=bdim; cdim_t=cdim;
1829 
1830 	     switch(atype) {
1831 	     case C_FLOAT:
1832 	       BLAS_SGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
1833 		      (Real *)alpha, (Real *)a, &adim_t,
1834               (Real *)b, &bdim_t, (Real *)&ONE_CF,
1835               (Real *)c, &cdim_t);
1836 	       break;
1837 	     case C_DBL:
1838 	       BLAS_DGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
1839 		      (DoublePrecision *)alpha, (DoublePrecision *)a, &adim_t,
1840               (DoublePrecision *)b, &bdim_t, (DoublePrecision *)&ONE,
1841               (DoublePrecision *)c, &cdim_t);
1842 	       break;
1843 	     case C_DCPL:
1844 	       BLAS_ZGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
1845 		      (DoubleComplex *)alpha, (DoubleComplex *)a, &adim_t,
1846               (DoubleComplex *)b, &bdim_t, (DoubleComplex *)&ONE,
1847               (DoubleComplex *)c, &cdim_t);
1848 	       break;
1849 	     case C_SCPL:
1850 	       BLAS_CGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
1851 		      (SingleComplex *)alpha, (SingleComplex *)a, &adim_t,
1852               (SingleComplex *)b, &bdim_t, (SingleComplex *)&ONE_CF,
1853               (SingleComplex *)c, &cdim_t);
1854 	       break;
1855 	     default:
1856 	       pnga_error("ga_matmul_patch: wrong data type", atype);
1857 	     }
1858 
1859 	     i0= cilo+ilo; i1= cilo+ihi;   j0= cjlo+jlo; j1= cjlo+jhi;
1860 	     if(atype == C_FLOAT || atype == C_SCPL)  {
1861           clo[0] = i0;
1862           clo[1] = j0;
1863           chi[0] = i1;
1864           chi[1] = j1;
1865 	       pnga_acc(g_c, clo, chi, (float *)c, &cdim, &ONE_CF);
1866         } else {
1867           clo[0] = i0;
1868           clo[1] = j0;
1869           chi[0] = i1;
1870           chi[1] = j1;
1871 	       pnga_acc(g_c, clo, chi, (DoublePrecision*)c, &cdim, (DoublePrecision*)&ONE);
1872         }
1873 	   }
1874 	   ++ijk;
1875 	 }
1876        }
1877    }
1878 
1879 #ifndef STATBUF
1880    ga_free(a);
1881 #endif
1882 
1883    if(local_sync_end)pnga_sync();
1884 
1885 }
1886 
1887 
1888 #if 0
1889 void gai_matmul_patch(char *transa, char *transb, void *alpha, void *beta,
1890         Integer g_a,Integer ailo,Integer aihi,Integer ajlo,Integer ajhi,
1891         Integer g_b,Integer bilo,Integer bihi,Integer bjlo,Integer bjhi,
1892         Integer g_c,Integer cilo,Integer cihi,Integer cjlo,Integer cjhi)
1893 {
1894     if(pnga_is_mirrored(g_a))
1895        pnga_matmul_mirrored(transa, transb, alpha, beta,
1896 			  g_a, ailo, aihi, ajlo, ajhi,
1897 			  g_b, bilo, bihi, bjlo, bjhi,
1898 			  g_c, cilo, cihi, cjlo, cjhi);
1899     else {
1900        _gai_matmul_patch_flag = SET;
1901        pnga_matmul(transa, transb, alpha, beta,
1902 		 g_a, ailo, aihi, ajlo, ajhi,
1903 		 g_b, bilo, bihi, bjlo, bjhi,
1904 		 g_c, cilo, cihi, cjlo, cjhi);
1905        _gai_matmul_patch_flag = UNSET;
1906     }
1907 }
1908 #endif
1909 
1910 
1911 /*\ select the 2d plane to be used in matrix multiplication
1912  *  rank: dimension of original array
1913  *  trans: character signifying whether to use transpose
1914  *  dims: dimensions of original array
1915  *  lo, hi: bounds of patch
1916  *  ipos: position of first patch dimension greater than 1
1917  *  jpos: position of second patch dimension greater than 1
1918  *  vec_idx: indicator of which dimension data is originally located for vectors
1919  *  (only applies to transpose flag)
1920 \*/
gai_setup_2d_patch(Integer rank,char * trans,Integer dims[],Integer lo[],Integer hi[],Integer * ilo,Integer * ihi,Integer * jlo,Integer * jhi,Integer * dim1,Integer * dim2,int * ipos,int * jpos,int * vpos,int vec_idx)1921   static void  gai_setup_2d_patch(Integer rank, char *trans, Integer dims[],
1922                                 Integer lo[], Integer hi[],
1923                                 Integer* ilo, Integer* ihi,
1924                                 Integer* jlo, Integer* jhi,
1925                                 Integer* dim1, Integer* dim2,
1926                                 int* ipos, int* jpos, int *vpos, int vec_idx)
1927 {
1928     int d,e=0,f=0;
1929     char t='n';
1930     int reset_t = 0;
1931     int tmp;
1932 
1933     /* check to see if more than two dimensions of patch have
1934      * size greater than 1*/
1935     for(d=0; d<rank; d++)
1936        if( (hi[d]-lo[d])>0 && ++e>2 ) pnga_error("3-D Patch Detected", 0L);
1937     /* determine two locations where patch dimensions are greater than 1*/
1938     *ipos = *jpos = *vpos = -1;
1939     for(d=0; d<rank; d++){
1940        if( (*ipos <0) && (hi[d]>lo[d]) ) { *ipos =d; continue; }
1941        if( (*ipos >=0) && (hi[d]>lo[d])) { *jpos =d; break; }
1942     }
1943     /* we have an ambiguous vector; mark its location */
1944     if (1 == e && *ipos != 0 && *ipos != rank-1) {
1945         *vpos = *ipos;
1946     }
1947 
1948     /*    if(*ipos >*jpos){Integer t=*ipos; *ipos=*jpos; *jpos=t;}
1949      */
1950 
1951     /* single element case (trivial) */
1952     if((*ipos <0) && (*jpos <0)){ *ipos =0; *jpos=1; }
1953     else{
1954 
1955        if(trans == NULL) {
1956          trans = &t;
1957          reset_t = 1;
1958        }
1959        /* handle almost trivial case of only one dimension with >1 elements.
1960         * None of the conditions in this block are true for a 2D block with more
1961         * than 1 element in each of the 2 dimensions so no need to specify
1962         * condition that e == 1 */
1963        if(*trans == 'n' || *trans == 'N') {
1964           if(*ipos == rank-1) (*ipos)--; /* i cannot be the last dimension */
1965           if(*ipos <0) *ipos = *jpos-1; /* select i dimension based on j */
1966           if(*jpos <0) *jpos = *ipos+1; /* select j dimenison based on i */
1967        }
1968        else {
1969           if(*ipos <0) *ipos = *jpos-1; /* this condition is probably never reached */
1970           if(*jpos <0) {
1971             if (vec_idx < 0) {
1972               if(*ipos==0) *jpos = *ipos + 1;
1973               else         *jpos = (*ipos)--;
1974             } else {
1975               if (*ipos < vec_idx) {
1976                 *jpos = vec_idx;
1977               } else if (*ipos > vec_idx) {
1978                 *jpos = *ipos;
1979                 *ipos = vec_idx;
1980               } else {
1981                 /* this is probably an error if you get this far */
1982                 if (*ipos > 0) {
1983                   *jpos = *ipos;
1984                   *ipos--;
1985                 } else {
1986                   *jpos = 1;
1987                   *ipos = 0;
1988                 }
1989               }
1990             }
1991           }
1992        }
1993        if (reset_t) trans = NULL;
1994     }
1995 
1996     *ilo = lo[*ipos]; *ihi = hi[*ipos];
1997     *jlo = lo[*jpos]; *jhi = hi[*jpos];
1998     *dim1 = dims[*ipos];
1999     *dim2 = dims[*jpos];
2000 #if 0
2001     printf("lo/hi=[%ld:%ld", lo[0], hi[0]);
2002     for (d=1; d<rank; ++d) {
2003         printf(",%ld:%ld", lo[d], hi[d]);
2004     }
2005     printf("]\n");
2006     printf("size=[%ld", hi[0]-lo[0]+1);
2007     for (d=1; d<rank; ++d) {
2008         printf(",%ld", hi[d]-lo[d]+1);
2009     }
2010     printf("]\n");
2011     printf("gai_setup_2d_patch(%ld, T, X, X, X, %ld, %ld, %ld, %ld, %ld, %ld, %d, %d, %d)\n",
2012             rank, *ilo, *ihi, *jlo, *jhi, *dim1, *dim2, *ipos, *jpos, *vpos);
2013 #endif
2014 }
2015 
2016 #define  SETINT(tmp,val,n) {int _i; for(_i=0;_i<n; _i++)tmp[_i]=val;}
2017 
2018 /**
2019  *  The indices avec_loc, and bvec_loc art designed to handle the special case
2020  *  that the patch requested is a 1D vector and the original array contains 3 or
2021  *  more dimensions. The original interface cannot handle this case. The
2022  *  avec_pos parameter specifies which dimension the original patch (before
2023  *  transpose) lies on.
2024  */
gai_matmul_patch(char * transa,char * transb,void * alpha,void * beta,Integer g_a,Integer alo[],Integer ahi[],int avec_pos,Integer g_b,Integer blo[],Integer bhi[],int bvec_pos,Integer g_c,Integer clo[],Integer chi[])2025 void gai_matmul_patch(char *transa, char *transb, void *alpha, void *beta,
2026     Integer g_a, Integer alo[], Integer ahi[], int avec_pos,
2027     Integer g_b, Integer blo[], Integer bhi[], int bvec_pos,
2028     Integer g_c, Integer clo[], Integer chi[])
2029 {
2030 #ifdef STATBUF
2031    DoubleComplex a[ICHUNK*KCHUNK], b[KCHUNK*JCHUNK], c[ICHUNK*JCHUNK];
2032 #else
2033    DoubleComplex *a, *b, *c;
2034 #endif
2035 Integer atype, btype, ctype, adim1, adim2, bdim1, bdim2, cdim1, cdim2;
2036 Integer me= pnga_nodeid(), nproc, inode, iproc;
2037 Integer i, ijk = 0, i0, i1, j0, j1;
2038 Integer ilo, ihi, idim, jlo, jhi, jdim, klo, khi, kdim;
2039 Integer n, m, k, k2, cm, cn, adim, bdim, cdim, arank, brank, crank;
2040 int aipos, ajpos, bipos, bjpos,cipos, cjpos, need_scaling=1;
2041 int avpos, bvpos, cvpos;
2042 Integer Ichunk, Kchunk, Jchunk;
2043 Integer ailo, aihi, ajlo, ajhi;    /* 2d plane of g_a */
2044 Integer bilo, bihi, bjlo, bjhi;    /* 2d plane of g_b */
2045 Integer cilo, cihi, cjlo, cjhi;    /* 2d plane of g_c */
2046 Integer adims[GA_MAX_DIM],bdims[GA_MAX_DIM],cdims[GA_MAX_DIM],tmpld[GA_MAX_DIM];
2047 Integer *tmplo = adims, *tmphi =bdims;
2048 DoubleComplex ONE;
2049 SingleComplex ONE_CF;
2050 Integer ZERO_I = 0;
2051 Integer get_new_B;
2052 DoublePrecision chunk_cube;
2053 Integer min_tasks = 10, max_chunk;
2054 int local_sync_begin,local_sync_end;
2055 BlasInt idim_t, jdim_t, kdim_t, adim_t, bdim_t, cdim_t;
2056 
2057    ONE.real =1.; ONE.imag =0.;
2058    ONE_CF.real =1.; ONE_CF.imag =0.;
2059 
2060    local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2061    _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2062    if(local_sync_begin)pnga_sync();
2063 
2064    if (pnga_total_blocks(g_a) > 0 || pnga_total_blocks(g_b) > 0 ||
2065        pnga_total_blocks(g_c) > 0) {
2066      pnga_matmul_basic(transa, transb, alpha, beta, g_a, alo, ahi,
2067          g_b, blo, bhi, g_c, clo, chi);
2068      return;
2069    }
2070 
2071    /* Check to make sure all global arrays are of the same type */
2072    if (!(pnga_is_mirrored(g_a) == pnga_is_mirrored(g_b) &&
2073         pnga_is_mirrored(g_a) == pnga_is_mirrored(g_c))) {
2074      pnga_error("Processors do not match for all arrays",pnga_nnodes());
2075    }
2076    if (pnga_is_mirrored(g_a)) {
2077      inode = pnga_cluster_nodeid();
2078      nproc = pnga_cluster_nprocs(inode);
2079      iproc = me - pnga_cluster_procid(inode, ZERO_I);
2080    } else {
2081      nproc = pnga_nnodes();
2082      iproc = me;
2083    }
2084 
2085    pnga_inquire(g_a, &atype, &arank, adims);
2086    pnga_inquire(g_b, &btype, &brank, bdims);
2087    pnga_inquire(g_c, &ctype, &crank, cdims);
2088 
2089    if(arank<2)  pnga_error("rank of A must be at least 2",arank);
2090    if(brank<2)  pnga_error("rank of B must be at least 2",brank);
2091    if(crank<2)  pnga_error("rank of C must be at least 2",crank);
2092 
2093    if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
2094    if(atype != C_DCPL && atype != C_DBL && atype != C_FLOAT && atype != C_SCPL)
2095      pnga_error(" type error",atype);
2096 
2097    gai_setup_2d_patch(arank, transa, adims, alo, ahi, &ailo, &aihi,
2098                       &ajlo, &ajhi, &adim1, &adim2, &aipos, &ajpos, &avpos, avec_pos);
2099    gai_setup_2d_patch(brank, transb, bdims, blo, bhi, &bilo, &bihi,
2100                       &bjlo, &bjhi, &bdim1, &bdim2, &bipos, &bjpos, &bvpos, bvec_pos);
2101    gai_setup_2d_patch(crank, NULL, cdims, clo, chi, &cilo, &cihi,
2102                       &cjlo, &cjhi, &cdim1, &cdim2, &cipos, &cjpos, &cvpos, -1);
2103 
2104 
2105    /* check if patch indices and dims match */
2106    if (*transa == 'n' || *transa == 'N'){
2107       if (ailo <= 0 || aihi > adim1 || ajlo <= 0 || ajhi > adim2)
2108          pnga_error("  g_a indices out of range ", g_a);
2109    }else
2110       if (ailo <= 0 || aihi > adim2 || ajlo <= 0 || ajhi > adim1)
2111          pnga_error("  g_a indices out of range ", g_a);
2112 
2113    if (*transb == 'n' || *transb == 'N'){
2114       if (bilo <= 0 || bihi > bdim1 || bjlo <= 0 || bjhi > bdim2)
2115           pnga_error("  g_b indices out of range ", g_b);
2116    }else
2117       if (bilo <= 0 || bihi > bdim2 || bjlo <= 0 || bjhi > bdim1)
2118           pnga_error("  g_b indices out of range ", g_b);
2119 
2120 
2121    if (cilo <= 0 || cihi > cdim1 || cjlo <= 0 || cjhi > cdim2)
2122        pnga_error("  g_c indices out of range ", g_c);
2123 
2124    /* verify if patch dimensions are consistent */
2125 #define RESET() do {   \
2126    m = aihi - ailo +1; \
2127    k = ajhi - ajlo +1; \
2128    k2= bihi - bilo +1; \
2129    n = bjhi - bjlo +1; \
2130    cm= cihi - cilo +1; \
2131    cn= cjhi - cjlo +1; \
2132 } while (0)
2133    RESET();
2134 #define SHIFT(L,INC) do {      \
2135    L##ipos+=INC;               \
2136    L##jpos+=INC;               \
2137    L##ilo = L##lo[L##ipos];    \
2138    L##ihi = L##hi[L##ipos];    \
2139    L##jlo = L##lo[L##jpos];    \
2140    L##jhi = L##hi[L##jpos];    \
2141    L##dim1 = L##dims[L##ipos]; \
2142    L##dim2 = L##dims[L##jpos]; \
2143    RESET();                    \
2144 } while (0)
2145    /* gai_setup_2d_patch may produce ambiguous vectors */
2146    if (!(m==cm && k==k2 && n==cn)) {
2147        /* patches don't agree */
2148        if (avpos>=0 && bvpos<0 && cvpos<0) {
2149            /* only A is an ambiguous vector */
2150            SHIFT(a,-1);
2151        }
2152        else if (avpos<0 && bvpos>=0 && cvpos<0) {
2153            /* only B is an ambiguous vector */
2154            SHIFT(b,-1);
2155        }
2156        else if (avpos<0 && bvpos<0 && cvpos>=0) {
2157            /* only C is an ambiguous vector */
2158            SHIFT(c,-1);
2159        }
2160        else if (avpos>=0 && bvpos>=0 && cvpos<0) {
2161            /* A and B are ambiguous vectors */
2162            if (m != cm) {
2163                SHIFT(a,-1);
2164            }
2165            if (n != cn) {
2166                SHIFT(b,-1);
2167            }
2168        }
2169        else if (avpos>=0 && bvpos<0 && cvpos>=0) {
2170            /* A and C are ambiguous vectors */
2171            if (k != k2) {
2172                SHIFT(a,-1);
2173            }
2174            if (n != cn) {
2175                SHIFT(c,-1);
2176            }
2177        }
2178        else if (avpos<0 && bvpos>=0 && cvpos>=0) {
2179            /* B and C are ambiguous vectors */
2180            if (k != k2) {
2181                SHIFT(b,-1);
2182            }
2183            if (m != cm) {
2184                SHIFT(c,-1);
2185            }
2186        }
2187        else if (avpos>=0 && bvpos>=0 && cvpos>=0) {
2188            /* A and B and C are ambiguous vectors */
2189            pnga_error("a and b and c ambiguous", 1);
2190        }
2191    }
2192    if( (cihi - cilo +1) != m) pnga_error(" a & c dims error",m);
2193    if( (cjhi - cjlo +1) != n) pnga_error(" b & c dims error",n);
2194    if( (bihi - bilo +1) != k) pnga_error(" a & b dims error",k);
2195 
2196    chunk_cube = (k*(double)(m*n)) / (min_tasks * nproc);
2197    max_chunk = (Integer)pow(chunk_cube, (DoublePrecision)(1.0/3.0) );
2198    if (max_chunk < 32) max_chunk = 32;
2199 
2200 #ifdef STATBUF
2201    if(atype ==  C_DBL || atype == C_FLOAT){
2202       Ichunk=D_CHUNK, Kchunk=D_CHUNK, Jchunk=D_CHUNK;
2203    }else{
2204       Ichunk=ICHUNK; Kchunk=KCHUNK; Jchunk=JCHUNK;
2205    }
2206 #else
2207    {
2208      Integer elems, factor = sizeof(DoubleComplex)/GAsizeofM(atype);
2209      Ichunk = Jchunk = Kchunk = CHUNK_SIZE;
2210 
2211      if ( max_chunk > Ichunk) {
2212        /*if memory if very limited, performance degrades for large matrices
2213 	 as chunk size is very small, which leads to communication overhead)*/
2214        Integer avail = pnga_memory_avail_type(atype);
2215        pnga_gop(pnga_type_f2c(MT_F_INT), &avail, (Integer)1, "min");
2216        if(avail<MINMEM && pnga_nodeid()==0) pnga_error("Not enough memory",avail);
2217        elems = (Integer)(avail*0.9);/* Donot use every last drop */
2218 
2219        max_chunk=GA_MIN(max_chunk, (Integer)(sqrt( (double)((elems-EXTRA)/3))));
2220        Ichunk = GA_MIN(m,max_chunk);
2221        Jchunk = GA_MIN(n,max_chunk);
2222        Kchunk = GA_MIN(k,max_chunk);
2223      }
2224      else /* "EXTRA" elems for safety - just in case */
2225        elems = 3*Ichunk*Jchunk + EXTRA*factor;
2226 
2227      a = (DoubleComplex*) ga_malloc(elems, atype, "GA mulmat bufs");
2228      b = a + (Ichunk*Kchunk)/factor + 1;
2229      c = b + (Kchunk*Jchunk)/factor + 1;
2230    }
2231 #endif
2232 
2233    if(atype==C_DCPL){if((((DoubleComplex*)beta)->real == 0) &&
2234 	       (((DoubleComplex*)beta)->imag ==0)) need_scaling =0;}
2235    else if(atype==C_SCPL){if((((SingleComplex*)beta)->real == 0) &&
2236 	       (((SingleComplex*)beta)->imag ==0)) need_scaling =0;}
2237    else if(atype==C_DBL){if(*(DoublePrecision *)beta == 0)need_scaling =0;}
2238    else if( *(float*)beta ==0) need_scaling =0;
2239 
2240    if(need_scaling) pnga_scale_patch(g_c, clo, chi, beta);
2241    else      pnga_fill_patch(g_c, clo, chi, beta);
2242 
2243    for(jlo = 0; jlo < n; jlo += Jchunk){ /* loop through columns of g_c patch */
2244        jhi = GA_MIN(n-1, jlo+Jchunk-1);
2245        jdim= jhi - jlo +1;
2246 
2247        for(klo = 0; klo < k; klo += Kchunk){    /* loop cols of g_a patch */
2248 	 khi = GA_MIN(k-1, klo+Kchunk-1);        /* loop rows of g_b patch */
2249 	 kdim= khi - klo +1;
2250 
2251 	 get_new_B = TRUE;
2252 
2253 	 for(ilo = 0; ilo < m; ilo += Ichunk){ /*loop through rows of g_c patch */
2254 
2255 	   if(ijk%nproc == iproc){
2256 	     ihi = GA_MIN(m-1, ilo+Ichunk-1);
2257 	     idim= cdim = ihi - ilo +1;
2258 
2259 	     if(atype == C_FLOAT)
2260 	       for (i = 0; i < idim*jdim; i++) *(((float*)c)+i)=0;
2261 	     else if(atype ==  C_DBL)
2262 	       for (i = 0; i < idim*jdim; i++) *(((double*)c)+i)=0;
2263 	     else if(atype == C_SCPL)
2264 	       for (i = 0; i < idim*jdim; i++){
2265                  ((SingleComplex*)c)[i].real=0;
2266                  ((SingleComplex*)c)[i].imag=0;
2267                }
2268              else
2269 	       for (i = 0; i < idim*jdim; i++){ c[i].real=0;c[i].imag=0;}
2270 
2271 	     if (*transa == 'n' || *transa == 'N'){
2272 	       adim = idim;
2273 	       i0= ailo+ilo; i1= ailo+ihi;
2274 	       j0= ajlo+klo; j1= ajlo+khi;
2275 	     }else{
2276 	       adim = kdim;
2277 	       i0= ajlo+klo; i1= ajlo+khi;
2278 	       j0= ailo+ilo; j1= ailo+ihi;
2279 	     }
2280 
2281 	     /* ga_get_(g_a, &i0, &i1, &j0, &j1, a, &adim); */
2282 	     memcpy(tmplo,alo,arank*sizeof(Integer));
2283 	     memcpy(tmphi,ahi,arank*sizeof(Integer));
2284 	     SETINT(tmpld,1,arank-1);
2285 	     tmplo[aipos]=i0; tmphi[aipos]=i1;
2286 	     tmplo[ajpos]=j0; tmphi[ajpos]=j1;
2287 	     tmpld[aipos]=i1-i0+1;
2288 	     tmpld[ajpos]=j1-j0+1;
2289 	     pnga_get(g_a,tmplo,tmphi,a,tmpld);
2290 
2291 	     if(get_new_B) {
2292 	       if (*transb == 'n' || *transb == 'N'){
2293 		 bdim = kdim;
2294 		 i0= bilo+klo; i1= bilo+khi;
2295 		 j0= bjlo+jlo; j1= bjlo+jhi;
2296 	       }else{
2297 		 bdim = jdim;
2298 		 i0= bjlo+jlo; i1= bjlo+jhi;
2299 		 j0= bilo+klo; j1= bilo+khi;
2300 	       }
2301 	       /* ga_get_(g_b, &i0, &i1, &j0, &j1, b, &bdim); */
2302 	       memcpy(tmplo,blo,brank*sizeof(Integer));
2303 	       memcpy(tmphi,bhi,brank*sizeof(Integer));
2304 	       SETINT(tmpld,1,brank-1);
2305 	       tmplo[bipos]=i0; tmphi[bipos]=i1;
2306 	       tmplo[bjpos]=j0; tmphi[bjpos]=j1;
2307 	       tmpld[bipos]=i1-i0+1;
2308 	       tmpld[bjpos]=j1-j0+1;
2309 	       pnga_get(g_b,tmplo,tmphi,b,tmpld);
2310 	       get_new_B = FALSE;
2311 	     }
2312 
2313 	     idim_t=idim; jdim_t=jdim; kdim_t=kdim;
2314 	     adim_t=adim; bdim_t=bdim; cdim_t=cdim;
2315 
2316 		  switch(atype) {
2317 		  case C_FLOAT:
2318 		    BLAS_SGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2319 			   (Real *)alpha, (Real *)a, &adim_t,
2320                (Real *)b, &bdim_t, (Real *)&ONE_CF,
2321                (Real *)c, &cdim_t);
2322 		    break;
2323 		  case C_DBL:
2324 		    BLAS_DGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2325 			   (DoublePrecision *)alpha, (DoublePrecision *)a, &adim_t,
2326                (DoublePrecision *)b, &bdim_t, (DoublePrecision *)&ONE,
2327                (DoublePrecision *)c, &cdim_t);
2328 		    break;
2329 		  case C_DCPL:
2330 		    BLAS_ZGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2331 			   (DoubleComplex *)alpha, (DoubleComplex *)a, &adim_t,
2332                (DoubleComplex *)b, &bdim_t, (DoubleComplex *)&ONE,
2333                (DoubleComplex *)c, &cdim_t);
2334 		    break;
2335 		  case C_SCPL:
2336 		    BLAS_CGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2337 			   (SingleComplex *)alpha, (SingleComplex *)a, &adim_t,
2338                (SingleComplex *)b, &bdim_t, (SingleComplex *)&ONE_CF,
2339                (SingleComplex *)c, &cdim_t);
2340 		    break;
2341 		  default:
2342 		    pnga_error("ga_matmul_patch: wrong data type", atype);
2343 		  }
2344 
2345                   i0= cilo+ilo; i1= cilo+ihi;   j0= cjlo+jlo; j1= cjlo+jhi;
2346                   /* ga_acc_(g_c, &i0, &i1, &j0, &j1, (DoublePrecision*)c,
2347                                             &cdim, (DoublePrecision*)&ONE); */
2348 		  memcpy(tmplo,clo,crank*sizeof(Integer));
2349 		  memcpy(tmphi,chi,crank*sizeof(Integer));
2350 		  SETINT(tmpld,1,crank-1);
2351 		  tmplo[cipos]=i0; tmphi[cipos]=i1;
2352 		  tmplo[cjpos]=j0; tmphi[cjpos]=j1;
2353 		  tmpld[cipos]=i1-i0+1;
2354 		  tmpld[cjpos]=j1-j0+1;
2355 		  if(atype == C_FLOAT || atype == C_SCPL)
2356 		    pnga_acc(g_c,tmplo,tmphi,(float *)c,tmpld, &ONE_CF);
2357 		  else
2358 		    pnga_acc(g_c,tmplo,tmphi,c,tmpld,(DoublePrecision*)&ONE);
2359                }
2360 	   ++ijk;
2361 	 }
2362        }
2363    }
2364 
2365 #ifndef STATBUF
2366    ga_free(a);
2367 #endif
2368 
2369    if(local_sync_end)pnga_sync();
2370 }
2371 
2372 /*\ MATRIX MULTIPLICATION for 2d patches of multi-dimensional arrays
2373  *
2374  *  C[lo:hi,lo:hi] = alpha*op(A)[lo:hi,lo:hi] * op(B)[lo:hi,lo:hi]
2375  *                 + beta *C[lo:hi,lo:hi]
2376  *
2377  *  where:
2378  *          op(A) = A or A' depending on the transpose flag
2379  *  [lo:hi,lo:hi] - patch indices _after_ op() operator was applied
2380  *
2381  *  In this interface all patch indices refer to the post-transpose block. For
2382  *  example, if the transpose of the block [1:4,2:5] is request, the indices
2383  *  [2:5,1:4] should be used for lo and hi. Note that this interface can not
2384  *  handle a 1D slice from an array containing more than 2 dimensions.
2385  *
2386  *
2387 \*/
2388 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2389 #   pragma weak wnga_matmul_patch = pnga_matmul_patch
2390 #endif
pnga_matmul_patch(char * transa,char * transb,void * alpha,void * beta,Integer g_a,Integer alo[],Integer ahi[],Integer g_b,Integer blo[],Integer bhi[],Integer g_c,Integer clo[],Integer chi[])2391 void pnga_matmul_patch(char *transa, char *transb, void *alpha, void *beta,
2392     Integer g_a, Integer alo[], Integer ahi[],
2393     Integer g_b, Integer blo[], Integer bhi[],
2394     Integer g_c, Integer clo[], Integer chi[])
2395 {
2396   gai_matmul_patch(transa,transb,alpha,beta,g_a,alo,ahi,-1,
2397     g_b,blo,bhi,-1,g_c,clo,chi);
2398 }
2399 
2400 /*\ MATRIX MULTIPLICATION for 2d patches of multi-dimensional arrays
2401  *
2402  *  C[lo:hi,lo:hi] = alpha*op(A)[lo:hi,lo:hi] * op(B)[lo:hi,lo:hi]
2403  *                 + beta *C[lo:hi,lo:hi]
2404  *
2405  *  where:
2406  *          op(A) = A or A' depending on the transpose flag
2407  *  [lo:hi,lo:hi] - patch indices _after_ op() operator was applied
2408  *
2409  *  This interface fixes a bug in the original interface that cannot handle a 1D
2410  *  slice from an array that is over 3D if the transpose option is used. For
2411  *  this interface, all patch indices refer to the pre-transpose block, if the
2412  *  transpose option is used.
2413  *
2414 \*/
2415 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2416 #   pragma weak wnga_matmul_patch_alt = pnga_matmul_patch_alt
2417 #endif
pnga_matmul_patch_alt(char * transa,char * transb,void * alpha,void * beta,Integer g_a,Integer alo[],Integer ahi[],Integer g_b,Integer blo[],Integer bhi[],Integer g_c,Integer clo[],Integer chi[])2418 void pnga_matmul_patch_alt(char *transa, char *transb, void *alpha, void *beta,
2419     Integer g_a, Integer alo[], Integer ahi[],
2420     Integer g_b, Integer blo[], Integer bhi[],
2421     Integer g_c, Integer clo[], Integer chi[])
2422 {
2423   int aivec, bivec;
2424   int e, d, ipos, jpos;
2425   Integer atype, btype, arank, brank, lo, hi;
2426   Integer adims[GA_MAX_DIM],bdims[GA_MAX_DIM];
2427   /* if the patch is transposed, find the two indices that correspond to the
2428    * non-unit dimensions */
2429   if (*transa == 't' || *transa == 'T') {
2430     pnga_inquire(g_a, &atype, &arank, adims);
2431     aivec = -1;
2432     if (arank > 2) {
2433       ipos = -1;
2434       jpos = -1;
2435       e = 0;
2436       /* find out how many patch dimensions are greater than 1*/
2437       for (d=0; d<arank; d++) {
2438         if (ahi[d]-alo[d] > 0 && ipos == -1) {
2439           ipos = d;
2440           e++;
2441         } else if (ahi[d]-alo[d] > 0 && jpos == -1) {
2442           jpos = d;
2443           e++;
2444         } else if (ahi[d]-alo[d] > 0) {
2445           pnga_error("Patch A has more than 2 dimensions",0);
2446         }
2447       }
2448       if (e == 0) {
2449         aivec = -1;
2450       } else if (e == 1) {
2451         /* array is a vector */
2452         aivec = ipos;
2453         if (ipos < arank-1) {
2454           jpos = ipos + 1;
2455         } else {
2456           jpos = ipos;
2457           ipos--;
2458         }
2459       } else {
2460         aivec = -1;
2461       }
2462     }
2463     lo = alo[0];
2464     hi = ahi[0];
2465     alo[0] = alo[1];
2466     ahi[0] = ahi[1];
2467     alo[1] = lo;
2468     ahi[1] = hi;
2469   }
2470   if (*transb == 't' || *transb == 'T') {
2471     pnga_inquire(g_b, &btype, &brank, bdims);
2472     bivec = -1;
2473     if (brank > 2) {
2474       ipos = -1;
2475       jpos = -1;
2476       e = 0;
2477       /* find out how many patch dimensions are greater than 1*/
2478       for (d=0; d<brank; d++) {
2479         if (bhi[d]-blo[d] > 0 && ipos == -1) {
2480           ipos = d;
2481           e++;
2482         } else if (bhi[d]-blo[d] > 0 && jpos == -1) {
2483           jpos = d;
2484           e++;
2485         } else if (bhi[d]-blo[d] > 0) {
2486           pnga_error("Patch A has more than 2 dimensions",0);
2487         }
2488       }
2489       if (e == 0) {
2490         bivec = -1;
2491       } else if (e == 1) {
2492         /* array is a vector */
2493         bivec = ipos;
2494         if (ipos < arank-1) {
2495           jpos = ipos + 1;
2496         } else {
2497           jpos = ipos;
2498           ipos--;
2499         }
2500       } else {
2501         bivec = -1;
2502       }
2503       lo = blo[ipos];
2504       hi = bhi[ipos];
2505       blo[ipos] = blo[jpos];
2506       bhi[ipos] = bhi[jpos];
2507       blo[jpos] = lo;
2508       bhi[jpos] = hi;
2509     } else {
2510       lo = blo[0];
2511       hi = bhi[0];
2512       blo[0] = blo[1];
2513       bhi[0] = bhi[1];
2514       blo[1] = lo;
2515       bhi[1] = hi;
2516     }
2517   }
2518 
2519 
2520   gai_matmul_patch(transa,transb,alpha,beta,g_a,alo,ahi,aivec,
2521     g_b,blo,bhi,bivec,g_c,clo,chi);
2522 }
2523 
2524 
2525 /**
2526  * 1. remove STATBUF
2527  * 2.
2528  */
printBlock(char * banner,Integer type,void * ptr,Integer lo[],Integer hi[],Integer ld[])2529 void printBlock(char * banner, Integer type, void *ptr, Integer lo[],
2530     Integer hi[], Integer ld[])
2531 {
2532   Integer i,j;
2533   Integer offset;
2534   printf("p[%d] %s lo[0]: %d hi[0]: %d lo[1]: %d hi[1]: %d\n",
2535       (int)pnga_nodeid(),banner,(int)lo[0],(int)hi[0],(int)lo[1],(int)hi[1]);
2536   printf("    ");
2537   for (i=lo[0]; i<=hi[0]; i++) printf(" %12d",(int)i);
2538   printf("\n");
2539   for (j=lo[1]; j<=hi[1]; j++) {
2540     printf("J: %d",(int)j);
2541     for (i=lo[0]; i<=hi[0]; i++) {
2542       offset = (j-lo[1])*ld[0] + i-lo[0];
2543       switch (type) {
2544         case C_FLOAT:
2545           printf(" %12.4f",*((float*)ptr+offset));
2546           break;
2547         case C_DBL:
2548           printf(" %12.4f",*((double*)ptr+offset));
2549           break;
2550         case C_DCPL:
2551           printf(" [%12.4f:%12.4f]",*((double*)ptr+2*offset),
2552               *((double*)ptr+2*offset+1));
2553           break;
2554         case C_SCPL:
2555           printf(" [%12.4f:%12.4f]",*((float*)ptr+2*offset),
2556               *((float*)ptr+2*offset+1));
2557           break;
2558         default:
2559           pnga_error("ga_matmul_basic: wrong data type", type);
2560       }
2561     }
2562     printf("\n");
2563   }
2564   printf("\n\n");
2565 }
2566 /**
2567  * This is a routine that is designed to work for all layouts but may not be
2568  * high performing.
2569  */
2570 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2571 #   pragma weak wnga_matmul_basic = pnga_matmul_basic
2572 #endif
pnga_matmul_basic(char * transa,char * transb,void * alpha,void * beta,Integer g_a,Integer alo[],Integer ahi[],Integer g_b,Integer blo[],Integer bhi[],Integer g_c,Integer clo[],Integer chi[])2573 void pnga_matmul_basic(char *transa, char *transb, void *alpha, void *beta,
2574 		                 Integer g_a, Integer alo[], Integer ahi[],
2575                        Integer g_b, Integer blo[], Integer bhi[],
2576 		                 Integer g_c, Integer clo[], Integer chi[])
2577 {
2578   Integer atype, btype, ctype;
2579   Integer adims[GA_MAX_DIM],bdims[GA_MAX_DIM],cdims[GA_MAX_DIM],tmpld[GA_MAX_DIM];
2580   Integer n, m, k, nb, adim, bdim, cdim, arank, brank, crank;
2581   Integer loC[MAXDIM], hiC[MAXDIM], lot[MAXDIM], hit[MAXDIM], lC[MAXDIM];
2582   Integer nlo, loA[MAXDIM], hiA[MAXDIM], loB[MAXDIM], hiB[MAXDIM];
2583   DoubleComplex ONE_Z;
2584   SingleComplex ONE_F;
2585   BlasInt idim_t, jdim_t, kdim_t, adim_t, bdim_t, cdim_t;
2586   char *src_ptr;
2587   _iterator_hdl hdl_c;
2588   int local_sync_begin,local_sync_end;
2589 
2590   ONE_Z.real = 1.0;
2591   ONE_Z.imag = 0.0;
2592   ONE_F.real = 1.0;
2593   ONE_F.imag = 0.0;
2594 
2595   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2596   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2597   if(local_sync_begin)pnga_sync();
2598 
2599   /* For the time being, punt on transposes*/
2600   if (*transa == 't' || *transa == 'T' || *transb == 't' || *transb == 'T') {
2601     pnga_error("Cannot do basic multiply with tranpose ",0);
2602   }
2603   /* For the time being, punt on mirrored arrays */
2604   if (pnga_is_mirrored(g_a) || pnga_is_mirrored(g_b) || pnga_is_mirrored(g_c)) {
2605     pnga_error("Cannot do basic multiply with mirrored arrays ",0);
2606   }
2607 
2608 
2609   pnga_inquire(g_a, &atype, &arank, adims);
2610   pnga_inquire(g_b, &btype, &brank, bdims);
2611   pnga_inquire(g_c, &ctype, &crank, cdims);
2612 
2613   /* Can't handle dimensions other than 2 */
2614   if(arank != 2)  pnga_error("rank of A must be 2 ",arank);
2615   if(brank != 2)  pnga_error("rank of B must be 2 ",brank);
2616   if(crank != 2)  pnga_error("rank of C must be 2 ",crank);
2617 
2618   if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
2619   if(atype != C_DCPL && atype != C_DBL && atype != C_FLOAT && atype != C_SCPL)
2620     pnga_error(" type error: type not supported ",atype);
2621 
2622   /* Check that patch dims are reasonable */
2623   for (n=0; n<arank; n++) {
2624     if (alo[n] <= 0 || alo[n] > adims[n] || ahi[n] <= 0 || ahi[n] > adims[n]) {
2625       pnga_error("Patch is out of bounds for A ",0);
2626     }
2627   }
2628   for (n=0; n<brank; n++) {
2629     if (blo[n] <= 0 || blo[n] > bdims[n] || bhi[n] <= 0 || bhi[n] > bdims[n]) {
2630       pnga_error("Patch is out of bounds for B ",0);
2631     }
2632   }
2633   for (n=0; n<crank; n++) {
2634     if (clo[n] <= 0 || clo[n] > cdims[n] || chi[n] <= 0 || chi[n] > cdims[n]) {
2635       pnga_error("Patch is out of bounds for C ",0);
2636     }
2637   }
2638 
2639   /* Check that multiplication is feasible */
2640   for (n=0; n<arank; n++) {
2641     if (ahi[n]<alo[n]) pnga_error("No data in patch on A ",0);
2642   }
2643 
2644   for (n=0; n<brank; n++) {
2645     if (bhi[n]<blo[n]) pnga_error("No data in patch on B ",0);
2646   }
2647   for (n=0; n<crank; n++) {
2648     if (chi[n]<clo[n]) pnga_error("No data in patch on C ",0);
2649   }
2650   m = ahi[0]-alo[0]+1;
2651   k = ahi[1]-alo[1]+1;
2652   n = bhi[1]-blo[1]+1;
2653   if (k != bhi[0]-blo[0]+1) {
2654     pnga_error("Inner dimensions of A and B do not match ",0);
2655   }
2656   if (m != chi[0]-clo[0]+1 || n != chi[1]-clo[1]+1) {
2657     pnga_error("Outer dimensions of A and B do not match dimensions of C ",0);
2658   }
2659 
2660   /* Scale patch of C by beta */
2661   pnga_scale_patch(g_c,clo,chi,beta);
2662 
2663   /* Multiplication is feasible. Start iterating over patches of C */
2664   pnga_local_iterator_init(g_c, &hdl_c);
2665   while (pnga_local_iterator_next(&hdl_c, loC, hiC, &src_ptr, lC)) {
2666     Integer num_blocks;
2667     Integer offset, elemsize, ld;
2668     void *a_buf, *b_buf, *c_buf;
2669     Integer size_a, size_b, size_c;
2670     /*
2671     printf("p[%d] loC[0]: %d hiC[0]: %d loC[1]: %d hiC[1]: %d lC[0]: %d\n",
2672         pnga_nodeid(),loC[0],hiC[0],loC[1],hiC[1],lC[0]);
2673         */
2674     /* Copy limits since patch intersect modifies loC array */
2675     for (n=0; n<crank; n++) {
2676       lot[n] = loC[n];
2677       hit[n] = hiC[n];
2678     }
2679 
2680     /* check to see if this block overlaps with patch on C */
2681     if (pnga_patch_intersect(loC,hiC,lot,hit,crank)) {
2682       int istart, in;
2683 
2684       /* Calculating number of blocks for inner dimension. Assume that
2685        * number of rows in block returned by iterator is a good size */
2686       num_blocks = (ahi[1]-alo[1]+1)/(hit[0]-lot[0]+1);
2687       if (num_blocks*(hit[0]-lot[0]+1) < ahi[1]-alo[1]+1) num_blocks++;
2688 
2689       /* Calculate offset from  src_ptr to beginning of lot */
2690       offset = lC[0]*(lot[1]-loC[1])+lot[0]-loC[0];
2691       elemsize = GAsizeofM(atype);
2692       c_buf = (void*)((char*)src_ptr + elemsize*offset);
2693       a_buf = (void*)malloc((hit[0]-lot[0]+1)*(hit[1]-lot[1]+1)*elemsize);
2694       b_buf = (void*)malloc((hit[0]-lot[0]+1)*(hit[1]-lot[1]+1)*elemsize);
2695       size_c = (hit[0]-lot[0]+1)*(hit[1]-lot[1]+1);
2696 
2697       /* calculate starting block index */
2698       istart = (lot[0]-clo[0])/(hit[0]-lot[0]+1);
2699 
2700       /* loop over block pairs */
2701       for (nb=0; nb<num_blocks; nb++) {
2702         /*
2703     printf("p[%d] loC[0]: %d hiC[0]: %d loC[1]: %d hiC[1]: %d\n",pnga_nodeid(),
2704         lot[0],hit[0],lot[1],hit[1]);
2705         */
2706         in = istart + nb;
2707         in = in%num_blocks;
2708         nlo = alo[1]+in*(hit[0]-lot[0]+1);
2709         loA[0] = lot[0];
2710         hiA[0] = hit[0];
2711         loA[1] = nlo;
2712         hiA[1] = loA[1]+(hit[0]-lot[0]);
2713         if (hiA[1] > ahi[1]) hiA[1] = ahi[1];
2714         ld = hiA[0]-loA[0]+1;
2715         size_a = (hiA[0]-loA[0]+1)*(hiA[1]-loA[1]+1);
2716         if (size_a > size_c) {
2717           printf("p[%d] size_a: %d size_c: %d\n",(int)pnga_nodeid(),(int)size_a,(int)size_c);
2718         }
2719         /*
2720     printf("p[%d] loA[0]: %d hiA[0]: %d loA[1]: %d hiA[1]: %d\n",pnga_nodeid(),
2721         loA[0],hiA[0],loA[1],hiA[1]);
2722         */
2723         pnga_get(g_a,loA,hiA,a_buf,&ld);
2724         /*
2725         printBlock("Matrix A",atype,a_buf,loA,hiA,&ld);
2726         */
2727         loB[1] = lot[1];
2728         hiB[1] = hit[1];
2729         nlo = blo[0]+in*(hit[0]-lot[0]+1);
2730         loB[0] = nlo;
2731         hiB[0] = loB[0]+(hit[0]-lot[0]);
2732         if (hiB[0] > bhi[0]) hiB[0] = bhi[0];
2733         ld = hiB[0]-loB[0]+1;
2734         size_b = (hiB[0]-loB[0]+1)*(hiB[1]-loB[1]+1);
2735         if (size_b > size_c) {
2736           printf("p[%d] size_b: %d size_c: %d\n",(int)pnga_nodeid(),
2737               (int)size_b,(int)size_c);
2738         }
2739         /*
2740     printf("p[%d] loB[0]: %d hiB[0]: %d loB[1]: %d hiB[1]: %d\n",pnga_nodeid(),
2741         loB[0],hiB[0],loB[1],hiB[1]);
2742         */
2743         pnga_get(g_b,loB,hiB,b_buf,&ld);
2744         /*
2745         printBlock("Matrix B",btype,b_buf,loB,hiB,&ld);
2746         */
2747 
2748         idim_t=(BlasInt)(hiA[0]-loA[0]+1);
2749         kdim_t=(BlasInt)(hiA[1]-loA[1]+1);
2750         jdim_t=(BlasInt)(hiB[1]-loB[1]+1);
2751         adim_t=hiA[0]-loA[0]+1;
2752         bdim_t=hiB[0]-loB[0]+1;
2753         cdim_t=lC[0];
2754 
2755         /*
2756         printf("p[%d] idim: %d jdim: %d kdim: %d adim: %d bdim: %d cdim: %d\n",
2757             pnga_nodeid(),idim_t,jdim_t,kdim_t,adim_t,bdim_t,cdim_t);
2758             */
2759         switch(atype) {
2760           case C_FLOAT:
2761             BLAS_SGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2762                 (Real *)alpha, (Real *)a_buf, &adim_t,
2763                 (Real *)b_buf, &bdim_t, (Real *)&ONE_F,
2764                 (Real *)c_buf, &cdim_t);
2765             break;
2766           case C_DBL:
2767             BLAS_DGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2768                 (DoublePrecision *)alpha, (DoublePrecision *)a_buf, &adim_t,
2769                 (DoublePrecision *)b_buf, &bdim_t, (DoublePrecision *)&ONE_Z,
2770                 (DoublePrecision *)c_buf, &cdim_t);
2771             break;
2772           case C_DCPL:
2773             BLAS_ZGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2774                 (DoubleComplex *)alpha, (DoubleComplex *)a_buf, &adim_t,
2775                 (DoubleComplex *)b_buf, &bdim_t, (DoubleComplex *)&ONE_Z,
2776                 (DoubleComplex *)c_buf, &cdim_t);
2777             break;
2778           case C_SCPL:
2779             BLAS_CGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2780                 (SingleComplex *)alpha, (SingleComplex *)a_buf, &adim_t,
2781                 (SingleComplex *)b_buf, &bdim_t, (SingleComplex *)&ONE_F,
2782                 (SingleComplex *)c_buf, &cdim_t);
2783             break;
2784 		  default:
2785 		    pnga_error("ga_matmul_basic: wrong data type", atype);
2786 		  }
2787       }
2788 
2789       /* multiplication is done, free buffers */
2790       free(a_buf);
2791       free(b_buf);
2792     }
2793   }
2794   if(local_sync_end)pnga_sync();
2795 }
2796