1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /* $Id: matmul.c,v 1.60.4.1 2006-12-22 13:05:22 manoj Exp $ */
6 /*===========================================================
7 *
8 * GA_Dgemm(): Parallel Matrix Multiplication
9 * (i.e. C = alpha*A*B + beta*C)
10 *
11 *===========================================================*/
12
13 #if HAVE_STDLIB_H
14 # include <stdlib.h>
15 #endif
16 #if HAVE_STRING_H
17 # include <string.h>
18 #endif
19 #include "matmul.h"
20 #include "ga_iterator.h"
21 #include "ga-papi.h"
22 #include "ga-wapi.h"
23
24 #define DEBUG_ 0 /*set 1, to verify the correctness of parallel matrix mult.*/
25
26 /* some optimization macros */
27 #define KCHUNK_OPTIMIZATION 0 /* This Opt performing well only for m=1000;n=1000'k=2000 kinda cases and not for the opposite*/
28
29 /* Optimization flags: Initialized everytime in pnga_matmul() */
30 static short int CYCLIC_DISTR_OPT_FLAG = SET;
31 static short int CONTIG_CHUNKS_OPT_FLAG = SET;
32 static short int DIRECT_ACCESS_OPT_FLAG = SET;
33
34 Integer gNbhdlA[2], gNbhdlB[2], gNbhdlC[2];/* for A and B matrix */
35
36 static int _gai_matmul_patch_flag = 0;
gai_matmul_patch_flag(int flag)37 void gai_matmul_patch_flag(int flag)
38 {
39 _gai_matmul_patch_flag = flag;
40 }
41
max3(int ichunk,int jchunk,int kchunk)42 static inline int max3(int ichunk, int jchunk, int kchunk) {
43 if(ichunk>jchunk) return GA_MAX(ichunk,kchunk);
44 else return GA_MAX(jchunk, kchunk);
45 }
46
init_task_list(task_list_t * thing)47 static inline void init_task_list(task_list_t *thing)
48 {
49 thing->lo[0] = 0;
50 thing->lo[1] = 0;
51 thing->hi[0] = 0;
52 thing->hi[1] = 0;
53 thing->dim[0] = 0;
54 thing->dim[1] = 0;
55 thing->chunkBId = 0;
56 thing->do_put = 0;
57 }
58
GET_BLOCK(Integer g_x,task_list_t * chunk,void * buf,char * trans,Integer xilo,Integer xjlo,Integer * dim_next,Integer * nbhdl)59 static void GET_BLOCK(Integer g_x, task_list_t *chunk, void *buf,
60 char *trans, Integer xilo, Integer xjlo,
61 Integer *dim_next, Integer *nbhdl) {
62
63 Integer i0, i1, j0, j1;
64 Integer lo[2], hi[2];
65
66 if(*trans == 'n' || *trans == 'N') {
67 *dim_next = chunk->dim[0];
68 i0= xilo+chunk->lo[0]; i1= xilo+chunk->hi[0];
69 j0= xjlo+chunk->lo[1]; j1= xjlo+chunk->hi[1];
70 }
71 else {
72 *dim_next = chunk->dim[1];
73 i0= xjlo+chunk->lo[1]; i1= xjlo+chunk->hi[1];
74 j0= xilo+chunk->lo[0]; j1= xilo+chunk->hi[0];
75 }
76
77 lo[0] = i0;
78 lo[1] = j0;
79 hi[0] = i1;
80 hi[1] = j1;
81 pnga_nbget(g_x, lo, hi, buf, dim_next, nbhdl);
82 }
83
84 static short int
gai_get_task_list(task_list_t * taskListA,task_list_t * taskListB,task_list_t * state,Integer istart,Integer jstart,Integer kstart,Integer iend,Integer jend,Integer kend,Integer Ichunk,Integer Jchunk,Integer Kchunk,int * max_tasks,Integer g_a)85 gai_get_task_list(task_list_t *taskListA, task_list_t *taskListB,
86 task_list_t *state, Integer istart, Integer jstart,
87 Integer kstart, Integer iend, Integer jend, Integer kend,
88 Integer Ichunk, Integer Jchunk, Integer Kchunk,
89 int *max_tasks, Integer g_a) {
90
91 int ii, jj, nloops=0;
92 short int do_put, more_chunks_left=0, recovery=0;
93 Integer ilo, ihi, jlo, jhi, klo, khi, get_new_B;
94 Integer jstart_=jstart, kstart_=kstart;
95
96 if(state->lo[0] != -1) recovery = 1;
97
98 nloops = (iend-istart+1)/Ichunk + ( ((iend-istart+1)%Ichunk)?1:0 );
99 if(nloops>MAX_CHUNKS) pnga_error("Increase MAX_CHUNKS value in matmul.h",0L);
100
101 if(recovery) jstart_ = state->lo[0]; /* recovering the previous state */
102 for(ii=jj=0, jlo = jstart_; jlo <= jend; jlo += Jchunk) {
103 jhi = GA_MIN(jend, jlo+Jchunk-1);
104
105 if(recovery) {
106 do_put = state->do_put;
107 kstart_ = state->lo[1];
108 }
109 else do_put = SET; /* for 1st shot we can put, instead of accumulate */
110
111 for(klo = kstart_; klo <= kend; klo += Kchunk) {
112 khi = GA_MIN(kend, klo+Kchunk-1);
113 get_new_B = TRUE;
114
115 /* set it back after the first loop */
116 recovery = 0;
117 jstart_ = jstart;
118 kstart_ = kstart;
119
120 /* save CURRENT STATE. Saving state before "i" loop helps to avoid
121 tracking get_new_B, which is hassle in ga_matmul_regular() */
122 if(ii+nloops >= MAX_CHUNKS) {
123 more_chunks_left = 1;
124 state->lo[0] = jlo;
125 state->lo[1] = klo;
126 state->do_put = do_put;
127 break;
128 }
129
130 for(ilo = istart; ilo <= iend; ilo += Ichunk){
131 ihi = GA_MIN(iend, ilo+Ichunk-1);
132 taskListA[ii].dim[0] = ihi - ilo + 1;
133 taskListA[ii].dim[1] = khi - klo + 1;
134 taskListA[ii].lo[0] = ilo; taskListA[ii].hi[0] = ihi;
135 taskListA[ii].lo[1] = klo; taskListA[ii].hi[1] = khi;
136 taskListA[ii].do_put = do_put;
137 if(get_new_B) { /* B matrix */
138 ihi = GA_MIN(iend, ilo+Ichunk-1);
139 taskListB[jj].dim[0] = khi - klo + 1;
140 taskListB[jj].dim[1] = jhi - jlo + 1;
141 taskListB[jj].lo[0] = klo; taskListB[jj].hi[0] = khi;
142 taskListB[jj].lo[1] = jlo; taskListB[jj].hi[1] = jhi;
143 get_new_B = FALSE; /* Until J or K change again */
144 taskListA[ii].chunkBId = jj;
145 ++jj;
146 }
147 else taskListA[ii].chunkBId = taskListA[ii-1].chunkBId;
148 ++ii;
149 }
150 if (more_chunks_left) break;
151 do_put = UNSET;
152 }
153 if (more_chunks_left) break;
154 }
155
156 *max_tasks = ii;
157
158 /* Optimization disabled if chunks exceeds buffer space */
159 if(more_chunks_left) CYCLIC_DISTR_OPT_FLAG = UNSET;
160
161 if(CYCLIC_DISTR_OPT_FLAG) { /* should not be called for irregular matmul */
162 int prow, pcol, offset, grp_me;
163 Integer a_grp = pnga_get_pgroup(g_a);
164 grp_me = (int)pnga_pgroup_nodeid(a_grp);
165 prow = GA[GA_OFFSET + g_a].nblock[0];
166 pcol = GA[GA_OFFSET + g_a].nblock[1];
167 offset = (grp_me/prow + grp_me%prow) % pcol;
168 for(jj=0, ilo = istart; ilo <= iend; jj++, ilo += Ichunk)
169 taskListA[jj].do_put = UNSET;
170 for(jj=0, ilo = istart; ilo <= iend; jj++, ilo += Ichunk)
171 taskListA[jj+offset].do_put = SET;
172 }
173
174 return more_chunks_left;
175 }
176
gai_get_chunk_size(int irregular,Integer * Ichunk,Integer * Jchunk,Integer * Kchunk,Integer * elems,Integer atype,Integer m,Integer n,Integer k,short int nbuf,short int use_armci_memory,Integer a_grp)177 static void gai_get_chunk_size(int irregular,Integer *Ichunk,Integer *Jchunk,
178 Integer *Kchunk,Integer *elems,Integer atype,
179 Integer m,Integer n,Integer k, short int nbuf,
180 short int use_armci_memory, Integer a_grp) {
181 double temp;
182 Integer min_tasks = MINTASKS; /* Increase tasks if there is load imbalance.
183 This controls the granularity of chunks */
184 Integer max_chunk, nproc=pnga_nnodes(), tmpa, tmpb;
185 Integer avail = pnga_memory_avail_type(atype);
186
187 tmpa = *Ichunk;
188 tmpb = *Jchunk;
189
190 if(irregular) {
191 temp = (k*(double)(m*(double)n)) / (min_tasks * nproc);
192 max_chunk = (Integer)pow(temp, (1.0/3.0) );
193 if (max_chunk < MIN_CHUNK_SIZE) max_chunk = MIN_CHUNK_SIZE;
194 }
195 else
196 max_chunk = (Integer) max3(*Ichunk, *Jchunk, *Kchunk);
197
198 pnga_pgroup_gop(a_grp, pnga_type_f2c(MT_F_INT), &avail, (Integer)1, "min");
199
200 if ( max_chunk > CHUNK_SIZE/nbuf) {
201 /*if memory if very limited, performance degrades for large matrices
202 as chunk size is very small, which leads to communication overhead)*/
203 if(avail<MINMEM && pnga_pgroup_nodeid(a_grp)==0) pnga_error("NotEnough memory",avail);
204 *elems = (Integer)(avail*0.9); /* Donot use every last drop */
205
206 /* MAX: get the maximum chunk (or, block) size i.e */
207 max_chunk=GA_MIN(max_chunk, (Integer)(sqrt( (double)((*elems-nbuf*NUM_MATS)/(nbuf*NUM_MATS)))));
208
209 if(!irregular && use_armci_memory==SET)
210 max_chunk = *Ichunk = *Jchunk = *Kchunk = BLOCK_SIZE;
211
212 if(irregular) {
213 /* NOTE:enable this part for regular cases, if later
214 part of the code is buggy or inefficient */
215 *Ichunk = GA_MIN(m,max_chunk);
216 *Jchunk = GA_MIN(n,max_chunk);
217 *Kchunk = GA_MIN(k,max_chunk);
218 }
219 else { /* This part of the code takes care of rectangular chunks and
220 most probably gives optimum rectangular chunk size */
221 temp = max_chunk*max_chunk;
222 if(*Ichunk < max_chunk && *Kchunk > max_chunk) {
223 *Kchunk = GA_MIN(*Kchunk,(Integer)(temp/(*Ichunk)));
224 *Jchunk = GA_MIN(*Jchunk,(Integer)(temp/(*Kchunk)));
225 }
226 else if(*Kchunk < max_chunk && *Ichunk > max_chunk) {
227 temp *= 1.0/(*Kchunk);
228 *Ichunk = GA_MIN(*Ichunk,(Integer)temp);
229 *Jchunk = GA_MIN(*Jchunk,(Integer)temp);
230 }
231 else *Ichunk = *Jchunk = *Kchunk = max_chunk;
232 }
233 }
234 else
235 *Ichunk = *Jchunk = *Kchunk = CHUNK_SIZE/nbuf;
236
237 /* Try to use 1-d data transfer & take advantage of zero-copy protocol */
238 if(CONTIG_CHUNKS_OPT_FLAG) { /* select a contiguous piece */
239 if(!irregular) {
240 if(*Ichunk > tmpa && *Jchunk > tmpb) {
241 *Ichunk = tmpa;
242 *Jchunk = tmpb;
243 *Kchunk = GA_MIN(*Ichunk,*Jchunk);
244 }
245 else {
246 int i=1;/* i should be >=1 , to avoid divide by zero error */
247 temp = max_chunk*max_chunk;
248 if(temp > tmpa) {
249 *Ichunk = tmpa;
250 *Jchunk = (Integer)(temp/(*Ichunk));
251 if(*Jchunk < tmpb) {
252 while(tmpb/i > *Jchunk) ++i;
253 *Jchunk = tmpb/i;
254 }
255 else *Jchunk = tmpb;
256 *Kchunk = GA_MIN(*Ichunk, *Jchunk);
257 }
258 }
259 }
260 }
261
262 if(*Ichunk<=0) *Ichunk = 1; /* should be atleast 1 */
263 if(*Jchunk<=0) *Jchunk = 1;
264 if(*Kchunk<=0) *Kchunk = 1;
265
266 /* Total elements "NUM_MAT" extra elems for safety - just in case */
267 *elems = ( nbuf*(*Ichunk)*(*Kchunk) + nbuf*(*Kchunk)*(*Jchunk) +
268 (*Ichunk)*(*Jchunk) );
269 *elems += nbuf*NUM_MATS*sizeof(DoubleComplex)/GAsizeofM(atype);
270 }
271
272 static DoubleComplex*
gai_get_armci_memory(Integer Ichunk,Integer Jchunk,Integer Kchunk,short int nbuf,Integer atype)273 gai_get_armci_memory(Integer Ichunk, Integer Jchunk, Integer Kchunk,
274 short int nbuf, Integer atype) {
275
276 DoubleComplex *tmp = NULL;
277 Integer elems;
278
279 elems = (Integer) pow((double)BLOCK_SIZE,(double)2);
280 elems = nbuf*elems + nbuf*elems + elems; /* A,B,C temporary buffers */
281
282 /* add extra elements for safety */
283 elems += nbuf*NUM_MATS*sizeof(DoubleComplex)/GAsizeofM(atype);
284
285 /* allocate temporary storage using ARMCI_Malloc */
286 if( (Integer) (((double)nbuf)*(Ichunk* Kchunk) +
287 ((double)nbuf)*(Kchunk* Jchunk) +
288 Ichunk* Jchunk ) < elems) {
289 tmp=(DoubleComplex*)ARMCI_Malloc_local(elems*GAsizeofM(atype));
290 }
291 return tmp;
292 }
293
294 /************************************
295 * Sequential DGEMM
296 * i.e. BLAS dgemm Routines
297 ************************************/
298
GAI_DGEMM(Integer atype,char * transa,char * transb,Integer idim,Integer jdim,Integer kdim,void * alpha,DoubleComplex * a,Integer adim,DoubleComplex * b,Integer bdim,DoubleComplex * c,Integer cdim)299 static void GAI_DGEMM(Integer atype, char *transa, char *transb,
300 Integer idim, Integer jdim, Integer kdim, void *alpha,
301 DoubleComplex *a, Integer adim, DoubleComplex *b,
302 Integer bdim, DoubleComplex *c, Integer cdim) {
303
304 BlasInt idim_t, jdim_t, kdim_t, adim_t, bdim_t, cdim_t;
305 DoubleComplex ZERO;
306 SingleComplex ZERO_CF;
307
308 idim_t=idim; jdim_t=jdim; kdim_t=kdim;
309 adim_t=adim; bdim_t=bdim; cdim_t=cdim;
310 ZERO.real = 0.; ZERO.imag = 0.;
311 ZERO_CF.real = 0.; ZERO_CF.imag = 0.;
312
313 switch(atype) {
314 case C_FLOAT:
315 BLAS_SGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
316 (Real *)alpha, (Real *)a, &adim_t,
317 (Real *)b, &bdim_t, (Real *)&ZERO_CF,
318 (Real *)c, &cdim_t);
319 break;
320 case C_DBL:
321 BLAS_DGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
322 (DoublePrecision *)alpha, (DoublePrecision *)a, &adim_t,
323 (DoublePrecision *)b, &bdim_t, (DoublePrecision *)&ZERO,
324 (DoublePrecision *)c, &cdim_t);
325 break;
326 case C_DCPL:
327 BLAS_ZGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
328 (DoubleComplex *)alpha, (DoubleComplex *)a, &adim_t,
329 (DoubleComplex *)b, &bdim_t, (DoubleComplex *)&ZERO,
330 (DoubleComplex *)c, &cdim_t);
331 break;
332 case C_SCPL:
333 BLAS_CGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
334 (SingleComplex *)alpha, (SingleComplex *)a, &adim_t,
335 (SingleComplex *)b, &bdim_t, (SingleComplex *)&ZERO_CF,
336 (SingleComplex *)c, &cdim_t);
337 break;
338 default:
339 pnga_error("ga_matmul_patch: wrong data type", atype);
340 }
341 }
342
343
344
gai_matmul_shmem(transa,transb,alpha,beta,atype,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi,Ichunk,Kchunk,Jchunk,a,b,c,need_scaling)345 static void gai_matmul_shmem(transa, transb, alpha, beta, atype,
346 g_a, ailo, aihi, ajlo, ajhi,
347 g_b, bilo, bihi, bjlo, bjhi,
348 g_c, cilo, cihi, cjlo, cjhi,
349 Ichunk, Kchunk, Jchunk, a,b,c, need_scaling)
350
351 Integer g_a, ailo, aihi, ajlo, ajhi; /* patch of g_a */
352 Integer g_b, bilo, bihi, bjlo, bjhi; /* patch of g_b */
353 Integer g_c, cilo, cihi, cjlo, cjhi; /* patch of g_c */
354 Integer Ichunk, Kchunk, Jchunk, atype;
355 void *alpha, *beta;
356 char *transa, *transb;
357 DoubleComplex *a, *b, *c;
358 short int need_scaling;
359 {
360
361 Integer me = pnga_nodeid();
362 Integer get_new_B, loC[2]={0,0}, hiC[2]={0,0}, ld[2];
363 Integer i0, i1, j0, j1;
364 Integer ilo, ihi, idim, jlo, jhi, jdim, klo, khi, kdim, adim, bdim=0, cdim;
365 int istart, jstart, kstart, iend, jend, kend;
366 short int do_put=UNSET, single_task_flag=UNSET;
367 DoubleComplex ONE = {1.,0.};
368 SingleComplex ONE_CF = {1.,0.};
369 Integer clo[2], chi[2];
370
371
372 /* to skip accumulate and exploit data locality:
373 get chunks according to "C" matrix distribution*/
374 pnga_distribution(g_c, me, loC, hiC);
375 istart = loC[0]-1; iend = hiC[0]-1;
376 jstart = loC[1]-1; jend = hiC[1]-1;
377 kstart = 0 ; kend = ajhi-ajlo;
378
379 if(DIRECT_ACCESS_OPT_FLAG) {
380 /* check if there is only one task. If so, then it is contiguous */
381 if( (iend-istart+1 <= Ichunk) && (jend-jstart+1 <= Jchunk) &&
382 (kend-kstart+1 <= Kchunk) ) {
383 single_task_flag = SET;
384 pnga_access_ptr(g_c, loC, hiC, &c, ld);
385 }
386 }
387
388 /* loop through columns of g_c patch */
389 for(jlo = jstart; jlo <= jend; jlo += Jchunk) {
390 jhi = GA_MIN(jend, jlo+Jchunk-1);
391 jdim = jhi - jlo +1;
392
393 /* if beta=0,then for first shot we can do put,instead of accumulate */
394 if(need_scaling == UNSET) do_put = SET;
395
396 /* loop cols of g_a patch : loop rows of g_b patch*/
397 for(klo = kstart; klo <= kend; klo += Kchunk) {
398 khi = GA_MIN(kend, klo+Kchunk-1);
399 kdim= khi - klo +1;
400 get_new_B = TRUE; /* Each pass thru' outer 2 loops means we
401 need a different patch of B.*/
402 /*loop through rows of g_c patch */
403 for(ilo = istart; ilo <= iend; ilo += Ichunk){
404 ihi = GA_MIN(iend, ilo+Ichunk-1);
405 idim= cdim = ihi - ilo +1;
406
407 /* STEP1(a): get matrix "A" chunk */
408 i0= ailo+ilo; i1= ailo+ihi;
409 j0= ajlo+klo; j1= ajlo+khi;
410 if (*transa == 'n' || *transa == 'N'){
411 clo[0] = i0;
412 clo[1] = j0;
413 chi[0] = i1;
414 chi[1] = j1;
415 adim=idim; pnga_get(g_a, clo, chi, a, &idim);
416 }else{
417 clo[0] = j0;
418 clo[1] = i0;
419 chi[0] = j1;
420 chi[1] = i1;
421 adim=kdim; pnga_get(g_a, clo, chi, a, &kdim);
422 }
423
424 /* STEP1(b): get matrix "B" chunk*/
425 if(get_new_B) {/*Avoid rereading B if same patch as last time*/
426 i0= bilo+klo; i1= bilo+khi;
427 j0= bjlo+jlo; j1= bjlo+jhi;
428 if (*transb == 'n' || *transb == 'N'){
429 clo[0] = i0;
430 clo[1] = j0;
431 chi[0] = i1;
432 chi[1] = j1;
433 bdim=kdim; pnga_get(g_b, clo, chi, b, &kdim);
434 }else {
435 clo[0] = j0;
436 clo[1] = i0;
437 chi[0] = j1;
438 chi[1] = i1;
439 bdim=jdim; pnga_get(g_b, clo, chi, b, &jdim);
440 }
441 get_new_B = FALSE; /* Until J or K change again */
442 }
443
444 /* STEP2: Do the sequential matrix multiply - i.e.BLAS dgemm */
445 GAI_DGEMM(atype, transa, transb, idim, jdim, kdim, alpha,
446 a, adim, b, bdim, c, cdim);
447
448 /* STEP3: put/accumulate into "C" matrix */
449 i0= cilo+ilo; i1= cilo+ihi;
450 j0= cjlo+jlo; j1= cjlo+jhi;
451 /* if single_task_flag is SET (i.e =1), then there is no need to
452 update "C" matrix, as we use pointer directly in GAI_DGEMM */
453 if(single_task_flag != SET) {
454 switch(atype) {
455 case C_FLOAT:
456 case C_SCPL:
457 clo[0] = i0;
458 clo[1] = j0;
459 chi[0] = i1;
460 chi[1] = j1;
461 if(do_put==SET) /* i.e.beta == 0.0 */
462 pnga_put(g_c, clo, chi, (float *)c, &cdim);
463 else {
464 pnga_acc(g_c, clo, chi, (float*)c, &cdim, &ONE_CF);
465 }
466 break;
467 default:
468 clo[0] = i0;
469 clo[1] = j0;
470 chi[0] = i1;
471 chi[1] = j1;
472 if(do_put==SET) /* i.e.beta == 0.0 */
473 pnga_put(g_c, clo, chi, (DoublePrecision*)c, &cdim);
474 else {
475 pnga_acc(g_c, clo, chi, (DoublePrecision*)c,
476 &cdim, (DoublePrecision*)&ONE);
477 }
478 break;
479 }
480 }
481 }
482 do_put = UNSET; /* In the second loop, accumulate should be done */
483 }
484 }
485 }
486
487
488 static
init_block_info(Integer g_c,Integer * proc_index,Integer * index,Integer * blocks,Integer * block_dims,Integer * topology,Integer * iblock)489 void init_block_info(Integer g_c, Integer *proc_index, Integer *index,
490 Integer *blocks, Integer *block_dims, Integer *topology,
491 Integer *iblock)
492 {
493 Integer me= pnga_nodeid();
494
495 /* Uses simple block-cyclic data distribution */
496 if(!pnga_uses_proc_grid(g_c))
497 {
498 *iblock = me;
499 }
500 else /* Uses scalapack block-cyclic data distribution */
501 {
502 *iblock = 0;
503 pnga_get_proc_index(g_c, me, proc_index);
504 pnga_get_proc_index(g_c, me, index);
505 pnga_get_block_info(g_c, blocks, block_dims);
506 pnga_get_proc_grid(g_c, topology);
507 }
508 }
509
510 /**
511 * get the lo/hi distribution info of the next block
512 * return 0 indicates no more blocks available
513 * return 1 indicates there is a block available
514 */
515 static
get_next_block_info(Integer g_c,Integer * proc_index,Integer * index,Integer * blocks,Integer * block_dims,Integer * topology,Integer * iblock,Integer * blo,Integer * bhi)516 int get_next_block_info(Integer g_c, Integer *proc_index, Integer *index,
517 Integer *blocks, Integer *block_dims, Integer*topology,
518 Integer *iblock, Integer *blo, Integer *bhi)
519 {
520 Integer dims[MAXDIM], ndim, type;
521 int i;
522
523 /* works only upto 2 dims - i.e vectors/matrices*/
524 pnga_inquire(g_c, &type, &ndim, dims);
525 if(ndim>2) pnga_error("get_next_block_info() supports upto 2-d only ", 0L);
526
527 /* Uses simple block-cyclic data distribution */
528 if (!pnga_uses_proc_grid(g_c))
529 {
530 if(*iblock < pnga_total_blocks(g_c))
531 {
532 pnga_distribution(g_c, *iblock, blo, bhi);
533 *iblock += pnga_nnodes();
534 return 1;
535 }
536 }
537 else /* Uses scalapack block-cyclic data distribution */
538 {
539 if (index[ndim-1] < blocks[ndim-1])
540 {
541 /* find bounding coordinates of block */
542 for (i = 0; i < ndim; i++)
543 {
544 blo[i] = index[i]*block_dims[i]+1;
545 bhi[i] = (index[i] + 1)*block_dims[i];
546 if (bhi[i] > dims[i]) bhi[i] = dims[i];
547 }
548
549 /* increment index to get next block on processor */
550 index[0] += topology[0];
551 for (i = 0; i < ndim; i++)
552 {
553 if (index[i] >= blocks[i] && i<ndim-1)
554 {
555 index[i] = proc_index[i];
556 index[i+1] += topology[i+1];
557 }
558 }
559
560 return 1;
561 } /* end of while */
562 }
563
564 return 0;
565 }
566
567
568
gai_matmul_regular(transa,transb,alpha,beta,atype,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi,Ichunk,Kchunk,Jchunk,a_ar,b_ar,c_ar,need_scaling,irregular)569 static void gai_matmul_regular(transa, transb, alpha, beta, atype,
570 g_a, ailo, aihi, ajlo, ajhi,
571 g_b, bilo, bihi, bjlo, bjhi,
572 g_c, cilo, cihi, cjlo, cjhi,
573 Ichunk, Kchunk, Jchunk, a_ar,b_ar,c_ar,
574 need_scaling, irregular)
575
576 Integer g_a, ailo, aihi, ajlo, ajhi; /* patch of g_a */
577 Integer g_b, bilo, bihi, bjlo, bjhi; /* patch of g_b */
578 Integer g_c, cilo, cihi, cjlo, cjhi; /* patch of g_c */
579 Integer Ichunk, Kchunk, Jchunk, atype;
580 void *alpha, *beta;
581 char *transa, *transb;
582 DoubleComplex **a_ar, **b_ar, **c_ar;
583 short int need_scaling, irregular;
584 {
585
586 Integer me= pnga_nodeid();
587 Integer get_new_B=TRUE, i0, i1, j0, j1;
588 Integer idim, jdim, kdim;
589 Integer k, adim=0, bdim, cdim, adim_next, bdim_next;
590 Integer clo[2], chi[2], loC[2]={1,1}, hiC[2]={1,1}, ld[2];
591 int max_tasks=0, shiftA=0, shiftB=0;
592 int currA, nextA, currB, nextB=0; /* "current" and "next" task Ids */
593 task_list_t taskListA[MAX_CHUNKS], taskListB[MAX_CHUNKS], state;
594 short int do_put=UNSET, single_task_flag=UNSET, chunks_left=0;
595 DoubleComplex ONE, *a, *b, *c;
596 SingleComplex ONE_CF;
597 int offset=0, gTaskId=0;
598 int numblocks=0, has_more_blocks=1;
599 Integer ctype, cndim, cdims[2];
600 Integer iblock=0, proc_index[2], index[2];
601 Integer blocks[2], block_dims[2], topology[2];
602 _iterator_hdl hdl;
603 char *ptr_c;
604
605 if(irregular) pnga_error("irregular flag set", 0L);
606
607 init_task_list(&state);
608
609 #if DEBUG_
610 if(me==0) { printf("@@ga_matmul_regular:m,n,k=%ld %ld %ld\n",aihi-ailo+1,
611 bjhi-bjlo+1,ajhi-ajlo+1); fflush(stdout); }
612 #endif
613
614 ONE.real =1.; ONE.imag =0.;
615 ONE_CF.real =1.; ONE_CF.imag =0.;
616 clo[0] = cilo; chi[0] = cihi;
617 clo[1] = cjlo; chi[1] = cjhi;
618 k = ajhi - ajlo +1;
619
620 #if 0
621 numblocks = pnga_total_blocks(g_c);
622 if(numblocks>=0) init_block_info(g_c, proc_index, index, blocks,
623 block_dims, topology, &iblock);
624
625 has_more_blocks = 1;
626 while(has_more_blocks)
627 {
628 /* if block cyclic distribution and loop accordigly. In case of simple
629 block distribution we process this loop only once */
630 if(numblocks<0)
631 { /* simple block distribution */
632 has_more_blocks = 0;
633 pnga_distribution(g_c, me, loC, hiC);
634 }
635 else
636 { /* block cyclic */
637
638 if(!get_next_block_info(g_c, proc_index, index, blocks, block_dims,
639 topology, &iblock, loC, hiC))
640 break;
641 }
642 /* } */
643 #else
644 pnga_local_iterator_init(g_c, &hdl);
645 while (pnga_local_iterator_next(&hdl,loC,hiC,&ptr_c,ld)) {
646 #endif
647
648 /* If loC and hiC intersects with current patch region, then they will
649 * be updated accordingly. Else it returns FALSE */
650 pnga_inquire(g_c, &ctype, &cndim, cdims);
651 if(!pnga_patch_intersect(clo,chi,loC,hiC,cndim)) continue;
652
653 #if DEBUG_
654 printf("%d: Processing block #%d [%d,%d] - [%d,%d]\n", GAme, iblock,
655 loC[0], loC[1], hiC[0], hiC[1]);
656 #endif
657
658 state.lo[0] = -1; /* just for first do-while loop */
659 do {
660
661 /* Inital Settings */
662 a = a_ar[0];
663 b = b_ar[0];
664 c = c_ar[0];
665 do_put = single_task_flag = UNSET;
666 offset = 0;
667
668 /*****************************************************************
669 * Task list: Collect information of all chunks. Matmul using
670 * Non-blocking call needs this list
671 *****************************************************************/
672 gTaskId=0;
673
674 /* to skip accumulate and exploit data locality:
675 get chunks according to "C" matrix distribution*/
676 /* pnga_distribution(g_c, me, loC, hiC); */
677 chunks_left=gai_get_task_list(taskListA, taskListB, &state,loC[0]-1,
678 loC[1]-1, 0, hiC[0]-1, hiC[1]-1, k-1,
679 Ichunk,Jchunk,Kchunk, &max_tasks,g_a);
680 currA = nextA = 0;
681
682 if(chunks_left) { /* then turn OFF this optimization */
683 if(DIRECT_ACCESS_OPT_FLAG) {
684 /* check if there is only one task.If so,then it is contiguous */
685 if(max_tasks == 1) {
686 if( !((hiC[0]-loC[0]+1 <= Ichunk) &&(hiC[1]-loC[1]+1 <=Jchunk)
687 && (k <= Kchunk)))
688 pnga_error("Invalid task list", 0L);
689 single_task_flag = SET;
690 pnga_access_ptr(g_c, loC, hiC, &c, ld);
691 }
692 }
693 }
694
695 if(CYCLIC_DISTR_OPT_FLAG) {
696 int prow,pcol,grp_me;
697 Integer a_grp=pnga_get_pgroup(g_a);
698 grp_me = (int)pnga_pgroup_nodeid(a_grp);
699 prow = GA[GA_OFFSET + g_a].nblock[0];
700 pcol = GA[GA_OFFSET + g_a].nblock[1];
701 offset = (grp_me/prow + grp_me%prow) % pcol;
702 currA = nextA = nextA + offset;
703 }
704
705 /*************************************************
706 * Do the setup & issue non-blocking calls to get
707 * the first block/chunk I'm gonna work
708 *************************************************/
709 shiftA=0; shiftB=0;
710 if(nextA < max_tasks) {
711 currB = nextB = taskListA[currA].chunkBId;
712
713 GET_BLOCK(g_a, &taskListA[nextA], a_ar[shiftA], transa,
714 ailo, ajlo, &adim_next, &gNbhdlA[shiftA]);
715
716 GET_BLOCK(g_b, &taskListB[nextB], b_ar[shiftB], transb,
717 bilo, bjlo, &bdim_next, &gNbhdlB[shiftB]);
718
719 adim=adim_next; bdim=bdim_next;
720 get_new_B = TRUE;
721 }
722
723 /*************************************************************
724 * Main Parallel DGEMM Loop.
725 *************************************************************/
726 while(nextA < max_tasks) {
727 currA = nextA;
728 currB = taskListA[currA].chunkBId;
729
730 idim = cdim = taskListA[currA].dim[0];
731 jdim = taskListB[currB].dim[1];
732 kdim = taskListA[currA].dim[1];
733 bdim=bdim_next;
734
735 /* if beta=0.0 (i.e.if need_scaling=UNSET), then for first shot,
736 we can do put, instead of accumulate */
737 if(need_scaling == UNSET) do_put = taskListA[currA].do_put;
738
739 nextA = ++gTaskId; /* get the next task id */
740
741 if(CYCLIC_DISTR_OPT_FLAG && nextA < max_tasks)
742 nextA = (offset+nextA) % max_tasks;
743
744
745 /* ---- WAIT till we get the current A & B block ---- */
746 a = a_ar[shiftA];
747 WAIT_GET_BLOCK(&gNbhdlA[shiftA]);
748 if(get_new_B){/*Avoid rereading B if it is same patch as last time*/
749 get_new_B = FALSE;
750 b = b_ar[shiftB];
751 WAIT_GET_BLOCK(&gNbhdlB[shiftB]);
752 }
753
754 /* ---- GET the next A & B block ---- */
755 if(nextA < max_tasks) {
756 GET_BLOCK(g_a, &taskListA[nextA], a_ar[(shiftA+1)%2], transa,
757 ailo, ajlo, &adim_next, &gNbhdlA[(shiftA+1)%2]);
758
759 nextB = taskListA[nextA].chunkBId;
760 if(currB != nextB) {
761 shiftB=((shiftB+1)%2);
762
763 GET_BLOCK(g_b, &taskListB[nextB], b_ar[shiftB], transb,
764 bilo, bjlo, &bdim_next, &gNbhdlB[shiftB]);
765 }
766 }
767 if(currB != nextB) get_new_B = TRUE;
768
769 /* Do the sequential matrix multiply - i.e.BLAS dgemm */
770 GAI_DGEMM(atype, transa, transb, idim, jdim, kdim, alpha,
771 a, adim, b, bdim, c, cdim);
772
773 /* Non-blocking Accumulate Operation. Note: skip wait in 1st loop*/
774 i0 = cilo + taskListA[currA].lo[0];
775 i1 = cilo + taskListA[currA].hi[0];
776 j0 = cjlo + taskListB[currB].lo[1];
777 j1 = cjlo + taskListB[currB].hi[1];
778
779 if(currA < max_tasks) {
780 if (single_task_flag != SET) {
781 switch(atype) {
782 case C_FLOAT:
783 case C_SCPL:
784 clo[0] = i0;
785 clo[1] = j0;
786 chi[0] = i1;
787 chi[1] = j1;
788 if(do_put==SET) /* Note:do_put is UNSET, if beta!=0.0*/
789 pnga_put(g_c, clo, chi, (float *)c, &cdim);
790 else {
791 pnga_acc(g_c, clo, chi, (float *)c, &cdim, &ONE_CF);
792 }
793 break;
794 default:
795 clo[0] = i0;
796 clo[1] = j0;
797 chi[0] = i1;
798 chi[1] = j1;
799 if(do_put==SET) /* i.e.beta ==0.0 */
800 pnga_put(g_c, clo, chi, (DoublePrecision*)c, &cdim);
801 else {
802 pnga_acc(g_c, clo, chi, (DoublePrecision*)c, &cdim,(DoublePrecision*)&ONE);
803 }
804 break;
805 }
806 }
807 }
808
809 /* shift next buffer..toggles between 0 and 1: as we use 2 buffers,
810 one for computation and the other for communication (overlap) */
811 shiftA = ((shiftA+1)%2);
812 adim = adim_next;
813 }
814 } while(chunks_left);
815 } /* while(has_more_blocks) */
816
817 }
818
819
820
gai_matmul_irreg(transa,transb,alpha,beta,atype,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi,Ichunk,Kchunk,Jchunk,a_ar,b_ar,c_ar,need_scaling,irregular)821 static void gai_matmul_irreg(transa, transb, alpha, beta, atype,
822 g_a, ailo, aihi, ajlo, ajhi,
823 g_b, bilo, bihi, bjlo, bjhi,
824 g_c, cilo, cihi, cjlo, cjhi,
825 Ichunk, Kchunk, Jchunk, a_ar,b_ar,c_ar,
826 need_scaling, irregular)
827
828 Integer g_a, ailo, aihi, ajlo, ajhi; /* patch of g_a */
829 Integer g_b, bilo, bihi, bjlo, bjhi; /* patch of g_b */
830 Integer g_c, cilo, cihi, cjlo, cjhi; /* patch of g_c */
831 Integer Ichunk, Kchunk, Jchunk, atype;
832 void *alpha, *beta;
833 char *transa, *transb;
834 DoubleComplex **a_ar, **b_ar, **c_ar;
835 short int need_scaling, irregular;
836 {
837
838 #if DEBUG_
839 Integer me= pnga_nodeid();
840 #endif
841 Integer nproc=pnga_nnodes();
842 Integer get_new_B, i, i0, i1, j0, j1;
843 Integer ilo, ihi, idim, jlo, jhi, jdim, klo, khi, kdim, ijk=0;
844 Integer n, m, k, adim, bdim=0, cdim;
845 Integer idim_prev=0, jdim_prev=0, kdim_prev=0;
846 Integer adim_prev=0, bdim_prev=0, cdim_prev=0;
847 task_list_t taskListC;
848 short int compute_flag=0, shiftA=0, shiftB=0;
849 DoubleComplex ONE, *a, *b, *c;
850 SingleComplex ONE_CF;
851 Integer grp_me, a_grp = pnga_get_pgroup(g_a);
852 Integer clo[2], chi[2];
853
854 init_task_list(&taskListC);
855 ONE.real =1.; ONE.imag =0.;
856 ONE_CF.real =1.; ONE_CF.imag =0.;
857 #if DEBUG_
858 if(me==0) { printf("@@ga_matmul_irreg:m,n,k=%ld %ld %ld\n", aihi-ailo+1,
859 bjhi-bjlo+1,ajhi-ajlo+1); fflush(stdout); }
860 #endif
861
862 m = aihi - ailo +1;
863 n = bjhi - bjlo +1;
864 k = ajhi - ajlo +1;
865 a = a_ar[0];
866 b = b_ar[0];
867 c = c_ar[0];
868
869 grp_me = pnga_pgroup_nodeid(a_grp);
870 clo[0] = cilo; clo[1] = cjlo;
871 chi[0] = cihi; chi[1] = cjhi;
872 if(!need_scaling) pnga_fill_patch(g_c, clo, chi, beta);
873
874 compute_flag=0; /* take care of the last chunk */
875
876 for(jlo = 0; jlo < n; jlo += Jchunk){ /* loop thru columns of g_c patch */
877 jhi = GA_MIN(n-1, jlo+Jchunk-1);
878 jdim= jhi - jlo +1;
879
880 for(klo = 0; klo < k; klo += Kchunk){ /* loop cols of g_a patch */
881 khi = GA_MIN(k-1, klo+Kchunk-1); /* loop rows of g_b patch */
882 kdim= khi - klo +1;
883
884 /** Each pass through the outer two loops means we need a
885 different patch of B.*/
886 get_new_B = TRUE;
887
888 for(ilo = 0; ilo < m; ilo+=Ichunk){ /* loop thru rows of g_c patch */
889
890 if(ijk%nproc == grp_me){
891
892 ihi = GA_MIN(m-1, ilo+Ichunk-1);
893 idim= cdim = ihi - ilo +1;
894
895
896 if (*transa == 'n' || *transa == 'N'){
897 adim = idim;
898 i0= ailo+ilo; i1= ailo+ihi;
899 j0= ajlo+klo; j1= ajlo+khi;
900 clo[0] = i0;
901 clo[1] = j0;
902 chi[0] = i1;
903 chi[1] = j1;
904 pnga_nbget(g_a, clo, chi, a_ar[shiftA],
905 &idim, &gNbhdlA[shiftA]);
906 }else{
907 adim = kdim;
908 i0= ajlo+klo; i1= ajlo+khi;
909 j0= ailo+ilo; j1= ailo+ihi;
910 clo[0] = i0;
911 clo[1] = j0;
912 chi[0] = i1;
913 chi[1] = j1;
914 pnga_nbget(g_a, clo, chi, a_ar[shiftA],
915 &kdim, &gNbhdlA[shiftA]);
916 }
917
918 /* Avoid rereading B if it is same patch as last time. */
919 if(get_new_B) {
920 if (*transb == 'n' || *transb == 'N'){
921 bdim = kdim;
922 i0= bilo+klo; i1= bilo+khi;
923 j0= bjlo+jlo; j1= bjlo+jhi;
924 clo[0] = i0;
925 clo[1] = j0;
926 chi[0] = i1;
927 chi[1] = j1;
928 pnga_nbget(g_b, clo, chi, b_ar[shiftB],
929 &kdim, &gNbhdlB[shiftB]);
930 }else{
931 bdim = jdim;
932 i0= bjlo+jlo; i1= bjlo+jhi;
933 j0= bilo+klo; j1= bilo+khi;
934 clo[0] = i0;
935 clo[1] = j0;
936 chi[0] = i1;
937 chi[1] = j1;
938 pnga_nbget(g_b, clo, chi, b_ar[shiftB],
939 &jdim, &gNbhdlB[shiftB]);
940 }
941 }
942
943 if(compute_flag) { /* compute loop */
944
945 if(atype == C_FLOAT)
946 for(i=0;i<idim_prev*jdim_prev;i++) *(((float*)c)+i)=0;
947 else if(atype == C_DBL)
948 for(i=0;i<idim_prev*jdim_prev;i++) *(((double*)c)+i)=0;
949 else if(atype == C_SCPL)
950 for(i=0;i<idim_prev*jdim_prev;i++) {
951 ((SingleComplex*)c)[i].real=0;
952 ((SingleComplex*)c)[i].imag=0;
953 }
954 else for(i=0;i<idim_prev*jdim_prev;i++) {
955 c[i].real=0;c[i].imag=0; }
956
957 /* wait till we get the previous block */
958 a = a_ar[shiftA^1];
959 WAIT_GET_BLOCK(&gNbhdlA[shiftA^1]);
960 if(taskListC.chunkBId) {
961 b = b_ar[shiftB^1];
962 WAIT_GET_BLOCK(&gNbhdlB[shiftB^1]);
963 }
964
965 /* Do the sequential matrix multiply - i.e.BLAS dgemm */
966 GAI_DGEMM(atype, transa, transb, idim_prev, jdim_prev,
967 kdim_prev, alpha, a, adim_prev, b, bdim_prev,
968 c, cdim_prev);
969
970 i0= cilo + taskListC.lo[0];
971 i1= cilo + taskListC.hi[0];
972 j0= cjlo + taskListC.lo[1];
973 j1= cjlo + taskListC.hi[1];
974
975 if(atype == C_FLOAT || atype == C_SCPL) {
976 clo[0] = i0;
977 clo[1] = j0;
978 chi[0] = i1;
979 chi[1] = j1;
980 pnga_acc(g_c, clo, chi, (float *)c, &cdim_prev, &ONE_CF);
981 } else {
982 clo[0] = i0;
983 clo[1] = j0;
984 chi[0] = i1;
985 chi[1] = j1;
986 pnga_acc(g_c, clo, chi, (DoublePrecision*)c, &cdim_prev, (DoublePrecision*)&ONE);
987 }
988 }
989 compute_flag=1;
990
991 /* meta-data of current block for next compute loop */
992 taskListC.lo[0] = ilo; taskListC.hi[0] = ihi;
993 taskListC.lo[1] = jlo; taskListC.hi[1] = jhi;
994 taskListC.chunkBId = get_new_B;
995 idim_prev = idim; adim_prev = adim;
996 jdim_prev = jdim; bdim_prev = bdim;
997 kdim_prev = kdim; cdim_prev = cdim;
998
999 /* shift bext buffer */
1000 shiftA ^= 1;
1001 if(get_new_B) shiftB ^= 1;
1002
1003 get_new_B = FALSE; /* Until J or K change again */
1004 }
1005 ++ijk;
1006 }
1007 }
1008 }
1009
1010 /* -------- compute the last chunk --------- */
1011 if(compute_flag) {
1012 if(atype == C_FLOAT)
1013 for(i=0;i<idim_prev*jdim_prev;i++) *(((float*)c)+i)=0;
1014 else if(atype == C_DBL)
1015 for(i=0;i<idim_prev*jdim_prev;i++) *(((double*)c)+i)=0;
1016 else if(atype == C_SCPL)
1017 for(i=0;i<idim_prev*jdim_prev;i++) {
1018 ((SingleComplex*)c)[i].real=0;
1019 ((SingleComplex*)c)[i].imag=0;
1020 }
1021 else for(i=0;i<idim_prev*jdim_prev;i++) {
1022 c[i].real=0;c[i].imag=0; }
1023
1024 /* wait till we get the previous block */
1025 a = a_ar[shiftA^1];
1026 WAIT_GET_BLOCK(&gNbhdlA[shiftA^1]);
1027 if(taskListC.chunkBId) {
1028 b = b_ar[shiftB^1];
1029 WAIT_GET_BLOCK(&gNbhdlB[shiftB^1]);
1030 }
1031
1032 /* Do the sequential matrix multiply - i.e.BLAS dgemm */
1033 GAI_DGEMM(atype, transa, transb, idim_prev, jdim_prev,
1034 kdim_prev, alpha, a, adim_prev, b, bdim_prev,
1035 c, cdim_prev);
1036
1037 i0= cilo + taskListC.lo[0];
1038 i1= cilo + taskListC.hi[0];
1039 j0= cjlo + taskListC.lo[1];
1040 j1= cjlo + taskListC.hi[1];
1041
1042 if(atype == C_FLOAT || atype == C_SCPL) {
1043 clo[0] = i0;
1044 clo[1] = j0;
1045 chi[0] = i1;
1046 chi[1] = j1;
1047 pnga_acc(g_c, clo, chi, (float *)c, &cdim_prev, &ONE_CF);
1048 } else {
1049 clo[0] = i0;
1050 clo[1] = j0;
1051 chi[0] = i1;
1052 chi[1] = j1;
1053 pnga_acc(g_c, clo, chi, (DoublePrecision*)c, &cdim_prev, (DoublePrecision*)&ONE);
1054 }
1055 }
1056 /* ----------------------------------------- */
1057 }
1058
1059 #if DEBUG_
1060
check_result(cond,transa,transb,alpha,beta,atype,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi)1061 static void check_result(cond, transa, transb, alpha, beta, atype,
1062 g_a, ailo, aihi, ajlo, ajhi,
1063 g_b, bilo, bihi, bjlo, bjhi,
1064 g_c, cilo, cihi, cjlo, cjhi)
1065
1066 Integer g_a, ailo, aihi, ajlo, ajhi; /* patch of g_a */
1067 Integer g_b, bilo, bihi, bjlo, bjhi; /* patch of g_b */
1068 Integer g_c, cilo, cihi, cjlo, cjhi; /* patch of g_c */
1069 Integer atype, cond;
1070 void *alpha, *beta;
1071 char *transa, *transb;
1072 {
1073
1074 DoubleComplex *tmpa=NULL, *tmpb=NULL, *tmpc2=NULL;
1075 static DoubleComplex *tmpc_orig = NULL;
1076 Integer i,j,m,n,k,adim,bdim,cdim;
1077 Integer factor=sizeof(DoubleComplex)/GAsizeofM(atype);
1078 BlasInt m_t, n_t, k_t, adim_t, bdim_t, cdim_t;
1079 Integer alo[2], ahi[2], blo[2], bhi[2], clo[2], chi[2];
1080
1081 m = aihi - ailo +1;
1082 n = bjhi - bjlo +1;
1083 k = ajhi - ajlo +1;
1084 cdim = m;
1085
1086 if(cond==0) { /* store the original matrix C before matmul starts, as
1087 matrix C is subject to change during pnga_matmul */
1088 tmpc_orig= (DoubleComplex*)malloc(sizeof(DoubleComplex)*(m*n/factor+1));
1089 if(tmpc_orig==NULL) pnga_error("check_result: malloc failed", 0);
1090
1091 /* get matrix C */
1092
1093 clo[0] = cilo;
1094 clo[1] = cjlo;
1095 chi[0] = cihi;
1096 chi[1] = cjhi;
1097 pnga_get(g_c, clo, chi, tmpc_orig, &m);
1098 }
1099 else { /* check for CORRECTNESS */
1100
1101 /* Memory Allocation */
1102 tmpa = (DoubleComplex*)malloc( sizeof(DoubleComplex)* (m*k/factor+1));
1103 tmpb = (DoubleComplex*)malloc( sizeof(DoubleComplex)* (k*n/factor+1));
1104 if(tmpa==NULL || tmpb==NULL) pnga_error("check_result: malloc failed", 0);
1105
1106 switch(atype) {
1107 case C_FLOAT:
1108 for(i=0; i<m*k; i++) ((float*)tmpa)[i] = -1.0;
1109 for(i=0; i<k*n; i++) ((float*)tmpb)[i] = -1.0;
1110 break;
1111 case C_DBL:
1112 for(i=0; i<m*k; i++) ((double*)tmpa)[i] = -1.0;
1113 for(i=0; i<k*n; i++) ((double*)tmpb)[i] = -1.0;
1114 break;
1115 case C_DCPL:
1116 for(i=0; i<m*k; i++) {tmpa[i].real=-1.0; tmpa[i].imag=0.0;}
1117 for(i=0; i<k*n; i++) {tmpb[i].real=-1.0; tmpb[i].imag=0.0;}
1118 break;
1119 case C_SCPL:
1120 for(i=0; i<m*k; i++) {((SingleComplex*)tmpa)[i].real=-1.0; ((SingleComplex*)tmpa)[i].imag=0.0;}
1121 for(i=0; i<k*n; i++) {((SingleComplex*)tmpb)[i].real=-1.0; ((SingleComplex*)tmpb)[i].imag=0.0;}
1122 break;
1123 default:
1124 pnga_error("ga_matmul_patch: wrong data type", atype);
1125 }
1126
1127 /* get matrix A */
1128 if (*transa == 'n' || *transa == 'N'){
1129 alo[0] = ailo;
1130 alo[1] = ajlo;
1131 ahi[0] = aihi;
1132 ahi[1] = ajhi;
1133 adim=m;
1134 pnga_get(g_a, alo, ahi, tmpa, &m);
1135 } else {
1136 alo[0] = ajlo;
1137 alo[1] = ailo;
1138 ahi[0] = ajhi;
1139 ahi[1] = aihi;
1140 adim=k;
1141 pnga_get(g_a, alo, ahi, tmpa, &k);
1142 }
1143
1144 /* get matrix B */
1145 if (*transb == 'n' || *transb == 'N'){
1146 blo[0] = bilo;
1147 blo[1] = bjlo;
1148 bhi[0] = bihi;
1149 bhi[1] = bjhi;
1150 bdim=k;
1151 pnga_get(g_b, blo, bhi, tmpb, &k);
1152 } else {
1153 blo[0] = bjlo;
1154 blo[1] = bilo;
1155 bhi[0] = bjhi;
1156 bhi[1] = bihi;
1157 bdim=n;
1158 pnga_get(g_b, blo, bhi, tmpb, &n);
1159 }
1160
1161
1162 m_t=m; n_t=n; k_t=k;
1163 adim_t=adim; bdim_t=bdim; cdim_t=cdim;
1164 #if (defined(CRAY) || defined(WIN32)) && !NOFORT
1165 pnga_error("check_result: Serial dgemms not defined", 0L);
1166 #else
1167 switch(atype) {
1168 case C_DBL:
1169 # if F2C_HIDDEN_STRING_LENGTH_AFTER_ARGS
1170 dgemm_(transa, transb, &m_t, &n_t, &k_t, alpha, tmpa, &adim_t,
1171 tmpb, &bdim_t, beta, tmpc_orig, &cdim_t, 1, 1);
1172 # else
1173 dgemm_(transa, 1, transb, 1, &m_t, &n_t, &k_t, alpha, tmpa, &adim_t,
1174 tmpb, &bdim_t, beta, tmpc_orig, &cdim_t);
1175 # endif
1176 break;
1177 case C_DCPL:
1178 # if F2C_HIDDEN_STRING_LENGTH_AFTER_ARGS
1179 zgemm_(transa, transb, &m_t, &n_t, &k_t, (DoubleComplex*)alpha,
1180 tmpa, &adim_t, tmpb, &bdim_t, beta, tmpc_orig, &cdim_t, 1, 1);
1181 # else
1182 zgemm_(transa, 1, transb, 1, &m_t, &n_t, &k_t, (DoubleComplex*)alpha,
1183 tmpa, &adim_t, tmpb, &bdim_t, beta, tmpc_orig, &cdim_t);
1184 # endif
1185 break;
1186 default:
1187 pnga_error("check_result: data type not supported here", atype);
1188 }
1189 #endif
1190
1191 printf("CHK:%c%c : %ld %ld %ld %ld: %ld %ld %ld %ld: %ld %ld %ld %ld\n",
1192 *transa, *transb, ailo, aihi, ajlo, ajhi,
1193 bilo, bihi, bjlo, bjhi, cilo, cihi, cjlo, cjhi);
1194
1195 free(tmpa); free(tmpb);
1196 /* after computing c locally, verify it with the values in g_c */
1197 tmpc2 = (DoubleComplex*)malloc( sizeof(DoubleComplex)* (m*n/factor+1));
1198 if(tmpc2==NULL) pnga_error("check_result: malloc failed for tmpc2", 0);
1199 clo[0] = cilo;
1200 clo[1] = cjlo;
1201 chi[0] = cihi;
1202 chi[1] = cjhi;
1203 pnga_get(g_c, clo, chi, tmpc2, &m);
1204
1205 #define _GA_TOL_ 0.1 /* error tolerance */
1206
1207 switch(atype) {
1208
1209 case C_FLOAT:
1210 {
1211 float abs_value=0.0;
1212 for(i=0; i<m*n; i++) {
1213 abs_value = ((float*)tmpc_orig)[i] - ((float*)tmpc2)[i];
1214 if(abs_value > _GA_TOL_ || abs_value < -(_GA_TOL_)) {
1215 printf("Values are = %f : %f\n Alpha=%f Beta=%f\n",
1216 ((float*)tmpc_orig)[i], ((float*)tmpc2)[i],
1217 *((float*)alpha), *((float*)beta));
1218 pnga_error("Matmul (type:float) check failed", 0);
1219 }
1220 }
1221 }
1222 break;
1223
1224 case C_DBL:
1225 {
1226 double abs_value=0.0;
1227 for(i=0; i<m*n; i++) {
1228 abs_value = ((double*)tmpc_orig)[i] - ((double*)tmpc2)[i];
1229 if(abs_value > _GA_TOL_ || abs_value < -(_GA_TOL_)) {
1230 printf("Values are = %lf : %lf\n Alpha=%lf Beta=%lf\n",
1231 ((double*)tmpc_orig)[i+j], ((double*)tmpc2)[i+j],
1232 *((double*)alpha),*((double*)beta));
1233 pnga_error("Matmul (type:double) check failed", 0);
1234 }
1235 }
1236 }
1237 break;
1238
1239 case C_DCPL:
1240 {
1241 DoubleComplex abs_value;
1242 for(i=0; i<m*n; i++) {
1243 abs_value.real = tmpc_orig[i].real - tmpc2[i].real;
1244 abs_value.imag = tmpc_orig[i].imag - tmpc2[i].imag;
1245 if(abs_value.real>_GA_TOL_ || abs_value.real<-(_GA_TOL_) ||
1246 abs_value.imag>_GA_TOL_ || abs_value.imag<-(_GA_TOL_)) {
1247 printf("Values= %lf, %lf : %lf, %lf\n", tmpc_orig[i].real,
1248 tmpc_orig[i].imag,tmpc2[i].real,tmpc2[i].imag);
1249 pnga_error("Matmul (DoubleComplex) check failed", 0);
1250 }
1251 }
1252 }
1253 break;
1254
1255 case C_SCPL:
1256 {
1257 SingleComplex abs_value;
1258 for(i=0; i<m*n; i++) {
1259 abs_value.real = ((SingleComplex*)tmpc_orig)[i].real - ((SingleComplex*)tmpc2)[i].real;
1260 abs_value.imag = ((SingleComplex*)tmpc_orig)[i].imag - ((SingleComplex*)tmpc2)[i].imag;
1261 if(abs_value.real>_GA_TOL_ || abs_value.real<-(_GA_TOL_) ||
1262 abs_value.imag>_GA_TOL_ || abs_value.imag<-(_GA_TOL_)) {
1263 printf("Values= %lf, %lf : %lf, %lf\n", ((SingleComplex*)tmpc_orig)[i].real,
1264 ((SingleComplex*)tmpc_orig)[i].imag,((SingleComplex*)tmpc2)[i].real,((SingleComplex*)tmpc2)[i].imag);
1265 pnga_error("Matmul (SingleComplex) check failed", 0);
1266 }
1267 }
1268 }
1269 break;
1270
1271 default:
1272 pnga_error("ga_matmul_patch: wrong data type", atype);
1273 }
1274 printf("Matrix Multiplication check (m,n,k=%ld %ld %ld)...O.K\n",m,n,k);
1275 fflush(stdout);
1276 free(tmpc_orig); free(tmpc2);
1277 tmpc_orig = NULL;
1278 }
1279 }
1280
1281 #endif
1282
1283 /******************************************
1284 * PARALLEL DGEMM
1285 * i.e. C = alpha*A*B + beta*C
1286 ******************************************/
1287 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1288 # pragma weak wnga_matmul = pnga_matmul
1289 #endif
pnga_matmul(transa,transb,alpha,beta,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi)1290 void pnga_matmul(transa, transb, alpha, beta,
1291 g_a, ailo, aihi, ajlo, ajhi,
1292 g_b, bilo, bihi, bjlo, bjhi,
1293 g_c, cilo, cihi, cjlo, cjhi)
1294
1295 Integer g_a, ailo, aihi, ajlo, ajhi; /* patch of g_a */
1296 Integer g_b, bilo, bihi, bjlo, bjhi; /* patch of g_b */
1297 Integer g_c, cilo, cihi, cjlo, cjhi; /* patch of g_c */
1298 void *alpha, *beta;
1299 char *transa, *transb;
1300 {
1301 DoubleComplex *a=NULL, *b, *c, *a_ar[2], *b_ar[2], *c_ar[2];
1302 Integer adim1=0, adim2=0, bdim1=0, bdim2=0, cdim1=0, cdim2=0, dims[2];
1303 Integer atype, btype, ctype, rank, me= pnga_nodeid();
1304 Integer n, m, k, Ichunk, Kchunk, Jchunk;
1305 Integer loA[2]={0,0}, hiA[2]={0,0};
1306 Integer loB[2]={0,0}, hiB[2]={0,0};
1307 Integer loC[2]={0,0}, hiC[2]={0,0};
1308 int local_sync_begin,local_sync_end;
1309 short int need_scaling=SET,use_NB_matmul=SET;
1310 short int irregular=UNSET, use_armci_memory=UNSET;
1311 Integer a_grp=pnga_get_pgroup(g_a), b_grp=pnga_get_pgroup(g_b);
1312 Integer c_grp=pnga_get_pgroup(g_c);
1313 Integer numblocks;
1314 Integer clo[2], chi[2];
1315
1316 /* OPTIMIZATIONS FLAGS. To unset an optimization, replace SET by UNSET) */
1317 CYCLIC_DISTR_OPT_FLAG = UNSET;
1318 CONTIG_CHUNKS_OPT_FLAG = SET;
1319 DIRECT_ACCESS_OPT_FLAG = SET;
1320
1321 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1322 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1323 if(local_sync_begin)pnga_pgroup_sync(a_grp);
1324
1325
1326 if (a_grp != b_grp || a_grp != c_grp)
1327 pnga_error("Arrays must be defined on same group",0L);
1328 # if 0 /* disabled. should not fail if there are non-overlapping patches*/
1329 /* check if C is different from A and B */
1330 if (g_c == g_a || g_c == g_b)
1331 pnga_error("Global Array C should be different from A and B", 0);
1332 #endif
1333
1334 /**************************************************
1335 * Do All Sanity Checks
1336 **************************************************/
1337
1338 /* Check to make sure all global arrays are of the same type */
1339 if (!(pnga_is_mirrored(g_a) == pnga_is_mirrored(g_b) &&
1340 pnga_is_mirrored(g_a) == pnga_is_mirrored(g_c))) {
1341 pnga_error("Processors do not match for all arrays",pnga_nnodes());
1342 }
1343
1344 /* check if ranks are O.K. */
1345 pnga_inquire(g_a, &atype, &rank, dims);
1346 VECTORCHECK(rank, dims, adim1, adim2, ailo, aihi, ajlo, ajhi);
1347 pnga_inquire(g_b, &btype, &rank, dims);
1348 VECTORCHECK(rank, dims, bdim1, bdim2, bilo, bihi, bjlo, bjhi);
1349 pnga_inquire(g_c, &ctype, &rank, dims);
1350 VECTORCHECK(rank, dims, cdim1, cdim2, cilo, cihi, cjlo, cjhi);
1351
1352 /* check for data-types mismatch */
1353 if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
1354 if(atype != C_DCPL && atype != C_DBL && atype != C_FLOAT && atype!=C_SCPL)
1355 pnga_error(" type error",atype);
1356
1357 /* check if patch indices and dims match */
1358 if (*transa == 'n' || *transa == 'N'){
1359 if (ailo <= 0 || aihi > adim1 || ajlo <= 0 || ajhi > adim2)
1360 pnga_error(" g_a indices out of range ", g_a);
1361 }else
1362 if (ailo <= 0 || aihi > adim2 || ajlo <= 0 || ajhi > adim1)
1363 pnga_error(" g_a indices out of range ", g_a);
1364
1365 if (*transb == 'n' || *transb == 'N'){
1366 if (bilo <= 0 || bihi > bdim1 || bjlo <= 0 || bjhi > bdim2)
1367 pnga_error(" g_b indices out of range ", g_b);
1368 }else
1369 if (bilo <= 0 || bihi > bdim2 || bjlo <= 0 || bjhi > bdim1)
1370 pnga_error(" g_b indices out of range ", g_b);
1371
1372 if (cilo <= 0 || cihi > cdim1 || cjlo <= 0 || cjhi > cdim2)
1373 pnga_error(" g_c indices out of range ", g_c);
1374
1375 /* verify if patch dimensions are consistent */
1376 m = aihi - ailo +1;
1377 n = bjhi - bjlo +1;
1378 k = ajhi - ajlo +1;
1379 if( (cihi - cilo +1) != m) pnga_error(" a & c dims error",m);
1380 if( (cjhi - cjlo +1) != n) pnga_error(" b & c dims error",n);
1381 if( (bihi - bilo +1) != k) pnga_error(" a & b dims error",k);
1382
1383 #if DEBUG_
1384 if(me==0) check_result(0, transa, transb, alpha, beta, atype,
1385 g_a, ailo, aihi, ajlo, ajhi,
1386 g_b, bilo, bihi, bjlo, bjhi,
1387 g_c, cilo, cihi, cjlo, cjhi);
1388 pnga_sync();
1389 #endif
1390
1391 /* switch to various matmul algorithms here. more to come */
1392 if( GA[GA_OFFSET + g_c].irreg == 1 ||
1393 GA[GA_OFFSET + g_b].irreg == 1 ||
1394 GA[GA_OFFSET + g_a].irreg == 1 ||
1395 _gai_matmul_patch_flag == SET) irregular = SET;
1396
1397 /* even ga_dgemm is called, m,n & k might not match GA dimensions */
1398 pnga_inquire(g_c, &ctype, &rank, dims);
1399 if(dims[0] != m || dims[1] != n) irregular = SET; /* C matrix dims */
1400
1401 if(!irregular) {
1402 if((adim1=GA_Cluster_nnodes()) > 1) use_NB_matmul = SET;
1403 else {
1404 use_NB_matmul = UNSET;
1405 CONTIG_CHUNKS_OPT_FLAG = UNSET;
1406 DIRECT_ACCESS_OPT_FLAG = UNSET;
1407 }
1408 # if defined(__crayx1) || defined(NEC)
1409 use_NB_matmul = UNSET;
1410 # endif
1411 }
1412
1413 /* if block cyclic, then use regular algorithm. This is turned on for now
1414 * to test block cyclic */
1415 numblocks = pnga_total_blocks(g_c);
1416 if(numblocks>=0) {
1417 #if 0
1418 irregular = UNSET;
1419 use_NB_matmul = SET;
1420 #else
1421 loA[0] = ailo;
1422 loA[1] = ajlo;
1423 hiA[0] = aihi;
1424 hiA[1] = ajhi;
1425 loB[0] = bilo;
1426 loB[1] = bjlo;
1427 hiB[0] = bihi;
1428 hiB[1] = bjhi;
1429 loC[0] = cilo;
1430 loC[1] = cjlo;
1431 hiC[0] = cihi;
1432 hiC[1] = cjhi;
1433 pnga_matmul_basic(transa, transb, alpha, beta, g_a, loA, hiA,
1434 g_b, loB, hiB, g_c, loC, hiC);
1435 return;
1436 #endif
1437 }
1438
1439 /****************************************************************
1440 * Get the memory (i.e.static or dynamic) for temporary buffers
1441 ****************************************************************/
1442
1443 /* to skip accumulate and exploit data locality:
1444 get chunks according to "C" matrix distribution*/
1445 pnga_distribution(g_a, me, loA, hiA);
1446 pnga_distribution(g_b, me, loB, hiB);
1447 pnga_distribution(g_c, me, loC, hiC);
1448
1449 {
1450 Integer elems, factor=sizeof(DoubleComplex)/GAsizeofM(atype);
1451 short int nbuf=1;
1452 DoubleComplex *tmp = NULL;
1453
1454 Ichunk = GA_MIN( (hiC[0]-loC[0]+1), (hiA[0]-loA[0]+1) );
1455 Jchunk = GA_MIN( (hiC[1]-loC[1]+1), (hiB[1]-loB[1]+1) );
1456 Kchunk = GA_MIN( (hiA[1]-loA[1]+1), (hiB[0]-loB[0]+1) );
1457
1458 #if KCHUNK_OPTIMIZATION /*works great for m=1000,n=1000,k=4000 kinda cases*/
1459 pnga_distribution(g_a, me, loC, hiC);
1460 Kchunk = hiC[1]-loC[1]+1;
1461 pnga_distribution(g_b, me, loC, hiC);
1462 Kchunk = GA_MIN(Kchunk, (hiC[0]-loC[0]+1));
1463 #endif
1464
1465 /* Just to avoid divide by zero error */
1466 if(Ichunk<=0) Ichunk = 1;
1467 if(Jchunk<=0) Jchunk = 1;
1468 if(Kchunk<=0) Kchunk = 1;
1469
1470 {
1471 Integer irreg=0;
1472 if(Ichunk/Kchunk > GA_ASPECT_RATIO || Kchunk/Ichunk > GA_ASPECT_RATIO ||
1473 Jchunk/Kchunk > GA_ASPECT_RATIO || Kchunk/Jchunk > GA_ASPECT_RATIO) {
1474 irreg = SET;
1475 }
1476 pnga_pgroup_gop(a_grp, pnga_type_f2c(MT_F_INT), &irreg, (Integer)1, "max");
1477 if(irreg==SET) irregular = SET;
1478 }
1479
1480 /* If non-blocking, we need 2 temporary buffers for A and B matrix */
1481 if(use_NB_matmul) nbuf = 2;
1482
1483 if(!irregular) {
1484 tmp = a_ar[0] =a=gai_get_armci_memory(Ichunk,Jchunk,Kchunk,
1485 nbuf, atype);
1486 if(tmp != NULL) use_armci_memory = SET;
1487 }
1488
1489 /* get ChunkSize (i.e.BlockSize), that fits in temporary buffer */
1490 gai_get_chunk_size(irregular, &Ichunk, &Jchunk, &Kchunk, &elems,
1491 atype, m, n, k, nbuf, use_armci_memory, a_grp);
1492
1493 if(tmp == NULL) { /* try once again from armci for new chunk sizes */
1494 tmp = a_ar[0] =a=gai_get_armci_memory(Ichunk,Jchunk,Kchunk,
1495 nbuf, atype);
1496 if(tmp != NULL) use_armci_memory = SET;
1497 }
1498
1499 if(tmp == NULL) { /*if armci malloc fails again, then get from MA */
1500 tmp = a_ar[0] = a =(DoubleComplex*) ga_malloc(elems,atype,
1501 "GA mulmat bufs");
1502 }
1503
1504 if(use_NB_matmul) tmp = a_ar[1] = a_ar[0] + (Ichunk*Kchunk)/factor+1;
1505
1506 tmp = b_ar[0] = b = tmp + (Ichunk*Kchunk)/factor + 1;
1507 if(use_NB_matmul) tmp = b_ar[1] = b_ar[0] + (Kchunk*Jchunk)/factor+1;
1508
1509 c_ar[0] = c = tmp + (Kchunk*Jchunk)/factor + 1;
1510 }
1511
1512 /** check if there is a need for scaling the data.
1513 Note: if beta=0, then need_scaling=0 */
1514 if(atype==C_DCPL){
1515 if((((DoubleComplex*)beta)->real == 0) &&
1516 (((DoubleComplex*)beta)->imag ==0)) need_scaling =0;}
1517 else if(atype==C_SCPL){
1518 if((((SingleComplex*)beta)->real == 0) &&
1519 (((SingleComplex*)beta)->imag ==0)) need_scaling =0;}
1520 else if(atype==C_DBL){
1521 if(*(DoublePrecision *)beta == 0) need_scaling =0;}
1522 else if( *(float*)beta ==0) need_scaling =0;
1523
1524 clo[0] = cilo; clo[1] = cjlo;
1525 chi[0] = cihi; chi[1] = cjhi;
1526 if(need_scaling) pnga_scale_patch(g_c, clo, chi, beta);
1527
1528 /********************************************************************
1529 * Parallel Matrix Multiplication Starts Here.
1530 * 3 Steps:
1531 * 1. Get a chunk of A and B matrix, and store it in local buffer.
1532 * 2. Do sequential dgemm.
1533 * 3. Put/accumulate the result into C matrix.
1534 *********************************************************************/
1535
1536 /* if only one node, then enable the optimized shmem code */
1537 if(use_NB_matmul==UNSET) {
1538 gai_matmul_shmem(transa, transb, alpha, beta, atype,
1539 g_a, ailo, aihi, ajlo, ajhi,
1540 g_b, bilo, bihi, bjlo, bjhi,
1541 g_c, cilo, cihi, cjlo, cjhi,
1542 Ichunk, Kchunk, Jchunk, a,b,c, need_scaling);
1543 }
1544 else {
1545 if(irregular)
1546 gai_matmul_irreg(transa, transb, alpha, beta, atype,
1547 g_a, ailo, aihi, ajlo, ajhi,
1548 g_b, bilo, bihi, bjlo, bjhi,
1549 g_c, cilo, cihi, cjlo, cjhi,
1550 Ichunk, Kchunk, Jchunk, a_ar, b_ar, c_ar,
1551 need_scaling, irregular);
1552 else {
1553 gai_matmul_regular(transa, transb, alpha, beta, atype,
1554 g_a, ailo, aihi, ajlo, ajhi,
1555 g_b, bilo, bihi, bjlo, bjhi,
1556 g_c, cilo, cihi, cjlo, cjhi,
1557 Ichunk, Kchunk, Jchunk, a_ar, b_ar, c_ar,
1558 need_scaling, irregular);
1559 }
1560 }
1561
1562 a = a_ar[0];
1563 if(use_armci_memory == SET) ARMCI_Free_local(a);
1564 else ga_free(a);
1565
1566 #if DEBUG_
1567 Integer grp_me;
1568 grp_me = pnga_pgroup_nodeid(a_grp);
1569 pnga_pgroup_sync(a_grp);
1570 if(me==0) check_result(1, transa, transb, alpha, beta, atype,
1571 g_a, ailo, aihi, ajlo, ajhi,
1572 g_b, bilo, bihi, bjlo, bjhi,
1573 g_c, cilo, cihi, cjlo, cjhi);
1574 pnga_pgroup_sync(a_grp);
1575 #endif
1576
1577 if(local_sync_end)pnga_pgroup_sync(a_grp);
1578 }
1579
1580 /* This is the old matmul code. It is enabled now for mirrored matrix multiply.
1581 It also work for normal matrix/vector multiply with no changes */
1582 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1583 # pragma weak wnga_matmul_mirrored = pnga_matmul_mirrored
1584 #endif
pnga_matmul_mirrored(transa,transb,alpha,beta,g_a,ailo,aihi,ajlo,ajhi,g_b,bilo,bihi,bjlo,bjhi,g_c,cilo,cihi,cjlo,cjhi)1585 void pnga_matmul_mirrored(transa, transb, alpha, beta,
1586 g_a, ailo, aihi, ajlo, ajhi,
1587 g_b, bilo, bihi, bjlo, bjhi,
1588 g_c, cilo, cihi, cjlo, cjhi)
1589
1590 Integer g_a, ailo, aihi, ajlo, ajhi; /* patch of g_a */
1591 Integer g_b, bilo, bihi, bjlo, bjhi; /* patch of g_b */
1592 Integer g_c, cilo, cihi, cjlo, cjhi; /* patch of g_c */
1593 void *alpha, *beta;
1594 char *transa, *transb;
1595 {
1596
1597 #ifdef STATBUF
1598 /* approx. sqrt(2) ratio in chunk size to use the same buffer space */
1599 DoubleComplex a[ICHUNK*KCHUNK], b[KCHUNK*JCHUNK], c[ICHUNK*JCHUNK];
1600 #else
1601 DoubleComplex *a, *b, *c;
1602 #endif
1603 Integer atype, btype, ctype, adim1=0, adim2=0, bdim1=0, bdim2=0, cdim1=0, cdim2=0, dims[2], rank;
1604 Integer me= pnga_nodeid(), nproc;
1605 Integer i, ijk = 0, i0, i1, j0, j1;
1606 Integer ilo, ihi, idim, jlo, jhi, jdim, klo, khi, kdim;
1607 Integer n, m, k, adim, bdim=0, cdim;
1608 Integer Ichunk, Kchunk, Jchunk;
1609 DoubleComplex ONE;
1610 SingleComplex ONE_CF;
1611
1612 DoublePrecision chunk_cube;
1613 Integer min_tasks = 10, max_chunk;
1614 int need_scaling=1;
1615 Integer ZERO_I = 0, inode, iproc;
1616 Integer get_new_B;
1617 int local_sync_begin,local_sync_end;
1618 BlasInt idim_t, jdim_t, kdim_t, adim_t, bdim_t, cdim_t;
1619 Integer clo[2], chi[2];
1620
1621 ONE.real =1.; ONE.imag =0.;
1622 ONE_CF.real =1.; ONE_CF.imag =0.;
1623
1624 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1625 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1626 if(local_sync_begin)pnga_sync();
1627
1628
1629 /* Check to make sure all global arrays are of the same type */
1630 if (!(pnga_is_mirrored(g_a) == pnga_is_mirrored(g_b) &&
1631 pnga_is_mirrored(g_a) == pnga_is_mirrored(g_c))) {
1632 pnga_error("Processors do not match for all arrays",pnga_nnodes());
1633 }
1634 if (pnga_is_mirrored(g_a)) {
1635 inode = pnga_cluster_nodeid();
1636 nproc = pnga_cluster_nprocs(inode);
1637 iproc = me - pnga_cluster_procid(inode, ZERO_I);
1638 } else {
1639 nproc = pnga_nnodes();
1640 iproc = me;
1641 }
1642
1643 pnga_inquire(g_a, &atype, &rank, dims);
1644 VECTORCHECK(rank, dims, adim1, adim2, ailo, aihi, ajlo, ajhi);
1645 pnga_inquire(g_b, &btype, &rank, dims);
1646 VECTORCHECK(rank, dims, bdim1, bdim2, bilo, bihi, bjlo, bjhi);
1647 pnga_inquire(g_c, &ctype, &rank, dims);
1648 VECTORCHECK(rank, dims, cdim1, cdim2, cilo, cihi, cjlo, cjhi);
1649
1650 if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
1651 if(atype != C_DCPL && atype != C_DBL && atype != C_FLOAT && atype != C_SCPL)
1652 pnga_error(" type error",atype);
1653
1654
1655
1656 /* check if patch indices and dims match */
1657 if (*transa == 'n' || *transa == 'N'){
1658 if (ailo <= 0 || aihi > adim1 || ajlo <= 0 || ajhi > adim2)
1659 pnga_error(" g_a indices out of range ", g_a);
1660 }else
1661 if (ailo <= 0 || aihi > adim2 || ajlo <= 0 || ajhi > adim1)
1662 pnga_error(" g_a indices out of range ", g_a);
1663
1664 if (*transb == 'n' || *transb == 'N'){
1665 if (bilo <= 0 || bihi > bdim1 || bjlo <= 0 || bjhi > bdim2)
1666 pnga_error(" g_b indices out of range ", g_b);
1667 }else
1668 if (bilo <= 0 || bihi > bdim2 || bjlo <= 0 || bjhi > bdim1)
1669 pnga_error(" g_b indices out of range ", g_b);
1670
1671 if (cilo <= 0 || cihi > cdim1 || cjlo <= 0 || cjhi > cdim2)
1672 pnga_error(" g_c indices out of range ", g_c);
1673
1674 /* verify if patch dimensions are consistent */
1675 m = aihi - ailo +1;
1676 n = bjhi - bjlo +1;
1677 k = ajhi - ajlo +1;
1678 if( (cihi - cilo +1) != m) pnga_error(" a & c dims error",m);
1679 if( (cjhi - cjlo +1) != n) pnga_error(" b & c dims error",n);
1680 if( (bihi - bilo +1) != k) pnga_error(" a & b dims error",k);
1681
1682
1683 /* In 32-bit platforms, k*m*n might exceed the "long" range(2^31),
1684 eg:k=m=n=1600. So casting the temporary value to "double" helps */
1685 chunk_cube = (k*(double)(m*n)) / (min_tasks * nproc);
1686 max_chunk = (Integer)pow(chunk_cube, (DoublePrecision)(1.0/3.0) );
1687 if (max_chunk < 32) max_chunk = 32;
1688
1689 #ifdef STATBUF
1690 if(atype == C_DBL || atype == C_FLOAT){
1691 Ichunk=D_CHUNK, Kchunk=D_CHUNK, Jchunk=D_CHUNK;
1692 }else{
1693 Ichunk=ICHUNK; Kchunk=KCHUNK; Jchunk=JCHUNK;
1694 }
1695 #else
1696 {
1697 /**
1698 * Find out how much memory we can grab. It will be used in
1699 * three chunks, and the result includes only the first one.
1700 */
1701
1702 Integer elems, factor = sizeof(DoubleComplex)/GAsizeofM(atype);
1703 Ichunk = Jchunk = Kchunk = CHUNK_SIZE;
1704
1705 if ( max_chunk > Ichunk) {
1706 /*if memory if very limited, performance degrades for large matrices
1707 as chunk size is very small, which leads to communication overhead)*/
1708 Integer avail = pnga_memory_avail_type(atype);
1709 if (pnga_is_mirrored(g_a)) {
1710 fflush(stdout);
1711 if (sizeof(Integer)/sizeof(int) > 1)
1712 armci_msg_gop_scope(SCOPE_NODE, &avail, 1, "min", ARMCI_LONG);
1713 else
1714 armci_msg_gop_scope(SCOPE_NODE, &avail, 1, "min", ARMCI_INT);
1715 fflush(stdout);
1716 } else {
1717 fflush(stdout);
1718 pnga_gop(pnga_type_f2c(MT_F_INT), &avail, (Integer)1, "min");
1719 }
1720 if(avail<MINMEM && pnga_nodeid()==0) pnga_error("NotEnough memory",avail);
1721 elems = (Integer)(avail*0.9); /* Donot use every last drop */
1722
1723 max_chunk=GA_MIN(max_chunk, (Integer)(sqrt( (double)((elems-EXTRA)/3))));
1724 Ichunk = GA_MIN(m,max_chunk);
1725 Jchunk = GA_MIN(n,max_chunk);
1726 Kchunk = GA_MIN(k,max_chunk);
1727 }
1728 else /* "EXTRA" elems for safety - just in case */
1729 elems = 3*Ichunk*Jchunk + EXTRA*factor;
1730
1731 a = (DoubleComplex*) ga_malloc(elems, atype, "GA mulmat bufs");
1732 b = a + (Ichunk*Kchunk)/factor + 1;
1733 c = b + (Kchunk*Jchunk)/factor + 1;
1734 }
1735 #endif
1736
1737 if(atype==C_DCPL){if((((DoubleComplex*)beta)->real == 0) &&
1738 (((DoubleComplex*)beta)->imag ==0)) need_scaling =0;}
1739 else if(atype==C_SCPL){if((((SingleComplex*)beta)->real == 0) &&
1740 (((SingleComplex*)beta)->imag ==0)) need_scaling =0;}
1741 else if(atype==C_DBL){if(*(DoublePrecision *)beta == 0) need_scaling =0;}
1742 else if( *(float*)beta ==0) need_scaling =0;
1743
1744 pnga_mask_sync(ZERO_I, ZERO_I);
1745 clo[0] = cilo; clo[1] = cjlo;
1746 chi[0] = cihi; chi[1] = cjhi;
1747 if(need_scaling) pnga_scale_patch(g_c, clo, chi, beta);
1748 else pnga_fill_patch(g_c, clo, chi, beta);
1749
1750 for(jlo = 0; jlo < n; jlo += Jchunk){ /* loop through columns of g_c patch */
1751 jhi = GA_MIN(n-1, jlo+Jchunk-1);
1752 jdim= jhi - jlo +1;
1753
1754 for(klo = 0; klo < k; klo += Kchunk){ /* loop cols of g_a patch */
1755 khi = GA_MIN(k-1, klo+Kchunk-1); /* loop rows of g_b patch */
1756 kdim= khi - klo +1;
1757
1758 /** Each pass through the outer two loops means we need a
1759 different patch of B.*/
1760 get_new_B = TRUE;
1761
1762 for(ilo = 0; ilo < m; ilo += Ichunk){ /*loop through rows of g_c patch */
1763
1764 if(ijk%nproc == iproc){
1765
1766 ihi = GA_MIN(m-1, ilo+Ichunk-1);
1767 idim= cdim = ihi - ilo +1;
1768
1769 if(atype == C_FLOAT)
1770 for (i = 0; i < idim*jdim; i++) *(((float*)c)+i)=0;
1771 else if(atype == C_DBL)
1772 for (i = 0; i < idim*jdim; i++) *(((double*)c)+i)=0;
1773 else if(atype == C_SCPL)
1774 for (i = 0; i < idim*jdim; i++){
1775 ((SingleComplex*)c)[i].real=0;
1776 ((SingleComplex*)c)[i].imag=0;
1777 }
1778 else
1779 for (i = 0; i < idim*jdim; i++){ c[i].real=0;c[i].imag=0;}
1780
1781 if (*transa == 'n' || *transa == 'N'){
1782 adim = idim;
1783 i0= ailo+ilo; i1= ailo+ihi;
1784 j0= ajlo+klo; j1= ajlo+khi;
1785 clo[0] = i0;
1786 clo[1] = j0;
1787 chi[0] = i1;
1788 chi[1] = j1;
1789 pnga_get(g_a, clo, chi, a, &idim);
1790 }else{
1791 adim = kdim;
1792 i0= ajlo+klo; i1= ajlo+khi;
1793 j0= ailo+ilo; j1= ailo+ihi;
1794 clo[0] = i0;
1795 clo[1] = j0;
1796 chi[0] = i1;
1797 chi[1] = j1;
1798 pnga_get(g_a, clo, chi, a, &kdim);
1799 }
1800
1801
1802 /* Avoid rereading B if it is the same patch as last time. */
1803 if(get_new_B) {
1804 if (*transb == 'n' || *transb == 'N'){
1805 bdim = kdim;
1806 i0= bilo+klo; i1= bilo+khi;
1807 j0= bjlo+jlo; j1= bjlo+jhi;
1808 clo[0] = i0;
1809 clo[1] = j0;
1810 chi[0] = i1;
1811 chi[1] = j1;
1812 pnga_get(g_b, clo, chi, b, &kdim);
1813 }else{
1814 bdim = jdim;
1815 i0= bjlo+jlo; i1= bjlo+jhi;
1816 j0= bilo+klo; j1= bilo+khi;
1817 clo[0] = i0;
1818 clo[1] = j0;
1819 chi[0] = i1;
1820 chi[1] = j1;
1821 pnga_get(g_b, clo, chi, b, &jdim);
1822 }
1823 get_new_B = FALSE; /* Until J or K change again */
1824 }
1825
1826
1827 idim_t=idim; jdim_t=jdim; kdim_t=kdim;
1828 adim_t=adim; bdim_t=bdim; cdim_t=cdim;
1829
1830 switch(atype) {
1831 case C_FLOAT:
1832 BLAS_SGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
1833 (Real *)alpha, (Real *)a, &adim_t,
1834 (Real *)b, &bdim_t, (Real *)&ONE_CF,
1835 (Real *)c, &cdim_t);
1836 break;
1837 case C_DBL:
1838 BLAS_DGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
1839 (DoublePrecision *)alpha, (DoublePrecision *)a, &adim_t,
1840 (DoublePrecision *)b, &bdim_t, (DoublePrecision *)&ONE,
1841 (DoublePrecision *)c, &cdim_t);
1842 break;
1843 case C_DCPL:
1844 BLAS_ZGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
1845 (DoubleComplex *)alpha, (DoubleComplex *)a, &adim_t,
1846 (DoubleComplex *)b, &bdim_t, (DoubleComplex *)&ONE,
1847 (DoubleComplex *)c, &cdim_t);
1848 break;
1849 case C_SCPL:
1850 BLAS_CGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
1851 (SingleComplex *)alpha, (SingleComplex *)a, &adim_t,
1852 (SingleComplex *)b, &bdim_t, (SingleComplex *)&ONE_CF,
1853 (SingleComplex *)c, &cdim_t);
1854 break;
1855 default:
1856 pnga_error("ga_matmul_patch: wrong data type", atype);
1857 }
1858
1859 i0= cilo+ilo; i1= cilo+ihi; j0= cjlo+jlo; j1= cjlo+jhi;
1860 if(atype == C_FLOAT || atype == C_SCPL) {
1861 clo[0] = i0;
1862 clo[1] = j0;
1863 chi[0] = i1;
1864 chi[1] = j1;
1865 pnga_acc(g_c, clo, chi, (float *)c, &cdim, &ONE_CF);
1866 } else {
1867 clo[0] = i0;
1868 clo[1] = j0;
1869 chi[0] = i1;
1870 chi[1] = j1;
1871 pnga_acc(g_c, clo, chi, (DoublePrecision*)c, &cdim, (DoublePrecision*)&ONE);
1872 }
1873 }
1874 ++ijk;
1875 }
1876 }
1877 }
1878
1879 #ifndef STATBUF
1880 ga_free(a);
1881 #endif
1882
1883 if(local_sync_end)pnga_sync();
1884
1885 }
1886
1887
1888 #if 0
1889 void gai_matmul_patch(char *transa, char *transb, void *alpha, void *beta,
1890 Integer g_a,Integer ailo,Integer aihi,Integer ajlo,Integer ajhi,
1891 Integer g_b,Integer bilo,Integer bihi,Integer bjlo,Integer bjhi,
1892 Integer g_c,Integer cilo,Integer cihi,Integer cjlo,Integer cjhi)
1893 {
1894 if(pnga_is_mirrored(g_a))
1895 pnga_matmul_mirrored(transa, transb, alpha, beta,
1896 g_a, ailo, aihi, ajlo, ajhi,
1897 g_b, bilo, bihi, bjlo, bjhi,
1898 g_c, cilo, cihi, cjlo, cjhi);
1899 else {
1900 _gai_matmul_patch_flag = SET;
1901 pnga_matmul(transa, transb, alpha, beta,
1902 g_a, ailo, aihi, ajlo, ajhi,
1903 g_b, bilo, bihi, bjlo, bjhi,
1904 g_c, cilo, cihi, cjlo, cjhi);
1905 _gai_matmul_patch_flag = UNSET;
1906 }
1907 }
1908 #endif
1909
1910
1911 /*\ select the 2d plane to be used in matrix multiplication
1912 * rank: dimension of original array
1913 * trans: character signifying whether to use transpose
1914 * dims: dimensions of original array
1915 * lo, hi: bounds of patch
1916 * ipos: position of first patch dimension greater than 1
1917 * jpos: position of second patch dimension greater than 1
1918 * vec_idx: indicator of which dimension data is originally located for vectors
1919 * (only applies to transpose flag)
1920 \*/
gai_setup_2d_patch(Integer rank,char * trans,Integer dims[],Integer lo[],Integer hi[],Integer * ilo,Integer * ihi,Integer * jlo,Integer * jhi,Integer * dim1,Integer * dim2,int * ipos,int * jpos,int * vpos,int vec_idx)1921 static void gai_setup_2d_patch(Integer rank, char *trans, Integer dims[],
1922 Integer lo[], Integer hi[],
1923 Integer* ilo, Integer* ihi,
1924 Integer* jlo, Integer* jhi,
1925 Integer* dim1, Integer* dim2,
1926 int* ipos, int* jpos, int *vpos, int vec_idx)
1927 {
1928 int d,e=0,f=0;
1929 char t='n';
1930 int reset_t = 0;
1931 int tmp;
1932
1933 /* check to see if more than two dimensions of patch have
1934 * size greater than 1*/
1935 for(d=0; d<rank; d++)
1936 if( (hi[d]-lo[d])>0 && ++e>2 ) pnga_error("3-D Patch Detected", 0L);
1937 /* determine two locations where patch dimensions are greater than 1*/
1938 *ipos = *jpos = *vpos = -1;
1939 for(d=0; d<rank; d++){
1940 if( (*ipos <0) && (hi[d]>lo[d]) ) { *ipos =d; continue; }
1941 if( (*ipos >=0) && (hi[d]>lo[d])) { *jpos =d; break; }
1942 }
1943 /* we have an ambiguous vector; mark its location */
1944 if (1 == e && *ipos != 0 && *ipos != rank-1) {
1945 *vpos = *ipos;
1946 }
1947
1948 /* if(*ipos >*jpos){Integer t=*ipos; *ipos=*jpos; *jpos=t;}
1949 */
1950
1951 /* single element case (trivial) */
1952 if((*ipos <0) && (*jpos <0)){ *ipos =0; *jpos=1; }
1953 else{
1954
1955 if(trans == NULL) {
1956 trans = &t;
1957 reset_t = 1;
1958 }
1959 /* handle almost trivial case of only one dimension with >1 elements.
1960 * None of the conditions in this block are true for a 2D block with more
1961 * than 1 element in each of the 2 dimensions so no need to specify
1962 * condition that e == 1 */
1963 if(*trans == 'n' || *trans == 'N') {
1964 if(*ipos == rank-1) (*ipos)--; /* i cannot be the last dimension */
1965 if(*ipos <0) *ipos = *jpos-1; /* select i dimension based on j */
1966 if(*jpos <0) *jpos = *ipos+1; /* select j dimenison based on i */
1967 }
1968 else {
1969 if(*ipos <0) *ipos = *jpos-1; /* this condition is probably never reached */
1970 if(*jpos <0) {
1971 if (vec_idx < 0) {
1972 if(*ipos==0) *jpos = *ipos + 1;
1973 else *jpos = (*ipos)--;
1974 } else {
1975 if (*ipos < vec_idx) {
1976 *jpos = vec_idx;
1977 } else if (*ipos > vec_idx) {
1978 *jpos = *ipos;
1979 *ipos = vec_idx;
1980 } else {
1981 /* this is probably an error if you get this far */
1982 if (*ipos > 0) {
1983 *jpos = *ipos;
1984 *ipos--;
1985 } else {
1986 *jpos = 1;
1987 *ipos = 0;
1988 }
1989 }
1990 }
1991 }
1992 }
1993 if (reset_t) trans = NULL;
1994 }
1995
1996 *ilo = lo[*ipos]; *ihi = hi[*ipos];
1997 *jlo = lo[*jpos]; *jhi = hi[*jpos];
1998 *dim1 = dims[*ipos];
1999 *dim2 = dims[*jpos];
2000 #if 0
2001 printf("lo/hi=[%ld:%ld", lo[0], hi[0]);
2002 for (d=1; d<rank; ++d) {
2003 printf(",%ld:%ld", lo[d], hi[d]);
2004 }
2005 printf("]\n");
2006 printf("size=[%ld", hi[0]-lo[0]+1);
2007 for (d=1; d<rank; ++d) {
2008 printf(",%ld", hi[d]-lo[d]+1);
2009 }
2010 printf("]\n");
2011 printf("gai_setup_2d_patch(%ld, T, X, X, X, %ld, %ld, %ld, %ld, %ld, %ld, %d, %d, %d)\n",
2012 rank, *ilo, *ihi, *jlo, *jhi, *dim1, *dim2, *ipos, *jpos, *vpos);
2013 #endif
2014 }
2015
2016 #define SETINT(tmp,val,n) {int _i; for(_i=0;_i<n; _i++)tmp[_i]=val;}
2017
2018 /**
2019 * The indices avec_loc, and bvec_loc art designed to handle the special case
2020 * that the patch requested is a 1D vector and the original array contains 3 or
2021 * more dimensions. The original interface cannot handle this case. The
2022 * avec_pos parameter specifies which dimension the original patch (before
2023 * transpose) lies on.
2024 */
gai_matmul_patch(char * transa,char * transb,void * alpha,void * beta,Integer g_a,Integer alo[],Integer ahi[],int avec_pos,Integer g_b,Integer blo[],Integer bhi[],int bvec_pos,Integer g_c,Integer clo[],Integer chi[])2025 void gai_matmul_patch(char *transa, char *transb, void *alpha, void *beta,
2026 Integer g_a, Integer alo[], Integer ahi[], int avec_pos,
2027 Integer g_b, Integer blo[], Integer bhi[], int bvec_pos,
2028 Integer g_c, Integer clo[], Integer chi[])
2029 {
2030 #ifdef STATBUF
2031 DoubleComplex a[ICHUNK*KCHUNK], b[KCHUNK*JCHUNK], c[ICHUNK*JCHUNK];
2032 #else
2033 DoubleComplex *a, *b, *c;
2034 #endif
2035 Integer atype, btype, ctype, adim1, adim2, bdim1, bdim2, cdim1, cdim2;
2036 Integer me= pnga_nodeid(), nproc, inode, iproc;
2037 Integer i, ijk = 0, i0, i1, j0, j1;
2038 Integer ilo, ihi, idim, jlo, jhi, jdim, klo, khi, kdim;
2039 Integer n, m, k, k2, cm, cn, adim, bdim, cdim, arank, brank, crank;
2040 int aipos, ajpos, bipos, bjpos,cipos, cjpos, need_scaling=1;
2041 int avpos, bvpos, cvpos;
2042 Integer Ichunk, Kchunk, Jchunk;
2043 Integer ailo, aihi, ajlo, ajhi; /* 2d plane of g_a */
2044 Integer bilo, bihi, bjlo, bjhi; /* 2d plane of g_b */
2045 Integer cilo, cihi, cjlo, cjhi; /* 2d plane of g_c */
2046 Integer adims[GA_MAX_DIM],bdims[GA_MAX_DIM],cdims[GA_MAX_DIM],tmpld[GA_MAX_DIM];
2047 Integer *tmplo = adims, *tmphi =bdims;
2048 DoubleComplex ONE;
2049 SingleComplex ONE_CF;
2050 Integer ZERO_I = 0;
2051 Integer get_new_B;
2052 DoublePrecision chunk_cube;
2053 Integer min_tasks = 10, max_chunk;
2054 int local_sync_begin,local_sync_end;
2055 BlasInt idim_t, jdim_t, kdim_t, adim_t, bdim_t, cdim_t;
2056
2057 ONE.real =1.; ONE.imag =0.;
2058 ONE_CF.real =1.; ONE_CF.imag =0.;
2059
2060 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2061 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2062 if(local_sync_begin)pnga_sync();
2063
2064 if (pnga_total_blocks(g_a) > 0 || pnga_total_blocks(g_b) > 0 ||
2065 pnga_total_blocks(g_c) > 0) {
2066 pnga_matmul_basic(transa, transb, alpha, beta, g_a, alo, ahi,
2067 g_b, blo, bhi, g_c, clo, chi);
2068 return;
2069 }
2070
2071 /* Check to make sure all global arrays are of the same type */
2072 if (!(pnga_is_mirrored(g_a) == pnga_is_mirrored(g_b) &&
2073 pnga_is_mirrored(g_a) == pnga_is_mirrored(g_c))) {
2074 pnga_error("Processors do not match for all arrays",pnga_nnodes());
2075 }
2076 if (pnga_is_mirrored(g_a)) {
2077 inode = pnga_cluster_nodeid();
2078 nproc = pnga_cluster_nprocs(inode);
2079 iproc = me - pnga_cluster_procid(inode, ZERO_I);
2080 } else {
2081 nproc = pnga_nnodes();
2082 iproc = me;
2083 }
2084
2085 pnga_inquire(g_a, &atype, &arank, adims);
2086 pnga_inquire(g_b, &btype, &brank, bdims);
2087 pnga_inquire(g_c, &ctype, &crank, cdims);
2088
2089 if(arank<2) pnga_error("rank of A must be at least 2",arank);
2090 if(brank<2) pnga_error("rank of B must be at least 2",brank);
2091 if(crank<2) pnga_error("rank of C must be at least 2",crank);
2092
2093 if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
2094 if(atype != C_DCPL && atype != C_DBL && atype != C_FLOAT && atype != C_SCPL)
2095 pnga_error(" type error",atype);
2096
2097 gai_setup_2d_patch(arank, transa, adims, alo, ahi, &ailo, &aihi,
2098 &ajlo, &ajhi, &adim1, &adim2, &aipos, &ajpos, &avpos, avec_pos);
2099 gai_setup_2d_patch(brank, transb, bdims, blo, bhi, &bilo, &bihi,
2100 &bjlo, &bjhi, &bdim1, &bdim2, &bipos, &bjpos, &bvpos, bvec_pos);
2101 gai_setup_2d_patch(crank, NULL, cdims, clo, chi, &cilo, &cihi,
2102 &cjlo, &cjhi, &cdim1, &cdim2, &cipos, &cjpos, &cvpos, -1);
2103
2104
2105 /* check if patch indices and dims match */
2106 if (*transa == 'n' || *transa == 'N'){
2107 if (ailo <= 0 || aihi > adim1 || ajlo <= 0 || ajhi > adim2)
2108 pnga_error(" g_a indices out of range ", g_a);
2109 }else
2110 if (ailo <= 0 || aihi > adim2 || ajlo <= 0 || ajhi > adim1)
2111 pnga_error(" g_a indices out of range ", g_a);
2112
2113 if (*transb == 'n' || *transb == 'N'){
2114 if (bilo <= 0 || bihi > bdim1 || bjlo <= 0 || bjhi > bdim2)
2115 pnga_error(" g_b indices out of range ", g_b);
2116 }else
2117 if (bilo <= 0 || bihi > bdim2 || bjlo <= 0 || bjhi > bdim1)
2118 pnga_error(" g_b indices out of range ", g_b);
2119
2120
2121 if (cilo <= 0 || cihi > cdim1 || cjlo <= 0 || cjhi > cdim2)
2122 pnga_error(" g_c indices out of range ", g_c);
2123
2124 /* verify if patch dimensions are consistent */
2125 #define RESET() do { \
2126 m = aihi - ailo +1; \
2127 k = ajhi - ajlo +1; \
2128 k2= bihi - bilo +1; \
2129 n = bjhi - bjlo +1; \
2130 cm= cihi - cilo +1; \
2131 cn= cjhi - cjlo +1; \
2132 } while (0)
2133 RESET();
2134 #define SHIFT(L,INC) do { \
2135 L##ipos+=INC; \
2136 L##jpos+=INC; \
2137 L##ilo = L##lo[L##ipos]; \
2138 L##ihi = L##hi[L##ipos]; \
2139 L##jlo = L##lo[L##jpos]; \
2140 L##jhi = L##hi[L##jpos]; \
2141 L##dim1 = L##dims[L##ipos]; \
2142 L##dim2 = L##dims[L##jpos]; \
2143 RESET(); \
2144 } while (0)
2145 /* gai_setup_2d_patch may produce ambiguous vectors */
2146 if (!(m==cm && k==k2 && n==cn)) {
2147 /* patches don't agree */
2148 if (avpos>=0 && bvpos<0 && cvpos<0) {
2149 /* only A is an ambiguous vector */
2150 SHIFT(a,-1);
2151 }
2152 else if (avpos<0 && bvpos>=0 && cvpos<0) {
2153 /* only B is an ambiguous vector */
2154 SHIFT(b,-1);
2155 }
2156 else if (avpos<0 && bvpos<0 && cvpos>=0) {
2157 /* only C is an ambiguous vector */
2158 SHIFT(c,-1);
2159 }
2160 else if (avpos>=0 && bvpos>=0 && cvpos<0) {
2161 /* A and B are ambiguous vectors */
2162 if (m != cm) {
2163 SHIFT(a,-1);
2164 }
2165 if (n != cn) {
2166 SHIFT(b,-1);
2167 }
2168 }
2169 else if (avpos>=0 && bvpos<0 && cvpos>=0) {
2170 /* A and C are ambiguous vectors */
2171 if (k != k2) {
2172 SHIFT(a,-1);
2173 }
2174 if (n != cn) {
2175 SHIFT(c,-1);
2176 }
2177 }
2178 else if (avpos<0 && bvpos>=0 && cvpos>=0) {
2179 /* B and C are ambiguous vectors */
2180 if (k != k2) {
2181 SHIFT(b,-1);
2182 }
2183 if (m != cm) {
2184 SHIFT(c,-1);
2185 }
2186 }
2187 else if (avpos>=0 && bvpos>=0 && cvpos>=0) {
2188 /* A and B and C are ambiguous vectors */
2189 pnga_error("a and b and c ambiguous", 1);
2190 }
2191 }
2192 if( (cihi - cilo +1) != m) pnga_error(" a & c dims error",m);
2193 if( (cjhi - cjlo +1) != n) pnga_error(" b & c dims error",n);
2194 if( (bihi - bilo +1) != k) pnga_error(" a & b dims error",k);
2195
2196 chunk_cube = (k*(double)(m*n)) / (min_tasks * nproc);
2197 max_chunk = (Integer)pow(chunk_cube, (DoublePrecision)(1.0/3.0) );
2198 if (max_chunk < 32) max_chunk = 32;
2199
2200 #ifdef STATBUF
2201 if(atype == C_DBL || atype == C_FLOAT){
2202 Ichunk=D_CHUNK, Kchunk=D_CHUNK, Jchunk=D_CHUNK;
2203 }else{
2204 Ichunk=ICHUNK; Kchunk=KCHUNK; Jchunk=JCHUNK;
2205 }
2206 #else
2207 {
2208 Integer elems, factor = sizeof(DoubleComplex)/GAsizeofM(atype);
2209 Ichunk = Jchunk = Kchunk = CHUNK_SIZE;
2210
2211 if ( max_chunk > Ichunk) {
2212 /*if memory if very limited, performance degrades for large matrices
2213 as chunk size is very small, which leads to communication overhead)*/
2214 Integer avail = pnga_memory_avail_type(atype);
2215 pnga_gop(pnga_type_f2c(MT_F_INT), &avail, (Integer)1, "min");
2216 if(avail<MINMEM && pnga_nodeid()==0) pnga_error("Not enough memory",avail);
2217 elems = (Integer)(avail*0.9);/* Donot use every last drop */
2218
2219 max_chunk=GA_MIN(max_chunk, (Integer)(sqrt( (double)((elems-EXTRA)/3))));
2220 Ichunk = GA_MIN(m,max_chunk);
2221 Jchunk = GA_MIN(n,max_chunk);
2222 Kchunk = GA_MIN(k,max_chunk);
2223 }
2224 else /* "EXTRA" elems for safety - just in case */
2225 elems = 3*Ichunk*Jchunk + EXTRA*factor;
2226
2227 a = (DoubleComplex*) ga_malloc(elems, atype, "GA mulmat bufs");
2228 b = a + (Ichunk*Kchunk)/factor + 1;
2229 c = b + (Kchunk*Jchunk)/factor + 1;
2230 }
2231 #endif
2232
2233 if(atype==C_DCPL){if((((DoubleComplex*)beta)->real == 0) &&
2234 (((DoubleComplex*)beta)->imag ==0)) need_scaling =0;}
2235 else if(atype==C_SCPL){if((((SingleComplex*)beta)->real == 0) &&
2236 (((SingleComplex*)beta)->imag ==0)) need_scaling =0;}
2237 else if(atype==C_DBL){if(*(DoublePrecision *)beta == 0)need_scaling =0;}
2238 else if( *(float*)beta ==0) need_scaling =0;
2239
2240 if(need_scaling) pnga_scale_patch(g_c, clo, chi, beta);
2241 else pnga_fill_patch(g_c, clo, chi, beta);
2242
2243 for(jlo = 0; jlo < n; jlo += Jchunk){ /* loop through columns of g_c patch */
2244 jhi = GA_MIN(n-1, jlo+Jchunk-1);
2245 jdim= jhi - jlo +1;
2246
2247 for(klo = 0; klo < k; klo += Kchunk){ /* loop cols of g_a patch */
2248 khi = GA_MIN(k-1, klo+Kchunk-1); /* loop rows of g_b patch */
2249 kdim= khi - klo +1;
2250
2251 get_new_B = TRUE;
2252
2253 for(ilo = 0; ilo < m; ilo += Ichunk){ /*loop through rows of g_c patch */
2254
2255 if(ijk%nproc == iproc){
2256 ihi = GA_MIN(m-1, ilo+Ichunk-1);
2257 idim= cdim = ihi - ilo +1;
2258
2259 if(atype == C_FLOAT)
2260 for (i = 0; i < idim*jdim; i++) *(((float*)c)+i)=0;
2261 else if(atype == C_DBL)
2262 for (i = 0; i < idim*jdim; i++) *(((double*)c)+i)=0;
2263 else if(atype == C_SCPL)
2264 for (i = 0; i < idim*jdim; i++){
2265 ((SingleComplex*)c)[i].real=0;
2266 ((SingleComplex*)c)[i].imag=0;
2267 }
2268 else
2269 for (i = 0; i < idim*jdim; i++){ c[i].real=0;c[i].imag=0;}
2270
2271 if (*transa == 'n' || *transa == 'N'){
2272 adim = idim;
2273 i0= ailo+ilo; i1= ailo+ihi;
2274 j0= ajlo+klo; j1= ajlo+khi;
2275 }else{
2276 adim = kdim;
2277 i0= ajlo+klo; i1= ajlo+khi;
2278 j0= ailo+ilo; j1= ailo+ihi;
2279 }
2280
2281 /* ga_get_(g_a, &i0, &i1, &j0, &j1, a, &adim); */
2282 memcpy(tmplo,alo,arank*sizeof(Integer));
2283 memcpy(tmphi,ahi,arank*sizeof(Integer));
2284 SETINT(tmpld,1,arank-1);
2285 tmplo[aipos]=i0; tmphi[aipos]=i1;
2286 tmplo[ajpos]=j0; tmphi[ajpos]=j1;
2287 tmpld[aipos]=i1-i0+1;
2288 tmpld[ajpos]=j1-j0+1;
2289 pnga_get(g_a,tmplo,tmphi,a,tmpld);
2290
2291 if(get_new_B) {
2292 if (*transb == 'n' || *transb == 'N'){
2293 bdim = kdim;
2294 i0= bilo+klo; i1= bilo+khi;
2295 j0= bjlo+jlo; j1= bjlo+jhi;
2296 }else{
2297 bdim = jdim;
2298 i0= bjlo+jlo; i1= bjlo+jhi;
2299 j0= bilo+klo; j1= bilo+khi;
2300 }
2301 /* ga_get_(g_b, &i0, &i1, &j0, &j1, b, &bdim); */
2302 memcpy(tmplo,blo,brank*sizeof(Integer));
2303 memcpy(tmphi,bhi,brank*sizeof(Integer));
2304 SETINT(tmpld,1,brank-1);
2305 tmplo[bipos]=i0; tmphi[bipos]=i1;
2306 tmplo[bjpos]=j0; tmphi[bjpos]=j1;
2307 tmpld[bipos]=i1-i0+1;
2308 tmpld[bjpos]=j1-j0+1;
2309 pnga_get(g_b,tmplo,tmphi,b,tmpld);
2310 get_new_B = FALSE;
2311 }
2312
2313 idim_t=idim; jdim_t=jdim; kdim_t=kdim;
2314 adim_t=adim; bdim_t=bdim; cdim_t=cdim;
2315
2316 switch(atype) {
2317 case C_FLOAT:
2318 BLAS_SGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2319 (Real *)alpha, (Real *)a, &adim_t,
2320 (Real *)b, &bdim_t, (Real *)&ONE_CF,
2321 (Real *)c, &cdim_t);
2322 break;
2323 case C_DBL:
2324 BLAS_DGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2325 (DoublePrecision *)alpha, (DoublePrecision *)a, &adim_t,
2326 (DoublePrecision *)b, &bdim_t, (DoublePrecision *)&ONE,
2327 (DoublePrecision *)c, &cdim_t);
2328 break;
2329 case C_DCPL:
2330 BLAS_ZGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2331 (DoubleComplex *)alpha, (DoubleComplex *)a, &adim_t,
2332 (DoubleComplex *)b, &bdim_t, (DoubleComplex *)&ONE,
2333 (DoubleComplex *)c, &cdim_t);
2334 break;
2335 case C_SCPL:
2336 BLAS_CGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2337 (SingleComplex *)alpha, (SingleComplex *)a, &adim_t,
2338 (SingleComplex *)b, &bdim_t, (SingleComplex *)&ONE_CF,
2339 (SingleComplex *)c, &cdim_t);
2340 break;
2341 default:
2342 pnga_error("ga_matmul_patch: wrong data type", atype);
2343 }
2344
2345 i0= cilo+ilo; i1= cilo+ihi; j0= cjlo+jlo; j1= cjlo+jhi;
2346 /* ga_acc_(g_c, &i0, &i1, &j0, &j1, (DoublePrecision*)c,
2347 &cdim, (DoublePrecision*)&ONE); */
2348 memcpy(tmplo,clo,crank*sizeof(Integer));
2349 memcpy(tmphi,chi,crank*sizeof(Integer));
2350 SETINT(tmpld,1,crank-1);
2351 tmplo[cipos]=i0; tmphi[cipos]=i1;
2352 tmplo[cjpos]=j0; tmphi[cjpos]=j1;
2353 tmpld[cipos]=i1-i0+1;
2354 tmpld[cjpos]=j1-j0+1;
2355 if(atype == C_FLOAT || atype == C_SCPL)
2356 pnga_acc(g_c,tmplo,tmphi,(float *)c,tmpld, &ONE_CF);
2357 else
2358 pnga_acc(g_c,tmplo,tmphi,c,tmpld,(DoublePrecision*)&ONE);
2359 }
2360 ++ijk;
2361 }
2362 }
2363 }
2364
2365 #ifndef STATBUF
2366 ga_free(a);
2367 #endif
2368
2369 if(local_sync_end)pnga_sync();
2370 }
2371
2372 /*\ MATRIX MULTIPLICATION for 2d patches of multi-dimensional arrays
2373 *
2374 * C[lo:hi,lo:hi] = alpha*op(A)[lo:hi,lo:hi] * op(B)[lo:hi,lo:hi]
2375 * + beta *C[lo:hi,lo:hi]
2376 *
2377 * where:
2378 * op(A) = A or A' depending on the transpose flag
2379 * [lo:hi,lo:hi] - patch indices _after_ op() operator was applied
2380 *
2381 * In this interface all patch indices refer to the post-transpose block. For
2382 * example, if the transpose of the block [1:4,2:5] is request, the indices
2383 * [2:5,1:4] should be used for lo and hi. Note that this interface can not
2384 * handle a 1D slice from an array containing more than 2 dimensions.
2385 *
2386 *
2387 \*/
2388 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2389 # pragma weak wnga_matmul_patch = pnga_matmul_patch
2390 #endif
pnga_matmul_patch(char * transa,char * transb,void * alpha,void * beta,Integer g_a,Integer alo[],Integer ahi[],Integer g_b,Integer blo[],Integer bhi[],Integer g_c,Integer clo[],Integer chi[])2391 void pnga_matmul_patch(char *transa, char *transb, void *alpha, void *beta,
2392 Integer g_a, Integer alo[], Integer ahi[],
2393 Integer g_b, Integer blo[], Integer bhi[],
2394 Integer g_c, Integer clo[], Integer chi[])
2395 {
2396 gai_matmul_patch(transa,transb,alpha,beta,g_a,alo,ahi,-1,
2397 g_b,blo,bhi,-1,g_c,clo,chi);
2398 }
2399
2400 /*\ MATRIX MULTIPLICATION for 2d patches of multi-dimensional arrays
2401 *
2402 * C[lo:hi,lo:hi] = alpha*op(A)[lo:hi,lo:hi] * op(B)[lo:hi,lo:hi]
2403 * + beta *C[lo:hi,lo:hi]
2404 *
2405 * where:
2406 * op(A) = A or A' depending on the transpose flag
2407 * [lo:hi,lo:hi] - patch indices _after_ op() operator was applied
2408 *
2409 * This interface fixes a bug in the original interface that cannot handle a 1D
2410 * slice from an array that is over 3D if the transpose option is used. For
2411 * this interface, all patch indices refer to the pre-transpose block, if the
2412 * transpose option is used.
2413 *
2414 \*/
2415 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2416 # pragma weak wnga_matmul_patch_alt = pnga_matmul_patch_alt
2417 #endif
pnga_matmul_patch_alt(char * transa,char * transb,void * alpha,void * beta,Integer g_a,Integer alo[],Integer ahi[],Integer g_b,Integer blo[],Integer bhi[],Integer g_c,Integer clo[],Integer chi[])2418 void pnga_matmul_patch_alt(char *transa, char *transb, void *alpha, void *beta,
2419 Integer g_a, Integer alo[], Integer ahi[],
2420 Integer g_b, Integer blo[], Integer bhi[],
2421 Integer g_c, Integer clo[], Integer chi[])
2422 {
2423 int aivec, bivec;
2424 int e, d, ipos, jpos;
2425 Integer atype, btype, arank, brank, lo, hi;
2426 Integer adims[GA_MAX_DIM],bdims[GA_MAX_DIM];
2427 /* if the patch is transposed, find the two indices that correspond to the
2428 * non-unit dimensions */
2429 if (*transa == 't' || *transa == 'T') {
2430 pnga_inquire(g_a, &atype, &arank, adims);
2431 aivec = -1;
2432 if (arank > 2) {
2433 ipos = -1;
2434 jpos = -1;
2435 e = 0;
2436 /* find out how many patch dimensions are greater than 1*/
2437 for (d=0; d<arank; d++) {
2438 if (ahi[d]-alo[d] > 0 && ipos == -1) {
2439 ipos = d;
2440 e++;
2441 } else if (ahi[d]-alo[d] > 0 && jpos == -1) {
2442 jpos = d;
2443 e++;
2444 } else if (ahi[d]-alo[d] > 0) {
2445 pnga_error("Patch A has more than 2 dimensions",0);
2446 }
2447 }
2448 if (e == 0) {
2449 aivec = -1;
2450 } else if (e == 1) {
2451 /* array is a vector */
2452 aivec = ipos;
2453 if (ipos < arank-1) {
2454 jpos = ipos + 1;
2455 } else {
2456 jpos = ipos;
2457 ipos--;
2458 }
2459 } else {
2460 aivec = -1;
2461 }
2462 }
2463 lo = alo[0];
2464 hi = ahi[0];
2465 alo[0] = alo[1];
2466 ahi[0] = ahi[1];
2467 alo[1] = lo;
2468 ahi[1] = hi;
2469 }
2470 if (*transb == 't' || *transb == 'T') {
2471 pnga_inquire(g_b, &btype, &brank, bdims);
2472 bivec = -1;
2473 if (brank > 2) {
2474 ipos = -1;
2475 jpos = -1;
2476 e = 0;
2477 /* find out how many patch dimensions are greater than 1*/
2478 for (d=0; d<brank; d++) {
2479 if (bhi[d]-blo[d] > 0 && ipos == -1) {
2480 ipos = d;
2481 e++;
2482 } else if (bhi[d]-blo[d] > 0 && jpos == -1) {
2483 jpos = d;
2484 e++;
2485 } else if (bhi[d]-blo[d] > 0) {
2486 pnga_error("Patch A has more than 2 dimensions",0);
2487 }
2488 }
2489 if (e == 0) {
2490 bivec = -1;
2491 } else if (e == 1) {
2492 /* array is a vector */
2493 bivec = ipos;
2494 if (ipos < arank-1) {
2495 jpos = ipos + 1;
2496 } else {
2497 jpos = ipos;
2498 ipos--;
2499 }
2500 } else {
2501 bivec = -1;
2502 }
2503 lo = blo[ipos];
2504 hi = bhi[ipos];
2505 blo[ipos] = blo[jpos];
2506 bhi[ipos] = bhi[jpos];
2507 blo[jpos] = lo;
2508 bhi[jpos] = hi;
2509 } else {
2510 lo = blo[0];
2511 hi = bhi[0];
2512 blo[0] = blo[1];
2513 bhi[0] = bhi[1];
2514 blo[1] = lo;
2515 bhi[1] = hi;
2516 }
2517 }
2518
2519
2520 gai_matmul_patch(transa,transb,alpha,beta,g_a,alo,ahi,aivec,
2521 g_b,blo,bhi,bivec,g_c,clo,chi);
2522 }
2523
2524
2525 /**
2526 * 1. remove STATBUF
2527 * 2.
2528 */
printBlock(char * banner,Integer type,void * ptr,Integer lo[],Integer hi[],Integer ld[])2529 void printBlock(char * banner, Integer type, void *ptr, Integer lo[],
2530 Integer hi[], Integer ld[])
2531 {
2532 Integer i,j;
2533 Integer offset;
2534 printf("p[%d] %s lo[0]: %d hi[0]: %d lo[1]: %d hi[1]: %d\n",
2535 (int)pnga_nodeid(),banner,(int)lo[0],(int)hi[0],(int)lo[1],(int)hi[1]);
2536 printf(" ");
2537 for (i=lo[0]; i<=hi[0]; i++) printf(" %12d",(int)i);
2538 printf("\n");
2539 for (j=lo[1]; j<=hi[1]; j++) {
2540 printf("J: %d",(int)j);
2541 for (i=lo[0]; i<=hi[0]; i++) {
2542 offset = (j-lo[1])*ld[0] + i-lo[0];
2543 switch (type) {
2544 case C_FLOAT:
2545 printf(" %12.4f",*((float*)ptr+offset));
2546 break;
2547 case C_DBL:
2548 printf(" %12.4f",*((double*)ptr+offset));
2549 break;
2550 case C_DCPL:
2551 printf(" [%12.4f:%12.4f]",*((double*)ptr+2*offset),
2552 *((double*)ptr+2*offset+1));
2553 break;
2554 case C_SCPL:
2555 printf(" [%12.4f:%12.4f]",*((float*)ptr+2*offset),
2556 *((float*)ptr+2*offset+1));
2557 break;
2558 default:
2559 pnga_error("ga_matmul_basic: wrong data type", type);
2560 }
2561 }
2562 printf("\n");
2563 }
2564 printf("\n\n");
2565 }
2566 /**
2567 * This is a routine that is designed to work for all layouts but may not be
2568 * high performing.
2569 */
2570 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2571 # pragma weak wnga_matmul_basic = pnga_matmul_basic
2572 #endif
pnga_matmul_basic(char * transa,char * transb,void * alpha,void * beta,Integer g_a,Integer alo[],Integer ahi[],Integer g_b,Integer blo[],Integer bhi[],Integer g_c,Integer clo[],Integer chi[])2573 void pnga_matmul_basic(char *transa, char *transb, void *alpha, void *beta,
2574 Integer g_a, Integer alo[], Integer ahi[],
2575 Integer g_b, Integer blo[], Integer bhi[],
2576 Integer g_c, Integer clo[], Integer chi[])
2577 {
2578 Integer atype, btype, ctype;
2579 Integer adims[GA_MAX_DIM],bdims[GA_MAX_DIM],cdims[GA_MAX_DIM],tmpld[GA_MAX_DIM];
2580 Integer n, m, k, nb, adim, bdim, cdim, arank, brank, crank;
2581 Integer loC[MAXDIM], hiC[MAXDIM], lot[MAXDIM], hit[MAXDIM], lC[MAXDIM];
2582 Integer nlo, loA[MAXDIM], hiA[MAXDIM], loB[MAXDIM], hiB[MAXDIM];
2583 DoubleComplex ONE_Z;
2584 SingleComplex ONE_F;
2585 BlasInt idim_t, jdim_t, kdim_t, adim_t, bdim_t, cdim_t;
2586 char *src_ptr;
2587 _iterator_hdl hdl_c;
2588 int local_sync_begin,local_sync_end;
2589
2590 ONE_Z.real = 1.0;
2591 ONE_Z.imag = 0.0;
2592 ONE_F.real = 1.0;
2593 ONE_F.imag = 0.0;
2594
2595 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2596 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2597 if(local_sync_begin)pnga_sync();
2598
2599 /* For the time being, punt on transposes*/
2600 if (*transa == 't' || *transa == 'T' || *transb == 't' || *transb == 'T') {
2601 pnga_error("Cannot do basic multiply with tranpose ",0);
2602 }
2603 /* For the time being, punt on mirrored arrays */
2604 if (pnga_is_mirrored(g_a) || pnga_is_mirrored(g_b) || pnga_is_mirrored(g_c)) {
2605 pnga_error("Cannot do basic multiply with mirrored arrays ",0);
2606 }
2607
2608
2609 pnga_inquire(g_a, &atype, &arank, adims);
2610 pnga_inquire(g_b, &btype, &brank, bdims);
2611 pnga_inquire(g_c, &ctype, &crank, cdims);
2612
2613 /* Can't handle dimensions other than 2 */
2614 if(arank != 2) pnga_error("rank of A must be 2 ",arank);
2615 if(brank != 2) pnga_error("rank of B must be 2 ",brank);
2616 if(crank != 2) pnga_error("rank of C must be 2 ",crank);
2617
2618 if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
2619 if(atype != C_DCPL && atype != C_DBL && atype != C_FLOAT && atype != C_SCPL)
2620 pnga_error(" type error: type not supported ",atype);
2621
2622 /* Check that patch dims are reasonable */
2623 for (n=0; n<arank; n++) {
2624 if (alo[n] <= 0 || alo[n] > adims[n] || ahi[n] <= 0 || ahi[n] > adims[n]) {
2625 pnga_error("Patch is out of bounds for A ",0);
2626 }
2627 }
2628 for (n=0; n<brank; n++) {
2629 if (blo[n] <= 0 || blo[n] > bdims[n] || bhi[n] <= 0 || bhi[n] > bdims[n]) {
2630 pnga_error("Patch is out of bounds for B ",0);
2631 }
2632 }
2633 for (n=0; n<crank; n++) {
2634 if (clo[n] <= 0 || clo[n] > cdims[n] || chi[n] <= 0 || chi[n] > cdims[n]) {
2635 pnga_error("Patch is out of bounds for C ",0);
2636 }
2637 }
2638
2639 /* Check that multiplication is feasible */
2640 for (n=0; n<arank; n++) {
2641 if (ahi[n]<alo[n]) pnga_error("No data in patch on A ",0);
2642 }
2643
2644 for (n=0; n<brank; n++) {
2645 if (bhi[n]<blo[n]) pnga_error("No data in patch on B ",0);
2646 }
2647 for (n=0; n<crank; n++) {
2648 if (chi[n]<clo[n]) pnga_error("No data in patch on C ",0);
2649 }
2650 m = ahi[0]-alo[0]+1;
2651 k = ahi[1]-alo[1]+1;
2652 n = bhi[1]-blo[1]+1;
2653 if (k != bhi[0]-blo[0]+1) {
2654 pnga_error("Inner dimensions of A and B do not match ",0);
2655 }
2656 if (m != chi[0]-clo[0]+1 || n != chi[1]-clo[1]+1) {
2657 pnga_error("Outer dimensions of A and B do not match dimensions of C ",0);
2658 }
2659
2660 /* Scale patch of C by beta */
2661 pnga_scale_patch(g_c,clo,chi,beta);
2662
2663 /* Multiplication is feasible. Start iterating over patches of C */
2664 pnga_local_iterator_init(g_c, &hdl_c);
2665 while (pnga_local_iterator_next(&hdl_c, loC, hiC, &src_ptr, lC)) {
2666 Integer num_blocks;
2667 Integer offset, elemsize, ld;
2668 void *a_buf, *b_buf, *c_buf;
2669 Integer size_a, size_b, size_c;
2670 /*
2671 printf("p[%d] loC[0]: %d hiC[0]: %d loC[1]: %d hiC[1]: %d lC[0]: %d\n",
2672 pnga_nodeid(),loC[0],hiC[0],loC[1],hiC[1],lC[0]);
2673 */
2674 /* Copy limits since patch intersect modifies loC array */
2675 for (n=0; n<crank; n++) {
2676 lot[n] = loC[n];
2677 hit[n] = hiC[n];
2678 }
2679
2680 /* check to see if this block overlaps with patch on C */
2681 if (pnga_patch_intersect(loC,hiC,lot,hit,crank)) {
2682 int istart, in;
2683
2684 /* Calculating number of blocks for inner dimension. Assume that
2685 * number of rows in block returned by iterator is a good size */
2686 num_blocks = (ahi[1]-alo[1]+1)/(hit[0]-lot[0]+1);
2687 if (num_blocks*(hit[0]-lot[0]+1) < ahi[1]-alo[1]+1) num_blocks++;
2688
2689 /* Calculate offset from src_ptr to beginning of lot */
2690 offset = lC[0]*(lot[1]-loC[1])+lot[0]-loC[0];
2691 elemsize = GAsizeofM(atype);
2692 c_buf = (void*)((char*)src_ptr + elemsize*offset);
2693 a_buf = (void*)malloc((hit[0]-lot[0]+1)*(hit[1]-lot[1]+1)*elemsize);
2694 b_buf = (void*)malloc((hit[0]-lot[0]+1)*(hit[1]-lot[1]+1)*elemsize);
2695 size_c = (hit[0]-lot[0]+1)*(hit[1]-lot[1]+1);
2696
2697 /* calculate starting block index */
2698 istart = (lot[0]-clo[0])/(hit[0]-lot[0]+1);
2699
2700 /* loop over block pairs */
2701 for (nb=0; nb<num_blocks; nb++) {
2702 /*
2703 printf("p[%d] loC[0]: %d hiC[0]: %d loC[1]: %d hiC[1]: %d\n",pnga_nodeid(),
2704 lot[0],hit[0],lot[1],hit[1]);
2705 */
2706 in = istart + nb;
2707 in = in%num_blocks;
2708 nlo = alo[1]+in*(hit[0]-lot[0]+1);
2709 loA[0] = lot[0];
2710 hiA[0] = hit[0];
2711 loA[1] = nlo;
2712 hiA[1] = loA[1]+(hit[0]-lot[0]);
2713 if (hiA[1] > ahi[1]) hiA[1] = ahi[1];
2714 ld = hiA[0]-loA[0]+1;
2715 size_a = (hiA[0]-loA[0]+1)*(hiA[1]-loA[1]+1);
2716 if (size_a > size_c) {
2717 printf("p[%d] size_a: %d size_c: %d\n",(int)pnga_nodeid(),(int)size_a,(int)size_c);
2718 }
2719 /*
2720 printf("p[%d] loA[0]: %d hiA[0]: %d loA[1]: %d hiA[1]: %d\n",pnga_nodeid(),
2721 loA[0],hiA[0],loA[1],hiA[1]);
2722 */
2723 pnga_get(g_a,loA,hiA,a_buf,&ld);
2724 /*
2725 printBlock("Matrix A",atype,a_buf,loA,hiA,&ld);
2726 */
2727 loB[1] = lot[1];
2728 hiB[1] = hit[1];
2729 nlo = blo[0]+in*(hit[0]-lot[0]+1);
2730 loB[0] = nlo;
2731 hiB[0] = loB[0]+(hit[0]-lot[0]);
2732 if (hiB[0] > bhi[0]) hiB[0] = bhi[0];
2733 ld = hiB[0]-loB[0]+1;
2734 size_b = (hiB[0]-loB[0]+1)*(hiB[1]-loB[1]+1);
2735 if (size_b > size_c) {
2736 printf("p[%d] size_b: %d size_c: %d\n",(int)pnga_nodeid(),
2737 (int)size_b,(int)size_c);
2738 }
2739 /*
2740 printf("p[%d] loB[0]: %d hiB[0]: %d loB[1]: %d hiB[1]: %d\n",pnga_nodeid(),
2741 loB[0],hiB[0],loB[1],hiB[1]);
2742 */
2743 pnga_get(g_b,loB,hiB,b_buf,&ld);
2744 /*
2745 printBlock("Matrix B",btype,b_buf,loB,hiB,&ld);
2746 */
2747
2748 idim_t=(BlasInt)(hiA[0]-loA[0]+1);
2749 kdim_t=(BlasInt)(hiA[1]-loA[1]+1);
2750 jdim_t=(BlasInt)(hiB[1]-loB[1]+1);
2751 adim_t=hiA[0]-loA[0]+1;
2752 bdim_t=hiB[0]-loB[0]+1;
2753 cdim_t=lC[0];
2754
2755 /*
2756 printf("p[%d] idim: %d jdim: %d kdim: %d adim: %d bdim: %d cdim: %d\n",
2757 pnga_nodeid(),idim_t,jdim_t,kdim_t,adim_t,bdim_t,cdim_t);
2758 */
2759 switch(atype) {
2760 case C_FLOAT:
2761 BLAS_SGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2762 (Real *)alpha, (Real *)a_buf, &adim_t,
2763 (Real *)b_buf, &bdim_t, (Real *)&ONE_F,
2764 (Real *)c_buf, &cdim_t);
2765 break;
2766 case C_DBL:
2767 BLAS_DGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2768 (DoublePrecision *)alpha, (DoublePrecision *)a_buf, &adim_t,
2769 (DoublePrecision *)b_buf, &bdim_t, (DoublePrecision *)&ONE_Z,
2770 (DoublePrecision *)c_buf, &cdim_t);
2771 break;
2772 case C_DCPL:
2773 BLAS_ZGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2774 (DoubleComplex *)alpha, (DoubleComplex *)a_buf, &adim_t,
2775 (DoubleComplex *)b_buf, &bdim_t, (DoubleComplex *)&ONE_Z,
2776 (DoubleComplex *)c_buf, &cdim_t);
2777 break;
2778 case C_SCPL:
2779 BLAS_CGEMM(transa, transb, &idim_t, &jdim_t, &kdim_t,
2780 (SingleComplex *)alpha, (SingleComplex *)a_buf, &adim_t,
2781 (SingleComplex *)b_buf, &bdim_t, (SingleComplex *)&ONE_F,
2782 (SingleComplex *)c_buf, &cdim_t);
2783 break;
2784 default:
2785 pnga_error("ga_matmul_basic: wrong data type", atype);
2786 }
2787 }
2788
2789 /* multiplication is done, free buffers */
2790 free(a_buf);
2791 free(b_buf);
2792 }
2793 }
2794 if(local_sync_end)pnga_sync();
2795 }
2796