1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /*
6  * module: global.npatch.c
7  * author: Jialin Ju
8  * description: Implements the n-dimensional patch operations:
9  *              - fill patch
10  *              - copy patch
11  *              - scale patch
12  *              - dot patch
13  *              - add patch
14  *
15  * DISCLAIMER
16  *
17  * This material was prepared as an account of work sponsored by an
18  * agency of the United States Government.  Neither the United States
19  * Government nor the United States Department of Energy, nor Battelle,
20  * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
21  * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
22  * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
23  * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
24  * INFRINGE PRIVATELY OWNED RIGHTS.
25  *
26  *
27  * ACKNOWLEDGMENT
28  *
29  * This software and its documentation were produced with United States
30  * Government support under Contract Number DE-AC06-76RLO-1830 awarded by
31  * the United States Department of Energy.  The United States Government
32  * retains a paid-up non-exclusive, irrevocable worldwide license to
33  * reproduce, prepare derivative works, perform publicly and display
34  * publicly by or for the US Government, including the right to
35  * distribute to other US Government contractors.
36  */
37 
38 #if HAVE_MATH_H
39 #   include <math.h>
40 #endif
41 
42 #include "message.h"
43 #include "globalp.h"
44 #include "armci.h"
45 #include "ga_iterator.h"
46 #include "ga-papi.h"
47 #include "ga-wapi.h"
48 
49 #ifdef MSG_COMMS_MPI
50 extern ARMCI_Group* ga_get_armci_group_(int);
51 #endif
52 
53 /**********************************************************
54  *  n-dimensional utilities                               *
55  **********************************************************/
56 
57 /*\ compute Index from subscript and convert it back to subscript
58  *  in another array
59 \*/
snga_dest_indices(Integer ndims,Integer * los,Integer * blos,Integer * dimss,Integer ndimd,Integer * lod,Integer * blod,Integer * dimsd)60 static void snga_dest_indices(Integer ndims, Integer *los, Integer *blos, Integer *dimss,
61                Integer ndimd, Integer *lod, Integer *blod, Integer *dimsd)
62 {
63     Integer idx = 0, i, factor=1;
64 
65     for(i=0;i<ndims;i++) {
66         idx += (los[i] - blos[i])*factor;
67         factor *= dimss[i];
68     }
69 
70     for(i=0;i<ndims;i++) {
71         lod[i] = idx % dimsd[i] + blod[i];
72         idx /= dimsd[i];
73     }
74 }
75 
76 
77 /* check if I own data in the patch */
pnga_patch_intersect(Integer * lo,Integer * hi,Integer * lop,Integer * hip,Integer ndim)78 logical pnga_patch_intersect(Integer *lo, Integer *hi,
79                         Integer *lop, Integer *hip, Integer ndim)
80 {
81     Integer i;
82 
83     /* check consistency of patch coordinates */
84     for(i=0; i<ndim; i++) {
85         if(hi[i] < lo[i]) return FALSE; /* inconsistent */
86         if(hip[i] < lop[i]) return FALSE; /* inconsistent */
87     }
88 
89     /* find the intersection and update (ilop: ihip, jlop: jhip) */
90     for(i=0; i<ndim; i++) {
91         if(hi[i] < lop[i]) return FALSE; /* don't intersect */
92         if(hip[i] < lo[i]) return FALSE; /* don't intersect */
93     }
94 
95     for(i=0; i<ndim; i++) {
96         lop[i] = GA_MAX(lo[i], lop[i]);
97         hip[i] = GA_MIN(hi[i], hip[i]);
98     }
99 
100     return TRUE;
101 }
102 
103 
104 /*\ check if patches are identical
105 \*/
pnga_comp_patch(Integer andim,Integer * alo,Integer * ahi,Integer bndim,Integer * blo,Integer * bhi)106 logical pnga_comp_patch(Integer andim, Integer *alo, Integer *ahi,
107                           Integer bndim, Integer *blo, Integer *bhi)
108 {
109     Integer i;
110     Integer ndim;
111 
112     if(andim > bndim) {
113         ndim = bndim;
114         for(i=ndim; i<andim; i++)
115             if(alo[i] != ahi[i]) return FALSE;
116     }
117     else if(andim < bndim) {
118         ndim = andim;
119         for(i=ndim; i<bndim; i++)
120             if(blo[i] != bhi[i]) return FALSE;
121     }
122     else ndim = andim;
123 
124     for(i=0; i<ndim; i++)
125         if((alo[i] != blo[i]) || (ahi[i] != bhi[i])) return FALSE;
126 
127     return TRUE;
128 }
129 
130 
131 /* test two GAs to see if they have the same shape */
snga_test_shape(Integer * alo,Integer * ahi,Integer * blo,Integer * bhi,Integer andim,Integer bndim)132 static logical snga_test_shape(Integer *alo, Integer *ahi, Integer *blo,
133                           Integer *bhi, Integer andim, Integer bndim)
134 {
135     Integer i;
136 
137     if(andim != bndim) return FALSE;
138 
139     for(i=0; i<andim; i++)
140         if((ahi[i] - alo[i]) != (bhi[i] - blo[i])) return FALSE;
141 
142     return TRUE;
143 }
144 
145 
146 /**********************************************************
147  *  n-dimensional functions                               *
148  **********************************************************/
149 
150 
151 /*\ COPY A PATCH AND POSSIBLY RESHAPE
152  *
153  *  . the element capacities of two patches must be identical
154  *  . copy by column order - Fortran convention
155 \*/
156 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
157 #   pragma weak wnga_copy_patch = pnga_copy_patch
158 #endif
pnga_copy_patch(char * trans,Integer g_a,Integer * alo,Integer * ahi,Integer g_b,Integer * blo,Integer * bhi)159 void pnga_copy_patch(char *trans,
160                     Integer g_a, Integer *alo, Integer *ahi,
161                     Integer g_b, Integer *blo, Integer *bhi)
162 {
163   Integer i, j;
164   Integer idx, factor;
165   Integer atype, btype, andim, adims[MAXDIM], bndim, bdims[MAXDIM];
166   Integer nelem;
167   Integer atotal, btotal;
168   Integer los[MAXDIM], his[MAXDIM];
169   Integer lod[MAXDIM], hid[MAXDIM];
170   Integer ld[MAXDIM], ald[MAXDIM], bld[MAXDIM];
171   char *src_data_ptr, *tmp_ptr;
172   Integer *src_idx_ptr, *dst_idx_ptr;
173   Integer bvalue[MAXDIM], bunit[MAXDIM];
174   Integer factor_idx1[MAXDIM], factor_idx2[MAXDIM], factor_data[MAXDIM];
175   Integer base;
176   Integer me_a, me_b;
177   Integer a_grp, b_grp, anproc, bnproc;
178   Integer num_blocks_a, num_blocks_b/*, chk*/;
179   int use_put, has_intersection;
180   int local_sync_begin,local_sync_end;
181 
182   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
183   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
184   a_grp = pnga_get_pgroup(g_a);
185   b_grp = pnga_get_pgroup(g_b);
186   me_a = pnga_pgroup_nodeid(a_grp);
187   me_b = pnga_pgroup_nodeid(b_grp);
188   anproc = pnga_get_pgroup_size(a_grp);
189   bnproc = pnga_get_pgroup_size(b_grp);
190   if (anproc <= bnproc) {
191     use_put = 1;
192   }  else {
193     use_put = 0;
194   }
195 
196   /*if (a_grp != b_grp)
197     pnga_error("All matrices must be on same group for pnga_copy_patch", 0L); */
198   if(local_sync_begin) {
199     if (anproc <= bnproc) {
200       pnga_pgroup_sync(a_grp);
201     } else if (a_grp == pnga_pgroup_get_world() &&
202         b_grp == pnga_pgroup_get_world()) {
203       pnga_sync();
204     } else {
205       pnga_pgroup_sync(b_grp);
206     }
207   }
208 
209 
210   pnga_inquire(g_a, &atype, &andim, adims);
211   pnga_inquire(g_b, &btype, &bndim, bdims);
212 
213   if(g_a == g_b) {
214     /* they are the same patch */
215     if(pnga_comp_patch(andim, alo, ahi, bndim, blo, bhi)) {
216         return;
217     /* they are in the same GA, but not the same patch */
218     } else if (pnga_patch_intersect(alo, ahi, blo, bhi, andim)) {
219       pnga_error("array patches cannot overlap ", 0L);
220     }
221   }
222 
223   if(atype != btype ) pnga_error("array type mismatch ", 0L);
224 
225   /* check if patch indices and dims match */
226   for(i=0; i<andim; i++)
227     if(alo[i] <= 0 || ahi[i] > adims[i])
228       pnga_error("g_a indices out of range ", 0L);
229   for(i=0; i<bndim; i++)
230     if(blo[i] <= 0 || bhi[i] > bdims[i])
231       pnga_error("g_b indices out of range ", 0L);
232 
233   /* check if numbers of elements in two patches match each other */
234   atotal = 1; btotal = 1;
235   for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
236   for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
237   if(atotal != btotal)
238     pnga_error("capacities two of patches do not match ", 0L);
239 
240   /* additional restrictions that apply if one or both arrays use
241      block-cyclic data distributions */
242   num_blocks_a = pnga_total_blocks(g_a);
243   num_blocks_b = pnga_total_blocks(g_b);
244   if (num_blocks_a >= 0 || num_blocks_b >= 0) {
245     if (!(*trans == 'n' || *trans == 'N')) {
246       pnga_error("Transpose option not supported for block-cyclic data", 0L);
247     }
248     if (!snga_test_shape(alo, ahi, blo, bhi, andim, bndim)) {
249       pnga_error("Change in shape not supported for block-cyclic data", 0L);
250     }
251   }
252 
253   if (num_blocks_a < 0 && num_blocks_b <0) {
254     /* now find out cordinates of a patch of g_a that I own */
255     if (use_put) {
256       pnga_distribution(g_a, me_a, los, his);
257     } else {
258       pnga_distribution(g_b, me_b, los, his);
259     }
260 
261     /* copy my share of data */
262     if (use_put) {
263       has_intersection = pnga_patch_intersect(alo, ahi, los, his, andim);
264     } else {
265       has_intersection = pnga_patch_intersect(blo, bhi, los, his, bndim);
266     }
267     if(has_intersection){
268       if (use_put) {
269         pnga_access_ptr(g_a, los, his, &src_data_ptr, ld);
270       } else {
271         pnga_access_ptr(g_b, los, his, &src_data_ptr, ld);
272       }
273 
274       /* calculate the number of elements in the patch that I own */
275       nelem = 1; for(i=0; i<andim; i++) nelem *= (his[i] - los[i] + 1);
276 
277       for(i=0; i<andim; i++) ald[i] = ahi[i] - alo[i] + 1;
278       for(i=0; i<bndim; i++) bld[i] = bhi[i] - blo[i] + 1;
279 
280       base = 0; factor = 1;
281       for(i=0; i<andim; i++) {
282         base += los[i] * factor;
283         factor *= ld[i];
284       }
285 
286       /*** straight copy possible if there's no reshaping or transpose ***/
287       if((*trans == 'n' || *trans == 'N') &&
288           snga_test_shape(alo, ahi, blo, bhi, andim, bndim)) {
289         /* find source[lo:hi] --> destination[lo:hi] */
290         if (use_put) {
291           snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
292           snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
293           pnga_put(g_b, lod, hid, src_data_ptr, ld);
294           pnga_release(g_a, los, his);
295         } else {
296           snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
297           snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
298           pnga_get(g_a, lod, hid, src_data_ptr, ld);
299           pnga_release(g_b, los, his);
300         }
301         /*** due to generality of this transformation scatter is required ***/
302       } else{
303         if (use_put) {
304           tmp_ptr = ga_malloc(nelem, atype, "v");
305           src_idx_ptr = (Integer*) ga_malloc((andim*nelem), MT_F_INT, "si");
306           dst_idx_ptr = (Integer*) ga_malloc((bndim*nelem), MT_F_INT, "di");
307 
308           /* calculate the destination indices */
309 
310           /* given los and his, find indices for each elements
311            * bvalue: starting index in each dimension
312            * bunit: stride in each dimension
313            */
314           for (i=0; i<andim; i++) {
315             bvalue[i] = los[i];
316             if (i == 0) bunit[i] = 1;
317             else bunit[i] = bunit[i-1] * (his[i-1] - los[i-1] + 1);
318           }
319 
320           /* source indices */
321           for (i=0; i<nelem; i++) {
322             for (j=0; j<andim; j++){
323               src_idx_ptr[i*andim+j] = bvalue[j];
324               /* if the next element is the first element in
325                * one dimension, increment the index by 1
326                */
327               if (((i+1) % bunit[j]) == 0) bvalue[j]++;
328               /* if the index becomes larger than the upper
329                * bound in one dimension, reset it.
330                */
331               if(bvalue[j] > his[j]) bvalue[j] = los[j];
332             }
333           }
334 
335           /* index factor: reshaping without transpose */
336           factor_idx1[0] = 1;
337           for (j=1; j<andim; j++)
338             factor_idx1[j] = factor_idx1[j-1] * ald[j-1];
339 
340           /* index factor: reshaping with transpose */
341           factor_idx2[andim-1] = 1;
342           for (j=(andim-1)-1; j>=0; j--)
343             factor_idx2[j] = factor_idx2[j+1] * ald[j+1];
344 
345           /* data factor */
346           factor_data[0] = 1;
347           for (j=1; j<andim; j++)
348             factor_data[j] = factor_data[j-1] * ld[j-1];
349 
350           /* destination indices */
351           for(i=0; i<nelem; i++) {
352             /* linearize the n-dimensional indices to one dimension */
353             idx = 0;
354             if (*trans == 'n' || *trans == 'N')
355               for (j=0; j<andim; j++)
356                 idx += (src_idx_ptr[i*andim+j] - alo[j]) *
357                   factor_idx1[j];
358             else
359               /* if the patch needs to be transposed, reverse
360                * the indices: (i, j, ...) -> (..., j, i)
361                */
362               for (j=(andim-1); j>=0; j--)
363                 idx += (src_idx_ptr[i*andim+j] - alo[j]) *
364                   factor_idx2[j];
365 
366             /* convert the one dimensional index to n-dimensional
367              * indices of destination
368              */
369             for (j=0; j<bndim; j++) {
370               dst_idx_ptr[i*bndim+j] = idx % bld[j] + blo[j];
371               idx /= bld[j];
372             }
373 
374             /* move the data block to create a new block */
375             /* linearize the data indices */
376             idx = 0;
377             for (j=0; j<andim; j++)
378               idx += (src_idx_ptr[i*andim+j]) * factor_data[j];
379 
380             /* adjust the position
381              * base: starting address of the first element */
382             idx -= base;
383 
384             /* move the element to the temporary location */
385             switch(atype) {
386               case C_DBL: ((double*)tmp_ptr)[i] =
387                           ((double*)src_data_ptr)[idx];
388                           break;
389               case C_INT:
390                           ((int *)tmp_ptr)[i] = ((int *)src_data_ptr)[idx];
391                           break;
392               case C_DCPL:((DoubleComplex *)tmp_ptr)[i] =
393                           ((DoubleComplex *)src_data_ptr)[idx];
394                           break;
395               case C_SCPL:((SingleComplex *)tmp_ptr)[i] =
396                           ((SingleComplex *)src_data_ptr)[idx];
397                           break;
398               case C_FLOAT: ((float *)tmp_ptr)[i] =
399                             ((float *)src_data_ptr)[idx];
400                           break;
401               case C_LONG: ((long *)tmp_ptr)[i] =
402                            ((long *)src_data_ptr)[idx];
403                           break;
404               case C_LONGLONG: ((long long *)tmp_ptr)[i] =
405                            ((long long *)src_data_ptr)[idx];
406             }
407           }
408           pnga_release(g_a, los, his);
409           pnga_scatter(g_b, tmp_ptr, dst_idx_ptr, 0, nelem);
410           ga_free(dst_idx_ptr);
411           ga_free(src_idx_ptr);
412           ga_free(tmp_ptr);
413         } else {
414           tmp_ptr = ga_malloc(nelem, atype, "v");
415           src_idx_ptr = (Integer*) ga_malloc((bndim*nelem), MT_F_INT, "si");
416           dst_idx_ptr = (Integer*) ga_malloc((andim*nelem), MT_F_INT, "di");
417 
418           /* calculate the destination indices */
419 
420           /* given los and his, find indices for each elements
421            * bvalue: starting index in each dimension
422            * bunit: stride in each dimension
423            */
424           for (i=0; i<andim; i++) {
425             bvalue[i] = los[i];
426             if (i == 0) bunit[i] = 1;
427             else bunit[i] = bunit[i-1] * (his[i-1] - los[i-1] + 1);
428           }
429 
430           /* destination indices */
431           for (i=0; i<nelem; i++) {
432             for (j=0; j<bndim; j++){
433               src_idx_ptr[i*bndim+j] = bvalue[j];
434               /* if the next element is the first element in
435                * one dimension, increment the index by 1
436                */
437               if (((i+1) % bunit[j]) == 0) bvalue[j]++;
438               /* if the index becomes larger than the upper
439                * bound in one dimension, reset it.
440                */
441               if(bvalue[j] > his[j]) bvalue[j] = los[j];
442             }
443           }
444 
445           /* index factor: reshaping without transpose */
446           factor_idx1[0] = 1;
447           for (j=1; j<bndim; j++)
448             factor_idx1[j] = factor_idx1[j-1] * bld[j-1];
449 
450           /* index factor: reshaping with transpose */
451           factor_idx2[bndim-1] = 1;
452           for (j=(bndim-1)-1; j>=0; j--)
453             factor_idx2[j] = factor_idx2[j+1] * bld[j+1];
454 
455           /* data factor */
456           factor_data[0] = 1;
457           for (j=1; j<bndim; j++)
458             factor_data[j] = factor_data[j-1] * ld[j-1];
459 
460           /* destination indices */
461           for(i=0; i<nelem; i++) {
462             /* linearize the n-dimensional indices to one dimension */
463             idx = 0;
464             if (*trans == 'n' || *trans == 'N')
465               for (j=0; j<andim; j++)
466                 idx += (src_idx_ptr[i*bndim+j] - blo[j]) *
467                   factor_idx1[j];
468             else
469               /* if the patch needs to be transposed, reverse
470                * the indices: (i, j, ...) -> (..., j, i)
471                */
472               for (j=(andim-1); j>=0; j--)
473                 idx += (src_idx_ptr[i*bndim+j] - blo[j]) *
474                   factor_idx2[j];
475 
476             /* convert the one dimensional index to n-dimensional
477              * indices of destination
478              */
479             for (j=0; j<andim; j++) {
480               dst_idx_ptr[i*bndim+j] = idx % ald[j] + alo[j];
481               idx /= ald[j];
482             }
483 
484             /* move the data block to create a new block */
485             /* linearize the data indices */
486             idx = 0;
487             for (j=0; j<bndim; j++)
488               idx += (src_idx_ptr[i*bndim+j]) * factor_data[j];
489 
490             /* adjust the position
491              * base: starting address of the first element */
492             idx -= base;
493 
494             /* move the element to the temporary location */
495             switch(atype) {
496               case C_DBL: ((double*)tmp_ptr)[i] =
497                           ((double*)src_data_ptr)[idx];
498                           break;
499               case C_INT:
500                           ((int *)tmp_ptr)[i] = ((int *)src_data_ptr)[idx];
501                           break;
502               case C_DCPL:((DoubleComplex *)tmp_ptr)[i] =
503                           ((DoubleComplex *)src_data_ptr)[idx];
504                           break;
505               case C_SCPL:((SingleComplex *)tmp_ptr)[i] =
506                           ((SingleComplex *)src_data_ptr)[idx];
507                           break;
508               case C_FLOAT: ((float *)tmp_ptr)[i] =
509                             ((float *)src_data_ptr)[idx];
510                           break;
511               case C_LONG: ((long *)tmp_ptr)[i] =
512                            ((long *)src_data_ptr)[idx];
513                           break;
514               case C_LONGLONG: ((long long *)tmp_ptr)[i] =
515                            ((long long *)src_data_ptr)[idx];
516             }
517           }
518           pnga_release(g_b, los, his);
519           pnga_gather(g_a, tmp_ptr, dst_idx_ptr, 0, nelem);
520           ga_free(dst_idx_ptr);
521           ga_free(src_idx_ptr);
522           ga_free(tmp_ptr);
523         }
524       }
525     }
526   } else {
527     Integer offset, last, jtot;
528     for (i=0; i<andim; i++) {
529       ald[i] = ahi[i] - alo[i] + 1;
530     }
531     for (i=0; i<bndim; i++) {
532       bld[i] = bhi[i] - blo[i] + 1;
533     }
534     if (use_put) {
535       /* Array a is block-cyclic distributed */
536       if (num_blocks_a >= 0) {
537 #if 1
538         _iterator_hdl hdl_a;
539         pnga_local_iterator_init(g_a, &hdl_a);
540         while (pnga_local_iterator_next(&hdl_a, los, his,
541               &src_data_ptr, ld)) {
542             /* Copy limits since patch intersect modifies los array */
543             for (j=0; j < andim; j++) {
544               lod[j] = los[j];
545               hid[j] = his[j];
546             }
547             if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
548               offset = 0;
549               last = andim - 1;
550               jtot = 1;
551               for (j=0; j<last; j++) {
552                 offset += (los[j]-lod[j])*jtot;
553                 jtot *= ld[j];
554               }
555               offset += (los[last]-lod[last])*jtot;
556               switch(atype) {
557                 case C_DBL:
558                   src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
559                   break;
560                 case C_INT:
561                   src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
562                   break;
563                 case C_DCPL:
564                   src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
565                   break;
566                 case C_SCPL:
567                   src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
568                   break;
569                 case C_FLOAT:
570                   src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
571                   break;
572                 case C_LONG:
573                   src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
574                   break;
575                 case C_LONGLONG:
576                   src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
577                   break;
578                 default:
579                   break;
580               }
581               snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
582               snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
583               pnga_put(g_b, lod, hid, src_data_ptr, ld);
584               pnga_release_block(g_a, i);
585             }
586           }
587 #else
588         /* Uses simple block-cyclic data distribution */
589         if (!pnga_uses_proc_grid(g_a)) {
590           for (i = me_a; i < num_blocks_a; i += anproc) {
591             pnga_distribution(g_a, i, los, his);
592             /* make temporory copies of los, his since ngai_patch_intersection
593                destroys original versions */
594             for (j=0; j < andim; j++) {
595               lod[j] = los[j];
596               hid[j] = his[j];
597             }
598             if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
599               pnga_access_block_ptr(g_a, i, &src_data_ptr, ld);
600               offset = 0;
601               last = andim - 1;
602               jtot = 1;
603               for (j=0; j<last; j++) {
604                 offset += (los[j]-lod[j])*jtot;
605                 jtot *= ld[j];
606               }
607               offset += (los[last]-lod[last])*jtot;
608               switch(atype) {
609                 case C_DBL:
610                   src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
611                   break;
612                 case C_INT:
613                   src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
614                   break;
615                 case C_DCPL:
616                   src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
617                   break;
618                 case C_SCPL:
619                   src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
620                   break;
621                 case C_FLOAT:
622                   src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
623                   break;
624                 case C_LONG:
625                   src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
626                   break;
627                 case C_LONGLONG:
628                   src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
629                   break;
630                 default:
631                   break;
632               }
633               snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
634               snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
635               pnga_put(g_b, lod, hid, src_data_ptr, ld);
636               pnga_release_block(g_a, i);
637             }
638           }
639         } else {
640           /* Uses scalapack block-cyclic data distribution */
641           Integer proc_index[MAXDIM], index[MAXDIM];
642           Integer topology[MAXDIM];
643           Integer blocks[MAXDIM], block_dims[MAXDIM];
644           pnga_get_proc_index(g_a, me_a, proc_index);
645           pnga_get_proc_index(g_a, me_a, index);
646           pnga_get_block_info(g_a, blocks, block_dims);
647           pnga_get_proc_grid(g_a, topology);
648           while (index[andim-1] < blocks[andim-1]) {
649             /* find bounding coordinates of block */
650             /*chk = 1;*/
651             for (i = 0; i < andim; i++) {
652               los[i] = index[i]*block_dims[i]+1;
653               his[i] = (index[i] + 1)*block_dims[i];
654               if (his[i] > adims[i]) his[i] = adims[i];
655               /*if (his[i] < los[i]) chk = 0;*/
656             }
657             /* make temperary copies of los, his since ngai_patch_intersection
658                destroys original versions */
659             for (j=0; j < andim; j++) {
660               lod[j] = los[j];
661               hid[j] = his[j];
662             }
663             if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
664               pnga_access_block_grid_ptr(g_a, index, &src_data_ptr, ld);
665               offset = 0;
666               last = andim - 1;
667               jtot = 1;
668               for (j=0; j<last; j++) {
669                 offset += (los[j]-lod[j])*jtot;
670                 jtot *= ld[j];
671               }
672               offset += (los[last]-lod[last])*jtot;
673               switch(atype) {
674                 case C_DBL:
675                   src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
676                   break;
677                 case C_INT:
678                   src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
679                   break;
680                 case C_DCPL:
681                   src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
682                   break;
683                 case C_SCPL:
684                   src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
685                   break;
686                 case C_FLOAT:
687                   src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
688                   break;
689                 case C_LONG:
690                   src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
691                   break;
692                 case C_LONGLONG:
693                   src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
694                   break;
695                 default:
696                   break;
697               }
698               snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
699               snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
700               pnga_put(g_b, lod, hid, src_data_ptr, ld);
701               pnga_release_block_grid(g_a, index);
702             }
703 
704             /* increment index to get next block on processor */
705             index[0] += topology[0];
706             for (i = 0; i < andim; i++) {
707               if (index[i] >= blocks[i] && i<andim-1) {
708                 index[i] = proc_index[i];
709                 index[i+1] += topology[i+1];
710               }
711             }
712           }
713         }
714 #endif
715       } else {
716         /* Only array b is block-cyclic distributed */
717         pnga_distribution(g_a, me_a, los, his);
718         if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
719           pnga_access_ptr(g_a, los, his, &src_data_ptr, ld);
720           snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
721           snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
722           pnga_put(g_b, lod, hid, src_data_ptr, ld);
723           pnga_release(g_a, los, his);
724         }
725       }
726     } else {
727       /* Array b is block-cyclic distributed */
728       if (num_blocks_b >= 0) {
729 #if 1
730         _iterator_hdl hdl_b;
731         pnga_local_iterator_init(g_b, &hdl_b);
732         while (pnga_local_iterator_next(&hdl_b, los, his,
733               &src_data_ptr, ld)) {
734             /* Copy limits since patch intersect modifies los array */
735             for (j=0; j < andim; j++) {
736               lod[j] = los[j];
737               hid[j] = his[j];
738             }
739             if (pnga_patch_intersect(blo,bhi,los,his,andim)) {
740               offset = 0;
741               last = andim - 1;
742               jtot = 1;
743               for (j=0; j<last; j++) {
744                 offset += (los[j]-lod[j])*jtot;
745                 jtot *= ld[j];
746               }
747               offset += (los[last]-lod[last])*jtot;
748               switch(atype) {
749                 case C_DBL:
750                   src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
751                   break;
752                 case C_INT:
753                   src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
754                   break;
755                 case C_DCPL:
756                   src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
757                   break;
758                 case C_SCPL:
759                   src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
760                   break;
761                 case C_FLOAT:
762                   src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
763                   break;
764                 case C_LONG:
765                   src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
766                   break;
767                 case C_LONGLONG:
768                   src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
769                   break;
770                 default:
771                   break;
772               }
773               snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
774               snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
775               pnga_get(g_a, lod, hid, src_data_ptr, ld);
776               pnga_release_block(g_b, i);
777             }
778           }
779 #else
780         /* Uses simple block-cyclic data distribution */
781         if (!pnga_uses_proc_grid(g_b)) {
782           for (i = me_b; i < num_blocks_b; i += bnproc) {
783             pnga_distribution(g_b, i, los, his);
784             /* make temporory copies of los, his since ngai_patch_intersection
785                destroys original versions */
786             for (j=0; j < andim; j++) {
787               lod[j] = los[j];
788               hid[j] = his[j];
789             }
790             if (pnga_patch_intersect(blo,bhi,los,his,andim)) {
791               pnga_access_block_ptr(g_b, i, &src_data_ptr, ld);
792               offset = 0;
793               last = bndim - 1;
794               jtot = 1;
795               for (j=0; j<last; j++) {
796                 offset += (los[j]-lod[j])*jtot;
797                 jtot *= ld[j];
798               }
799               offset += (los[last]-lod[last])*jtot;
800               switch(atype) {
801                 case C_DBL:
802                   src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
803                   break;
804                 case C_INT:
805                   src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
806                   break;
807                 case C_DCPL:
808                   src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
809                   break;
810                 case C_SCPL:
811                   src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
812                   break;
813                 case C_FLOAT:
814                   src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
815                   break;
816                 case C_LONG:
817                   src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
818                   break;
819                 case C_LONGLONG:
820                   src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
821                   break;
822                 default:
823                   break;
824               }
825               snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
826               snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
827               pnga_get(g_a, lod, hid, src_data_ptr, ld);
828               pnga_release_block(g_b, i);
829             }
830           }
831         } else {
832           /* Uses scalapack block-cyclic data distribution */
833           Integer proc_index[MAXDIM], index[MAXDIM];
834           Integer topology[MAXDIM];
835           Integer blocks[MAXDIM], block_dims[MAXDIM];
836           pnga_get_proc_index(g_b, me_b, proc_index);
837           pnga_get_proc_index(g_b, me_b, index);
838           pnga_get_block_info(g_b, blocks, block_dims);
839           pnga_get_proc_grid(g_b, topology);
840           while (index[bndim-1] < blocks[bndim-1]) {
841             /* find bounding coordinates of block */
842             /*chk = 1;*/
843             for (i = 0; i < bndim; i++) {
844               los[i] = index[i]*block_dims[i]+1;
845               his[i] = (index[i] + 1)*block_dims[i];
846               if (his[i] > bdims[i]) his[i] = bdims[i];
847               /*if (his[i] < los[i]) chk = 0;*/
848             }
849             /* make temporory copies of los, his since ngai_patch_intersection
850                destroys original versions */
851             for (j=0; j < andim; j++) {
852               lod[j] = los[j];
853               hid[j] = his[j];
854             }
855             if (pnga_patch_intersect(blo,bhi,los,his,andim)) {
856               pnga_access_block_grid_ptr(g_b, index, &src_data_ptr, ld);
857               offset = 0;
858               last = bndim - 1;
859               jtot = 1;
860               for (j=0; j<last; j++) {
861                 offset += (los[j]-lod[j])*jtot;
862                 jtot *= ld[j];
863               }
864               offset += (los[last]-lod[last])*jtot;
865               switch(atype) {
866                 case C_DBL:
867                   src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
868                   break;
869                 case C_INT:
870                   src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
871                   break;
872                 case C_DCPL:
873                   src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
874                   break;
875                 case C_SCPL:
876                   src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
877                   break;
878                 case C_FLOAT:
879                   src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
880                   break;
881                 case C_LONG:
882                   src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
883                   break;
884                 case C_LONGLONG:
885                   src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
886                   break;
887                 default:
888                   break;
889               }
890               snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
891               snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
892               pnga_get(g_a, lod, hid, src_data_ptr, ld);
893               pnga_release_block_grid(g_b, index);
894             }
895 
896             /* increment index to get next block on processor */
897             index[0] += topology[0];
898             for (i = 0; i < bndim; i++) {
899               if (index[i] >= blocks[i] && i<bndim-1) {
900                 index[i] = proc_index[i];
901                 index[i+1] += topology[i+1];
902               }
903             }
904           }
905         }
906 #endif
907       } else {
908         /* Array a is block-cyclic distributed */
909         pnga_distribution(g_b, me_b, los, his);
910         if (pnga_patch_intersect(blo,bhi,los,his,bndim)) {
911           pnga_access_ptr(g_b, los, his, &src_data_ptr, ld);
912           snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
913           snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
914           pnga_get(g_a, lod, hid, src_data_ptr, ld);
915           pnga_release(g_b, los, his);
916         }
917       }
918     }
919   }
920   /* ARMCI_AllFence(); */
921   if(local_sync_end) {
922     if (anproc <= bnproc) {
923       pnga_pgroup_sync(a_grp);
924     } else if (a_grp == pnga_pgroup_get_world() &&
925         b_grp == pnga_pgroup_get_world()) {
926       pnga_sync();
927     } else {
928       pnga_pgroup_sync(b_grp);
929     }
930   }
931 }
932 
933 
snga_dot_local_patch(Integer atype,Integer andim,Integer * loA,Integer * hiA,Integer * ldA,void * A_ptr,void * B_ptr,int * alen,void * retval)934 static void snga_dot_local_patch(Integer atype, Integer andim, Integer *loA,
935                           Integer *hiA, Integer *ldA, void *A_ptr, void *B_ptr,
936                           int *alen, void *retval)
937 {
938   int isum;
939   double dsum;
940   DoubleComplex zsum;
941   SingleComplex csum;
942   float fsum;
943   long lsum;
944   long long llsum;
945   Integer i, j, n1dim, idx;
946   Integer bvalue[MAXDIM], bunit[MAXDIM], baseldA[MAXDIM];
947 
948   isum = 0; dsum = 0.; zsum.real = 0.; zsum.imag = 0.; fsum = 0;lsum=0;llsum=0;
949   csum.real = 0.; csum.imag = 0.;
950 
951   /* number of n-element of the first dimension */
952   n1dim = 1; for(i=1; i<andim; i++) n1dim *= (hiA[i] - loA[i] + 1);
953 
954   /* calculate the destination indices */
955   bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
956   /* baseldA[0] = ldA[0]
957    * baseldA[1] = ldA[0] * ldA[1]
958    * baseldA[2] = ldA[0] * ldA[1] * ldA[2] .....
959    */
960   baseldA[0] = ldA[0]; baseldA[1] = baseldA[0] *ldA[1];
961   for(i=2; i<andim; i++) {
962     bvalue[i] = 0;
963     bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
964     baseldA[i] = baseldA[i-1] * ldA[i];
965   }
966 
967   /* compute "local" contribution to the dot product */
968   switch (atype){
969     case C_INT:
970       for(i=0; i<n1dim; i++) {
971         idx = 0;
972         for(j=1; j<andim; j++) {
973           idx += bvalue[j] * baseldA[j-1];
974           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
975           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
976         }
977 
978         for(j=0; j<(hiA[0]-loA[0]+1); j++)
979           isum += ((int *)A_ptr)[idx+j] *
980             ((int *)B_ptr)[idx+j];
981       }
982       *(int*)retval += isum;
983       break;
984     case C_DCPL:
985       for(i=0; i<n1dim; i++) {
986         idx = 0;
987         for(j=1; j<andim; j++) {
988           idx += bvalue[j] * baseldA[j-1];
989           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
990           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
991         }
992 
993         for(j=0; j<(hiA[0]-loA[0]+1); j++) {
994           DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
995           DoubleComplex b = ((DoubleComplex *)B_ptr)[idx+j];
996           zsum.real += a.real*b.real  - b.imag * a.imag;
997           zsum.imag += a.imag*b.real  + b.imag * a.real;
998         }
999       }
1000       ((double*)retval)[0] += zsum.real;
1001       ((double*)retval)[1] += zsum.imag;
1002       break;
1003     case C_SCPL:
1004       for(i=0; i<n1dim; i++) {
1005         idx = 0;
1006         for(j=1; j<andim; j++) {
1007           idx += bvalue[j] * baseldA[j-1];
1008           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1009           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1010         }
1011 
1012         for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1013           SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
1014           SingleComplex b = ((SingleComplex *)B_ptr)[idx+j];
1015           csum.real += a.real*b.real  - b.imag * a.imag;
1016           csum.imag += a.imag*b.real  + b.imag * a.real;
1017         }
1018       }
1019       ((float*)retval)[0] += csum.real;
1020       ((float*)retval)[1] += csum.imag;
1021       break;
1022     case  C_DBL:
1023       for(i=0; i<n1dim; i++) {
1024         idx = 0;
1025         for(j=1; j<andim; j++) {
1026           idx += bvalue[j] * baseldA[j-1];
1027           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1028           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1029         }
1030 
1031         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1032           dsum += ((double*)A_ptr)[idx+j] *
1033             ((double*)B_ptr)[idx+j];
1034       }
1035       *(double*)retval += dsum;
1036       break;
1037     case C_FLOAT:
1038       for(i=0; i<n1dim; i++) {
1039         idx = 0;
1040         for(j=1; j<andim; j++) {
1041           idx += bvalue[j] * baseldA[j-1];
1042           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1043           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1044         }
1045 
1046         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1047           fsum += ((float *)A_ptr)[idx+j] *
1048             ((float *)B_ptr)[idx+j];
1049       }
1050       *(float*)retval += fsum;
1051       break;
1052     case C_LONG:
1053       for(i=0; i<n1dim; i++) {
1054         idx = 0;
1055         for(j=1; j<andim; j++) {
1056           idx += bvalue[j] * baseldA[j-1];
1057           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1058           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1059         }
1060 
1061         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1062           lsum += ((long *)A_ptr)[idx+j] *
1063             ((long *)B_ptr)[idx+j];
1064       }
1065       *(long*)retval += lsum;
1066       break;
1067     case C_LONGLONG:
1068       for(i=0; i<n1dim; i++) {
1069         idx = 0;
1070         for(j=1; j<andim; j++) {
1071           idx += bvalue[j] * baseldA[j-1];
1072           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1073           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1074         }
1075 
1076         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1077           llsum += ((long long *)A_ptr)[idx+j] *
1078             ((long long *)B_ptr)[idx+j];
1079       }
1080       *(long long*)retval += llsum;
1081       break;
1082      default:
1083         pnga_error("snga_dot_local_patch: type not supported",atype);
1084 
1085   }
1086 }
1087 
1088 
1089 /*\ generic dot product routine
1090 \*/
1091 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1092 #   pragma weak wnga_dot_patch = pnga_dot_patch
1093 #endif
pnga_dot_patch(Integer g_a,char * t_a,Integer * alo,Integer * ahi,Integer g_b,char * t_b,Integer * blo,Integer * bhi,void * retval)1094 void pnga_dot_patch(Integer g_a, char *t_a, Integer *alo, Integer *ahi, Integer g_b, char *t_b, Integer *blo, Integer *bhi, void *retval)
1095 {
1096   Integer i=0, j=0;
1097   Integer compatible=0;
1098   Integer atype=0, btype=0, andim=0, adims[MAXDIM], bndim=0, bdims[MAXDIM];
1099   Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
1100   Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
1101   Integer g_A = g_a, g_B = g_b;
1102   char *A_ptr=NULL, *B_ptr=NULL;
1103   Integer ctype=0;
1104   Integer atotal=0, btotal=0;
1105   int isum=0, alen=0;
1106   long lsum=0;
1107   long long llsum=0;
1108   double dsum=0;
1109   DoubleComplex zsum={0,0};
1110   DoubleComplex csum={0,0};
1111   float fsum=0;
1112   Integer me= pnga_nodeid(), temp_created=0;
1113   Integer nproc = pnga_nnodes();
1114   Integer num_blocks_a=0, num_blocks_b=0;
1115   char *tempname = "temp", transp, transp_a, transp_b;
1116   int local_sync_begin=0;
1117   Integer a_grp=0, b_grp=0;
1118   _iterator_hdl hdl_a, hdl_b;
1119 
1120   local_sync_begin = _ga_sync_begin;
1121   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1122   if(local_sync_begin)pnga_sync();
1123 
1124   a_grp = pnga_get_pgroup(g_a);
1125   b_grp = pnga_get_pgroup(g_b);
1126   if (a_grp != b_grp)
1127     pnga_error("Both arrays must be defined on same group",0L);
1128   me = pnga_pgroup_nodeid(a_grp);
1129 
1130   pnga_inquire(g_a, &atype, &andim, adims);
1131   pnga_inquire(g_b, &btype, &bndim, bdims);
1132 
1133   if(atype != btype ) pnga_error(" type mismatch ", 0L);
1134 
1135   /* check if patch indices and g_a dims match */
1136   for(i=0; i<andim; i++)
1137     if(alo[i] <= 0 || ahi[i] > adims[i])
1138       pnga_error("g_a indices out of range ", g_a);
1139   for(i=0; i<bndim; i++)
1140     if(blo[i] <= 0 || bhi[i] > bdims[i])
1141       pnga_error("g_b indices out of range ", g_b);
1142 
1143   /* check if numbers of elements in two patches match each other */
1144   atotal = 1; for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
1145   btotal = 1; for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
1146 
1147   if(atotal != btotal)
1148     pnga_error("  capacities of patches do not match ", 0L);
1149 
1150   /* is transpose operation required ? */
1151   /* -- only if for one array transpose operation requested*/
1152   transp_a = (*t_a == 'n' || *t_a =='N')? 'n' : 't';
1153   transp_b = (*t_b == 'n' || *t_b =='N')? 'n' : 't';
1154   transp   = (transp_a == transp_b)? 'n' : 't';
1155 
1156   /* Find out if distribution is block-cyclic */
1157   num_blocks_a = pnga_total_blocks(g_a);
1158   num_blocks_b = pnga_total_blocks(g_b);
1159 
1160   if (num_blocks_a >= 0 || num_blocks_b >= 0) {
1161     if (transp_a == 't' || transp_b == 't')
1162       pnga_error("transpose not supported for block-cyclic data ", 0);
1163   }
1164 
1165   isum = 0; dsum = 0.; zsum.real = 0.; zsum.imag = 0.; fsum = 0;lsum=0;llsum=0;
1166   csum.real = 0.; csum.imag = 0.;
1167 
1168   switch (atype){
1169     case C_INT:
1170       *(int*)retval = isum;
1171       alen = 1;
1172       break;
1173     case C_DCPL:
1174       ((double*)retval)[0] = zsum.real;
1175       ((double*)retval)[1] = zsum.imag;
1176       alen = 2;
1177       break;
1178     case C_SCPL:
1179       ((float*)retval)[0] = csum.real;
1180       ((float*)retval)[1] = csum.imag;
1181       alen = 2;
1182       break;
1183     case  C_DBL:
1184       *(double*)retval = dsum;
1185       alen = 1;
1186       break;
1187     case  C_FLOAT:
1188       *(float*)retval = fsum;
1189       alen = 1;
1190       break;
1191     case C_LONG:
1192       *(long*)retval = lsum;
1193       alen = 1;
1194       break;
1195     case C_LONGLONG:
1196       *(long long*)retval = llsum;
1197       alen = 1;
1198       break;
1199      default:
1200         pnga_error("snga_dot_local_patch: type not supported",atype);
1201   }
1202 
1203   if (num_blocks_a < 0 && num_blocks_b < 0) {
1204     /* find out coordinates of patches of g_A and g_B that I own */
1205     pnga_distribution(g_A, me, loA, hiA);
1206     pnga_distribution(g_B, me, loB, hiB);
1207 
1208     if(pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB) &&
1209         pnga_comp_patch(andim, alo, ahi, bndim, blo, bhi)) compatible = 1;
1210     else compatible = 0;
1211     /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*"); */
1212     pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "&&");
1213     if(!(compatible && (transp=='n'))) {
1214       /* either patches or distributions do not match:
1215        *        - create a temp array that matches distribution of g_a
1216        *        - copy & reshape patch of g_b into g_B
1217        */
1218       if (!pnga_duplicate(g_a, &g_B, tempname))
1219         pnga_error("duplicate failed",0L);
1220 
1221       pnga_copy_patch(&transp, g_b, blo, bhi, g_B, alo, ahi);
1222       bndim = andim;
1223       temp_created = 1;
1224       pnga_distribution(g_B, me, loB, hiB);
1225     }
1226 
1227     if(!pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB))
1228       pnga_error(" patches mismatch ",0);
1229 
1230     /* A[83:125,1:1]  <==> B[83:125] */
1231     if(andim > bndim) andim = bndim; /* need more work */
1232 
1233     /*  determine subsets of my patches to access  */
1234     if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1235       pnga_access_ptr(g_A, loA, hiA, &A_ptr, ldA);
1236       pnga_access_ptr(g_B, loA, hiA, &B_ptr, ldB);
1237 
1238       snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1239           &alen, retval);
1240       /* release access to the data */
1241       pnga_release(g_A, loA, hiA);
1242       pnga_release(g_B, loA, hiA);
1243     }
1244   } else {
1245     /* Create copy of g_b identical with identical distribution as g_a */
1246     if (!pnga_duplicate(g_a, &g_B, tempname))
1247       pnga_error("duplicate failed",0L);
1248     pnga_copy_patch(&transp, g_b, blo, bhi, g_B, alo, ahi);
1249     temp_created = 1;
1250 
1251 #if 1
1252     pnga_local_iterator_init(g_a, &hdl_a);
1253     pnga_local_iterator_init(g_B, &hdl_b);
1254     while(pnga_local_iterator_next(&hdl_a, loA, hiA, &A_ptr, ldA)) {
1255       Integer lo[MAXDIM]/*, hi[MAXDIM]*/;
1256       Integer offset, jtot, last;
1257       pnga_local_iterator_next(&hdl_b, loB, hiB, &B_ptr, ldB);
1258       /* make copies of loA and hiA since pnga_patch_intersect destroys
1259          original versions */
1260       for (j=0; j<andim; j++) {
1261         lo[j] = loA[j];
1262         /*hi[j] = hiA[j];*/
1263       }
1264       if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1265         /* evaluate offsets for system */
1266         offset = 0;
1267         last = andim-1;
1268         jtot = 1;
1269         for (j=0; j<last; j++) {
1270           offset += (loA[j] - lo[j])*jtot;
1271           jtot *= ldA[j];
1272         }
1273         offset += (loA[last]-lo[last])*jtot;
1274 
1275         /* offset pointers by correct amount */
1276         switch (atype){
1277           case C_INT:
1278             A_ptr = (void*)((int*)(A_ptr) + offset);
1279             B_ptr = (void*)((int*)(B_ptr) + offset);
1280             break;
1281           case C_DCPL:
1282             A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1283             B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1284             break;
1285           case C_SCPL:
1286             A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1287             B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1288             break;
1289           case  C_DBL:
1290             A_ptr = (void*)((double*)(A_ptr) + offset);
1291             B_ptr = (void*)((double*)(B_ptr) + offset);
1292             break;
1293           case  C_FLOAT:
1294             A_ptr = (void*)((float*)(A_ptr) + offset);
1295             B_ptr = (void*)((float*)(B_ptr) + offset);
1296             break;
1297           case C_LONG:
1298             A_ptr = (void*)((long*)(A_ptr) + offset);
1299             B_ptr = (void*)((long*)(B_ptr) + offset);
1300             break;
1301           case C_LONGLONG:
1302             A_ptr = (void*)((long long*)(A_ptr) + offset);
1303             B_ptr = (void*)((long long*)(B_ptr) + offset);
1304             break;
1305         }
1306         snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1307             &alen, retval);
1308       }
1309     }
1310 #else
1311     /* If g_a regular distribution, then just use normal dot product on patch */
1312     if (num_blocks_a < 0) {
1313       /* find out coordinates of patches of g_A and g_B that I own */
1314       pnga_distribution(g_A, me, loA, hiA);
1315       pnga_distribution(g_B, me, loB, hiB);
1316 
1317       if(!pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB))
1318         pnga_error(" patches mismatch ",0);
1319 
1320       /* A[83:125,1:1]  <==> B[83:125] */
1321       if(andim > bndim) andim = bndim; /* need more work */
1322       if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1323         pnga_access_ptr(g_A, loA, hiA, &A_ptr, ldA);
1324         pnga_access_ptr(g_B, loA, hiA, &B_ptr, ldB);
1325 
1326         snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1327             &alen, retval);
1328         /* release access to the data */
1329         pnga_release(g_A, loA, hiA);
1330         pnga_release(g_B, loA, hiA);
1331       }
1332     } else {
1333       Integer lo[MAXDIM]/*, hi[MAXDIM]*/;
1334       Integer offset, jtot, last;
1335       /* simple block cyclic data distribution */
1336       if (!pnga_uses_proc_grid(g_a)) {
1337         for (i=me; i<num_blocks_a; i += nproc) {
1338           pnga_distribution(g_A, i, loA, hiA);
1339           /* make copies of loA and hiA since pnga_patch_intersect destroys
1340              original versions */
1341           for (j=0; j<andim; j++) {
1342             lo[j] = loA[j];
1343             /*hi[j] = hiA[j];*/
1344           }
1345           if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1346             pnga_access_block_ptr(g_A, i, &A_ptr, ldA);
1347             pnga_access_block_ptr(g_B, i, &B_ptr, ldB);
1348 
1349             /* evaluate offsets for system */
1350             offset = 0;
1351             last = andim-1;
1352             jtot = 1;
1353             for (j=0; j<last; j++) {
1354               offset += (loA[j] - lo[j])*jtot;
1355               jtot *= ldA[j];
1356             }
1357             offset += (loA[last]-lo[last])*jtot;
1358 
1359             /* offset pointers by correct amount */
1360             switch (atype){
1361               case C_INT:
1362                 A_ptr = (void*)((int*)(A_ptr) + offset);
1363                 B_ptr = (void*)((int*)(B_ptr) + offset);
1364                 break;
1365               case C_DCPL:
1366                 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1367                 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1368                 break;
1369               case C_SCPL:
1370                 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1371                 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1372                 break;
1373               case  C_DBL:
1374                 A_ptr = (void*)((double*)(A_ptr) + offset);
1375                 B_ptr = (void*)((double*)(B_ptr) + offset);
1376                 break;
1377               case  C_FLOAT:
1378                 A_ptr = (void*)((float*)(A_ptr) + offset);
1379                 B_ptr = (void*)((float*)(B_ptr) + offset);
1380                 break;
1381               case C_LONG:
1382                 A_ptr = (void*)((long*)(A_ptr) + offset);
1383                 B_ptr = (void*)((long*)(B_ptr) + offset);
1384                 break;
1385               case C_LONGLONG:
1386                 A_ptr = (void*)((long long*)(A_ptr) + offset);
1387                 B_ptr = (void*)((long long*)(B_ptr) + offset);
1388                 break;
1389             }
1390             snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1391                 &alen, retval);
1392             /* release access to the data */
1393             pnga_release_block(g_A, i);
1394             pnga_release_block(g_B, i);
1395           }
1396         }
1397       } else {
1398         /* Uses scalapack block-cyclic data distribution */
1399         Integer proc_index[MAXDIM], index[MAXDIM];
1400         Integer topology[MAXDIM]/*, chk*/;
1401         Integer blocks[MAXDIM], block_dims[MAXDIM];
1402         pnga_get_proc_index(g_a, me, proc_index);
1403         pnga_get_proc_index(g_a, me, index);
1404         pnga_get_block_info(g_a, blocks, block_dims);
1405         pnga_get_proc_grid(g_a, topology);
1406         while (index[andim-1] < blocks[andim-1]) {
1407           /* find bounding coordinates of block */
1408           /*chk = 1;*/
1409           for (i = 0; i < andim; i++) {
1410             loA[i] = index[i]*block_dims[i]+1;
1411             hiA[i] = (index[i] + 1)*block_dims[i];
1412             if (hiA[i] > adims[i]) hiA[i] = adims[i];
1413             /*if (hiA[i] < loA[i]) chk = 0;*/
1414           }
1415           /* make copies of loA and hiA since pnga_patch_intersect destroys
1416              original versions */
1417           for (j=0; j<andim; j++) {
1418             lo[j] = loA[j];
1419             /*hi[j] = hiA[j];*/
1420           }
1421           if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1422             pnga_access_block_grid_ptr(g_A, index, &A_ptr, ldA);
1423             pnga_access_block_grid_ptr(g_B, index, &B_ptr, ldB);
1424 
1425             /* evaluate offsets for system */
1426             offset = 0;
1427             last = andim-1;
1428             jtot = 1;
1429             for (j=0; j<last; j++) {
1430               offset += (loA[j] - lo[j])*jtot;
1431               jtot *= ldA[j];
1432             }
1433             offset += (loA[last]-lo[last])*jtot;
1434 
1435             /* offset pointers by correct amount */
1436             switch (atype){
1437               case C_INT:
1438                 A_ptr = (void*)((int*)(A_ptr) + offset);
1439                 B_ptr = (void*)((int*)(B_ptr) + offset);
1440                 break;
1441               case C_DCPL:
1442                 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1443                 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1444                 break;
1445               case C_SCPL:
1446                 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1447                 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1448                 break;
1449               case  C_DBL:
1450                 A_ptr = (void*)((double*)(A_ptr) + offset);
1451                 B_ptr = (void*)((double*)(B_ptr) + offset);
1452                 break;
1453               case  C_FLOAT:
1454                 A_ptr = (void*)((float*)(A_ptr) + offset);
1455                 B_ptr = (void*)((float*)(B_ptr) + offset);
1456                 break;
1457               case C_LONG:
1458                 A_ptr = (void*)((long*)(A_ptr) + offset);
1459                 B_ptr = (void*)((long*)(B_ptr) + offset);
1460                 break;
1461               case C_LONGLONG:
1462                 A_ptr = (void*)((long long*)(A_ptr) + offset);
1463                 B_ptr = (void*)((long long*)(B_ptr) + offset);
1464                 break;
1465             }
1466             snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1467                 &alen, retval);
1468             /* release access to the data */
1469             pnga_release_block_grid(g_A, index);
1470             pnga_release_block_grid(g_B, index);
1471           }
1472 
1473           /* increment index to get next block on processor */
1474           index[0] += topology[0];
1475           for (i = 0; i < andim; i++) {
1476             if (index[i] >= blocks[i] && i<andim-1) {
1477               index[i] = proc_index[i];
1478               index[i+1] += topology[i+1];
1479             }
1480           }
1481         }
1482       }
1483     }
1484 #endif
1485   }
1486 
1487   /*convert from C data type to ARMCI type */
1488   switch(atype) {
1489     case C_FLOAT: ctype=ARMCI_FLOAT; break;
1490     case C_DBL: ctype=ARMCI_DOUBLE; break;
1491     case C_INT: ctype=ARMCI_INT; break;
1492     case C_LONG: ctype=ARMCI_LONG; break;
1493     case C_LONGLONG: ctype=ARMCI_LONG_LONG; break;
1494     case C_DCPL: ctype=ARMCI_DOUBLE; break;
1495     case C_SCPL: ctype=ARMCI_FLOAT; break;
1496     default: pnga_error("pnga_dot_patch: type not supported",atype);
1497   }
1498 
1499   if (pnga_is_mirrored(g_a) && pnga_is_mirrored(g_b)) {
1500     armci_msg_gop_scope(SCOPE_NODE,retval,alen,"+",ctype);
1501   } else {
1502     if (a_grp == -1) {
1503       armci_msg_gop_scope(SCOPE_ALL,retval,alen,"+",ctype);
1504 #ifdef MSG_COMMS_MPI
1505     } else {
1506       armci_msg_group_gop_scope(SCOPE_ALL,retval,alen,"+",ctype,
1507           ga_get_armci_group_((int)a_grp));
1508 #endif
1509     }
1510   }
1511 
1512   if(temp_created) pnga_destroy(g_B);
1513 }
1514 
1515 
1516 /*****************************************************************************
1517  * ngai_Xdot_patch routines
1518  ****************************************************************************/
1519 
1520 
1521 /*\
1522  *  Set all values in patch to value stored in *val
1523 \*/
snga_set_patch_value(Integer type,Integer ndim,Integer * loA,Integer * hiA,Integer * ld,void * data_ptr,void * val)1524 static void snga_set_patch_value(
1525         Integer type, Integer ndim, Integer *loA, Integer *hiA,
1526         Integer *ld, void *data_ptr, void *val)
1527 {
1528   Integer n1dim, i, j, idx;
1529   Integer bvalue[MAXDIM], bunit[MAXDIM], baseld[MAXDIM];
1530   /* number of n-element of the first dimension */
1531   n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiA[i] - loA[i] + 1);
1532 
1533   /* calculate the destination indices */
1534   bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1535   /* baseld[0] = ld[0]
1536    * baseld[1] = ld[0] * ld[1]
1537    * baseld[2] = ld[0] * ld[1] * ld[2] .....
1538    */
1539   baseld[0] = ld[0]; baseld[1] = baseld[0] *ld[1];
1540   for(i=2; i<ndim; i++) {
1541     bvalue[i] = 0;
1542     bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
1543     baseld[i] = baseld[i-1] * ld[i];
1544   }
1545 
1546   switch (type){
1547     case C_INT:
1548       for(i=0; i<n1dim; i++) {
1549         idx = 0;
1550         for(j=1; j<ndim; j++) {
1551           idx += bvalue[j] * baseld[j-1];
1552           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1553           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1554         }
1555         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1556           ((int *)data_ptr)[idx+j] = *(int*)val;
1557       }
1558       break;
1559     case C_DCPL:
1560       for(i=0; i<n1dim; i++) {
1561         idx = 0;
1562         for(j=1; j<ndim; j++) {
1563           idx += bvalue[j] * baseld[j-1];
1564           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1565           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1566         }
1567 
1568         for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1569           DoubleComplex tmp = *(DoubleComplex *)val;
1570           ((DoubleComplex *)data_ptr)[idx+j].real = tmp.real;
1571           ((DoubleComplex *)data_ptr)[idx+j].imag = tmp.imag;
1572         }
1573       }
1574       break;
1575     case C_SCPL:
1576       for(i=0; i<n1dim; i++) {
1577         idx = 0;
1578         for(j=1; j<ndim; j++) {
1579           idx += bvalue[j] * baseld[j-1];
1580           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1581           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1582         }
1583 
1584         for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1585           SingleComplex tmp = *(SingleComplex *)val;
1586           ((SingleComplex *)data_ptr)[idx+j].real = tmp.real;
1587           ((SingleComplex *)data_ptr)[idx+j].imag = tmp.imag;
1588         }
1589       }
1590       break;
1591     case C_DBL:
1592       for(i=0; i<n1dim; i++) {
1593         idx = 0;
1594         for(j=1; j<ndim; j++) {
1595           idx += bvalue[j] * baseld[j-1];
1596           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1597           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1598         }
1599 
1600         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1601           ((double*)data_ptr)[idx+j] =
1602             *(double*)val;
1603       }
1604       break;
1605     case C_FLOAT:
1606       for(i=0; i<n1dim; i++) {
1607         idx = 0;
1608         for(j=1; j<ndim; j++) {
1609           idx += bvalue[j] * baseld[j-1];
1610           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1611           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1612         }
1613         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1614           ((float *)data_ptr)[idx+j] = *(float*)val;
1615       }
1616       break;
1617     case C_LONG:
1618       for(i=0; i<n1dim; i++) {
1619         idx = 0;
1620         for(j=1; j<ndim; j++) {
1621           idx += bvalue[j] * baseld[j-1];
1622           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1623           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1624         }
1625         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1626           ((long *)data_ptr)[idx+j] = *(long*)val;
1627       }
1628       break;
1629     case C_LONGLONG:
1630       for(i=0; i<n1dim; i++) {
1631         idx = 0;
1632         for(j=1; j<ndim; j++) {
1633           idx += bvalue[j] * baseld[j-1];
1634           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1635           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1636         }
1637         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1638           ((long long*)data_ptr)[idx+j] = *(long long*)val;
1639       }
1640       break;
1641     default: pnga_error(" wrong data type ",type);
1642   }
1643 
1644 }
1645 
1646 
1647 /*\ FILL IN ARRAY WITH VALUE
1648 \*/
1649 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1650 #   pragma weak wnga_fill_patch = pnga_fill_patch
1651 #endif
pnga_fill_patch(Integer g_a,Integer * lo,Integer * hi,void * val)1652 void pnga_fill_patch(Integer g_a, Integer *lo, Integer *hi, void* val)
1653 {
1654   Integer i;
1655   Integer ndim, dims[MAXDIM], type;
1656   Integer loA[MAXDIM], hiA[MAXDIM], ld[MAXDIM];
1657   char *data_ptr;
1658   Integer num_blocks, nproc;
1659   Integer me= pnga_nodeid();
1660   int local_sync_begin,local_sync_end;
1661   _iterator_hdl hdl;
1662 
1663   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1664   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1665   if(local_sync_begin)pnga_sync();
1666 
1667 
1668   pnga_inquire(g_a,  &type, &ndim, dims);
1669   num_blocks = pnga_total_blocks(g_a);
1670 
1671 #if 1
1672   pnga_local_iterator_init(g_a, &hdl);
1673   while (pnga_local_iterator_next(&hdl,loA,hiA,&data_ptr,ld)) {
1674     Integer offset, j, jtmp, chk;
1675     Integer loS[MAXDIM];
1676 
1677     /* loA is changed by pnga_patch_intersect, so
1678        save a copy */
1679     for (j=0; j<ndim; j++) {
1680       loS[j] = loA[j];
1681     }
1682 
1683     /*  determine subset of my local patch to access  */
1684     /*  Output is in loA and hiA */
1685     if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1686       /* Check for partial overlap */
1687       chk = 1;
1688       for (j=0; j<ndim; j++) {
1689         if (loS[j] < loA[j]) {
1690           chk=0;
1691           break;
1692         }
1693       }
1694       if (!chk) {
1695         /* Evaluate additional offset for pointer */
1696         offset = 0;
1697         jtmp = 1;
1698         for (j=0; j<ndim-1; j++) {
1699           offset += (loA[j]-loS[j])*jtmp;
1700           jtmp *= ld[j];
1701         }
1702         offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1703         switch (type){
1704           case C_INT:
1705             data_ptr = (void*)((int*)data_ptr + offset);
1706             break;
1707           case C_DCPL:
1708             data_ptr = (void*)((double*)data_ptr + 2*offset);
1709             break;
1710           case C_SCPL:
1711             data_ptr = (void*)((float*)data_ptr + 2*offset);
1712             break;
1713           case C_DBL:
1714             data_ptr = (void*)((double*)data_ptr + offset);
1715             break;
1716           case C_FLOAT:
1717             data_ptr = (void*)((float*)data_ptr + offset);
1718             break;
1719           case C_LONG:
1720             data_ptr = (void*)((long*)data_ptr + offset);
1721             break;
1722           case C_LONGLONG:
1723             data_ptr = (void*)((long long*)data_ptr + offset);
1724             break;
1725           default: pnga_error(" wrong data type ",type);
1726         }
1727       }
1728 
1729       /* set all values in patch to *val */
1730       snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1731     }
1732   }
1733 #else
1734   if (num_blocks < 0) {
1735     /* get limits of VISIBLE patch */
1736     pnga_distribution(g_a, me, loA, hiA);
1737 
1738     /*  determine subset of my local patch to access  */
1739     /*  Output is in loA and hiA */
1740     if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1741 
1742       /* get data_ptr to corner of patch */
1743       /* ld are leading dimensions INCLUDING ghost cells */
1744       pnga_access_ptr(g_a, loA, hiA, &data_ptr, ld);
1745 
1746       /* set all values in patch to *val */
1747       snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1748 
1749       /* release access to the data */
1750       pnga_release_update(g_a, loA, hiA);
1751     }
1752   } else {
1753     Integer offset, j, jtmp, chk;
1754     Integer loS[MAXDIM];
1755     nproc = pnga_nnodes();
1756     /* using simple block-cyclic data distribution */
1757     if (!pnga_uses_proc_grid(g_a)){
1758       for (i=me; i<num_blocks; i += nproc) {
1759         /* get limits of patch */
1760         pnga_distribution(g_a, i, loA, hiA);
1761 
1762         /* loA is changed by pnga_patch_intersect, so
1763            save a copy */
1764         for (j=0; j<ndim; j++) {
1765           loS[j] = loA[j];
1766         }
1767 
1768         /*  determine subset of my local patch to access  */
1769         /*  Output is in loA and hiA */
1770         if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1771 
1772           /* get data_ptr to corner of patch */
1773           /* ld are leading dimensions for block */
1774           pnga_access_block_ptr(g_a, i, &data_ptr, ld);
1775 
1776           /* Check for partial overlap */
1777           chk = 1;
1778           for (j=0; j<ndim; j++) {
1779             if (loS[j] < loA[j]) {
1780               chk=0;
1781               break;
1782             }
1783           }
1784           if (!chk) {
1785             /* Evaluate additional offset for pointer */
1786             offset = 0;
1787             jtmp = 1;
1788             for (j=0; j<ndim-1; j++) {
1789               offset += (loA[j]-loS[j])*jtmp;
1790               jtmp *= ld[j];
1791             }
1792             offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1793             switch (type){
1794               case C_INT:
1795                 data_ptr = (void*)((int*)data_ptr + offset);
1796                 break;
1797               case C_DCPL:
1798                 data_ptr = (void*)((double*)data_ptr + 2*offset);
1799                 break;
1800               case C_SCPL:
1801                 data_ptr = (void*)((float*)data_ptr + 2*offset);
1802                 break;
1803               case C_DBL:
1804                 data_ptr = (void*)((double*)data_ptr + offset);
1805                 break;
1806               case C_FLOAT:
1807                 data_ptr = (void*)((float*)data_ptr + offset);
1808                 break;
1809               case C_LONG:
1810                 data_ptr = (void*)((long*)data_ptr + offset);
1811                 break;
1812               case C_LONGLONG:
1813                 data_ptr = (void*)((long long*)data_ptr + offset);
1814                 break;
1815               default: pnga_error(" wrong data type ",type);
1816             }
1817           }
1818 
1819           /* set all values in patch to *val */
1820           snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1821 
1822           /* release access to the data */
1823           pnga_release_update_block(g_a, i);
1824         }
1825       }
1826     } else {
1827       /* using scalapack block-cyclic data distribution */
1828       Integer proc_index[MAXDIM], index[MAXDIM];
1829       Integer topology[MAXDIM];
1830       Integer blocks[MAXDIM], block_dims[MAXDIM];
1831       pnga_get_proc_index(g_a, me, proc_index);
1832       pnga_get_proc_index(g_a, me, index);
1833       pnga_get_block_info(g_a, blocks, block_dims);
1834       pnga_get_proc_grid(g_a, topology);
1835       while (index[ndim-1] < blocks[ndim-1]) {
1836         /* find bounding coordinates of block */
1837         chk = 1;
1838         for (i = 0; i < ndim; i++) {
1839           loA[i] = index[i]*block_dims[i]+1;
1840           hiA[i] = (index[i] + 1)*block_dims[i];
1841           if (hiA[i] > dims[i]) hiA[i] = dims[i];
1842           if (hiA[i] < loA[i]) chk = 0;
1843         }
1844 
1845         /* loA is changed by pnga_patch_intersect, so
1846            save a copy */
1847         for (j=0; j<ndim; j++) {
1848           loS[j] = loA[j];
1849         }
1850 
1851         /*  determine subset of my local patch to access  */
1852         /*  Output is in loA and hiA */
1853         if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1854 
1855           /* get data_ptr to corner of patch */
1856           /* ld are leading dimensions for block */
1857           pnga_access_block_grid_ptr(g_a, index, &data_ptr, ld);
1858 
1859           /* Check for partial overlap */
1860           chk = 1;
1861           for (j=0; j<ndim; j++) {
1862             if (loS[j] < loA[j]) {
1863               chk=0;
1864               break;
1865             }
1866           }
1867           if (!chk) {
1868             /* Evaluate additional offset for pointer */
1869             offset = 0;
1870             jtmp = 1;
1871             for (j=0; j<ndim-1; j++) {
1872               offset += (loA[j]-loS[j])*jtmp;
1873               jtmp *= ld[j];
1874             }
1875             offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1876             switch (type){
1877               case C_INT:
1878                 data_ptr = (void*)((int*)data_ptr + offset);
1879                 break;
1880               case C_DCPL:
1881                 data_ptr = (void*)((double*)data_ptr + 2*offset);
1882                 break;
1883               case C_SCPL:
1884                 data_ptr = (void*)((float*)data_ptr + 2*offset);
1885                 break;
1886               case C_DBL:
1887                 data_ptr = (void*)((double*)data_ptr + offset);
1888                 break;
1889               case C_FLOAT:
1890                 data_ptr = (void*)((float*)data_ptr + offset);
1891                 break;
1892               case C_LONG:
1893                 data_ptr = (void*)((long*)data_ptr + offset);
1894                 break;
1895               case C_LONGLONG:
1896                 data_ptr = (void*)((long long*)data_ptr + offset);
1897                 break;
1898               default: pnga_error(" wrong data type ",type);
1899             }
1900           }
1901 
1902           /* set all values in patch to *val */
1903           snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1904 
1905           /* release access to the data */
1906           pnga_release_update_block_grid(g_a, index);
1907         }
1908         /* increment index to get next block on processor */
1909         index[0] += topology[0];
1910         for (i = 0; i < ndim; i++) {
1911           if (index[i] >= blocks[i] && i<ndim-1) {
1912             index[i] = proc_index[i];
1913             index[i+1] += topology[i+1];
1914           }
1915         }
1916       }
1917     }
1918   }
1919 #endif
1920   if(local_sync_end)pnga_sync();
1921 }
1922 
1923 
snga_scale_patch_value(Integer type,Integer ndim,Integer * loA,Integer * hiA,Integer * ld,void * src_data_ptr,void * alpha)1924 static void snga_scale_patch_value(Integer type, Integer ndim, Integer *loA, Integer *hiA,
1925                      Integer *ld, void *src_data_ptr, void *alpha)
1926 {
1927   Integer n1dim, i, j, idx;
1928   Integer bvalue[MAXDIM], bunit[MAXDIM], baseld[MAXDIM];
1929   DoublePrecision tmp1_real, tmp1_imag, tmp2_real, tmp2_imag;
1930   float ftmp1_real, ftmp1_imag, ftmp2_real, ftmp2_imag;
1931   /* number of n-element of the first dimension */
1932   n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiA[i] - loA[i] + 1);
1933 
1934   /* calculate the destination indices */
1935   bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1936   /* baseld[0] = ld[0]
1937    * baseld[1] = ld[0] * ld[1]
1938    * baseld[2] = ld[0] * ld[1] * ld[2] .....
1939    */
1940   baseld[0] = ld[0]; baseld[1] = baseld[0] *ld[1];
1941   for(i=2; i<ndim; i++) {
1942     bvalue[i] = 0;
1943     bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
1944     baseld[i] = baseld[i-1] * ld[i];
1945   }
1946 
1947   /* scale local part of g_a */
1948   switch(type){
1949     case C_DBL:
1950       for(i=0; i<n1dim; i++) {
1951         idx = 0;
1952         for(j=1; j<ndim; j++) {
1953           idx += bvalue[j] * baseld[j-1];
1954           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1955           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1956         }
1957 
1958         for(j=0; j<(hiA[0]-loA[0]+1); j++)
1959           ((double*)src_data_ptr)[idx+j]  *=
1960             *(double*)alpha;
1961       }
1962       break;
1963     case C_DCPL:
1964       for(i=0; i<n1dim; i++) {
1965         idx = 0;
1966         for(j=1; j<ndim; j++) {
1967           idx += bvalue[j] * baseld[j-1];
1968           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1969           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1970         }
1971 
1972         for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1973           tmp1_real =((DoubleComplex *)src_data_ptr)[idx+j].real;
1974           tmp1_imag =((DoubleComplex *)src_data_ptr)[idx+j].imag;
1975           tmp2_real = (*(DoubleComplex*)alpha).real;
1976           tmp2_imag = (*(DoubleComplex*)alpha).imag;
1977 
1978           ((DoubleComplex *)src_data_ptr)[idx+j].real =
1979             tmp1_real*tmp2_real  - tmp1_imag * tmp2_imag;
1980           ((DoubleComplex *)src_data_ptr)[idx+j].imag =
1981             tmp2_imag*tmp1_real  + tmp1_imag * tmp2_real;
1982         }
1983       }
1984       break;
1985     case C_SCPL:
1986       for(i=0; i<n1dim; i++) {
1987         idx = 0;
1988         for(j=1; j<ndim; j++) {
1989           idx += bvalue[j] * baseld[j-1];
1990           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1991           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1992         }
1993 
1994         for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1995           ftmp1_real =((SingleComplex *)src_data_ptr)[idx+j].real;
1996           ftmp1_imag =((SingleComplex *)src_data_ptr)[idx+j].imag;
1997           ftmp2_real = (*(SingleComplex*)alpha).real;
1998           ftmp2_imag = (*(SingleComplex*)alpha).imag;
1999 
2000           ((SingleComplex *)src_data_ptr)[idx+j].real =
2001             ftmp1_real*ftmp2_real  - ftmp1_imag * ftmp2_imag;
2002           ((SingleComplex *)src_data_ptr)[idx+j].imag =
2003             ftmp2_imag*ftmp1_real  + ftmp1_imag * ftmp2_real;
2004         }
2005       }
2006       break;
2007     case C_INT:
2008       for(i=0; i<n1dim; i++) {
2009         idx = 0;
2010         for(j=1; j<ndim; j++) {
2011           idx += bvalue[j] * baseld[j-1];
2012           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2013           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2014         }
2015 
2016         for(j=0; j<(hiA[0]-loA[0]+1); j++)
2017           ((int*)src_data_ptr)[idx+j]  *= *(int*)alpha;
2018       }
2019       break;
2020     case C_LONG:
2021       for(i=0; i<n1dim; i++) {
2022         idx = 0;
2023         for(j=1; j<ndim; j++) {
2024           idx += bvalue[j] * baseld[j-1];
2025           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2026           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2027         }
2028 
2029         for(j=0; j<(hiA[0]-loA[0]+1); j++)
2030           ((long *)src_data_ptr)[idx+j]  *= *(long*)alpha;
2031       }
2032       break;
2033     case C_LONGLONG:
2034       for(i=0; i<n1dim; i++) {
2035         idx = 0;
2036         for(j=1; j<ndim; j++) {
2037           idx += bvalue[j] * baseld[j-1];
2038           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2039           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2040         }
2041 
2042         for(j=0; j<(hiA[0]-loA[0]+1); j++)
2043           ((long long*)src_data_ptr)[idx+j]  *= *(long long*)alpha;
2044       }
2045       break;
2046     case C_FLOAT:
2047       for(i=0; i<n1dim; i++) {
2048         idx = 0;
2049         for(j=1; j<ndim; j++) {
2050           idx += bvalue[j] * baseld[j-1];
2051           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2052           if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2053         }
2054 
2055         for(j=0; j<(hiA[0]-loA[0]+1); j++)
2056           ((float *)src_data_ptr)[idx+j]  *= *(float*)alpha;
2057       }
2058       break;
2059     default: pnga_error(" wrong data type ",type);
2060   }
2061 }
2062 
2063 
2064 /*\ SCALE ARRAY
2065 \*/
2066 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2067 #   pragma weak wnga_scale_patch = pnga_scale_patch
2068 #endif
pnga_scale_patch(Integer g_a,Integer * lo,Integer * hi,void * alpha)2069 void pnga_scale_patch(Integer g_a, Integer *lo, Integer *hi, void *alpha)
2070 {
2071   Integer ndim, dims[MAXDIM], type;
2072   Integer loA[MAXDIM], hiA[MAXDIM];
2073   Integer ld[MAXDIM];
2074   char *src_data_ptr;
2075   Integer num_blocks, nproc;
2076   Integer me= pnga_nodeid();
2077   int local_sync_begin,local_sync_end;
2078   _iterator_hdl hdl;
2079 
2080   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2081   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2082   if(local_sync_begin)pnga_sync();
2083 
2084 
2085   pnga_inquire(g_a,  &type, &ndim, dims);
2086   num_blocks = pnga_total_blocks(g_a);
2087 
2088 #if 1
2089   pnga_local_iterator_init(g_a, &hdl);
2090   while (pnga_local_iterator_next(&hdl,loA,hiA,&src_data_ptr,ld)) {
2091     Integer offset, j, jtmp, chk;
2092     Integer loS[MAXDIM];
2093     /* loA is changed by pnga_patch_intersect, so
2094        save a copy */
2095     for (j=0; j<ndim; j++) {
2096       loS[j] = loA[j];
2097     }
2098 
2099     /*  determine subset of my local patch to access  */
2100     /*  Output is in loA and hiA */
2101     if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
2102 
2103       /* Check for partial overlap */
2104       chk = 1;
2105       for (j=0; j<ndim; j++) {
2106         if (loS[j] < loA[j]) {
2107           chk=0;
2108           break;
2109         }
2110       }
2111       if (!chk) {
2112         /* Evaluate additional offset for pointer */
2113         offset = 0;
2114         jtmp = 1;
2115         for (j=0; j<ndim-1; j++) {
2116           offset += (loA[j]-loS[j])*jtmp;
2117           jtmp *= ld[j];
2118         }
2119         offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
2120         switch (type){
2121           case C_INT:
2122             src_data_ptr = (void*)((int*)src_data_ptr + offset);
2123             break;
2124           case C_DCPL:
2125             src_data_ptr = (void*)((double*)src_data_ptr + 2*offset);
2126             break;
2127           case C_SCPL:
2128             src_data_ptr = (void*)((float*)src_data_ptr + 2*offset);
2129             break;
2130           case C_DBL:
2131             src_data_ptr = (void*)((double*)src_data_ptr + offset);
2132             break;
2133           case C_FLOAT:
2134             src_data_ptr = (void*)((float*)src_data_ptr + offset);
2135             break;
2136           case C_LONG:
2137             src_data_ptr = (void*)((long*)src_data_ptr + offset);
2138             break;
2139           case C_LONGLONG:
2140             src_data_ptr = (void*)((long long*)src_data_ptr + offset);
2141             break;
2142           default: pnga_error(" wrong data type ",type);
2143         }
2144       }
2145 
2146       /* set all values in patch to *val */
2147       snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2148     }
2149 
2150   }
2151 #else
2152   if (num_blocks < 0) {
2153     pnga_distribution(g_a, me, loA, hiA);
2154 
2155     /* determine subset of my patch to access */
2156     if (pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
2157       pnga_access_ptr(g_a, loA, hiA, &src_data_ptr, ld);
2158 
2159       snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2160 
2161       /* release access to the data */
2162       pnga_release_update(g_a, loA, hiA);
2163     }
2164   } else {
2165     Integer offset, i, j, jtmp, chk;
2166     Integer loS[MAXDIM];
2167     nproc = pnga_nnodes();
2168     /* using simple block-cyclic data distribution */
2169     if (!pnga_uses_proc_grid(g_a)){
2170       for (i=me; i<num_blocks; i += nproc) {
2171         /* get limits of VISIBLE patch */
2172         pnga_distribution(g_a, i, loA, hiA);
2173 
2174         /* loA is changed by pnga_patch_intersect, so
2175            save a copy */
2176         for (j=0; j<ndim; j++) {
2177           loS[j] = loA[j];
2178         }
2179 
2180         /*  determine subset of my local patch to access  */
2181         /*  Output is in loA and hiA */
2182         if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
2183 
2184           /* get src_data_ptr to corner of patch */
2185           /* ld are leading dimensions INCLUDING ghost cells */
2186           pnga_access_block_ptr(g_a, i, &src_data_ptr, ld);
2187 
2188           /* Check for partial overlap */
2189           chk = 1;
2190           for (j=0; j<ndim; j++) {
2191             if (loS[j] < loA[j]) {
2192               chk=0;
2193               break;
2194             }
2195           }
2196           if (!chk) {
2197             /* Evaluate additional offset for pointer */
2198             offset = 0;
2199             jtmp = 1;
2200             for (j=0; j<ndim-1; j++) {
2201               offset += (loA[j]-loS[j])*jtmp;
2202               jtmp *= ld[j];
2203             }
2204             offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
2205             switch (type){
2206               case C_INT:
2207                 src_data_ptr = (void*)((int*)src_data_ptr + offset);
2208                 break;
2209               case C_DCPL:
2210                 src_data_ptr = (void*)((double*)src_data_ptr + 2*offset);
2211                 break;
2212               case C_SCPL:
2213                 src_data_ptr = (void*)((float*)src_data_ptr + 2*offset);
2214                 break;
2215               case C_DBL:
2216                 src_data_ptr = (void*)((double*)src_data_ptr + offset);
2217                 break;
2218               case C_FLOAT:
2219                 src_data_ptr = (void*)((float*)src_data_ptr + offset);
2220                 break;
2221               case C_LONG:
2222                 src_data_ptr = (void*)((long*)src_data_ptr + offset);
2223                 break;
2224               case C_LONGLONG:
2225                 src_data_ptr = (void*)((long long*)src_data_ptr + offset);
2226                 break;
2227               default: pnga_error(" wrong data type ",type);
2228             }
2229           }
2230 
2231           /* set all values in patch to *val */
2232           snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2233 
2234           /* release access to the data */
2235           pnga_release_update_block(g_a, i);
2236         }
2237       }
2238     } else {
2239       /* using scalapack block-cyclic data distribution */
2240       Integer proc_index[MAXDIM], index[MAXDIM];
2241       Integer topology[MAXDIM];
2242       Integer blocks[MAXDIM], block_dims[MAXDIM];
2243       pnga_get_proc_index(g_a, me, proc_index);
2244       pnga_get_proc_index(g_a, me, index);
2245       pnga_get_block_info(g_a, blocks, block_dims);
2246       pnga_get_proc_grid(g_a, topology);
2247       while (index[ndim-1] < blocks[ndim-1]) {
2248         /* find bounding coordinates of block */
2249         chk = 1;
2250         for (i = 0; i < ndim; i++) {
2251           loA[i] = index[i]*block_dims[i]+1;
2252           hiA[i] = (index[i] + 1)*block_dims[i];
2253           if (hiA[i] > dims[i]) hiA[i] = dims[i];
2254           if (hiA[i] < loA[i]) chk = 0;
2255         }
2256 
2257         /* loA is changed by pnga_patch_intersect, so
2258            save a copy */
2259         for (j=0; j<ndim; j++) {
2260           loS[j] = loA[j];
2261         }
2262 
2263         /*  determine subset of my local patch to access  */
2264         /*  Output is in loA and hiA */
2265         if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
2266 
2267           /* get data_ptr to corner of patch */
2268           /* ld are leading dimensions for block */
2269           pnga_access_block_grid_ptr(g_a, index, &src_data_ptr, ld);
2270 
2271           /* Check for partial overlap */
2272           chk = 1;
2273           for (j=0; j<ndim; j++) {
2274             if (loS[j] < loA[j]) {
2275               chk=0;
2276               break;
2277             }
2278           }
2279           if (!chk) {
2280             /* Evaluate additional offset for pointer */
2281             offset = 0;
2282             jtmp = 1;
2283             for (j=0; j<ndim-1; j++) {
2284               offset += (loA[j]-loS[j])*jtmp;
2285               jtmp *= ld[j];
2286             }
2287             offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
2288             switch (type){
2289               case C_INT:
2290                 src_data_ptr = (void*)((int*)src_data_ptr + offset);
2291                 break;
2292               case C_DCPL:
2293                 src_data_ptr = (void*)((double*)src_data_ptr + 2*offset);
2294                 break;
2295               case C_SCPL:
2296                 src_data_ptr = (void*)((float*)src_data_ptr + 2*offset);
2297                 break;
2298               case C_DBL:
2299                 src_data_ptr = (void*)((double*)src_data_ptr + offset);
2300                 break;
2301               case C_FLOAT:
2302                 src_data_ptr = (void*)((float*)src_data_ptr + offset);
2303                 break;
2304               case C_LONG:
2305                 src_data_ptr = (void*)((long*)src_data_ptr + offset);
2306                 break;
2307               case C_LONGLONG:
2308                 src_data_ptr = (void*)((long long*)src_data_ptr + offset);
2309                 break;
2310               default: pnga_error(" wrong data type ",type);
2311             }
2312           }
2313 
2314           /* set all values in patch to *val */
2315           snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2316 
2317           /* release access to the data */
2318           pnga_release_update_block_grid(g_a, index);
2319         }
2320         /* increment index to get next block on processor */
2321         index[0] += topology[0];
2322         for (i = 0; i < ndim; i++) {
2323           if (index[i] >= blocks[i] && i<ndim-1) {
2324             index[i] = proc_index[i];
2325             index[i+1] += topology[i+1];
2326           }
2327         }
2328       }
2329     }
2330   }
2331 #endif
2332   if(local_sync_end)pnga_sync();
2333 }
2334 
2335 /*\ Utility function to accumulate patch values C = C + alpha*A
2336 \*/
snga_acc_patch_values(Integer type,void * alpha,Integer ndim,Integer * loC,Integer * hiC,Integer * ldC,void * A_ptr,void * C_ptr)2337 static void snga_acc_patch_values(Integer type, void* alpha, Integer ndim,
2338                                   Integer *loC, Integer *hiC, Integer *ldC,
2339                                   void *A_ptr, void *C_ptr)
2340 {
2341   Integer bvalue[MAXDIM], bunit[MAXDIM], baseldC[MAXDIM];
2342   Integer idx, n1dim;
2343   Integer i, j;
2344   /* compute "local" add */
2345 
2346   /* number of n-element of the first dimension */
2347   n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiC[i] - loC[i] + 1);
2348 
2349   /* calculate the destination indices */
2350   bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
2351   /* baseld[0] = ld[0]
2352    * baseld[1] = ld[0] * ld[1]
2353    * baseld[2] = ld[0] * ld[1] * ld[2] .....
2354    */
2355   baseldC[0] = ldC[0]; baseldC[1] = baseldC[0] *ldC[1];
2356   for(i=2; i<ndim; i++) {
2357     bvalue[i] = 0;
2358     bunit[i] = bunit[i-1] * (hiC[i-1] - loC[i-1] + 1);
2359     baseldC[i] = baseldC[i-1] * ldC[i];
2360   }
2361 
2362   switch(type){
2363     case C_DBL:
2364       for(i=0; i<n1dim; i++) {
2365         idx = 0;
2366         for(j=1; j<ndim; j++) {
2367           idx += bvalue[j] * baseldC[j-1];
2368           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2369           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2370         }
2371 
2372         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2373           ((double*)C_ptr)[idx+j] = ((double*)C_ptr)[idx+j]
2374             + *(double*)alpha * ((double*)A_ptr)[idx+j];
2375       }
2376       break;
2377     case C_DCPL:
2378       for(i=0; i<n1dim; i++) {
2379         idx = 0;
2380         for(j=1; j<ndim; j++) {
2381           idx += bvalue[j] * baseldC[j-1];
2382           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2383           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2384         }
2385 
2386         for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2387           DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
2388           DoubleComplex x= *(DoubleComplex*)alpha;
2389           ((DoubleComplex *)C_ptr)[idx+j].real =
2390             ((DoubleComplex *)C_ptr)[idx+j].real
2391             + x.real*a.real - x.imag*a.imag;
2392           ((DoubleComplex *)C_ptr)[idx+j].imag =
2393             ((DoubleComplex *)C_ptr)[idx+j].imag
2394             + x.real*a.imag + x.imag*a.real;
2395         }
2396       }
2397       break;
2398     case C_SCPL:
2399       for(i=0; i<n1dim; i++) {
2400         idx = 0;
2401         for(j=1; j<ndim; j++) {
2402           idx += bvalue[j] * baseldC[j-1];
2403           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2404           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2405         }
2406 
2407         for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2408           SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
2409           SingleComplex x= *(SingleComplex*)alpha;
2410           ((SingleComplex *)C_ptr)[idx+j].real =
2411             ((SingleComplex *)C_ptr)[idx+j].real
2412             + x.real*a.real - x.imag*a.imag;
2413           ((SingleComplex *)C_ptr)[idx+j].imag =
2414             ((SingleComplex *)C_ptr)[idx+j].imag
2415             + x.real*a.imag + x.imag*a.real;
2416         }
2417       }
2418       break;
2419     case C_INT:
2420       for(i=0; i<n1dim; i++) {
2421         idx = 0;
2422         for(j=1; j<ndim; j++) {
2423           idx += bvalue[j] * baseldC[j-1];
2424           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2425           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2426         }
2427 
2428         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2429           ((int*)C_ptr)[idx+j] = ((int*)C_ptr)[idx+j]
2430             + *(int *)alpha * ((int*)A_ptr)[idx+j];
2431       }
2432       break;
2433     case C_FLOAT:
2434       for(i=0; i<n1dim; i++) {
2435         idx = 0;
2436         for(j=1; j<ndim; j++) {
2437           idx += bvalue[j] * baseldC[j-1];
2438           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2439           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2440         }
2441 
2442         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2443           ((float *)C_ptr)[idx+j] = ((float *)C_ptr)[idx+j]
2444             + *(float *)alpha * ((float *)A_ptr)[idx+j];
2445       }
2446       break;
2447     case C_LONG:
2448       for(i=0; i<n1dim; i++) {
2449         idx = 0;
2450         for(j=1; j<ndim; j++) {
2451           idx += bvalue[j] * baseldC[j-1];
2452           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2453           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2454         }
2455 
2456         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2457           ((long *)C_ptr)[idx+j] = ((long *)C_ptr)[idx+j]
2458             + *(long *)alpha * ((long *)A_ptr)[idx+j];
2459       }
2460       break;
2461     case C_LONGLONG:
2462       for(i=0; i<n1dim; i++) {
2463         idx = 0;
2464         for(j=1; j<ndim; j++) {
2465           idx += bvalue[j] * baseldC[j-1];
2466           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2467           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2468         }
2469 
2470         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2471           ((long long*)C_ptr)[idx+j] = ((long long*)C_ptr)[idx+j]
2472             + *(long long*)alpha * ((long long*)A_ptr)[idx+j];
2473       }
2474       break;
2475     default: pnga_error(" wrong data type ",type);
2476   }
2477 }
2478 
2479 /*\ Utility function to add patch values together
2480 \*/
snga_add_patch_values(Integer type,void * alpha,void * beta,Integer ndim,Integer * loC,Integer * hiC,Integer * ldC,void * A_ptr,void * B_ptr,void * C_ptr)2481 static void snga_add_patch_values(Integer type, void* alpha, void *beta,
2482                            Integer ndim, Integer *loC, Integer *hiC, Integer *ldC,
2483                            void *A_ptr, void *B_ptr, void *C_ptr)
2484 {
2485   Integer bvalue[MAXDIM], bunit[MAXDIM], baseldC[MAXDIM];
2486   Integer idx, n1dim;
2487   Integer i, j;
2488   /* compute "local" add */
2489 
2490   /* number of n-element of the first dimension */
2491   n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiC[i] - loC[i] + 1);
2492 
2493   /* calculate the destination indices */
2494   bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
2495   /* baseld[0] = ld[0]
2496    * baseld[1] = ld[0] * ld[1]
2497    * baseld[2] = ld[0] * ld[1] * ld[2] .....
2498    */
2499   baseldC[0] = ldC[0]; baseldC[1] = baseldC[0] *ldC[1];
2500   for(i=2; i<ndim; i++) {
2501     bvalue[i] = 0;
2502     bunit[i] = bunit[i-1] * (hiC[i-1] - loC[i-1] + 1);
2503     baseldC[i] = baseldC[i-1] * ldC[i];
2504   }
2505 
2506   switch(type){
2507     case C_DBL:
2508       for(i=0; i<n1dim; i++) {
2509         idx = 0;
2510         for(j=1; j<ndim; j++) {
2511           idx += bvalue[j] * baseldC[j-1];
2512           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2513           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2514         }
2515 
2516         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2517           ((double*)C_ptr)[idx+j] =
2518             *(double*)alpha *
2519             ((double*)A_ptr)[idx+j] +
2520             *(double*)beta *
2521             ((double*)B_ptr)[idx+j];
2522       }
2523       break;
2524     case C_DCPL:
2525       for(i=0; i<n1dim; i++) {
2526         idx = 0;
2527         for(j=1; j<ndim; j++) {
2528           idx += bvalue[j] * baseldC[j-1];
2529           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2530           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2531         }
2532 
2533         for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2534           DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
2535           DoubleComplex b = ((DoubleComplex *)B_ptr)[idx+j];
2536           DoubleComplex x= *(DoubleComplex*)alpha;
2537           DoubleComplex y= *(DoubleComplex*)beta;
2538           ((DoubleComplex *)C_ptr)[idx+j].real = x.real*a.real -
2539             x.imag*a.imag + y.real*b.real - y.imag*b.imag;
2540           ((DoubleComplex *)C_ptr)[idx+j].imag = x.real*a.imag +
2541             x.imag*a.real + y.real*b.imag + y.imag*b.real;
2542         }
2543       }
2544       break;
2545     case C_SCPL:
2546       for(i=0; i<n1dim; i++) {
2547         idx = 0;
2548         for(j=1; j<ndim; j++) {
2549           idx += bvalue[j] * baseldC[j-1];
2550           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2551           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2552         }
2553 
2554         for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2555           SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
2556           SingleComplex b = ((SingleComplex *)B_ptr)[idx+j];
2557           SingleComplex x= *(SingleComplex*)alpha;
2558           SingleComplex y= *(SingleComplex*)beta;
2559           ((SingleComplex *)C_ptr)[idx+j].real = x.real*a.real -
2560             x.imag*a.imag + y.real*b.real - y.imag*b.imag;
2561           ((SingleComplex *)C_ptr)[idx+j].imag = x.real*a.imag +
2562             x.imag*a.real + y.real*b.imag + y.imag*b.real;
2563         }
2564       }
2565       break;
2566     case C_INT:
2567       for(i=0; i<n1dim; i++) {
2568         idx = 0;
2569         for(j=1; j<ndim; j++) {
2570           idx += bvalue[j] * baseldC[j-1];
2571           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2572           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2573         }
2574 
2575         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2576           ((int*)C_ptr)[idx+j] = *(int *)alpha *
2577             ((int*)A_ptr)[idx+j] + *(int*)beta *
2578             ((int*)B_ptr)[idx+j];
2579       }
2580       break;
2581     case C_FLOAT:
2582       for(i=0; i<n1dim; i++) {
2583         idx = 0;
2584         for(j=1; j<ndim; j++) {
2585           idx += bvalue[j] * baseldC[j-1];
2586           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2587           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2588         }
2589 
2590         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2591           ((float *)C_ptr)[idx+j] = *(float *)alpha *
2592             ((float *)A_ptr)[idx+j] + *(float *)beta *
2593             ((float *)B_ptr)[idx+j];
2594       }
2595       break;
2596     case C_LONG:
2597       for(i=0; i<n1dim; i++) {
2598         idx = 0;
2599         for(j=1; j<ndim; j++) {
2600           idx += bvalue[j] * baseldC[j-1];
2601           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2602           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2603         }
2604 
2605         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2606           ((long *)C_ptr)[idx+j] = *(long *)alpha *
2607             ((long *)A_ptr)[idx+j] + *(long *)beta *
2608             ((long *)B_ptr)[idx+j];
2609       }
2610       break;
2611     case C_LONGLONG:
2612       for(i=0; i<n1dim; i++) {
2613         idx = 0;
2614         for(j=1; j<ndim; j++) {
2615           idx += bvalue[j] * baseldC[j-1];
2616           if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2617           if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2618         }
2619 
2620         for(j=0; j<(hiC[0]-loC[0]+1); j++)
2621           ((long long*)C_ptr)[idx+j] = *(long long*)alpha *
2622             ((long long*)A_ptr)[idx+j] + *(long long*)beta *
2623             ((long long*)B_ptr)[idx+j];
2624       }
2625       break;
2626     default: pnga_error(" wrong data type ",type);
2627   }
2628 }
2629 
2630 
2631 /*\  SCALED ADDITION of two patches
2632 \*/
2633 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2634 #   pragma weak wnga_add_patch = pnga_add_patch
2635 #endif
pnga_add_patch(alpha,g_a,alo,ahi,beta,g_b,blo,bhi,g_c,clo,chi)2636 void pnga_add_patch(alpha, g_a, alo, ahi, beta,  g_b, blo, bhi,
2637                          g_c, clo, chi)
2638 Integer g_a, *alo, *ahi;    /* patch of g_a */
2639 Integer g_b, *blo, *bhi;    /* patch of g_b */
2640 Integer g_c, *clo, *chi;    /* patch of g_c */
2641 void *alpha, *beta;
2642 {
2643   Integer i, j;
2644   Integer compatible_a, compatible_b;
2645   Integer atype, btype, ctype;
2646   Integer andim, adims[MAXDIM], bndim, bdims[MAXDIM], cndim, cdims[MAXDIM];
2647   Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
2648   Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
2649   Integer loC[MAXDIM], hiC[MAXDIM], ldC[MAXDIM];
2650   char *A_ptr, *B_ptr, *C_ptr;
2651   Integer n1dim;
2652   Integer atotal, btotal;
2653   Integer g_A = g_a, g_B = g_b;
2654   Integer me= pnga_nodeid(), A_created=0, B_created=0;
2655   Integer nproc = pnga_nnodes();
2656   Integer num_blocks_a, num_blocks_b, num_blocks_c;
2657   char *tempname = "temp", notrans='n';
2658   int local_sync_begin,local_sync_end;
2659 
2660   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2661   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2662   if(local_sync_begin)pnga_sync();
2663 
2664 
2665   pnga_inquire(g_a, &atype, &andim, adims);
2666   pnga_inquire(g_b, &btype, &bndim, bdims);
2667   pnga_inquire(g_c, &ctype, &cndim, cdims);
2668 
2669   if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
2670 
2671   /* check if patch indices and dims match */
2672   for(i=0; i<andim; i++)
2673     if(alo[i] <= 0 || ahi[i] > adims[i])
2674       pnga_error("g_a indices out of range ", g_a);
2675   for(i=0; i<bndim; i++)
2676     if(blo[i] <= 0 || bhi[i] > bdims[i])
2677       pnga_error("g_b indices out of range ", g_b);
2678   for(i=0; i<cndim; i++)
2679     if(clo[i] <= 0 || chi[i] > cdims[i])
2680       pnga_error("g_c indices out of range ", g_c);
2681 
2682   /* check if numbers of elements in patches match each other */
2683   n1dim = 1; for(i=0; i<cndim; i++) n1dim *= (chi[i] - clo[i] + 1);
2684   atotal = 1; for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
2685   btotal = 1; for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
2686 
2687   if((atotal != n1dim) || (btotal != n1dim))
2688     pnga_error("  capacities of patches do not match ", 0L);
2689 
2690   num_blocks_a = pnga_total_blocks(g_a);
2691   num_blocks_b = pnga_total_blocks(g_b);
2692   num_blocks_c = pnga_total_blocks(g_c);
2693 
2694   if (num_blocks_a < 0 && num_blocks_b < 0 && num_blocks_c < 0) {
2695     /* find out coordinates of patches of g_a, g_b and g_c that I own */
2696     pnga_distribution( g_A, me, loA, hiA);
2697     pnga_distribution( g_B, me, loB, hiB);
2698     pnga_distribution( g_c, me, loC, hiC);
2699 
2700     /* test if the local portion of patches matches */
2701     if(pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC) &&
2702        pnga_comp_patch(andim, alo, ahi, cndim, clo, chi)) compatible_a = 1;
2703     else compatible_a = 0;
2704     /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_a, 1, "*"); */
2705     pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_a, 1, "&&");
2706     if(pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC) &&
2707        pnga_comp_patch(bndim, blo, bhi, cndim, clo, chi)) compatible_b = 1;
2708     else compatible_b = 0;
2709     /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_b, 1, "*"); */
2710     pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_b, 1, "&&");
2711     if (compatible_a && compatible_b) {
2712       if(andim > bndim) cndim = bndim;
2713       if(andim < bndim) cndim = andim;
2714 
2715       if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2716         pnga_error(" A patch mismatch ", g_A);
2717       if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2718         pnga_error(" B patch mismatch ", g_B);
2719 
2720       /*  determine subsets of my patches to access  */
2721       if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2722         pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2723         pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2724         pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2725 
2726         snga_add_patch_values(atype, alpha, beta, cndim,
2727             loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2728 
2729         /* release access to the data */
2730         pnga_release       (g_A, loC, hiC);
2731         pnga_release       (g_B, loC, hiC);
2732         pnga_release_update(g_c, loC, hiC);
2733       }
2734     } else if (!compatible_a && compatible_b) {
2735       /* either patches or distributions do not match:
2736        *        - create a temp array that matches distribution of g_c
2737        *        - do C<= A
2738        */
2739       if(g_b != g_c) {
2740         pnga_copy_patch(&notrans, g_a, alo, ahi, g_c, clo, chi);
2741         andim = cndim;
2742         g_A = g_c;
2743         pnga_distribution(g_A, me, loA, hiA);
2744       } else {
2745         if (!pnga_duplicate(g_c, &g_A, tempname))
2746           pnga_error("ga_dadd_patch: dup failed", 0L);
2747         pnga_copy_patch(&notrans, g_a, alo, ahi, g_A, clo, chi);
2748         andim = cndim;
2749         A_created = 1;
2750         pnga_distribution(g_A, me, loA, hiA);
2751       }
2752       if(andim > bndim) cndim = bndim;
2753       if(andim < bndim) cndim = andim;
2754 
2755       if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2756         pnga_error(" A patch mismatch ", g_A);
2757       if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2758         pnga_error(" B patch mismatch ", g_B);
2759 
2760       /*  determine subsets of my patches to access  */
2761       if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2762         pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2763         pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2764         pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2765 
2766         snga_add_patch_values(atype, alpha, beta, cndim,
2767             loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2768 
2769         /* release access to the data */
2770         pnga_release       (g_A, loC, hiC);
2771         pnga_release       (g_B, loC, hiC);
2772         pnga_release_update(g_c, loC, hiC);
2773       }
2774     } else if (compatible_a && !compatible_b) {
2775       /* either patches or distributions do not match:
2776        *        - create a temp array that matches distribution of g_c
2777        *        - copy & reshape patch of g_b into g_B
2778        */
2779       if (!pnga_duplicate(g_c, &g_B, tempname))
2780         pnga_error("ga_dadd_patch: dup failed", 0L);
2781       pnga_copy_patch(&notrans, g_b, blo, bhi, g_B, clo, chi);
2782       bndim = cndim;
2783       B_created = 1;
2784       pnga_distribution(g_B, me, loB, hiB);
2785 
2786       if(andim > bndim) cndim = bndim;
2787       if(andim < bndim) cndim = andim;
2788 
2789       if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2790         pnga_error(" A patch mismatch ", g_A);
2791       if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2792         pnga_error(" B patch mismatch ", g_B);
2793 
2794       /*  determine subsets of my patches to access  */
2795       if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2796         pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2797         pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2798         pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2799 
2800         snga_add_patch_values(atype, alpha, beta, cndim,
2801             loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2802 
2803         /* release access to the data */
2804         pnga_release       (g_A, loC, hiC);
2805         pnga_release       (g_B, loC, hiC);
2806         pnga_release_update(g_c, loC, hiC);
2807       }
2808     } else if (!compatible_a && !compatible_b) {
2809       /* there is no match between any of the global arrays */
2810       if (!pnga_duplicate(g_c, &g_B, tempname))
2811         pnga_error("ga_dadd_patch: dup failed", 0L);
2812       pnga_copy_patch(&notrans, g_b, blo, bhi, g_B, clo, chi);
2813       bndim = cndim;
2814       B_created = 1;
2815       if(andim > bndim) cndim = bndim;
2816       if(andim < bndim) cndim = andim;
2817       cndim = bndim;
2818       pnga_copy_patch(&notrans, g_a, alo, ahi, g_c, clo, chi);
2819       pnga_scale_patch(g_c, clo, chi, alpha);
2820       /*  determine subsets of my patches to access  */
2821       if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2822         pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2823         pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2824 
2825         snga_acc_patch_values(atype, beta, cndim, loC, hiC, ldC,
2826                               B_ptr, C_ptr);
2827         /* release access to the data */
2828         pnga_release       (g_B, loC, hiC);
2829         pnga_release_update(g_c, loC, hiC);
2830       }
2831     }
2832   } else {
2833     _iterator_hdl hdl_a, hdl_b, hdl_c;
2834     /* create copies of arrays A and B that are identically distributed
2835        as C*/
2836     if (!pnga_duplicate(g_c, &g_A, tempname))
2837       pnga_error("ga_dadd_patch: dup failed", 0L);
2838     pnga_copy_patch(&notrans, g_a, alo, ahi, g_A, clo, chi);
2839     andim = cndim;
2840     A_created = 1;
2841 
2842     if (!pnga_duplicate(g_c, &g_B, tempname))
2843       pnga_error("ga_dadd_patch: dup failed", 0L);
2844     pnga_copy_patch(&notrans, g_b, blo, bhi, g_B, clo, chi);
2845     bndim = cndim;
2846     B_created = 1;
2847 
2848 #if 1
2849     pnga_local_iterator_init(g_A, &hdl_a);
2850     pnga_local_iterator_init(g_B, &hdl_b);
2851     pnga_local_iterator_init(g_c, &hdl_c);
2852     while (pnga_local_iterator_next(&hdl_c,loC,hiC,&C_ptr,ldC)) {
2853       pnga_local_iterator_next(&hdl_a,loA,hiA,&A_ptr,ldA);
2854       pnga_local_iterator_next(&hdl_b,loB,hiB,&B_ptr,ldB);
2855       Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
2856       Integer offset, jtot, last;
2857       /* make temporary copies of loC and hiC since pnga_patch_intersect
2858          destroys original versions */
2859       for (j=0; j<cndim; j++) {
2860         lod[j] = loC[j];
2861         /*hid[j] = hiC[j];*/
2862       }
2863       if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)) {
2864 
2865         /* evaluate offsets for system */
2866         offset = 0;
2867         last = cndim - 1;
2868         jtot = 1;
2869         for (j=0; j<last; j++) {
2870           offset += (loC[j] - lod[j])*jtot;
2871           jtot *= ldC[j];
2872         }
2873         offset += (loC[last]-lod[last])*jtot;
2874 
2875         switch(ctype) {
2876           case C_DBL:
2877             A_ptr = (void*)((double*)(A_ptr) + offset);
2878             B_ptr = (void*)((double*)(B_ptr) + offset);
2879             C_ptr = (void*)((double*)(C_ptr) + offset);
2880             break;
2881           case C_INT:
2882             A_ptr = (void*)((int*)(A_ptr) + offset);
2883             B_ptr = (void*)((int*)(B_ptr) + offset);
2884             C_ptr = (void*)((int*)(C_ptr) + offset);
2885             break;
2886           case C_DCPL:
2887             A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
2888             B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
2889             C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
2890             break;
2891           case C_SCPL:
2892             A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
2893             B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
2894             C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
2895             break;
2896           case C_FLOAT:
2897             A_ptr = (void*)((float*)(A_ptr) + offset);
2898             B_ptr = (void*)((float*)(B_ptr) + offset);
2899             C_ptr = (void*)((float*)(C_ptr) + offset);
2900             break;
2901           case C_LONG:
2902             A_ptr = (void*)((long*)(A_ptr) + offset);
2903             B_ptr = (void*)((long*)(B_ptr) + offset);
2904             C_ptr = (void*)((long*)(C_ptr) + offset);
2905             break;
2906           case C_LONGLONG:
2907             A_ptr = (void*)((long long*)(A_ptr) + offset);
2908             B_ptr = (void*)((long long*)(B_ptr) + offset);
2909             C_ptr = (void*)((long long*)(C_ptr) + offset);
2910             break;
2911           default:
2912             break;
2913         }
2914         snga_add_patch_values(atype, alpha, beta, cndim,
2915             loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2916       }
2917     }
2918 #else
2919     /* C is normally distributed so just add copies together for regular
2920        arrays */
2921     if (num_blocks_c < 0) {
2922       pnga_distribution(g_c, me, loC, hiC);
2923       if(andim > bndim) cndim = bndim;
2924       if(andim < bndim) cndim = andim;
2925       if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2926         pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2927         pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2928         pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2929 
2930         snga_add_patch_values(atype, alpha, beta, cndim,
2931             loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2932 
2933         /* release access to the data */
2934         pnga_release       (g_A, loC, hiC);
2935         pnga_release       (g_B, loC, hiC);
2936         pnga_release_update(g_c, loC, hiC);
2937       }
2938     } else {
2939       Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
2940       Integer offset, jtot, last;
2941       /* Simple block-cyclic data disribution */
2942       if (!pnga_uses_proc_grid(g_c)) {
2943         for (idx = me; idx < num_blocks_c; idx += nproc) {
2944           pnga_distribution(g_c, idx, loC, hiC);
2945           /* make temporary copies of loC and hiC since pnga_patch_intersect
2946              destroys original versions */
2947           for (j=0; j<cndim; j++) {
2948             lod[j] = loC[j];
2949             /*hid[j] = hiC[j];*/
2950           }
2951           if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)) {
2952             pnga_access_block_ptr(g_A, idx, &A_ptr, ldA);
2953             pnga_access_block_ptr(g_B, idx, &B_ptr, ldB);
2954             pnga_access_block_ptr(g_c, idx, &C_ptr, ldC);
2955 
2956             /* evaluate offsets for system */
2957             offset = 0;
2958             last = cndim - 1;
2959             jtot = 1;
2960             for (j=0; j<last; j++) {
2961               offset += (loC[j] - lod[j])*jtot;
2962               jtot *= ldC[j];
2963             }
2964             offset += (loC[last]-lod[last])*jtot;
2965 
2966             switch(ctype) {
2967               case C_DBL:
2968                 A_ptr = (void*)((double*)(A_ptr) + offset);
2969                 B_ptr = (void*)((double*)(B_ptr) + offset);
2970                 C_ptr = (void*)((double*)(C_ptr) + offset);
2971                 break;
2972               case C_INT:
2973                 A_ptr = (void*)((int*)(A_ptr) + offset);
2974                 B_ptr = (void*)((int*)(B_ptr) + offset);
2975                 C_ptr = (void*)((int*)(C_ptr) + offset);
2976                 break;
2977               case C_DCPL:
2978                 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
2979                 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
2980                 C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
2981                 break;
2982               case C_SCPL:
2983                 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
2984                 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
2985                 C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
2986                 break;
2987               case C_FLOAT:
2988                 A_ptr = (void*)((float*)(A_ptr) + offset);
2989                 B_ptr = (void*)((float*)(B_ptr) + offset);
2990                 C_ptr = (void*)((float*)(C_ptr) + offset);
2991                 break;
2992               case C_LONG:
2993                 A_ptr = (void*)((long*)(A_ptr) + offset);
2994                 B_ptr = (void*)((long*)(B_ptr) + offset);
2995                 C_ptr = (void*)((long*)(C_ptr) + offset);
2996                 break;
2997               case C_LONGLONG:
2998                 A_ptr = (void*)((long long*)(A_ptr) + offset);
2999                 B_ptr = (void*)((long long*)(B_ptr) + offset);
3000                 C_ptr = (void*)((long long*)(C_ptr) + offset);
3001                 break;
3002               default:
3003                 break;
3004             }
3005             snga_add_patch_values(atype, alpha, beta, cndim,
3006                 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
3007 
3008             /* release access to the data */
3009             pnga_release_block       (g_A, idx);
3010             pnga_release_block       (g_B, idx);
3011             pnga_release_update_block(g_c, idx);
3012           }
3013         }
3014       } else {
3015         /* Uses scalapack block-cyclic data distribution */
3016         Integer lod[MAXDIM]/*, hid[MAXDIM], chk*/;
3017         Integer proc_index[MAXDIM], index[MAXDIM];
3018         Integer topology[MAXDIM];
3019         Integer blocks[MAXDIM], block_dims[MAXDIM];
3020         pnga_get_proc_index(g_c, me, proc_index);
3021         pnga_get_proc_index(g_c, me, index);
3022         pnga_get_block_info(g_c, blocks, block_dims);
3023         pnga_get_proc_grid(g_c, topology);
3024         while (index[cndim-1] < blocks[cndim-1]) {
3025           /* find bounding coordinates of block */
3026           /*chk = 1;*/
3027           for (i = 0; i < cndim; i++) {
3028             loC[i] = index[i]*block_dims[i]+1;
3029             hiC[i] = (index[i] + 1)*block_dims[i];
3030             if (hiC[i] > cdims[i]) hiC[i] = cdims[i];
3031             /*if (hiC[i] < loC[i]) chk = 0;*/
3032           }
3033           /* make temporary copies of loC and hiC since pnga_patch_intersect
3034              destroys original versions */
3035           for (j=0; j<cndim; j++) {
3036             lod[j] = loC[j];
3037             /*hid[j] = hiC[j];*/
3038           }
3039           if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)) {
3040             pnga_access_block_grid_ptr(g_A, index, &A_ptr, ldA);
3041             pnga_access_block_grid_ptr(g_B, index, &B_ptr, ldB);
3042             pnga_access_block_grid_ptr(g_c, index, &C_ptr, ldC);
3043 
3044             /* evaluate offsets for system */
3045             offset = 0;
3046             last = cndim - 1;
3047             jtot = 1;
3048             for (j=0; j<last; j++) {
3049               offset += (loC[j] - lod[j])*jtot;
3050               jtot *= ldC[j];
3051             }
3052             offset += (loC[last]-lod[last])*jtot;
3053 
3054             switch(ctype) {
3055               case C_DBL:
3056                 A_ptr = (void*)((double*)(A_ptr) + offset);
3057                 B_ptr = (void*)((double*)(B_ptr) + offset);
3058                 C_ptr = (void*)((double*)(C_ptr) + offset);
3059                 break;
3060               case C_INT:
3061                 A_ptr = (void*)((int*)(A_ptr) + offset);
3062                 B_ptr = (void*)((int*)(B_ptr) + offset);
3063                 C_ptr = (void*)((int*)(C_ptr) + offset);
3064                 break;
3065               case C_DCPL:
3066                 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
3067                 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
3068                 C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
3069                 break;
3070               case C_SCPL:
3071                 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
3072                 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
3073                 C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
3074                 break;
3075               case C_FLOAT:
3076                 A_ptr = (void*)((float*)(A_ptr) + offset);
3077                 B_ptr = (void*)((float*)(B_ptr) + offset);
3078                 C_ptr = (void*)((float*)(C_ptr) + offset);
3079                 break;
3080               case C_LONG:
3081                 A_ptr = (void*)((long*)(A_ptr) + offset);
3082                 B_ptr = (void*)((long*)(B_ptr) + offset);
3083                 C_ptr = (void*)((long*)(C_ptr) + offset);
3084                 break;
3085               case C_LONGLONG:
3086                 A_ptr = (void*)((long long*)(A_ptr) + offset);
3087                 B_ptr = (void*)((long long*)(B_ptr) + offset);
3088                 C_ptr = (void*)((long long*)(C_ptr) + offset);
3089                 break;
3090               default:
3091                 break;
3092             }
3093             snga_add_patch_values(atype, alpha, beta, cndim,
3094                 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
3095 
3096             /* release access to the data */
3097             pnga_release_block_grid       ( g_A, index);
3098             pnga_release_block_grid       ( g_B, index);
3099             pnga_release_update_block_grid(g_c, index);
3100           }
3101 
3102           /* increment index to get next block on processor */
3103           index[0] += topology[0];
3104           for (i = 0; i < cndim; i++) {
3105             if (index[i] >= blocks[i] && i<cndim-1) {
3106               index[i] = proc_index[i];
3107               index[i+1] += topology[i+1];
3108             }
3109           }
3110         }
3111       }
3112     }
3113 #endif
3114   }
3115 
3116   if(A_created) pnga_destroy(g_A);
3117   if(B_created) pnga_destroy(g_B);
3118 
3119   if(local_sync_end)pnga_sync();
3120 }
3121 
3122 
3123 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3124 #   pragma weak wnga_zero_patch = pnga_zero_patch
3125 #endif
pnga_zero_patch(Integer g_a,Integer * lo,Integer * hi)3126 void pnga_zero_patch(Integer g_a, Integer *lo, Integer *hi)
3127 {
3128     Integer ndim, dims[MAXDIM], type;
3129     int ival = 0;
3130     long lval = 0;
3131     long llval = 0;
3132     double dval = 0.0;
3133     DoubleComplex cval;
3134     SingleComplex cfval;
3135     float fval = 0.0;
3136     void *valptr = NULL;
3137     int local_sync_begin,local_sync_end;
3138 
3139     local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3140     _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3141     if(local_sync_begin)pnga_sync();
3142 
3143 
3144     pnga_inquire(g_a,  &type, &ndim, dims);
3145 
3146     switch (type){
3147         case C_INT:
3148             valptr = (void *)(&ival);
3149             break;
3150         case C_DBL:
3151             valptr = (void *)(&dval);
3152             break;
3153         case C_DCPL:
3154         {
3155             cval.real = 0.0; cval.imag = 0.0;
3156             valptr = (void *)(&cval);
3157             break;
3158         }
3159         case C_SCPL:
3160         {
3161             cfval.real = 0.0; cfval.imag = 0.0;
3162             valptr = (void *)(&cfval);
3163             break;
3164         }
3165         case C_FLOAT:
3166             valptr = (void *)(&fval);
3167             break;
3168        case C_LONG:
3169             valptr = (void *)(&lval);
3170             break;
3171        case C_LONGLONG:
3172             valptr = (void *)(&llval);
3173             break;
3174         default: pnga_error(" wrong data type ",type);
3175     }
3176     pnga_fill_patch(g_a, lo, hi, valptr);
3177 
3178     if(local_sync_end)pnga_sync();
3179 }
3180