1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /*$Id: matrix.c,v 1.7.8.11 2007/12/18 18:49:36 d3g293 Exp $******************************************************
6 File: matrix.c
7 
8 Author: Limin Zhang, Ph.D.
9         Mathematics Department
10         Columbia Basin College
11         Pasco, WA 99301
12         Limin.Zhang@cbc2.org
13 
14 Mentor: Jarek Naplocha, Ph.D.
15         Environmental Molecular Science Laboratory
16         Richland, WA 99352
17 
18 Date: 2/28/2002
19 
20 Purpose:
21       matrix interfaces between TAO and
22       global arrays.
23 **************************************************************/
24 
25 #include "globalp.h"
26 #if HAVE_MATH_H
27 #   include <math.h>
28 #endif
29 #if HAVE_STDLIB_H
30 #   include <stdlib.h>
31 #endif
32 #if HAVE_STRING_H
33 #   include <string.h>
34 #endif
35 #include "message.h"
36 #include "ga-papi.h"
37 #include "ga-wapi.h"
38 #include "ga_iterator.h"
39 
40 #define auxi_median(a,b,c,m)                         \
41 {                                                    \
42   if ((c < a) && (a < b))        m = a;              \
43   else if ((a <= c) && (c <= b)) m = c;              \
44   else                           m = b;              \
45 }
46 
47 #define median(a,b,c,m)                              \
48 {                                                    \
49   if(a == b)     m = a;                              \
50   else if(a < b) auxi_median(a, b, c, m)             \
51   else           auxi_median(b, a, c, m)             \
52 }
53 
54 #define auxi_median_dcpl(na, nb, nc, za, zb, zc, zm) \
55 {                                                    \
56    if ((nc < na) && (na < nb))       zm = za;        \
57   else if ((na <= nc) && (nc <= nb)) zm = zc;        \
58   else                               zm = zb;        \
59 }
60 
61 #define median_dcpl(na, nb, nc, za, zb, zc, zm)                    \
62 {                                                                  \
63   if (na == nb)     zm = za;                                       \
64   else if (na < nb) auxi_median_dcpl (na, nb, nc, za, zb, zc, zm)  \
65   else              auxi_median_dcpl (nb, na, nc, zb, za, zc, zm)  \
66 }
67 
68 #define auxi_median_scpl(na, nb, nc, ca, cb, cc, cm) \
69 {                                                    \
70    if ((nc < na) && (na < nb))       cm = ca;        \
71   else if ((na <= nc) && (nc <= nb)) cm = cc;        \
72   else                               cm = cb;        \
73 }
74 
75 #define median_scpl(na, nb, nc, ca, cb, cc, cm)                    \
76 {                                                                  \
77   if (na == nb)     cm = ca;                                       \
78   else if (na < nb) auxi_median_scpl (na, nb, nc, ca, cb, cc, cm)  \
79   else              auxi_median_scpl (nb, na, nc, cb, ca, cc, cm)  \
80 }
81 
82 /*\ Utility function to find median values of patch
83 \*/
sgai_median_patch_values(Integer type,Integer ndim,Integer * loA,Integer * hiA,Integer * ldA,void * A_ptr,void * B_ptr,void * C_ptr,void * M_ptr,Integer offset)84 static void sgai_median_patch_values(Integer type, Integer ndim, Integer *loA,
85                               Integer *hiA, Integer *ldA, void *A_ptr,
86                               void *B_ptr, void *C_ptr, void *M_ptr,
87                               Integer offset)
88 {
89   Integer bvalue[MAXDIM], bunit[MAXDIM], baseldA[MAXDIM];
90   Integer idx, n1dim;
91   Integer i, j;
92 
93   double na, nb, nc;            /*norm of a, norm of b, norm of c */
94   int ia, ib, ic, im;
95   float fa, fb, fc, fm;
96   double da, db, dc, dm;
97   long la, lb, lc, lm;
98   DoubleComplex za, zb, zc, zm;
99   SingleComplex ca, cb, cc, cm;
100 
101   /* number of n-element of the first dimension */
102   n1dim = 1;
103   for (i = 1; i < ndim; i++)
104     n1dim *= (hiA[i] - loA[i] + 1);
105 
106   /* calculate the destination indices */
107   bvalue[0] = 0;
108   bvalue[1] = 0;
109   bunit[0] = 1;
110   bunit[1] = 1;
111   /* baseldA[0] = ldA[0]
112    * baseldA[1] = ldA[0] * ldA[1]
113    * baseldA[2] = ldA[0] * ldA[1] * ldA[2] .....
114    */
115   baseldA[0] = ldA[0];
116   baseldA[1] = baseldA[0] * ldA[1];
117   for (i = 2; i < ndim; i++)
118   {
119     bvalue[i] = 0;
120     bunit[i] = bunit[i - 1] * (hiA[i - 1] - loA[i - 1] + 1);
121     baseldA[i] = baseldA[i - 1] * ldA[i];
122   }
123   switch(type) {
124     case C_DBL:
125       A_ptr = (void*)((double*)(A_ptr) + offset);
126       B_ptr = (void*)((double*)(B_ptr) + offset);
127       C_ptr = (void*)((double*)(C_ptr) + offset);
128       M_ptr = (void*)((double*)(M_ptr) + offset);
129       break;
130     case C_INT:
131       A_ptr = (void*)((int*)(A_ptr) + offset);
132       B_ptr = (void*)((int*)(B_ptr) + offset);
133       C_ptr = (void*)((int*)(C_ptr) + offset);
134       M_ptr = (void*)((int*)(M_ptr) + offset);
135       break;
136     case C_DCPL:
137       A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
138       B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
139       C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
140       M_ptr = (void*)((DoubleComplex*)(M_ptr) + offset);
141       break;
142     case C_SCPL:
143       A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
144       B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
145       C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
146       M_ptr = (void*)((SingleComplex*)(M_ptr) + offset);
147       break;
148     case C_FLOAT:
149       A_ptr = (void*)((float*)(A_ptr) + offset);
150       B_ptr = (void*)((float*)(B_ptr) + offset);
151       C_ptr = (void*)((float*)(C_ptr) + offset);
152       M_ptr = (void*)((float*)(M_ptr) + offset);
153       break;
154     case C_LONG:
155       A_ptr = (void*)((long*)(A_ptr) + offset);
156       B_ptr = (void*)((long*)(B_ptr) + offset);
157       C_ptr = (void*)((long*)(C_ptr) + offset);
158       M_ptr = (void*)((long*)(M_ptr) + offset);
159       break;
160     case C_LONGLONG:
161       A_ptr = (void*)((long long*)(A_ptr) + offset);
162       B_ptr = (void*)((long long*)(B_ptr) + offset);
163       C_ptr = (void*)((long long*)(C_ptr) + offset);
164       M_ptr = (void*)((long long*)(M_ptr) + offset);
165       break;
166     default:
167       break;
168   }
169 
170   /*compute elementwise median */
171 
172   switch (type) {
173 
174     case C_INT:
175       for (i = 0; i < n1dim; i++) {
176         idx = 0;
177         for (j = 1; j < ndim; j++) {
178           idx += bvalue[j] * baseldA[j - 1];
179           if (((i + 1) % bunit[j]) == 0)     bvalue[j]++;
180           if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
181         }
182         for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
183           ia = ((int *) A_ptr)[idx + j];
184           ib = ((int *) B_ptr)[idx + j];
185           ic = ((int *) C_ptr)[idx + j];
186           median(ia, ib, ic, im);
187           ((int *) M_ptr)[idx + j] = im;
188         }
189       }
190       break;
191     case C_LONG:
192       for (i = 0; i < n1dim; i++) {
193         idx = 0;
194         for (j = 1; j < ndim; j++) {
195           idx += bvalue[j] * baseldA[j - 1];
196           if (((i + 1) % bunit[j]) == 0)     bvalue[j]++;
197           if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
198         }
199         for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
200           la = ((long *) A_ptr)[idx + j];
201           lb = ((long *) B_ptr)[idx + j];
202           lc = ((long *) C_ptr)[idx + j];
203           median(la, lb, lc, lm);
204           ((long *) M_ptr)[idx + j] = lm;
205         }
206       }
207       break;
208     case C_FLOAT:
209       for (i = 0; i < n1dim; i++) {
210         idx = 0;
211         for (j = 1; j < ndim; j++) {
212           idx += bvalue[j] * baseldA[j - 1];
213           if (((i + 1) % bunit[j]) == 0)     bvalue[j]++;
214           if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
215         }
216         for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
217           fa = ((float *) A_ptr)[idx + j];
218           fb = ((float *) B_ptr)[idx + j];
219           fc = ((float *) C_ptr)[idx + j];
220           median(fa, fb, fc, fm);
221           ((float *) M_ptr)[idx + j] = fm;
222         }
223       }
224       break;
225     case C_DBL:
226       for (i = 0; i < n1dim; i++) {
227         idx = 0;
228         for (j = 1; j < ndim; j++) {
229           idx += bvalue[j] * baseldA[j - 1];
230           if (((i + 1) % bunit[j]) == 0)     bvalue[j]++;
231           if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
232         }
233         for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
234           da = ((double *) A_ptr)[idx + j];
235           db = ((double *) B_ptr)[idx + j];
236           dc = ((double *) C_ptr)[idx + j];
237           median(da, db, dc, dm);
238           ((double *) M_ptr)[idx + j] = dm;
239         }
240       }
241       break;
242     case C_DCPL:
243       for (i = 0; i < n1dim; i++) {
244         idx = 0;
245         for (j = 1; j < ndim; j++) {
246           idx += bvalue[j] * baseldA[j - 1];
247           if (((i + 1) % bunit[j]) == 0)     bvalue[j]++;
248           if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
249         }
250         for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
251           za = ((DoubleComplex *) A_ptr)[idx + j];
252           zb = ((DoubleComplex *) B_ptr)[idx + j];
253           zc = ((DoubleComplex *) C_ptr)[idx + j];
254           na = sqrt ((za.real) * (za.real) + (za.imag) * (za.imag));
255           nb = sqrt ((zb.real) * (zb.real) + (zb.imag) * (zb.imag));
256           nc = sqrt ((zc.real) * (zc.real) + (zc.imag) * (zc.imag));
257           median_dcpl(na, nb, nc, za, zb, zc, zm);
258           ((DoubleComplex *) M_ptr)[idx + j] = zm;
259         }
260       }
261       break;
262 
263     case C_SCPL:
264       for (i = 0; i < n1dim; i++) {
265         idx = 0;
266         for (j = 1; j < ndim; j++) {
267           idx += bvalue[j] * baseldA[j - 1];
268           if (((i + 1) % bunit[j]) == 0)     bvalue[j]++;
269           if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
270         }
271         for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
272           ca = ((SingleComplex *) A_ptr)[idx + j];
273           cb = ((SingleComplex *) B_ptr)[idx + j];
274           cc = ((SingleComplex *) C_ptr)[idx + j];
275           na = sqrt ((ca.real) * (ca.real) + (ca.imag) * (ca.imag));
276           nb = sqrt ((cb.real) * (cb.real) + (cb.imag) * (cb.imag));
277           nc = sqrt ((cc.real) * (cc.real) + (cc.imag) * (cc.imag));
278           median_scpl(na, nb, nc, ca, cb, cc, cm);
279           ((SingleComplex *) M_ptr)[idx + j] = cm;
280         }
281       }
282       break;
283     default:
284       pnga_error("median: wrong data type", type);
285   }
286 }
287 
288 /*\ median routine. Not sure what this function is suppose to do
289 \*/
290 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
291 #   pragma weak wnga_median_patch = pnga_median_patch
292 #endif
pnga_median_patch(g_a,alo,ahi,g_b,blo,bhi,g_c,clo,chi,g_m,mlo,mhi)293 void pnga_median_patch(
294         g_a, alo, ahi, g_b, blo, bhi, g_c, clo, chi, g_m, mlo, mhi)
295      Integer g_a, *alo, *ahi;  /* patch of g_a */
296      Integer g_b, *blo, *bhi;  /* patch of g_b */
297      Integer g_c, *clo, *chi;  /* patch of g_c */
298      Integer g_m, *mlo, *mhi;  /* patch of g_m */
299 {
300   Integer i, j;
301   Integer atype, btype, andim, adims[MAXDIM], bndim, bdims[MAXDIM];
302   Integer ctype, mtype, cndim, cdims[MAXDIM], mndim, mdims[MAXDIM];
303   Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
304   Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
305   Integer loC[MAXDIM], hiC[MAXDIM], ldC[MAXDIM];
306   Integer loM[MAXDIM], hiM[MAXDIM], ldM[MAXDIM];
307   Integer g_A = g_a, g_B = g_b;
308   Integer g_C = g_c, g_M = g_m;
309   char *A_ptr, *B_ptr;
310   char *C_ptr, *M_ptr;
311   Integer offset;
312   Integer atotal, btotal;
313   Integer ctotal, mtotal;
314   Integer me = pnga_nodeid (), a_temp_created = 0, b_temp_created = 0, c_temp_created = 0;
315   Integer type = GA_TYPE_GSM, compatible;
316   char *tempname = "temp", transp = 'n';        /*no transpose */
317   Integer num_blocks_a, num_blocks_b, num_blocks_c, num_blocks_m;
318   int local_sync_begin,local_sync_end;
319   _iterator_hdl hdl_a, hdl_b, hdl_c, hdl_m;
320 
321   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
322   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
323   if(local_sync_begin)pnga_sync();
324 
325 
326   pnga_inquire (g_a, &atype, &andim, adims);
327   pnga_inquire (g_b, &btype, &bndim, bdims);
328   pnga_inquire (g_c, &ctype, &cndim, cdims);
329   pnga_inquire (g_m, &mtype, &mndim, mdims);
330 
331   /* I have to inquire the data type again since nga_inquire and
332    * nga_inquire_internal_ treat data type differently */
333   pnga_inquire(g_m, &type, &mndim, mdims);
334 
335   if (mtype != atype)
336     pnga_error(" pnga_median_patch:type mismatch ", 0L);
337   if (mtype != btype)
338     pnga_error(" pnga_median_patch:type mismatch ", 0L);
339   if (mtype != ctype)
340     pnga_error(" pnga_median_patch:type mismatch ", 0L);
341 
342   /* check if patch indices and g_a dims match */
343   for (i = 0; i < andim; i++)
344     if (alo[i] <= 0 || ahi[i] > adims[i])
345     {
346       pnga_error("pnga_median_patch: g_a indices out of range ", g_a);
347     }
348 
349   for (i = 0; i < bndim; i++)
350     if (blo[i] <= 0 || bhi[i] > bdims[i])
351     {
352       pnga_error("pnga_median_patch:g_b indices out of range ", g_b);
353     }
354 
355   for (i = 0; i < cndim; i++)
356     if (clo[i] <= 0 || chi[i] > cdims[i])
357     {
358       pnga_error("pnga_median_patch:g_c indices out of range ", g_c);
359     }
360   for (i = 0; i < mndim; i++)
361     if (mlo[i] <= 0 || mhi[i] > mdims[i])
362     {
363       pnga_error("pnga_median_patch:g_m indices out of range ", g_m);
364     }
365 
366   /* check if numbers of elements in two patches match each other */
367 
368   atotal = 1;
369   for (i = 0; i < andim; i++)
370     atotal *= (ahi[i] - alo[i] + 1);
371 
372   btotal = 1;
373   for (i = 0; i < bndim; i++)
374     btotal *= (bhi[i] - blo[i] + 1);
375 
376   ctotal = 1;
377   for (i = 0; i < cndim; i++)
378     ctotal *= (chi[i] - clo[i] + 1);
379 
380   mtotal = 1;
381   for (i = 0; i < mndim; i++)
382     mtotal *= (mhi[i] - mlo[i] + 1);
383 
384 
385   if (mtotal != atotal)
386     pnga_error("pnga_median_patch:  capacities of patches do not match ", 0L);
387 
388   if (mtotal != btotal)
389     pnga_error("pnga_median_patch:  capacities of patches do not match ", 0L);
390 
391   if (mtotal != ctotal)
392     pnga_error("pnga_median_patch:  capacities of patches do not match ", 0L);
393 
394   num_blocks_a = pnga_total_blocks(g_a);
395   num_blocks_b = pnga_total_blocks(g_b);
396   num_blocks_c = pnga_total_blocks(g_c);
397   num_blocks_m = pnga_total_blocks(g_m);
398 
399   if (num_blocks_a < 0 && num_blocks_b < 0 &&
400       num_blocks_c < 0 && num_blocks_m < 0) {
401     /* find out coordinates of patches of g_A, g_B, g_C, and g_M that I own */
402     pnga_distribution (g_A, me, loA, hiA);
403     pnga_distribution (g_B, me, loB, hiB);
404     pnga_distribution (g_C, me, loC, hiC);
405     pnga_distribution (g_M, me, loM, hiM);
406 
407     if (!pnga_comp_patch (andim, loA, hiA, mndim, loM, hiM)) compatible = 1;
408     else compatible = 0;
409     /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*"); */
410     pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "&&");
411     if (!compatible) {
412       /* either patches or distributions do not match:
413        *        - create a temp array that matches distribution of g_a
414        *        - copy & reshape patch of g_b into g_B
415        */
416       if (!pnga_duplicate(g_m, &g_A, tempname))
417         pnga_error("pnga_median_patch:duplicate failed", 0L);
418 
419       pnga_copy_patch(&transp, g_a, alo, ahi, g_A, mlo, mhi);
420       andim = mndim;
421       a_temp_created = 1;
422       pnga_distribution (g_A, me, loA, hiA);
423     }
424 
425     if (!pnga_comp_patch (bndim, loB, hiB, mndim, loM, hiM)) compatible = 1;
426     else compatible = 0;
427     /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*"); */
428     pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "&&");
429     if (!compatible) {
430       /* either patches or distributions do not match:
431        *        - create a temp array that matches distribution of g_a
432        *        - copy & reshape patch of g_c into g_C
433        */
434       if (!pnga_duplicate(g_m, &g_B, tempname))
435         pnga_error("pnga_median_patch:duplicate failed", 0L);
436 
437       pnga_copy_patch(&transp, g_b, blo, bhi, g_B, mlo, mhi);
438       bndim = mndim;
439       b_temp_created = 1;
440       pnga_distribution (g_B, me, loB, hiB);
441     }
442 
443     if (!pnga_comp_patch (cndim, loC, hiC, mndim, loM, hiM)) compatible = 1;
444     else compatible = 0;
445     /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*"); */
446     pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "&&");
447     if (!compatible) {
448       /* either patches or distributions do not match:
449        *        - create a temp array that matches distribution of g_a
450        *        - copy & reshape patch of g_m into g_M
451        */
452       if (!pnga_duplicate(g_m, &g_C, tempname))
453         pnga_error("pnga_median_patch:duplicate failed", 0L);
454 
455       /*no need to copy g_m since it is the output matrix. */
456       cndim = mndim;
457       c_temp_created = 1;
458       pnga_copy_patch(&transp, g_c, clo, chi, g_C, mlo, mhi);
459       pnga_distribution (g_C, me, loC, hiC);
460     }
461 
462 
463     if (!pnga_comp_patch (mndim, loM, hiM, andim, loA, hiA))
464       pnga_error(" patches mismatch ", 0);
465     if (!pnga_comp_patch (mndim, loM, hiM, bndim, loB, hiB))
466       pnga_error(" patches mismatch ", 0);
467     if (!pnga_comp_patch (mndim, loM, hiM, cndim, loC, hiC))
468       pnga_error(" patches mismatch ", 0);
469 
470 
471     /* A[83:125,1:1]  <==> B[83:125] */
472     if (mndim > andim)
473       mndim = andim;            /* need more work */
474     if (mndim > bndim)
475       mndim = bndim;            /* need more work */
476     if (mndim > cndim)
477       mndim = cndim;            /* need more work */
478 
479 
480     /*  determine subsets of my patches to access  */
481     if (pnga_patch_intersect (mlo, mhi, loM, hiM, mndim)) {
482       offset = 0;
483       pnga_access_ptr (g_A, loM, hiM, &A_ptr, ldA);
484       pnga_access_ptr (g_B, loM, hiM, &B_ptr, ldB);
485       pnga_access_ptr (g_C, loM, hiM, &C_ptr, ldC);
486       pnga_access_ptr (g_M, loM, hiM, &M_ptr, ldM);
487 
488       sgai_median_patch_values(type, mndim, loM, hiM, ldM,
489           A_ptr, B_ptr, C_ptr, M_ptr, offset);
490 
491       /* release access to the data */
492       pnga_release (g_A, loM, hiM);
493       pnga_release (g_B, loM, hiM);
494       pnga_release (g_C, loM, hiM);
495       pnga_release_update (g_M, loM, hiM);
496     }
497   } else {
498     /* create copies of A, B, and C that are identically distributed
499        as M */
500     if (!pnga_duplicate(g_m, &g_A, tempname))
501       pnga_error("ga_add_patch: dup failed", 0L);
502     pnga_copy_patch(&transp, g_a, alo, ahi, g_A, mlo, mhi);
503     andim = mndim;
504     a_temp_created = 1;
505 
506     if (!pnga_duplicate(g_m, &g_B, tempname))
507       pnga_error("ga_add_patch: dup failed", 0L);
508     pnga_copy_patch(&transp, g_b, blo, bhi, g_B, mlo, mhi);
509     bndim = mndim;
510     b_temp_created = 1;
511 
512     if (!pnga_duplicate(g_m, &g_C, tempname))
513       pnga_error("ga_add_patch: dup failed", 0L);
514         pnga_copy_patch(&transp, g_c, clo, chi, g_C, mlo, mhi);
515     cndim = mndim;
516     c_temp_created = 1;
517 
518     /* M is normally distributed so just get the mean using standard approach */
519 #if 1
520     pnga_local_iterator_init(g_A, &hdl_a);
521     pnga_local_iterator_init(g_B, &hdl_b);
522     pnga_local_iterator_init(g_C, &hdl_c);
523     pnga_local_iterator_init(g_m, &hdl_m);
524     while(pnga_local_iterator_next(&hdl_a,loA,hiA,&A_ptr,ldA)) {
525       pnga_local_iterator_next(&hdl_b,loB,hiB,&B_ptr,ldB);
526       pnga_local_iterator_next(&hdl_c,loC,hiC,&C_ptr,ldC);
527       pnga_local_iterator_next(&hdl_m,loM,hiM,&M_ptr,ldM);
528       Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
529       Integer jtot, last;
530       /* make temporary copies of loM and hiM since pnga_patch_intersect
531          destroys original versions */
532       for (j=0; j<mndim; j++) {
533         lod[j] = loM[j];
534         /*hid[j] = hiM[j];*/
535       }
536 
537       /* evaluate offsets for system */
538       offset = 0;
539       last = mndim - 1;
540       jtot = 1;
541       for (j=0; j<last; j++) {
542         offset += (loM[j] - lod[j])*jtot;
543         jtot *= ldM[j];
544       }
545       offset += (loM[last]-lod[last])*jtot;
546 
547       sgai_median_patch_values(type, mndim, loM, hiM, ldM,
548           A_ptr, B_ptr, C_ptr, M_ptr, offset);
549 
550     }
551 #else
552     if (num_blocks_m < 0) {
553       offset = 0;
554       pnga_distribution(g_m, me, loM, hiM);
555       if (pnga_patch_intersect (mlo, mhi, loM, hiM, mndim)) {
556         pnga_access_ptr (g_A, loM, hiM, &A_ptr, ldA);
557         pnga_access_ptr (g_B, loM, hiM, &B_ptr, ldB);
558         pnga_access_ptr (g_C, loM, hiM, &C_ptr, ldC);
559         pnga_access_ptr (g_M, loM, hiM, &M_ptr, ldM);
560 
561         sgai_median_patch_values(type, mndim, loM, hiM, ldM,
562             A_ptr, B_ptr, C_ptr, M_ptr, offset);
563 
564         /* release access to the data */
565         pnga_release (g_A, loM, hiM);
566         pnga_release (g_B, loM, hiM);
567         pnga_release (g_C, loM, hiM);
568         pnga_release_update (g_M, loM, hiM);
569       }
570     } else {
571       Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
572       Integer jtot, last, nproc;
573       /* Simple block-cyclic data distribution */
574       if (!pnga_uses_proc_grid(g_m)) {
575         nproc = pnga_nnodes();
576         for (idx = me; idx < num_blocks_m; idx += nproc) {
577           pnga_distribution(g_m, idx, loM, hiM);
578           /* make temporary copies of loM and hiM since pnga_patch_intersect
579              destroys original versions */
580           for (j=0; j<mndim; j++) {
581             lod[j] = loM[j];
582             /*hid[j] = hiM[j];*/
583           }
584           if (pnga_patch_intersect(mlo, mhi, loM, hiM, mndim)) {
585             pnga_access_block_ptr(g_A, idx, &A_ptr, ldA);
586             pnga_access_block_ptr(g_B, idx, &B_ptr, ldB);
587             pnga_access_block_ptr(g_C, idx, &C_ptr, ldC);
588             pnga_access_block_ptr(g_m, idx, &M_ptr, ldM);
589 
590             /* evaluate offsets for system */
591             offset = 0;
592             last = mndim - 1;
593             jtot = 1;
594             for (j=0; j<last; j++) {
595               offset += (loM[j] - lod[j])*jtot;
596               jtot *= ldM[j];
597             }
598             offset += (loM[last]-lod[last])*jtot;
599 
600             sgai_median_patch_values(type, mndim, loM, hiM, ldM,
601                 A_ptr, B_ptr, C_ptr, M_ptr, offset);
602 
603             /* release access to the data */
604             pnga_release_block (g_A, idx);
605             pnga_release_block (g_B, idx);
606             pnga_release_block (g_C, idx);
607             pnga_release_update_block (g_M, idx);
608           }
609         }
610       } else {
611         /* Uses scalapack block-cyclic data distribution */
612         Integer lod[MAXDIM]/*, hid[MAXDIM], chk*/;
613         Integer proc_index[MAXDIM], index[MAXDIM];
614         Integer topology[MAXDIM];
615         Integer blocks[MAXDIM], block_dims[MAXDIM];
616         pnga_get_proc_index(g_m, me, proc_index);
617         pnga_get_proc_index(g_m, me, index);
618         pnga_get_block_info(g_m, blocks, block_dims);
619         pnga_get_proc_grid(g_m, topology);
620 
621         while (index[mndim-1] < blocks[mndim-1]) {
622           /* find bounding coordinates of block */
623           /*chk = 1;*/
624           for (i = 0; i < mndim; i++) {
625             loM[i] = index[i]*block_dims[i]+1;
626             hiM[i] = (index[i] + 1)*block_dims[i];
627             if (hiM[i] > mdims[i]) hiM[i] = mdims[i];
628             /*if (hiM[i] < loM[i]) chk = 0;*/
629           }
630           /* make temporary copies of loC and hiC since pnga_patch_intersect
631              destroys original versions */
632           for (j=0; j<mndim; j++) {
633             lod[j] = loM[j];
634             /*hid[j] = hiM[j];*/
635           }
636           if (pnga_patch_intersect(mlo, mhi, loM, hiM, mndim)) {
637             pnga_access_block_grid_ptr(g_A, index, &A_ptr, ldA);
638             pnga_access_block_grid_ptr(g_B, index, &B_ptr, ldB);
639             pnga_access_block_grid_ptr(g_C, index, &C_ptr, ldC);
640             pnga_access_block_grid_ptr(g_m, index, &M_ptr, ldM);
641 
642             /* evaluate offsets for system */
643             offset = 0;
644             last = mndim - 1;
645             jtot = 1;
646             for (j=0; j<last; j++) {
647               offset += (loM[j] - lod[j])*jtot;
648               jtot *= ldM[j];
649             }
650             offset += (loM[last]-lod[last])*jtot;
651 
652             sgai_median_patch_values(type, mndim, loM, hiM, ldM,
653                 A_ptr, B_ptr, C_ptr, M_ptr, offset);
654 
655             /* release access to the data */
656             pnga_release_block_grid (g_A, index);
657             pnga_release_block_grid (g_B, index);
658             pnga_release_block_grid (g_C, index);
659             pnga_release_update_block_grid (g_M, index);
660           }
661 
662           /* increment index to get next block on processor */
663           index[0] += topology[0];
664           for (i = 0; i < mndim; i++) {
665             if (index[i] >= blocks[i] && i<mndim-1) {
666               index[i] = proc_index[i];
667               index[i+1] += topology[i+1];
668             }
669           }
670         }
671       }
672     }
673 #endif
674   }
675   if (a_temp_created)
676     pnga_destroy (g_A);
677   if (b_temp_created)
678     pnga_destroy (g_B);
679   if (c_temp_created)
680     pnga_destroy (g_C);
681   if(local_sync_end)pnga_sync();
682 }
683 
684 
685 
686 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
687 #   pragma weak wnga_median = pnga_median
688 #endif
pnga_median(Integer g_a,Integer g_b,Integer g_c,Integer g_m)689 void pnga_median(Integer g_a, Integer g_b, Integer g_c, Integer g_m){
690 
691 
692    Integer atype, andim;
693    Integer alo[MAXDIM],ahi[MAXDIM];
694    Integer btype, bndim;
695    Integer blo[MAXDIM],bhi[MAXDIM];
696    Integer ctype, cndim;
697    Integer clo[MAXDIM],chi[MAXDIM];
698    Integer mtype, mndim;
699    Integer mlo[MAXDIM],mhi[MAXDIM];
700 
701     pnga_inquire(g_a,  &atype, &andim, ahi);
702     pnga_inquire(g_b,  &btype, &bndim, bhi);
703     pnga_inquire(g_c,  &ctype, &cndim, chi);
704     pnga_inquire(g_m,  &mtype, &mndim, mhi);
705 
706     while(andim){
707         alo[andim-1]=1;
708         andim--;
709     }
710 
711     while(bndim){
712         blo[bndim-1]=1;
713         bndim--;
714     }
715 
716     while(cndim){
717         clo[cndim-1]=1;
718         cndim--;
719     }
720 
721     while(mndim){
722         mlo[mndim-1]=1;
723         mndim--;
724     }
725     _ga_sync_begin = 1;
726     pnga_median_patch(g_a, alo, ahi, g_b, blo, bhi, g_c, clo, chi, g_m, mlo, mhi);
727 
728 }
729 
sgai_norm_infinity_block(Integer g_a,void * ptr,Integer * lo,Integer * hi,Integer ld,Integer type,Integer ndim,Integer * dims,void * buf)730 static void sgai_norm_infinity_block(Integer g_a, void *ptr,
731                              Integer *lo, Integer *hi, Integer ld,
732                              Integer type,
733                              Integer ndim, Integer *dims, void *buf)
734 {
735   /*Integer size, nelem, dim2;*/
736   Integer iloA=0, ihiA=0, jloA=0, jhiA=0;
737   Integer i, j;
738   int *isum = NULL;
739   long *lsum = NULL;
740   double *dsum = NULL;
741   float *fsum = NULL;
742   DoubleComplex *zsum = NULL;
743   SingleComplex *csum = NULL;
744 
745   switch (type)
746   {
747     case C_INT:
748       isum = (int *) buf;
749       break;
750     case C_LONG:
751       lsum = (long *) buf;
752       break;
753     case C_FLOAT:
754       fsum = (float *) buf;
755       break;
756     case C_DBL:
757       dsum = (double *) buf;
758       break;
759     case C_DCPL:
760       zsum = (DoubleComplex *) buf;
761       break;
762     case C_SCPL:
763       csum = (SingleComplex *) buf;
764       break;
765     default:
766       pnga_error("ga_norm_infinity_: wrong data type:", type);
767   }
768 
769   if(ndim<=0)
770     pnga_error("ga_norm_infinity: wrong dimension", ndim);
771   else if(ndim == 1) {
772     iloA=lo[0];
773     ihiA=hi[0];
774     jloA=1;
775     jhiA=1;
776   }
777   else if(ndim == 2) {
778     iloA=lo[0];
779     ihiA=hi[0];
780     jloA=lo[1];
781     jhiA=hi[1];
782   }
783   else
784     pnga_error("ga_norm_infinity: wrong dimension", ndim);
785 
786   /* determine subset of my patch to access */
787   if (ihiA > 0 && jhiA > 0)
788   {
789     switch (type)
790     {
791       int *pi;
792       double *pd;
793       long *pl;
794       float *pf;
795       DoubleComplex *pz;
796       SingleComplex *pc;
797       DoubleComplex zval;
798       SingleComplex cval;
799       double dtemp;
800       float ftemp;
801       case C_INT:
802       pi = (int *) ptr;
803       *isum = GA_ABS(pi[0]);
804       for (j = 0; j < jhiA - jloA + 1; j++) {
805         for (i = 0; i < ihiA - iloA + 1; i++)
806           if (GA_ABS(pi[j*ld + i]) > *isum) *isum = GA_ABS(pi[j*ld+i]);
807       }
808       break;
809       case C_LONG:
810       pl = (long *) ptr;
811       *lsum = GA_ABS(pl[0]);
812       for (j = 0; j < jhiA - jloA + 1; j++)
813         for (i = 0; i < ihiA - iloA + 1; i++)
814           if (GA_ABS(pl[j*ld + i]) > *lsum) *lsum = GA_ABS(pl[j*ld+i]);
815       break;
816       case C_DCPL:
817       pz = (DoubleComplex *) ptr;
818       zval = pz[0];
819       dtemp =
820         sqrt (zval.real * zval.real + zval.imag * zval.imag);
821       (*zsum).real = dtemp;
822       for (j = 0; j < jhiA - jloA + 1; j++)
823         for (i = 0; i < ihiA - iloA + 1; i++)
824         {
825           zval = pz[j * ld + i];
826           dtemp =
827             sqrt (zval.real * zval.real + zval.imag * zval.imag);
828           if (dtemp > (*zsum).real) (*zsum).real = dtemp;
829         }
830       break;
831       case C_SCPL:
832       pc = (SingleComplex *) ptr;
833       cval =  pc[0];
834       ftemp =
835         sqrt (cval.real * cval.real + cval.imag * cval.imag);
836       (*csum).real = ftemp;
837       for (j = 0; j < jhiA - jloA + 1; j++)
838         for (i = 0; i < ihiA - iloA + 1; i++)
839         {
840           cval = pc[j * ld + i];
841           ftemp =
842             sqrt (cval.real * cval.real + cval.imag * cval.imag);
843           if (ftemp > (*csum).real) (*csum).real = ftemp;
844         }
845       break;
846       case C_FLOAT:
847       pf = (float *) ptr;
848       *fsum = pf[0];
849       for (j = 0; j < jhiA - jloA + 1; j++)
850         for (i = 0; i < ihiA - iloA + 1; i++)
851           if (GA_ABS(pf[j*ld+i]) > *fsum) *fsum = GA_ABS(pf[j*ld+i]);
852       break;
853       case C_DBL:
854       pd = (double *) ptr;
855       *dsum = pd[0];
856       for (i = 0; i < ihiA - iloA + 1; i++)
857         for (j = 0; j < jhiA - jloA + 1; j++)
858           if (GA_ABS(pd[j*ld+i]) > *dsum) *dsum = GA_ABS(pd[j*ld+i]);
859       break;
860       default:
861       pnga_error("sgai_norm_infinity_block: wrong data type ", type);
862     }
863   }
864 }
865 
866 /* Evaluate the infinity norm of an array. This is the maximum absolute
867  * value of the the elements in the array */
868 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
869 #   pragma weak wnga_norm_infinity = pnga_norm_infinity
870 #endif
pnga_norm_infinity(Integer g_a,double * nm)871 void pnga_norm_infinity(Integer g_a, double *nm)
872 {
873   Integer type;
874   Integer me = pnga_nodeid (), i, j, nproc = pnga_nnodes();
875   Integer ndim, dims[MAXDIM], lo[2], hi[2], ld;
876   Integer num_blocks_a;
877   int local_sync_begin,local_sync_end;
878   int isum = 0;
879   long lsum = 0;
880   double dsum = 0.0;
881   float fsum = 0.0;
882   float fval;
883   double dval;
884   DoubleComplex zsum;
885   SingleComplex csum;
886   char *buf = NULL;
887   char *ptr = NULL;
888   zsum.real = 0.0;
889   zsum.imag = 0.0;
890   csum.real = 0.0;
891   csum.imag = 0.0;
892   _iterator_hdl hdl;
893 
894 
895   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
896   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
897   if(local_sync_begin)pnga_sync();
898 
899   pnga_check_handle (g_a, "ga_norm_infinity_");
900 
901   pnga_inquire (g_a, &type, &ndim, dims);
902 
903   if(ndim<=0)
904     pnga_error("ga_norm_infinity: wrong dimension", ndim);
905   else if (ndim >= 3)
906     pnga_error("ga_norm_infinity: wrong dimension", ndim);
907 
908 
909   switch (type)
910   {
911     case C_INT:
912       buf = (void*)(&isum);
913       break;
914     case C_LONG:
915       buf = (void*)(&lsum);
916       break;
917     case C_FLOAT:
918       buf = (void*)(&fsum);
919       break;
920     case C_DBL:
921       buf = (void*)(&dsum);
922       break;
923     case C_DCPL:
924       buf = (void*)(&zsum);
925       break;
926     case C_SCPL:
927       buf = (void*)(&csum);
928       break;
929     default:
930       pnga_error("ga_norm_infinity_: wrong data type:", type);
931   }
932 
933 #if 1
934   pnga_local_iterator_init(g_a, &hdl);
935   while (pnga_local_iterator_next(&hdl,lo,hi,&ptr,&ld)) {
936     sgai_norm_infinity_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
937   }
938 #else
939   num_blocks_a = pnga_total_blocks(g_a);
940 
941   if (num_blocks_a < 0) {
942     pnga_distribution(g_a, me, lo, hi);
943     pnga_access_ptr(g_a, lo, hi, &ptr, &ld);
944     sgai_norm_infinity_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
945     pnga_release_update(g_a, lo, hi);
946   } else {
947     Integer idx;
948     /* Simple block-cyclic data distribution */
949     if (!pnga_uses_proc_grid(g_a)) {
950       for (idx = me; idx < num_blocks_a; idx += nproc) {
951         pnga_distribution(g_a, idx, lo, hi);
952         pnga_access_block_ptr(g_a, idx, &ptr, &ld);
953         sgai_norm_infinity_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
954         pnga_release_update_block(g_a, idx);
955       }
956     } else {
957       /* Uses scalapack block-cyclic data distribution */
958       Integer chk;
959       Integer proc_index[MAXDIM], index[MAXDIM];
960       Integer topology[MAXDIM];
961       Integer blocks[MAXDIM], block_dims[MAXDIM];
962       pnga_get_proc_index(g_a, me, proc_index);
963       pnga_get_proc_index(g_a, me, index);
964       pnga_get_block_info(g_a, blocks, block_dims);
965       pnga_get_proc_grid(g_a, topology);
966       while (index[ndim-1] < blocks[ndim-1]) {
967         /* find bounding coordinates of block */
968         chk = 1;
969         for (i = 0; i < ndim; i++) {
970           lo[i] = index[i]*block_dims[i]+1;
971           hi[i] = (index[i] + 1)*block_dims[i];
972           if (hi[i] > dims[i]) hi[i] = dims[i];
973           if (hi[i] < lo[i]) chk = 0;
974         }
975         if (chk) {
976           pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
977           sgai_norm_infinity_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
978           pnga_release_update_block_grid(g_a, index);
979         }
980         /* increment index to get next block on processor */
981         index[0] += topology[0];
982         for (i = 0; i < ndim; i++) {
983           if (index[i] >= blocks[i] && i<ndim-1) {
984             index[i] = proc_index[i];
985             index[i+1] += topology[i+1];
986           }
987         }
988       }
989     }
990   }
991 #endif
992 
993 
994   /*calculate the global value buf[j] for each column */
995   switch (type)
996   {
997     case C_INT:
998       armci_msg_igop (&isum, 1, "max");
999       break;
1000     case C_DBL:
1001       armci_msg_dgop (&dsum, 1, "max");
1002       break;
1003     case C_DCPL:
1004       dval = zsum.real;
1005       armci_msg_dgop (&dval, 1, "max");
1006       zsum.real = dval;
1007       break;
1008     case C_FLOAT:
1009       armci_msg_fgop (&fsum, 1, "max");
1010       break;
1011     case C_SCPL:
1012       fval = csum.real;
1013       armci_msg_fgop (&fval, 1, "max");
1014       csum.real = fval;
1015       break;
1016     case C_LONG:
1017       armci_msg_lgop (&lsum, 1, "max");
1018       break;
1019     default:
1020       pnga_error("ga_norm_infinity_: wrong data type ", type);
1021   }
1022 
1023   /*evaluate the norm infinity for the matrix g_a */
1024   switch (type)
1025   {
1026     case C_INT:
1027       *nm = (double)isum;
1028       break;
1029     case C_LONG:
1030       *nm = (double)lsum;
1031       break;
1032     case C_FLOAT:
1033       *nm = (double)fsum;
1034       break;
1035     case C_DBL:
1036       *nm = (double)dsum;
1037       break;
1038     case C_DCPL:
1039       *nm = (double)(zsum.real);
1040       break;
1041     case C_SCPL:
1042       *nm = (double)(csum.real);
1043       break;
1044     default:
1045       pnga_error("ga_norm_infinity_:wrong data type.", type);
1046   }
1047 
1048   if (local_sync_end)pnga_sync();
1049 }
1050 
sgai_norm1_block(Integer g_a,void * ptr,Integer * lo,Integer * hi,Integer ld,Integer type,Integer ndim,Integer * dims,void * buf)1051 static void sgai_norm1_block(Integer g_a, void *ptr,
1052                      Integer *lo, Integer *hi, Integer ld,
1053                      Integer type,
1054                      Integer ndim, Integer *dims, void *buf)
1055 {
1056   Integer iloA=0, ihiA=0, jloA=0, jhiA=0;
1057   Integer i, j;
1058   int *isum = NULL;
1059   long *lsum = NULL;
1060   double *dsum = NULL;
1061   float *fsum = NULL;
1062   DoubleComplex *zsum = NULL;
1063   SingleComplex *csum = NULL;
1064 
1065   switch (type)
1066   {
1067     case C_INT:
1068       isum = (int *) buf;
1069       break;
1070     case C_LONG:
1071       lsum = (long *) buf;
1072       break;
1073     case C_FLOAT:
1074       fsum = (float *) buf;
1075       break;
1076     case C_DBL:
1077       dsum = (double *) buf;
1078       break;
1079     case C_DCPL:
1080       zsum = (DoubleComplex *) buf;
1081       break;
1082     case C_SCPL:
1083       csum = (SingleComplex *) buf;
1084       break;
1085     default:
1086       pnga_error("ga1_norm1_block: wrong data type:", type);
1087   }
1088 
1089   if(ndim<=0)
1090     pnga_error("sgai_norm1_block: wrong dimension", ndim);
1091   else if(ndim == 1) {
1092     iloA=lo[0];
1093     ihiA=hi[0];
1094     jloA=1;
1095     jhiA=1;
1096   }
1097   else if(ndim == 2)
1098   {
1099     iloA=lo[0];
1100     ihiA=hi[0];
1101     jloA=lo[1];
1102     jhiA=hi[1];
1103   }
1104   else
1105     pnga_error("sgai_norm1_block: wrong dimension", ndim);
1106 
1107   /* determine subset of my patch to access */
1108   if (ihiA > 0 && jhiA > 0)
1109   {
1110     switch (type)
1111     {
1112       int *pi;
1113       double *pd;
1114       long *pl;
1115       float *pf;
1116       DoubleComplex *pz;
1117       SingleComplex *pc;
1118       case C_INT:
1119         pi = (int *) ptr;
1120         *isum = 0;
1121         for (i = 0; i < ihiA - iloA + 1; i++)
1122           for (j = 0; j < jhiA - jloA + 1; j++)
1123             *isum += GA_ABS (pi[j * ld + i]);
1124         break;
1125       case C_LONG:
1126         pl = (long *) ptr;
1127         *lsum = 0;
1128         for (i = 0; i < ihiA - iloA + 1; i++)
1129           for (j = 0; j < jhiA - jloA + 1; j++)
1130             *lsum += GA_ABS (pl[j * ld + i]);
1131         break;
1132       case C_DCPL:
1133         pz = (DoubleComplex *) ptr;
1134         (*zsum).real = 0.0;
1135         (*zsum).imag = 0.0;
1136         for (i = 0; i < ihiA - iloA + 1; i++)
1137           for (j = 0; j < jhiA - jloA + 1; j++)
1138           {
1139             DoubleComplex zval = pz[j * ld + i];
1140             double temp =
1141               sqrt (zval.real * zval.real + zval.imag * zval.imag);
1142             (*zsum).real += temp;
1143           }
1144         break;
1145       case C_SCPL:
1146         pc = (SingleComplex *) ptr;
1147         (*csum).real = 0.0;
1148         (*csum).imag = 0.0;
1149         for (i = 0; i < ihiA - iloA + 1; i++)
1150           for (j = 0; j < jhiA - jloA + 1; j++)
1151           {
1152             SingleComplex cval = pc[j * ld + i];
1153             float temp =
1154               sqrt (cval.real * cval.real + cval.imag * cval.imag);
1155             (*csum).real += temp;
1156           }
1157         break;
1158       case C_FLOAT:
1159         pf = (float *) ptr;
1160         *fsum = 0.0;
1161         for (i = 0; i < ihiA - iloA + 1; i++)
1162           for (j = 0; j < jhiA - jloA + 1; j++)
1163             *fsum += GA_ABS (pf[j * ld + i]);
1164         break;
1165       case C_DBL:
1166         pd = (double *) ptr;
1167         *dsum = 0.0;
1168         for (i = 0; i < ihiA - iloA + 1; i++)
1169           for (j = 0; j < jhiA - jloA + 1; j++)
1170             *dsum += GA_ABS (pd[j * ld + i]);
1171         break;
1172       default:
1173         pnga_error("sgai_norm1_block: wrong data type ", type);
1174     }
1175   }
1176 }
1177 
1178 /* Evaluate the L_1 norm of an array. This should be the sum of the absolute
1179  * value of the individual array elements*/
1180 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1181 #   pragma weak wnga_norm1 = pnga_norm1
1182 #endif
pnga_norm1(Integer g_a,double * nm)1183 void pnga_norm1(Integer g_a, double *nm)
1184 {
1185   Integer type=0;
1186   Integer me = pnga_nodeid (), i, j, nproc = pnga_nnodes();
1187   Integer ndim, dims[MAXDIM], lo[2], hi[2], ld;
1188   Integer num_blocks_a;
1189   int local_sync_begin,local_sync_end;
1190   int isum = 0;
1191   long lsum = 0;
1192   double dsum = 0.0;
1193   float fsum = 0.0;
1194   double dval;
1195   float fval;
1196   DoubleComplex zsum;
1197   SingleComplex csum;
1198   char *buf = NULL;                    /*temporary buffer */
1199   char *ptr = NULL;
1200   zsum.real = 0.0;
1201   zsum.imag = 0.0;
1202   csum.real = 0.0;
1203   csum.imag = 0.0;
1204   _iterator_hdl hdl;
1205 
1206   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1207   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1208   if(local_sync_begin)pnga_sync();
1209 
1210   pnga_check_handle (g_a, "ga_norm1_");
1211 
1212   pnga_inquire (g_a, &type, &ndim, dims);
1213 
1214 
1215   if(ndim<=0 || ndim > 2)
1216     pnga_error("ga_norm1: wrong dimension", ndim);
1217 
1218   switch (type)
1219   {
1220     case C_INT:
1221       buf = (void*)(&isum);
1222       break;
1223     case C_LONG:
1224       buf = (void*)(&lsum);
1225       break;
1226     case C_FLOAT:
1227       buf = (void*)(&fsum);
1228       break;
1229     case C_DBL:
1230       buf = (void*)(&dsum);
1231       break;
1232     case C_DCPL:
1233       buf = (void*)(&zsum);
1234       break;
1235     case C_SCPL:
1236       buf = (void*)(&csum);
1237       break;
1238     default:
1239       pnga_error("ga_norm1_: wrong data type:", type);
1240   }
1241 
1242 #if 1
1243   pnga_local_iterator_init(g_a, &hdl);
1244   while (pnga_local_iterator_next(&hdl,lo,hi,&ptr,&ld)) {
1245     sgai_norm1_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
1246   }
1247 #else
1248   num_blocks_a = pnga_total_blocks(g_a);
1249 
1250   if (num_blocks_a < 0) {
1251     pnga_distribution(g_a, me, lo, hi);
1252     pnga_access_ptr(g_a, lo, hi, &ptr, &ld);
1253     sgai_norm1_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
1254     pnga_release_update(g_a, lo, hi);
1255   } else {
1256     Integer idx;
1257     /* Simple block-cyclic data distribution */
1258     if (!pnga_uses_proc_grid(g_a)) {
1259       for (idx = me; idx < num_blocks_a; idx += nproc) {
1260         pnga_distribution(g_a, idx, lo, hi);
1261         pnga_access_block_ptr(g_a, idx, &ptr, &ld);
1262         sgai_norm1_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
1263         pnga_release_update_block(g_a, idx);
1264       }
1265     } else {
1266       /* Uses scalapack block-cyclic data distribution */
1267       Integer chk;
1268       Integer proc_index[MAXDIM], index[MAXDIM];
1269       Integer topology[MAXDIM];
1270       Integer blocks[MAXDIM], block_dims[MAXDIM];
1271       pnga_get_proc_index(g_a, me, proc_index);
1272       pnga_get_proc_index(g_a, me, index);
1273       pnga_get_block_info(g_a, blocks, block_dims);
1274       pnga_get_proc_grid(g_a, topology);
1275       while (index[ndim-1] < blocks[ndim-1]) {
1276         /* find bounding coordinates of block */
1277         chk = 1;
1278         for (i = 0; i < ndim; i++) {
1279           lo[i] = index[i]*block_dims[i]+1;
1280           hi[i] = (index[i] + 1)*block_dims[i];
1281           if (hi[i] > dims[i]) hi[i] = dims[i];
1282           if (hi[i] < lo[i]) chk = 0;
1283         }
1284         if (chk) {
1285           pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
1286           sgai_norm1_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
1287           pnga_release_update_block_grid(g_a, index);
1288         }
1289         /* increment index to get next block on processor */
1290         index[0] += topology[0];
1291         for (i = 0; i < ndim; i++) {
1292           if (index[i] >= blocks[i] && i<ndim-1) {
1293             index[i] = proc_index[i];
1294             index[i+1] += topology[i+1];
1295           }
1296         }
1297       }
1298     }
1299   }
1300 #endif
1301 
1302   /*calculate the global value buf[j] for each column */
1303   switch (type)
1304   {
1305     case C_INT:
1306       armci_msg_igop (&isum, 1, "+");
1307       break;
1308     case C_DBL:
1309       armci_msg_dgop (&dsum, 1, "+");
1310       break;
1311     case C_DCPL:
1312       dval = zsum.real;
1313       armci_msg_dgop (&dval, 1, "+");
1314       zsum.real = dval;
1315       break;
1316     case C_FLOAT:
1317       armci_msg_fgop (&fsum, 1, "+");
1318       break;
1319     case C_SCPL:
1320       fval = csum.real;
1321       armci_msg_fgop (&fval, 1, "+");
1322       csum.real = fval;
1323       break;
1324     case C_LONG:
1325       armci_msg_lgop (&lsum, 1, "+");
1326       break;
1327     default:
1328       pnga_error("ga_norm1_: wrong data type ", type);
1329   }
1330 
1331   /*evaluate the norm1 for the matrix g_a */
1332   switch (type)
1333   {
1334     case C_INT:
1335       *nm = (double)isum;
1336       break;
1337     case C_LONG:
1338       *nm = (double)lsum;
1339       break;
1340     case C_FLOAT:
1341       *nm = (double)fsum;
1342       break;
1343     case C_DBL:
1344       *nm = (double)dsum;
1345       break;
1346     case C_DCPL:
1347       *nm = (double)(zsum.real);
1348       break;
1349     case C_SCPL:
1350       *nm = (double)(csum.real);
1351       break;
1352     default:
1353       pnga_error("ga_norm1_:wrong data type.", type);
1354   }
1355 
1356   if(local_sync_end)pnga_sync();
1357 }
1358 
sgai_get_diagonal_block(Integer g_a,void * ptr,Integer g_v,Integer * loA,Integer * hiA,Integer ld,Integer type)1359 static void sgai_get_diagonal_block(Integer g_a, void *ptr, Integer g_v,
1360                             Integer *loA, Integer *hiA, Integer ld,
1361                             Integer type)
1362 {
1363   Integer nelem, size;
1364   Integer vlo, vhi, iloA, ihiA, jloA, jhiA, lo[2], hi[2];
1365   Integer i;
1366   void *buf;
1367   int *ia;
1368   float *fa;
1369   double *da;
1370   long *la;
1371   DoubleComplex *dca;
1372   SingleComplex *fca;
1373 
1374   iloA = loA[0];
1375   ihiA = hiA[0];
1376   jloA = loA[1];
1377   jhiA = hiA[1];
1378 
1379   /* determine subset of my patch to access */
1380   if (iloA > 0)
1381   {
1382     lo[0] = GA_MAX (iloA, jloA);
1383     lo[1] = GA_MAX (iloA, jloA);
1384     hi[0] = GA_MIN (ihiA, jhiA);
1385     hi[1] = GA_MIN (ihiA, jhiA);
1386 
1387 
1388 
1389     if (hi[0] >= lo[0]) /*make sure the equality symbol is there!!! */
1390     {                   /* we got a block containing diagonal elements */
1391 
1392       /*allocate a buffer for the given vector g_v */
1393       size = GAsizeof (type);
1394       vlo = GA_MAX (iloA, jloA);
1395       vhi = GA_MIN (ihiA, jhiA);
1396       nelem = vhi - vlo + 1;
1397       buf = malloc (nelem * size);
1398       if (buf == NULL)
1399         pnga_error
1400           ("ga_get_diag_:failed to allocate memory for the local buffer.",
1401            9999);
1402 
1403       /* get the vector from the global array g_a, put that in the the local memory buffer buf */
1404       switch (type)
1405       {
1406         case C_INT:
1407           ia = (int *) ptr;
1408           ia += ld*(lo[1]-jloA) + lo[0]-iloA;
1409           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1410           {
1411             ((int *) buf)[i] = *ia;
1412             ia += ld + 1;
1413           }
1414           break;
1415         case C_LONG:
1416           la = (long *) ptr;
1417           la += ld*(lo[1]-jloA) + lo[0]-iloA;
1418           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1419           {
1420             ((long *) buf)[i] = *la;
1421             la += ld + 1;
1422           }
1423           break;
1424         case C_FLOAT:
1425           fa = (float *) ptr;
1426           fa += ld*(lo[1]-jloA) + lo[0]-iloA;
1427           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1428           {
1429             ((float *) buf)[i] = *fa;
1430             fa += ld + 1;
1431           }
1432           break;
1433         case C_DBL:
1434           da = (double *) ptr;
1435           da += ld*(lo[1]-jloA) + lo[0]-iloA;
1436           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1437           {
1438             ((double *) buf)[i] = *da;
1439             da += ld + 1;
1440           }
1441           break;
1442         case C_DCPL:
1443           dca = (DoubleComplex *) ptr;
1444           dca += ld*(lo[1]-jloA) + lo[0]-iloA;
1445           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1446           {
1447             (((DoubleComplex *) buf)[i]).real = (*dca).real;
1448             (((DoubleComplex *) buf)[i]).imag = (*dca).imag;
1449             dca += ld + 1;
1450           }
1451           break;
1452 
1453         case C_SCPL:
1454           fca = (SingleComplex *) ptr;
1455           fca += ld*(lo[1]-jloA) + lo[0]-iloA;
1456           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1457           {
1458             (((SingleComplex *) buf)[i]).real = (*fca).real;
1459             (((SingleComplex *) buf)[i]).imag = (*fca).imag;
1460             fca += ld + 1;
1461           }
1462           break;
1463 
1464         default:
1465           pnga_error("get_diagonal_zero: wrong data type:", type);
1466       }
1467 
1468       /* copy the local memory buffer buf to g_v */
1469       pnga_put(g_v, &vlo, &vhi, buf, &vhi);
1470 
1471       /*free the memory */
1472       free (buf);
1473     }
1474   }
1475 }
1476 
1477 /* Get the diagonal elements of a matrix and copy them into a one dimensional
1478  * array */
1479 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1480 #   pragma weak wnga_get_diag = pnga_get_diag
1481 #endif
pnga_get_diag(Integer g_a,Integer g_v)1482 void pnga_get_diag(Integer g_a, Integer g_v)
1483 {
1484   Integer vndim, vdims, dim1, dim2, vtype, atype, type;
1485   Integer me = pnga_nodeid (), i, nproc = pnga_nnodes();
1486   Integer andim, adims[2];
1487   Integer loA[2], hiA[2], ld;
1488   Integer num_blocks_a;
1489   int local_sync_begin,local_sync_end;
1490   char *ptr;
1491   _iterator_hdl hdl;
1492 
1493   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1494   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1495   if(local_sync_begin)pnga_sync();
1496 
1497   pnga_check_handle (g_a, "ga_get_diag_");
1498   pnga_check_handle (g_v, "ga_get_diag_");
1499 
1500   pnga_inquire (g_a, &atype, &andim, adims);
1501   dim1 = adims[0];
1502   dim2 = adims[1];
1503   type = atype;
1504   pnga_inquire (g_v, &vtype, &vndim, &vdims);
1505 
1506   /* Perform some error checking */
1507   if (andim != 2)
1508     pnga_error("ga_get_diag: wrong dimension for g_a.", andim);
1509   if (vndim != 1)
1510     pnga_error("ga_get_diag: wrong dimension for g_v.", vndim);
1511 
1512   if (vdims != GA_MIN (dim1, dim2))
1513     pnga_error
1514       ("ga_get_diag: The size of the first array's diagonal is greater than the size of the second array.",
1515        type);
1516 
1517   if (vtype != atype)
1518   {
1519     pnga_error
1520       ("ga_get_diag: input global arrays do not have the same data type. Global array type =",
1521        atype);
1522   }
1523 
1524 #if 1
1525   pnga_local_iterator_init(g_a, &hdl);
1526   while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
1527     sgai_get_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1528   }
1529 #else
1530   num_blocks_a = pnga_total_blocks(g_a);
1531 
1532   if (num_blocks_a < 0) {
1533     pnga_distribution(g_a, me, loA, hiA);
1534     pnga_access_ptr(g_a, loA, hiA, &ptr, &ld);
1535     sgai_get_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1536     pnga_release_update(g_a, loA, hiA);
1537   } else {
1538     Integer idx;
1539     /* Simple block-cyclic data distribution */
1540     if (!pnga_uses_proc_grid(g_a)) {
1541       for (idx = me; idx < num_blocks_a; idx += nproc) {
1542         pnga_distribution(g_a, idx, loA, hiA);
1543         pnga_access_block_ptr(g_a, idx, &ptr, &ld);
1544         sgai_get_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1545         pnga_release_update_block(g_a, idx);
1546       }
1547     } else {
1548       /* Uses scalapack block-cyclic data distribution */
1549       Integer chk;
1550       Integer proc_index[MAXDIM], index[MAXDIM];
1551       Integer topology[MAXDIM];
1552       Integer blocks[MAXDIM], block_dims[MAXDIM];
1553       pnga_get_proc_index(g_a, me, proc_index);
1554       pnga_get_proc_index(g_a, me, index);
1555       pnga_get_block_info(g_a, blocks, block_dims);
1556       pnga_get_proc_grid(g_a, topology);
1557       while (index[andim-1] < blocks[andim-1]) {
1558         /* find bounding coordinates of block */
1559         chk = 1;
1560         for (i = 0; i < andim; i++) {
1561           loA[i] = index[i]*block_dims[i]+1;
1562           hiA[i] = (index[i] + 1)*block_dims[i];
1563           if (hiA[i] > adims[i]) hiA[i] = adims[i];
1564           if (hiA[i] < loA[i]) chk = 0;
1565         }
1566         if (chk) {
1567           pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
1568           sgai_get_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1569           pnga_release_update_block_grid(g_a, index);
1570         }
1571         /* increment index to get next block on processor */
1572         index[0] += topology[0];
1573         for (i = 0; i < andim; i++) {
1574           if (index[i] >= blocks[i] && i<andim-1) {
1575             index[i] = proc_index[i];
1576             index[i+1] += topology[i+1];
1577           }
1578         }
1579       }
1580     }
1581   }
1582 #endif
1583 
1584   if(local_sync_end)pnga_sync();
1585 }
1586 
1587 
sgai_add_diagonal_block(Integer g_a,void * ptr,Integer g_v,Integer * loA,Integer * hiA,Integer ld,Integer type)1588 static void sgai_add_diagonal_block(Integer g_a, void *ptr, Integer g_v,
1589                             Integer *loA, Integer *hiA, Integer ld,
1590                             Integer type)
1591 {
1592   Integer nelem, size;
1593   Integer vlo, vhi, iloA, ihiA, jloA, jhiA, lo[2], hi[2];
1594   Integer i;
1595   void *buf;
1596   int *ia;
1597   float *fa;
1598   double *da;
1599   long *la;
1600   DoubleComplex *dca;
1601   SingleComplex *fca;
1602 
1603   iloA = loA[0];
1604   ihiA = hiA[0];
1605   jloA = loA[1];
1606   jhiA = hiA[1];
1607 
1608   /* determine subset of my patch to access */
1609   if (iloA > 0)
1610   {
1611     lo[0] = GA_MAX (iloA, jloA);
1612     lo[1] = GA_MAX (iloA, jloA);
1613     hi[0] = GA_MIN (ihiA, jhiA);
1614     hi[1] = GA_MIN (ihiA, jhiA);
1615 
1616 
1617 
1618     if (hi[0] >= lo[0]) /*make sure the equality symbol is there!!! */
1619     {                   /* we got a block containing diagonal elements */
1620 
1621       /*allocate a buffer for the given vector g_v */
1622       size = GAsizeof (type);
1623       vlo = GA_MAX (iloA, jloA);
1624       vhi = GA_MIN (ihiA, jhiA);
1625       nelem = vhi - vlo + 1;
1626       buf = malloc (nelem * size);
1627       if (buf == NULL)
1628         pnga_error
1629           ("ga_add_diagonal_:failed to allocate memory for the local buffer.",
1630            0);
1631 
1632       /* get the vector from the global array to the local memory buffer */
1633       pnga_get (g_v, &vlo, &vhi, buf, &vhi);
1634 
1635       switch (type)
1636       {
1637         case C_INT:
1638           ia = (int *) ptr;
1639           ia += ld*(lo[1]-jloA) + lo[0]-iloA;
1640           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1641           {
1642             *ia += ((int *) buf)[i];
1643             ia += ld + 1;
1644           }
1645           break;
1646         case C_LONG:
1647           la = (long *) ptr;
1648           la += ld*(lo[1]-jloA) + lo[0]-iloA;
1649           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1650           {
1651             *la += ((long *) buf)[i];
1652             la += ld + 1;
1653           }
1654           break;
1655         case C_FLOAT:
1656           fa = (float *) ptr;
1657           fa += ld*(lo[1]-jloA) + lo[0]-iloA;
1658           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1659           {
1660             *fa += ((float *) buf)[i];
1661             fa += ld + 1;
1662           }
1663           break;
1664         case C_DBL:
1665           da = (double *) ptr;
1666           da += ld*(lo[1]-jloA) + lo[0]-iloA;
1667           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1668           {
1669             *da += ((double *) buf)[i];
1670             da += ld + 1;
1671           }
1672           break;
1673         case C_DCPL:
1674           dca = (DoubleComplex *) ptr;
1675           dca += ld*(lo[1]-jloA) + lo[0]-iloA;
1676           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1677           {
1678             (*dca).real += (((DoubleComplex *) buf)[i]).real;
1679             (*dca).imag += (((DoubleComplex *) buf)[i]).imag;
1680             dca += ld + 1;
1681           }
1682           break;
1683 
1684         case C_SCPL:
1685           fca = (SingleComplex *) ptr;
1686           fca += ld*(lo[1]-jloA) + lo[0]-iloA;
1687           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1688           {
1689             (*fca).real += (((SingleComplex *) buf)[i]).real;
1690             (*fca).imag += (((SingleComplex *) buf)[i]).imag;
1691             fca += ld + 1;
1692           }
1693           break;
1694 
1695         default:
1696           pnga_error("ga_add_diagonal_: wrong data type:", type);
1697       }
1698 
1699       /*free the memory */
1700       free (buf);
1701     }
1702   }
1703 }
1704 
1705 /* Add values in vector g_v to diagonal elements of g_a */
1706 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1707 #   pragma weak wnga_add_diagonal = pnga_add_diagonal
1708 #endif
pnga_add_diagonal(Integer g_a,Integer g_v)1709 void pnga_add_diagonal(Integer g_a, Integer g_v)
1710 {
1711   Integer vndim, vdims, dim1, dim2, vtype, atype, type;
1712   Integer me = pnga_nodeid (), i, nproc = pnga_nnodes();
1713   Integer andim, adims[2];
1714   Integer loA[2], hiA[2], ld;
1715   Integer num_blocks_a;
1716   int local_sync_begin,local_sync_end;
1717   char *ptr;
1718   _iterator_hdl hdl;
1719 
1720   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1721   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1722   if(local_sync_begin)pnga_sync();
1723 
1724   pnga_check_handle (g_a, "ga_add_diagonal_");
1725   pnga_check_handle (g_v, "ga_add_diagonal_");
1726 
1727   pnga_inquire(g_a, &atype, &andim, adims);
1728   dim1 = adims[0];
1729   dim2 = adims[1];
1730   type = atype;
1731   pnga_inquire(g_v, &vtype, &vndim, &vdims);
1732 
1733   /* Perform some error checking */
1734   if (andim != 2)
1735     pnga_error("ga_add_diagonal: wrong dimension for g_a.", andim);
1736   if (vndim != 1)
1737     pnga_error("ga_add_diagonal: wrong dimension for g_v.", vndim);
1738 
1739 
1740   if (vdims != GA_MIN (dim1, dim2))
1741     pnga_error
1742       ("ga_add_diagonal: The size of the first array's diagonal is greater than the size of the second array.",
1743        type);
1744 
1745   if (vtype != atype)
1746   {
1747     pnga_error
1748       ("ga_add_diagonal: input global arrays do not have the same data type. Global array type =",
1749        atype);
1750   }
1751 
1752 #if 1
1753   pnga_local_iterator_init(g_a, &hdl);
1754   while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
1755     sgai_add_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1756   }
1757 #else
1758   num_blocks_a = pnga_total_blocks(g_a);
1759 
1760   if (num_blocks_a < 0) {
1761     pnga_distribution(g_a, me, loA, hiA);
1762     pnga_access_ptr(g_a, loA, hiA, &ptr, &ld);
1763     sgai_add_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1764   } else {
1765     Integer idx;
1766     /* Simple block-cyclic data distribution */
1767     if (!pnga_uses_proc_grid(g_a)) {
1768       for (idx = me; idx < num_blocks_a; idx += nproc) {
1769         pnga_distribution(g_a, idx, loA, hiA);
1770         pnga_access_block_ptr(g_a, idx, &ptr, &ld);
1771         sgai_add_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1772         pnga_release_update_block(g_a, idx);
1773       }
1774     } else {
1775       /* Uses scalapack block-cyclic data distribution */
1776       Integer chk;
1777       Integer proc_index[MAXDIM], index[MAXDIM];
1778       Integer topology[MAXDIM];
1779       Integer blocks[MAXDIM], block_dims[MAXDIM];
1780       pnga_get_proc_index(g_a, me, proc_index);
1781       pnga_get_proc_index(g_a, me, index);
1782       pnga_get_block_info(g_a, blocks, block_dims);
1783       pnga_get_proc_grid(g_a, topology);
1784       while (index[andim-1] < blocks[andim-1]) {
1785         /* find bounding coordinates of block */
1786         chk = 1;
1787         for (i = 0; i < andim; i++) {
1788           loA[i] = index[i]*block_dims[i]+1;
1789           hiA[i] = (index[i] + 1)*block_dims[i];
1790           if (hiA[i] > adims[i]) hiA[i] = adims[i];
1791           if (hiA[i] < loA[i]) chk = 0;
1792         }
1793         if (chk) {
1794           pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
1795           sgai_add_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1796           pnga_release_update_block_grid(g_a, index);
1797         }
1798         /* increment index to get next block on processor */
1799         index[0] += topology[0];
1800         for (i = 0; i < andim; i++) {
1801           if (index[i] >= blocks[i] && i<andim-1) {
1802             index[i] = proc_index[i];
1803             index[i+1] += topology[i+1];
1804           }
1805         }
1806       }
1807     }
1808   }
1809 #endif
1810   if(local_sync_end)pnga_sync();
1811 }
1812 
sgai_set_diagonal_block(Integer g_a,void * ptr,Integer g_v,Integer * loA,Integer * hiA,Integer ld,Integer type)1813 static void sgai_set_diagonal_block(Integer g_a, void *ptr, Integer g_v, Integer *loA,
1814                             Integer *hiA, Integer ld, Integer type)
1815 {
1816   Integer nelem, size;
1817   Integer vlo, vhi, iloA, ihiA, jloA, jhiA, lo[2], hi[2];
1818   Integer i;
1819   void *buf;
1820   int *ia;
1821   float *fa;
1822   double *da;
1823   long *la;
1824   DoubleComplex *dca;
1825   SingleComplex *fca;
1826 
1827   iloA = loA[0];
1828   ihiA = hiA[0];
1829   jloA = loA[1];
1830   jhiA = hiA[1];
1831 
1832   /* determine subset of my patch to access */
1833   if (iloA > 0)
1834   {
1835     lo[0] = GA_MAX (iloA, jloA);
1836     lo[1] = GA_MAX (iloA, jloA);
1837     hi[0] = GA_MIN (ihiA, jhiA);
1838     hi[1] = GA_MIN (ihiA, jhiA);
1839 
1840 
1841 
1842     if (hi[0] >= lo[0]) /*make sure the equality symbol is there!!! */
1843     {                   /* we got a block containing diagonal elements*/
1844 
1845       /*allocate a buffer for the given vector g_v */
1846       size = GAsizeof (type);
1847       vlo = GA_MAX (iloA, jloA);
1848       vhi = GA_MIN (ihiA, jhiA);
1849       nelem = vhi - vlo + 1;
1850       buf = malloc (nelem * size);
1851       if (buf == NULL)
1852         pnga_error
1853           ("ga_set_diagonal_:failed to allocate memory for local buffer",0);
1854 
1855       /* get the vector from the global array to the local memory buffer */
1856       pnga_get (g_v, &vlo, &vhi, buf, &vhi);
1857 
1858       switch (type)
1859       {
1860         case C_INT:
1861           ia = (int *) ptr;
1862           ia += ld*(lo[1]-jloA) + lo[0]-iloA;
1863           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1864           {
1865             *ia = ((int *) buf)[i];
1866             ia += ld + 1;
1867           }
1868           break;
1869         case C_LONG:
1870           la = (long *) ptr;
1871           la += ld*(lo[1]-jloA) + lo[0]-iloA;
1872           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1873           {
1874             *la = ((long *) buf)[i];
1875             la += ld + 1;
1876           }
1877           break;
1878         case C_FLOAT:
1879           fa = (float *) ptr;
1880           fa += ld*(lo[1]-jloA) + lo[0]-iloA;
1881           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1882           {
1883             *fa = ((float *) buf)[i];
1884             fa += ld + 1;
1885           }
1886           break;
1887         case C_DBL:
1888           da = (double *) ptr;
1889           da += ld*(lo[1]-jloA) + lo[0]-iloA;
1890           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1891           {
1892             *da = ((double *) buf)[i];
1893             da += ld + 1;
1894           }
1895           break;
1896         case C_DCPL:
1897           dca = (DoubleComplex *) ptr;
1898           dca += ld*(lo[1]-jloA) + lo[0]-iloA;
1899           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1900           {
1901             (*dca).real = (((DoubleComplex *) buf)[i]).real;
1902             (*dca).imag = (((DoubleComplex *) buf)[i]).imag;
1903             dca += ld + 1;
1904           }
1905           break;
1906 
1907         case C_SCPL:
1908           fca = (SingleComplex *) ptr;
1909           fca += ld*(lo[1]-jloA) + lo[0]-iloA;
1910           for (i = 0; i < hi[0] - lo[0] + 1; i++)
1911           {
1912             (*fca).real = (((SingleComplex *) buf)[i]).real;
1913             (*fca).imag = (((SingleComplex *) buf)[i]).imag;
1914             fca += ld + 1;
1915           }
1916           break;
1917 
1918         default:
1919           pnga_error("ga_set_diagonal_: wrong data type:", type);
1920       }
1921 
1922       /*free the memory */
1923       free (buf);
1924 
1925       /* release access to the data */
1926       lo[0] = iloA;
1927       lo[1] = jloA;
1928       hi[0] = ihiA;
1929       hi[1] = jhiA;
1930       pnga_release_update (g_a, lo, hi);
1931     }
1932   }
1933 }
1934 
1935 /* Set diagonal of g_a to have values in g_v */
1936 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1937 #   pragma weak wnga_set_diagonal = pnga_set_diagonal
1938 #endif
pnga_set_diagonal(Integer g_a,Integer g_v)1939 void pnga_set_diagonal(Integer g_a, Integer g_v)
1940 {
1941   Integer vndim, vdims, dim1, dim2, vtype, atype, type;
1942   Integer me = pnga_nodeid (), i, nproc = pnga_nnodes();
1943   Integer andim, adims[2];
1944   Integer loA[2], hiA[2], ld;
1945   Integer num_blocks_a;
1946   int local_sync_begin,local_sync_end;
1947   char *ptr;
1948   _iterator_hdl hdl;
1949 
1950   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1951   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1952   if(local_sync_begin)pnga_sync();
1953 
1954   pnga_check_handle (g_a, "ga_set_diagonal_");
1955   pnga_check_handle (g_v, "ga_set_diagonal_");
1956 
1957   pnga_inquire (g_a, &atype, &andim, adims);
1958   dim1 = adims[0];
1959   dim2 = adims[1];
1960   type = atype;
1961   pnga_inquire (g_v, &vtype, &vndim, &vdims);
1962 
1963   /* Perform some error checking */
1964   if (andim != 2)
1965     pnga_error("ga_set_diagonal: wrong dimension for g_a.", andim);
1966   if (vndim != 1)
1967     pnga_error("ga_set_diagonal: wrong dimension for g_v.", vndim);
1968 
1969 
1970   if (vdims != GA_MIN (dim1, dim2))
1971     pnga_error
1972       ("ga_set_diagonal: The size of the first array's diagonal is greater than the size of the second array.",
1973        type);
1974 
1975   if (vtype != atype)
1976   {
1977     pnga_error
1978       ("ga_set_diagonal: input global arrays do not have the same data type. Global array type =",
1979        atype);
1980   }
1981 
1982 #if 1
1983   pnga_local_iterator_init(g_a, &hdl);
1984   while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
1985     sgai_set_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1986   }
1987 #else
1988   num_blocks_a = pnga_total_blocks(g_a);
1989 
1990   if (num_blocks_a < 0) {
1991     pnga_distribution(g_a, me, loA, hiA);
1992     pnga_access_ptr(g_a, loA, hiA, &ptr, &ld);
1993     sgai_set_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1994     pnga_release_update(g_a, loA, hiA);
1995   } else {
1996     Integer idx;
1997     /* Simple block-cyclic data distribution */
1998     if (!pnga_uses_proc_grid(g_a)) {
1999       for (idx = me; idx < num_blocks_a; idx += nproc) {
2000         pnga_distribution(g_a, idx, loA, hiA);
2001         pnga_access_block_ptr(g_a, idx, &ptr, &ld);
2002         sgai_set_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
2003         pnga_release_update_block(g_a, idx);
2004       }
2005     } else {
2006       /* Uses scalapack block-cyclic data distribution */
2007       Integer chk;
2008       Integer proc_index[MAXDIM], index[MAXDIM];
2009       Integer topology[MAXDIM];
2010       Integer blocks[MAXDIM], block_dims[MAXDIM];
2011       pnga_get_proc_index(g_a, me, proc_index);
2012       pnga_get_proc_index(g_a, me, index);
2013       pnga_get_block_info(g_a, blocks, block_dims);
2014       pnga_get_proc_grid(g_a, topology);
2015       while (index[andim-1] < blocks[andim-1]) {
2016         /* find bounding coordinates of block */
2017         chk = 1;
2018         for (i = 0; i < andim; i++) {
2019           loA[i] = index[i]*block_dims[i]+1;
2020           hiA[i] = (index[i] + 1)*block_dims[i];
2021           if (hiA[i] > adims[i]) hiA[i] = adims[i];
2022           if (hiA[i] < loA[i]) chk = 0;
2023         }
2024         if (chk) {
2025           pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
2026           sgai_set_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
2027           pnga_release_update_block_grid(g_a, index);
2028         }
2029         /* increment index to get next block on processor */
2030         index[0] += topology[0];
2031         for (i = 0; i < andim; i++) {
2032           if (index[i] >= blocks[i] && i<andim-1) {
2033             index[i] = proc_index[i];
2034             index[i+1] += topology[i+1];
2035           }
2036         }
2037       }
2038     }
2039 
2040   }
2041 #endif
2042   if(local_sync_end)pnga_sync();
2043 }
2044 
sgai_shift_diagonal_block(Integer g_a,void * ptr,Integer * loA,Integer * hiA,Integer ld,void * c,Integer type)2045 static void sgai_shift_diagonal_block(Integer g_a, void *ptr, Integer *loA, Integer *hiA,
2046                               Integer ld, void *c, Integer type)
2047 {
2048   Integer iloA, ihiA, jloA, jhiA, lo[2], hi[2];
2049   Integer i;
2050   int *ia;
2051   float *fa;
2052   double *da;
2053   long *la;
2054   DoubleComplex *dca;
2055   SingleComplex *fca;
2056 
2057   iloA = loA[0];
2058   ihiA = hiA[0];
2059   jloA = loA[1];
2060   jhiA = hiA[1];
2061 
2062   /* determine subset of my patch to access */
2063   if (iloA > 0)
2064   {
2065     lo[0] = GA_MAX (iloA, jloA);
2066     lo[1] = GA_MAX (iloA, jloA);
2067     hi[0] = GA_MIN (ihiA, jhiA);
2068     hi[1] = GA_MIN (ihiA, jhiA);
2069     if (hi[0] >= lo[0]) /*make sure the equality sign is there since it is the singleton case */
2070     {                   /* we got a block containing diagonal elements */
2071 
2072       switch (type)
2073       {
2074         case C_INT:
2075           ia = (int *) ptr;
2076           ia += ld*(lo[1]-jloA) + lo[0]-iloA;
2077           for (i = 0; i < hi[0] - lo[0] + 1; i++)
2078           {
2079             *ia += *((int *) c);
2080             ia += ld + 1;
2081           }
2082           break;
2083         case C_LONG:
2084           la = (long *) ptr;
2085           la += ld*(lo[1]-jloA) + lo[0]-iloA;
2086           for (i = 0; i < hi[0] - lo[0] + 1; i++)
2087           {
2088             *la += *((long *) c);
2089             la += ld + 1;
2090           }
2091           break;
2092         case C_FLOAT:
2093           fa = (float *) ptr;
2094           fa += ld*(lo[1]-jloA) + lo[0]-iloA;
2095           for (i = 0; i < hi[0] - lo[0] + 1; i++)
2096           {
2097             *fa += *((float *) c);
2098             fa += ld + 1;
2099           }
2100           break;
2101         case C_DBL:
2102           da = (double *) ptr;
2103           da += ld*(lo[1]-jloA) + lo[0]-iloA;
2104           for (i = 0; i < hi[0] - lo[0] + 1; i++)
2105           {
2106             *da += *((double *) c);
2107             da += ld + 1;
2108           }
2109           break;
2110         case C_DCPL:
2111           dca = (DoubleComplex *) ptr;
2112           dca += ld*(lo[1]-jloA) + lo[0]-iloA;
2113           for (i = 0; i < hi[0] - lo[0] + 1; i++)
2114           {
2115             (*dca).real += (*((DoubleComplex *) c)).real;
2116             (*dca).imag += (*((DoubleComplex *) c)).imag;
2117             dca += ld + 1;
2118           }
2119           break;
2120 
2121         case C_SCPL:
2122           fca = (SingleComplex *) ptr;
2123           fca += ld*(lo[1]-jloA) + lo[0]-iloA;
2124           for (i = 0; i < hi[0] - lo[0] + 1; i++)
2125           {
2126             (*fca).real += (*((SingleComplex *) c)).real;
2127             (*fca).imag += (*((SingleComplex *) c)).imag;
2128             fca += ld + 1;
2129           }
2130           break;
2131 
2132         default:
2133           pnga_error("ga_shift_diagonal_: wrong data type:", type);
2134       }
2135     }
2136   }
2137 }
2138 
2139 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2140 #   pragma weak wnga_shift_diagonal = pnga_shift_diagonal
2141 #endif
pnga_shift_diagonal(Integer g_a,void * c)2142 void pnga_shift_diagonal(Integer g_a, void *c)
2143 {
2144   Integer loA[2], hiA[2]/*, dim1, dim2*/, ld;
2145   Integer andim, adims[2], type, atype;
2146   Integer me = pnga_nodeid (), i, nproc = pnga_nnodes();
2147   char *ptr;
2148   Integer num_blocks_a;
2149   int local_sync_begin,local_sync_end;
2150   _iterator_hdl hdl;
2151 
2152   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2153   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2154   if(local_sync_begin)pnga_sync();
2155 
2156   pnga_check_handle (g_a, "ga_shift_diagonal_");
2157 
2158   pnga_inquire (g_a, &atype, &andim, adims);
2159   /*dim1 = adims[0];*/
2160   /*dim2 = adims[1];*/
2161   type = atype;
2162   if (andim != 2)
2163     pnga_error("Dimension must be 2 for shift diagonal operation",andim);
2164 
2165 #if 1
2166   pnga_local_iterator_init(g_a, &hdl);
2167   while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
2168     sgai_shift_diagonal_block(g_a, ptr, loA, hiA, ld, c, type);
2169   }
2170 #else
2171   num_blocks_a = pnga_total_blocks(g_a);
2172 
2173   if (num_blocks_a < 0) {
2174     pnga_distribution(g_a, me, loA, hiA);
2175     pnga_access_ptr(g_a, loA, hiA, &ptr, &ld);
2176     sgai_shift_diagonal_block(g_a, ptr, loA, hiA, ld, c, type);
2177     pnga_release_update(g_a, loA, hiA);
2178   } else {
2179     Integer idx;
2180     /* Simple block-cyclic data distribution */
2181     if (!pnga_uses_proc_grid(g_a)) {
2182       for (idx = me; idx < num_blocks_a; idx += nproc) {
2183         pnga_distribution(g_a, idx, loA, hiA);
2184         pnga_access_block_ptr(g_a, idx, &ptr, &ld);
2185         sgai_shift_diagonal_block(g_a, ptr, loA, hiA, ld, c, type);
2186         pnga_release_update_block(g_a, idx);
2187       }
2188     } else {
2189       /* Uses scalapack block-cyclic data distribution */
2190       Integer chk;
2191       Integer proc_index[MAXDIM], index[MAXDIM];
2192       Integer topology[MAXDIM];
2193       Integer blocks[MAXDIM], block_dims[MAXDIM];
2194       pnga_get_proc_index(g_a, me, proc_index);
2195       pnga_get_proc_index(g_a, me, index);
2196       pnga_get_block_info(g_a, blocks, block_dims);
2197       pnga_get_proc_grid(g_a, topology);
2198       while (index[andim-1] < blocks[andim-1]) {
2199         /* find bounding coordinates of block */
2200         chk = 1;
2201         for (i = 0; i < andim; i++) {
2202           loA[i] = index[i]*block_dims[i]+1;
2203           hiA[i] = (index[i] + 1)*block_dims[i];
2204           if (hiA[i] > adims[i]) hiA[i] = adims[i];
2205           if (hiA[i] < loA[i]) chk = 0;
2206         }
2207         if (chk) {
2208           pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
2209           sgai_shift_diagonal_block(g_a, ptr, loA, hiA, ld, c, type);
2210           pnga_release_update_block_grid(g_a, index);
2211         }
2212         /* increment index to get next block on processor */
2213         index[0] += topology[0];
2214         for (i = 0; i < andim; i++) {
2215           if (index[i] >= blocks[i] && i<andim-1) {
2216             index[i] = proc_index[i];
2217             index[i+1] += topology[i+1];
2218           }
2219         }
2220       }
2221     }
2222   }
2223 #endif
2224   if(local_sync_end)pnga_sync();
2225 }
2226 
sgai_zero_diagonal_block(Integer g_a,void * ptr,Integer * lo,Integer * hi,Integer ld,Integer offset,Integer type)2227 static void sgai_zero_diagonal_block(Integer g_a, void *ptr, Integer *lo, Integer *hi,
2228                              Integer ld, Integer offset, Integer type)
2229 {
2230   Integer i;
2231   int *ia;
2232   float *fa;
2233   double *da;
2234   long *la;
2235   DoubleComplex *dca;
2236   SingleComplex *fca;
2237 
2238   switch (type) {
2239     case C_INT:
2240       ia = (int *) ptr + offset;
2241       for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2242         *ia = 0;
2243         ia += ld + 1;
2244       }
2245       break;
2246     case C_LONG:
2247       la = (long *) ptr + offset;
2248       for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2249         *la = 0;
2250         la += ld + 1;
2251       }
2252       break;
2253     case C_FLOAT:
2254       fa = (float *) ptr + offset;
2255       for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2256         *fa = 0.0;
2257         fa += ld + 1;
2258       }
2259       break;
2260     case C_DBL:
2261       da = (double *) ptr + offset;
2262       for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2263         *da = 0.0;
2264         da += ld + 1;
2265       }
2266       break;
2267     case C_DCPL:
2268       dca = (DoubleComplex *) ptr + offset;
2269       for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2270         (*dca).real = 0.0;
2271         (*dca).imag = 0.0;
2272         dca += ld + 1;
2273       }
2274       break;
2275     case C_SCPL:
2276       fca = (SingleComplex *) ptr + offset;
2277       for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2278         (*fca).real = 0.0;
2279         (*fca).imag = 0.0;
2280         fca += ld + 1;
2281       }
2282       break;
2283     default:
2284       pnga_error("set_diagonal_zero: wrong data type:", type);
2285   }
2286 }
2287 
2288 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2289 #   pragma weak wnga_zero_diagonal = pnga_zero_diagonal
2290 #endif
pnga_zero_diagonal(Integer g_a)2291 void pnga_zero_diagonal(Integer g_a)
2292 {
2293   Integer /*dim1, dim2,*/ type;
2294   Integer ld, lo[2], hi[2], loA[2], hiA[2];
2295   Integer me = pnga_nodeid (), i, offset;
2296   char *ptr;
2297   Integer num_blocks_a;
2298   Integer atype, andim, adims[2];
2299   int local_sync_begin,local_sync_end;
2300   _iterator_hdl hdl;
2301 
2302   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2303   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2304   if(local_sync_begin)pnga_sync();
2305 
2306 
2307   pnga_inquire (g_a, &atype, &andim, adims);
2308   /*dim1 = adims[0];*/
2309   /*dim2 = adims[1];*/
2310   type = atype;
2311 
2312 #if 1
2313   pnga_local_iterator_init(g_a, &hdl);
2314   while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
2315     offset = 0;
2316     if (loA[0] > 0) {
2317       lo[0] = GA_MAX (loA[0], loA[1]);
2318       lo[1] = GA_MAX (loA[0], loA[1]);
2319       hi[0] = GA_MIN (hiA[0], hiA[1]);
2320       hi[1] = GA_MIN (hiA[0], hiA[1]);
2321       if (hi[0] >= lo[0]) {
2322         sgai_zero_diagonal_block(g_a, ptr, lo, hi, ld, offset, type);
2323       }
2324     }
2325   }
2326 #else
2327   num_blocks_a = pnga_total_blocks(g_a);
2328 
2329   if (num_blocks_a < 0) {
2330     offset = 0;
2331     pnga_distribution (g_a, me, loA, hiA);
2332     /* determine subset of my patch to access */
2333     if (loA[0] > 0) {
2334       lo[0] = GA_MAX (loA[0], loA[1]);
2335       lo[1] = GA_MAX (loA[0], loA[1]);
2336       hi[0] = GA_MIN (hiA[0], hiA[1]);
2337       hi[1] = GA_MIN (hiA[0], hiA[1]);
2338       if (hi[0] >= lo[0]) {
2339                               /* we got a block containing diagonal elements */
2340         pnga_access_ptr (g_a, lo, hi, &ptr, &ld);
2341         sgai_zero_diagonal_block(g_a, ptr, lo, hi, ld, offset, type);
2342         /* release access to the data */
2343         pnga_release_update (g_a, lo, hi);
2344       }
2345     }
2346   } else {
2347     Integer idx, lld[MAXDIM], offset;
2348     Integer jtot, last, j;
2349     Integer nproc = pnga_nnodes();
2350     /* Simple block-cyclic data distribution */
2351     if (!pnga_uses_proc_grid(g_a)) {
2352       for (idx = me; idx < num_blocks_a; idx += nproc) {
2353         pnga_distribution(g_a, idx, loA, hiA);
2354         lo[0] = GA_MAX (loA[0], loA[1]);
2355         lo[1] = GA_MAX (loA[0], loA[1]);
2356         hi[0] = GA_MIN (hiA[0], hiA[1]);
2357         hi[1] = GA_MIN (hiA[0], hiA[1]);
2358 
2359         if (hi[0] >= lo[0]) {
2360           pnga_access_block_ptr(g_a, idx, &ptr, lld);
2361           /* evaluate offsets for system */
2362           offset = 0;
2363           last = andim - 1;
2364           jtot = 1;
2365           for (j=0; j<last; j++) {
2366             offset += (lo[j] - loA[j])*jtot;
2367             jtot *= lld[j];
2368           }
2369           offset += (lo[last]-loA[last])*jtot;
2370           sgai_zero_diagonal_block(g_a, ptr, lo, hi, lld[0], offset, type);
2371           pnga_release_update_block(g_a, idx);
2372         }
2373       }
2374     } else {
2375       /* Uses scalapack block-cyclic data distribution */
2376       Integer lld[MAXDIM]/*, chk*/;
2377       Integer proc_index[MAXDIM], index[MAXDIM];
2378       Integer topology[MAXDIM];
2379       Integer blocks[MAXDIM], block_dims[MAXDIM];
2380       pnga_get_proc_index(g_a, me, proc_index);
2381       pnga_get_proc_index(g_a, me, index);
2382       pnga_get_block_info(g_a, blocks, block_dims);
2383       pnga_get_proc_grid(g_a, topology);
2384 
2385       while (index[andim-1] < blocks[andim-1]) {
2386         /* find bounding coordinates of block */
2387         /*chk = 1;*/
2388         for (i = 0; i < andim; i++) {
2389           loA[i] = index[i]*block_dims[i]+1;
2390           hiA[i] = (index[i] + 1)*block_dims[i];
2391           if (hiA[i] > adims[i]) hiA[i] = adims[i];
2392           /*if (hiA[i] < loA[i]) chk = 0;*/
2393         }
2394         lo[0] = GA_MAX (loA[0], loA[1]);
2395         lo[1] = GA_MAX (loA[0], loA[1]);
2396         hi[0] = GA_MIN (hiA[0], hiA[1]);
2397         hi[1] = GA_MIN (hiA[0], hiA[1]);
2398 
2399         if (hi[0] >= lo[0]) {
2400           pnga_access_block_grid_ptr(g_a, index, &ptr, lld);
2401           /* evaluate offsets for system */
2402           offset = 0;
2403           last = andim - 1;
2404           jtot = 1;
2405           for (j=0; j<last; j++) {
2406             offset += (lo[j] - loA[j])*jtot;
2407             jtot *= lld[j];
2408           }
2409           offset += (lo[last]-loA[last])*jtot;
2410           sgai_zero_diagonal_block(g_a, ptr, lo, hi, lld[0], offset, type);
2411           pnga_release_update_block_grid(g_a, index);
2412         }
2413 
2414         /* increment index to get next block on processor */
2415         index[0] += topology[0];
2416         for (i = 0; i < andim; i++) {
2417           if (index[i] >= blocks[i] && i<andim-1) {
2418             index[i] = proc_index[i];
2419             index[i+1] += topology[i+1];
2420           }
2421         }
2422       }
2423     }
2424   }
2425 #endif
2426   if(local_sync_end)pnga_sync();
2427 }
2428 
sgai_scale_row_values(Integer type,Integer * lo,Integer * hi,Integer ld,void * ptr,Integer g_v)2429 static void sgai_scale_row_values(Integer type, Integer *lo,
2430                           Integer *hi, Integer ld, void *ptr, Integer g_v)
2431 {
2432   Integer vlo, vhi, size;
2433   int *ia;
2434   float *fa;
2435   double *da;
2436   long *la;
2437   DoubleComplex *dca;
2438   SingleComplex *fca;
2439   void *buf;
2440 
2441   /* determine subset of my patch to access */
2442   if (lo[0] > 0) {
2443 
2444     if (hi[0] >= lo[0]) { /*make sure the equality symbol is there!!! */
2445                           /* we got a block containing diagonal elements */
2446 
2447       Integer myrows = hi[0] - lo[0] + 1;
2448       Integer i, j;
2449       /*number of rows on the patch is jhiA - jloA + 1 */
2450       vlo =lo[0] ;
2451       vhi = hi[0];
2452 
2453       /*allocate a buffer for the given vector g_v */
2454       size = GAsizeof (type);
2455 
2456       buf = malloc (myrows * size);
2457       if (buf == NULL)
2458         pnga_error
2459           ("ga_scale_rows_:failed to allocate memory for the local buffer.",
2460            0);
2461 
2462       /* get the vector from the global array to the local memory buffer */
2463       pnga_get (g_v, &vlo, &vhi, buf, &vhi);
2464 
2465       switch (type) {
2466         case C_INT:
2467           ia = (int *) ptr;
2468           for (i = 0; i < hi[0]-lo[0]+1; i++) /*for each row */
2469             for(j=0;j<hi[1]-lo[1]+1;j++) /*for each column*/
2470               ia[j*ld+i] *= ((int *) buf)[i];
2471           break;
2472         case C_LONG:
2473           la = (long *) ptr;
2474           for (i = 0; i < hi[0]-lo[0]+1; i++)
2475             for(j=0;j<hi[1]-lo[1]+1;j++)
2476               la[j*ld+i] *= ((long *) buf)[i];
2477           break;
2478         case C_FLOAT:
2479           fa = (float *) ptr;
2480           for (i = 0; i < hi[0]-lo[0]+1; i++)
2481             for(j=0;j<hi[1]-lo[1]+1;j++)
2482               fa[j*ld+i] *= ((float *) buf)[i];
2483           break;
2484         case C_DBL:
2485           da = (double *) ptr;
2486           for (i = 0; i < hi[0]-lo[0]+1; i++)
2487             for(j=0;j<hi[1]-lo[1]+1;j++)
2488               da[j*ld+i] *= ((double *) buf)[i];
2489           break;
2490         case C_DCPL:
2491           dca = (DoubleComplex *) ptr;
2492           for (i = 0; i < hi[0]-lo[0]+1; i++)
2493             for(j=0;j<hi[1]-lo[1]+1;j++) {
2494               (dca[j*ld+i]).real *= (((DoubleComplex *) buf)[i]).real;
2495               (dca[j*ld+i]).imag *= (((DoubleComplex *) buf)[i]).imag;
2496             }
2497           break;
2498         case C_SCPL:
2499           fca = (SingleComplex *) ptr;
2500           for (i = 0; i < hi[0]-lo[0]+1; i++)
2501             for(j=0;j<hi[1]-lo[1]+1;j++) {
2502               (fca[j*ld+i]).real *= (((SingleComplex *) buf)[i]).real;
2503               (fca[j*ld+i]).imag *= (((SingleComplex *) buf)[i]).imag;
2504             }
2505           break;
2506         default:
2507           pnga_error("ga_scale_rows_: wrong data type:", type);
2508       }
2509 
2510       /*free the memory */
2511       free (buf);
2512     }
2513   }
2514 }
2515 
2516 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2517 #   pragma weak wnga_scale_rows = pnga_scale_rows
2518 #endif
pnga_scale_rows(Integer g_a,Integer g_v)2519 void pnga_scale_rows(Integer g_a, Integer g_v)
2520 {
2521   Integer vndim, vdims, dim1/*, dim2*/, vtype, atype, type;
2522   Integer ld, lo[2], hi[2];
2523   Integer me = pnga_nodeid (), i, chk;
2524   char *ptr;
2525   Integer andim, adims[2];
2526   Integer num_blocks_a;
2527   int local_sync_begin,local_sync_end;
2528   _iterator_hdl hdl;
2529 
2530   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2531   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2532   if(local_sync_begin)pnga_sync();
2533 
2534   pnga_check_handle (g_a, "ga_scale_rows_");
2535   pnga_check_handle (g_v, "ga_scale_rows_");
2536 
2537   pnga_inquire (g_a, &atype, &andim, adims);
2538   dim1 = adims[0];
2539   /*dim2 = adims[1];*/
2540   type = atype;
2541   pnga_inquire (g_v, &vtype, &vndim, &vdims);
2542 
2543   /* Perform some error checking */
2544   if (andim != 2)
2545     pnga_error("ga_scale_rows_: wrong dimension for g_a.", andim);
2546   if (vndim != 1)
2547     pnga_error("ga_scale_rows_: wrong dimension for g_v.", vndim);
2548 
2549   /*in internal functions, dim1 = number of rows of the matrix g_a*/
2550   /*in internal functions, dim2 = number of columns of the matrix g_a*/
2551   if (vdims != dim1)
2552     pnga_error
2553       ("ga_scale_rows_: The size of the scalar array is not the same as the number of the rows of g_a.",
2554        vdims);
2555 
2556   if (vtype != atype)
2557   {
2558     pnga_error
2559       ("ga_scale_rows_: input global arrays do not have the same data type. Global array type =",
2560        atype);
2561   }
2562 
2563 #if 1
2564   pnga_local_iterator_init(g_a, &hdl);
2565   while (pnga_local_iterator_next(&hdl,lo,hi,&ptr,&ld)) {
2566     sgai_scale_row_values(type, lo, hi, ld, ptr, g_v);
2567   }
2568 #else
2569   num_blocks_a = pnga_total_blocks(g_a);
2570 
2571   if (num_blocks_a < 0) {
2572     pnga_distribution (g_a, me, lo, hi);
2573 
2574     chk = 1;
2575     for (i=0; i<andim; i++) {
2576       if (lo[i]>hi[i]) chk = 0;
2577     }
2578     if (chk) {
2579       pnga_access_ptr (g_a, lo, hi, &ptr, &ld);
2580 
2581       sgai_scale_row_values(type, lo, hi, ld, ptr, g_v);
2582 
2583       /* release access to the data */
2584       pnga_release_update (g_a, lo, hi);
2585     }
2586   } else {
2587     Integer nproc = pnga_nnodes();
2588     /* Simple block-cyclic data distribution */
2589     if (!pnga_uses_proc_grid(g_a)) {
2590       Integer idx;
2591       for (idx=me; idx<num_blocks_a; idx += nproc) {
2592         pnga_distribution(g_a, idx, lo, hi);
2593         pnga_access_block_ptr(g_a, idx, &ptr, &ld);
2594         sgai_scale_row_values(type, lo, hi, ld, ptr, g_v);
2595         pnga_release_update_block(g_a, idx);
2596       }
2597     } else {
2598       Integer proc_index[MAXDIM], index[MAXDIM];
2599       Integer topology[MAXDIM];
2600       Integer blocks[MAXDIM], block_dims[MAXDIM];
2601       pnga_get_proc_index(g_a, me, proc_index);
2602       pnga_get_proc_index(g_a, me, index);
2603       pnga_get_block_info(g_a, blocks, block_dims);
2604       pnga_get_proc_grid(g_a, topology);
2605 
2606       while (index[andim-1] < blocks[andim-1]) {
2607         /* find bounding coordinates of block */
2608         chk = 1;
2609         for (i = 0; i < andim; i++) {
2610           lo[i] = index[i]*block_dims[i]+1;
2611           hi[i] = (index[i] + 1)*block_dims[i];
2612           if (hi[i] > adims[i]) hi[i] = adims[i];
2613           if (hi[i] < lo[i]) chk = 0;
2614         }
2615         if (chk) {
2616           pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
2617           sgai_scale_row_values(type, lo, hi, ld, ptr, g_v);
2618           pnga_release_update_block_grid(g_a, index);
2619         }
2620         /* increment index to get next block on processor */
2621         index[0] += topology[0];
2622         for (i = 0; i < andim; i++) {
2623           if (index[i] >= blocks[i] && i<andim-1) {
2624             index[i] = proc_index[i];
2625             index[i+1] += topology[i+1];
2626           }
2627         }
2628       }
2629     }
2630   }
2631 #endif
2632   if(local_sync_end)pnga_sync();
2633 }
2634 
sgai_scale_col_values(Integer type,Integer * lo,Integer * hi,Integer ld,void * ptr,Integer g_v)2635 static void sgai_scale_col_values(Integer type, Integer *lo,
2636                           Integer *hi, Integer ld, void *ptr, Integer g_v)
2637 {
2638   Integer vlo, vhi, size;
2639   void *buf;
2640   int *ia;
2641   float *fa;
2642   double *da;
2643   long *la;
2644   DoubleComplex *dca;
2645   SingleComplex *fca;
2646 
2647   /* determine subset of my patch to access */
2648   if (lo[1] > 0) {
2649     if (hi[0] >= lo[0]) {     /*make sure the equality symbol is there!!! */
2650                               /* we got a block containing diagonal elements*/
2651 
2652       Integer mycols = hi[1] - lo[1] + 1;
2653       Integer i, j;
2654       /*number of rows on the patch is jhiA - jloA + 1 */
2655       vlo =lo[1] ;
2656       vhi = hi[1];
2657 
2658       /*allocate a buffer for the given vector g_v */
2659       size = GAsizeof (type);
2660 
2661       buf = malloc (mycols * size);
2662       if (buf == NULL)
2663         pnga_error
2664           ("ga_scale_cols_:failed to allocate memory for the local buffer.",
2665            0);
2666 
2667       /* get the vector from the global array to the local memory buffer */
2668       pnga_get (g_v, &vlo, &vhi, buf, &vhi);
2669 
2670       switch (type) {
2671         case C_INT:
2672           ia = (int *) ptr;
2673           for(j=0;j<hi[1]-lo[1]+1;j++) /*for each column*/
2674             for (i = 0; i < hi[0]-lo[0]+1; i++) /*for each row */
2675               ia[j*ld+i] *= ((int *) buf)[j];
2676           break;
2677         case C_LONG:
2678           la = (long *) ptr;
2679           for(j=0;j<hi[1]-lo[1]+1;j++)
2680             for (i = 0; i < hi[0]-lo[0]+1; i++)
2681               la[j*ld+i] *= ((long *) buf)[j];
2682           break;
2683         case C_FLOAT:
2684           fa = (float *) ptr;
2685           for(j=0;j<hi[1]-lo[1]+1;j++)
2686             for (i = 0; i < hi[0]-lo[0]+1; i++)
2687               fa[j*ld+i] *= ((float *) buf)[j];
2688           break;
2689         case C_DBL:
2690           da = (double *) ptr;
2691           for(j=0;j<hi[1]-lo[1]+1;j++)
2692             for (i = 0; i < hi[0]-lo[0]+1; i++)
2693               da[j*ld+i] *= ((double *) buf)[j];
2694           break;
2695         case C_DCPL:
2696           dca = (DoubleComplex *) ptr;
2697           for(j=0;j<hi[1]-lo[1]+1;j++)
2698             for (i = 0; i < hi[0]-lo[0]+1; i++)
2699             {
2700               (dca[j*ld+i]).real *= (((DoubleComplex *) buf)[j]).real;
2701               (dca[j*ld+i]).imag *= (((DoubleComplex *) buf)[j]).imag;
2702             }
2703           break;
2704         case C_SCPL:
2705           fca = (SingleComplex *) ptr;
2706           for(j=0;j<hi[1]-lo[1]+1;j++)
2707             for (i = 0; i < hi[0]-lo[0]+1; i++)
2708             {
2709               (fca[j*ld+i]).real *= (((SingleComplex *) buf)[j]).real;
2710               (fca[j*ld+i]).imag *= (((SingleComplex *) buf)[j]).imag;
2711             }
2712           break;
2713         default:
2714           pnga_error("ga_scale_cols_: wrong data type:", type);
2715       }
2716       /*free the memory */
2717       free (buf);
2718     }
2719   }
2720 }
2721 
2722 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2723 #   pragma weak wnga_scale_cols = pnga_scale_cols
2724 #endif
pnga_scale_cols(Integer g_a,Integer g_v)2725 void pnga_scale_cols(Integer g_a, Integer g_v)
2726 {
2727   Integer vndim, vdims/*, dim1*/, dim2, vtype, atype, type;
2728   Integer ld, lo[2], hi[2];
2729   Integer me = pnga_nodeid (), i;
2730   char *ptr;
2731   Integer andim, adims[2];
2732   int local_sync_begin,local_sync_end;
2733   Integer num_blocks_a;
2734   Integer chk;
2735   _iterator_hdl hdl;
2736 
2737   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2738   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2739   if(local_sync_begin)pnga_sync();
2740 
2741   pnga_check_handle (g_a, "ga_scale_cols_");
2742   pnga_check_handle (g_v, "ga_scale_cols_");
2743 
2744   pnga_inquire(g_a, &atype, &andim, adims);
2745   /*dim1 = adims[0];*/
2746   dim2 = adims[1];
2747   type = atype;
2748   pnga_inquire(g_v, &vtype, &vndim, &vdims);
2749 
2750   /* Perform some error checking */
2751   if (andim != 2)
2752     pnga_error("ga_scale_cols_: wrong dimension for g_a.", andim);
2753   if (vndim != 1)
2754     pnga_error("ga_scale_cols_: wrong dimension for g_v.", vndim);
2755 
2756   /*in internal functions, dim1 = number of rows of the matrix g_a*/
2757   /*in internal functions, dim2 = number of columns of the matrix g_a*/
2758   if (vdims != dim2)
2759     pnga_error
2760       ("ga_scale_cols_: The size of the scalar array is not the same as the number of the rows of g_a.",
2761        vdims);
2762 
2763   if (vtype != atype)
2764     {
2765       pnga_error
2766         ("ga_scale_cols_: input global arrays do not have the same data type. Global array type =",
2767          atype);
2768     }
2769 
2770 #if 1
2771   pnga_local_iterator_init(g_a, &hdl);
2772   while (pnga_local_iterator_next(&hdl,lo,hi,&ptr,&ld)) {
2773     sgai_scale_col_values(type, lo, hi, ld, ptr, g_v);
2774   }
2775 #else
2776   num_blocks_a = pnga_total_blocks(g_a);
2777 
2778   if (num_blocks_a < 0) {
2779     pnga_distribution (g_a, me, lo, hi);
2780 
2781     chk = 1;
2782     for (i=0; i<andim; i++) {
2783       if (lo[i]>hi[i]) chk = 0;
2784     }
2785     if (chk) {
2786       pnga_access_ptr (g_a, lo, hi, &ptr, &ld);
2787 
2788       sgai_scale_col_values(type, lo, hi, ld, ptr, g_v);
2789 
2790       /* release access to the data */
2791       pnga_release_update (g_a, lo, hi);
2792     }
2793   } else {
2794     Integer nproc = pnga_nnodes();
2795     /* Simple block-cyclic data distribution */
2796     if (!pnga_uses_proc_grid(g_a)) {
2797       Integer idx;
2798       for (idx=me; idx<num_blocks_a; idx += nproc) {
2799         pnga_distribution(g_a, idx, lo, hi);
2800         pnga_access_block_ptr(g_a, idx, &ptr, &ld);
2801         sgai_scale_col_values(type, lo, hi, ld, ptr, g_v);
2802         pnga_release_update_block(g_a, idx);
2803       }
2804     } else {
2805       Integer proc_index[MAXDIM], index[MAXDIM];
2806       Integer topology[MAXDIM];
2807       Integer blocks[MAXDIM], block_dims[MAXDIM];
2808       pnga_get_proc_index(g_a, me, proc_index);
2809       pnga_get_proc_index(g_a, me, index);
2810       pnga_get_block_info(g_a, blocks, block_dims);
2811       pnga_get_proc_grid(g_a, topology);
2812 
2813       while (index[andim-1] < blocks[andim-1]) {
2814         /* find bounding coordinates of block */
2815         chk = 1;
2816         for (i = 0; i < andim; i++) {
2817           lo[i] = index[i]*block_dims[i]+1;
2818           hi[i] = (index[i] + 1)*block_dims[i];
2819           if (hi[i] > adims[i]) hi[i] = adims[i];
2820           if (hi[i] < lo[i]) chk = 0;
2821         }
2822         if (chk) {
2823           pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
2824           sgai_scale_col_values(type, lo, hi, ld, ptr, g_v);
2825           pnga_release_update_block_grid(g_a, index);
2826         }
2827         /* increment index to get next block on processor */
2828         index[0] += topology[0];
2829         for (i = 0; i < andim; i++) {
2830           if (index[i] >= blocks[i] && i<andim-1) {
2831             index[i] = proc_index[i];
2832             index[i+1] += topology[i+1];
2833           }
2834         }
2835       }
2836     }
2837   }
2838 #endif
2839   if(local_sync_end)pnga_sync();
2840 }
2841