1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /*$Id: matrix.c,v 1.7.8.11 2007/12/18 18:49:36 d3g293 Exp $******************************************************
6 File: matrix.c
7
8 Author: Limin Zhang, Ph.D.
9 Mathematics Department
10 Columbia Basin College
11 Pasco, WA 99301
12 Limin.Zhang@cbc2.org
13
14 Mentor: Jarek Naplocha, Ph.D.
15 Environmental Molecular Science Laboratory
16 Richland, WA 99352
17
18 Date: 2/28/2002
19
20 Purpose:
21 matrix interfaces between TAO and
22 global arrays.
23 **************************************************************/
24
25 #include "globalp.h"
26 #if HAVE_MATH_H
27 # include <math.h>
28 #endif
29 #if HAVE_STDLIB_H
30 # include <stdlib.h>
31 #endif
32 #if HAVE_STRING_H
33 # include <string.h>
34 #endif
35 #include "message.h"
36 #include "ga-papi.h"
37 #include "ga-wapi.h"
38 #include "ga_iterator.h"
39
40 #define auxi_median(a,b,c,m) \
41 { \
42 if ((c < a) && (a < b)) m = a; \
43 else if ((a <= c) && (c <= b)) m = c; \
44 else m = b; \
45 }
46
47 #define median(a,b,c,m) \
48 { \
49 if(a == b) m = a; \
50 else if(a < b) auxi_median(a, b, c, m) \
51 else auxi_median(b, a, c, m) \
52 }
53
54 #define auxi_median_dcpl(na, nb, nc, za, zb, zc, zm) \
55 { \
56 if ((nc < na) && (na < nb)) zm = za; \
57 else if ((na <= nc) && (nc <= nb)) zm = zc; \
58 else zm = zb; \
59 }
60
61 #define median_dcpl(na, nb, nc, za, zb, zc, zm) \
62 { \
63 if (na == nb) zm = za; \
64 else if (na < nb) auxi_median_dcpl (na, nb, nc, za, zb, zc, zm) \
65 else auxi_median_dcpl (nb, na, nc, zb, za, zc, zm) \
66 }
67
68 #define auxi_median_scpl(na, nb, nc, ca, cb, cc, cm) \
69 { \
70 if ((nc < na) && (na < nb)) cm = ca; \
71 else if ((na <= nc) && (nc <= nb)) cm = cc; \
72 else cm = cb; \
73 }
74
75 #define median_scpl(na, nb, nc, ca, cb, cc, cm) \
76 { \
77 if (na == nb) cm = ca; \
78 else if (na < nb) auxi_median_scpl (na, nb, nc, ca, cb, cc, cm) \
79 else auxi_median_scpl (nb, na, nc, cb, ca, cc, cm) \
80 }
81
82 /*\ Utility function to find median values of patch
83 \*/
sgai_median_patch_values(Integer type,Integer ndim,Integer * loA,Integer * hiA,Integer * ldA,void * A_ptr,void * B_ptr,void * C_ptr,void * M_ptr,Integer offset)84 static void sgai_median_patch_values(Integer type, Integer ndim, Integer *loA,
85 Integer *hiA, Integer *ldA, void *A_ptr,
86 void *B_ptr, void *C_ptr, void *M_ptr,
87 Integer offset)
88 {
89 Integer bvalue[MAXDIM], bunit[MAXDIM], baseldA[MAXDIM];
90 Integer idx, n1dim;
91 Integer i, j;
92
93 double na, nb, nc; /*norm of a, norm of b, norm of c */
94 int ia, ib, ic, im;
95 float fa, fb, fc, fm;
96 double da, db, dc, dm;
97 long la, lb, lc, lm;
98 DoubleComplex za, zb, zc, zm;
99 SingleComplex ca, cb, cc, cm;
100
101 /* number of n-element of the first dimension */
102 n1dim = 1;
103 for (i = 1; i < ndim; i++)
104 n1dim *= (hiA[i] - loA[i] + 1);
105
106 /* calculate the destination indices */
107 bvalue[0] = 0;
108 bvalue[1] = 0;
109 bunit[0] = 1;
110 bunit[1] = 1;
111 /* baseldA[0] = ldA[0]
112 * baseldA[1] = ldA[0] * ldA[1]
113 * baseldA[2] = ldA[0] * ldA[1] * ldA[2] .....
114 */
115 baseldA[0] = ldA[0];
116 baseldA[1] = baseldA[0] * ldA[1];
117 for (i = 2; i < ndim; i++)
118 {
119 bvalue[i] = 0;
120 bunit[i] = bunit[i - 1] * (hiA[i - 1] - loA[i - 1] + 1);
121 baseldA[i] = baseldA[i - 1] * ldA[i];
122 }
123 switch(type) {
124 case C_DBL:
125 A_ptr = (void*)((double*)(A_ptr) + offset);
126 B_ptr = (void*)((double*)(B_ptr) + offset);
127 C_ptr = (void*)((double*)(C_ptr) + offset);
128 M_ptr = (void*)((double*)(M_ptr) + offset);
129 break;
130 case C_INT:
131 A_ptr = (void*)((int*)(A_ptr) + offset);
132 B_ptr = (void*)((int*)(B_ptr) + offset);
133 C_ptr = (void*)((int*)(C_ptr) + offset);
134 M_ptr = (void*)((int*)(M_ptr) + offset);
135 break;
136 case C_DCPL:
137 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
138 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
139 C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
140 M_ptr = (void*)((DoubleComplex*)(M_ptr) + offset);
141 break;
142 case C_SCPL:
143 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
144 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
145 C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
146 M_ptr = (void*)((SingleComplex*)(M_ptr) + offset);
147 break;
148 case C_FLOAT:
149 A_ptr = (void*)((float*)(A_ptr) + offset);
150 B_ptr = (void*)((float*)(B_ptr) + offset);
151 C_ptr = (void*)((float*)(C_ptr) + offset);
152 M_ptr = (void*)((float*)(M_ptr) + offset);
153 break;
154 case C_LONG:
155 A_ptr = (void*)((long*)(A_ptr) + offset);
156 B_ptr = (void*)((long*)(B_ptr) + offset);
157 C_ptr = (void*)((long*)(C_ptr) + offset);
158 M_ptr = (void*)((long*)(M_ptr) + offset);
159 break;
160 case C_LONGLONG:
161 A_ptr = (void*)((long long*)(A_ptr) + offset);
162 B_ptr = (void*)((long long*)(B_ptr) + offset);
163 C_ptr = (void*)((long long*)(C_ptr) + offset);
164 M_ptr = (void*)((long long*)(M_ptr) + offset);
165 break;
166 default:
167 break;
168 }
169
170 /*compute elementwise median */
171
172 switch (type) {
173
174 case C_INT:
175 for (i = 0; i < n1dim; i++) {
176 idx = 0;
177 for (j = 1; j < ndim; j++) {
178 idx += bvalue[j] * baseldA[j - 1];
179 if (((i + 1) % bunit[j]) == 0) bvalue[j]++;
180 if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
181 }
182 for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
183 ia = ((int *) A_ptr)[idx + j];
184 ib = ((int *) B_ptr)[idx + j];
185 ic = ((int *) C_ptr)[idx + j];
186 median(ia, ib, ic, im);
187 ((int *) M_ptr)[idx + j] = im;
188 }
189 }
190 break;
191 case C_LONG:
192 for (i = 0; i < n1dim; i++) {
193 idx = 0;
194 for (j = 1; j < ndim; j++) {
195 idx += bvalue[j] * baseldA[j - 1];
196 if (((i + 1) % bunit[j]) == 0) bvalue[j]++;
197 if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
198 }
199 for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
200 la = ((long *) A_ptr)[idx + j];
201 lb = ((long *) B_ptr)[idx + j];
202 lc = ((long *) C_ptr)[idx + j];
203 median(la, lb, lc, lm);
204 ((long *) M_ptr)[idx + j] = lm;
205 }
206 }
207 break;
208 case C_FLOAT:
209 for (i = 0; i < n1dim; i++) {
210 idx = 0;
211 for (j = 1; j < ndim; j++) {
212 idx += bvalue[j] * baseldA[j - 1];
213 if (((i + 1) % bunit[j]) == 0) bvalue[j]++;
214 if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
215 }
216 for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
217 fa = ((float *) A_ptr)[idx + j];
218 fb = ((float *) B_ptr)[idx + j];
219 fc = ((float *) C_ptr)[idx + j];
220 median(fa, fb, fc, fm);
221 ((float *) M_ptr)[idx + j] = fm;
222 }
223 }
224 break;
225 case C_DBL:
226 for (i = 0; i < n1dim; i++) {
227 idx = 0;
228 for (j = 1; j < ndim; j++) {
229 idx += bvalue[j] * baseldA[j - 1];
230 if (((i + 1) % bunit[j]) == 0) bvalue[j]++;
231 if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
232 }
233 for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
234 da = ((double *) A_ptr)[idx + j];
235 db = ((double *) B_ptr)[idx + j];
236 dc = ((double *) C_ptr)[idx + j];
237 median(da, db, dc, dm);
238 ((double *) M_ptr)[idx + j] = dm;
239 }
240 }
241 break;
242 case C_DCPL:
243 for (i = 0; i < n1dim; i++) {
244 idx = 0;
245 for (j = 1; j < ndim; j++) {
246 idx += bvalue[j] * baseldA[j - 1];
247 if (((i + 1) % bunit[j]) == 0) bvalue[j]++;
248 if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
249 }
250 for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
251 za = ((DoubleComplex *) A_ptr)[idx + j];
252 zb = ((DoubleComplex *) B_ptr)[idx + j];
253 zc = ((DoubleComplex *) C_ptr)[idx + j];
254 na = sqrt ((za.real) * (za.real) + (za.imag) * (za.imag));
255 nb = sqrt ((zb.real) * (zb.real) + (zb.imag) * (zb.imag));
256 nc = sqrt ((zc.real) * (zc.real) + (zc.imag) * (zc.imag));
257 median_dcpl(na, nb, nc, za, zb, zc, zm);
258 ((DoubleComplex *) M_ptr)[idx + j] = zm;
259 }
260 }
261 break;
262
263 case C_SCPL:
264 for (i = 0; i < n1dim; i++) {
265 idx = 0;
266 for (j = 1; j < ndim; j++) {
267 idx += bvalue[j] * baseldA[j - 1];
268 if (((i + 1) % bunit[j]) == 0) bvalue[j]++;
269 if (bvalue[j] > (hiA[j] - loA[j])) bvalue[j] = 0;
270 }
271 for (j = 0; j < (hiA[0] - loA[0] + 1); j++) {
272 ca = ((SingleComplex *) A_ptr)[idx + j];
273 cb = ((SingleComplex *) B_ptr)[idx + j];
274 cc = ((SingleComplex *) C_ptr)[idx + j];
275 na = sqrt ((ca.real) * (ca.real) + (ca.imag) * (ca.imag));
276 nb = sqrt ((cb.real) * (cb.real) + (cb.imag) * (cb.imag));
277 nc = sqrt ((cc.real) * (cc.real) + (cc.imag) * (cc.imag));
278 median_scpl(na, nb, nc, ca, cb, cc, cm);
279 ((SingleComplex *) M_ptr)[idx + j] = cm;
280 }
281 }
282 break;
283 default:
284 pnga_error("median: wrong data type", type);
285 }
286 }
287
288 /*\ median routine. Not sure what this function is suppose to do
289 \*/
290 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
291 # pragma weak wnga_median_patch = pnga_median_patch
292 #endif
pnga_median_patch(g_a,alo,ahi,g_b,blo,bhi,g_c,clo,chi,g_m,mlo,mhi)293 void pnga_median_patch(
294 g_a, alo, ahi, g_b, blo, bhi, g_c, clo, chi, g_m, mlo, mhi)
295 Integer g_a, *alo, *ahi; /* patch of g_a */
296 Integer g_b, *blo, *bhi; /* patch of g_b */
297 Integer g_c, *clo, *chi; /* patch of g_c */
298 Integer g_m, *mlo, *mhi; /* patch of g_m */
299 {
300 Integer i, j;
301 Integer atype, btype, andim, adims[MAXDIM], bndim, bdims[MAXDIM];
302 Integer ctype, mtype, cndim, cdims[MAXDIM], mndim, mdims[MAXDIM];
303 Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
304 Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
305 Integer loC[MAXDIM], hiC[MAXDIM], ldC[MAXDIM];
306 Integer loM[MAXDIM], hiM[MAXDIM], ldM[MAXDIM];
307 Integer g_A = g_a, g_B = g_b;
308 Integer g_C = g_c, g_M = g_m;
309 char *A_ptr, *B_ptr;
310 char *C_ptr, *M_ptr;
311 Integer offset;
312 Integer atotal, btotal;
313 Integer ctotal, mtotal;
314 Integer me = pnga_nodeid (), a_temp_created = 0, b_temp_created = 0, c_temp_created = 0;
315 Integer type = GA_TYPE_GSM, compatible;
316 char *tempname = "temp", transp = 'n'; /*no transpose */
317 Integer num_blocks_a, num_blocks_b, num_blocks_c, num_blocks_m;
318 int local_sync_begin,local_sync_end;
319 _iterator_hdl hdl_a, hdl_b, hdl_c, hdl_m;
320
321 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
322 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
323 if(local_sync_begin)pnga_sync();
324
325
326 pnga_inquire (g_a, &atype, &andim, adims);
327 pnga_inquire (g_b, &btype, &bndim, bdims);
328 pnga_inquire (g_c, &ctype, &cndim, cdims);
329 pnga_inquire (g_m, &mtype, &mndim, mdims);
330
331 /* I have to inquire the data type again since nga_inquire and
332 * nga_inquire_internal_ treat data type differently */
333 pnga_inquire(g_m, &type, &mndim, mdims);
334
335 if (mtype != atype)
336 pnga_error(" pnga_median_patch:type mismatch ", 0L);
337 if (mtype != btype)
338 pnga_error(" pnga_median_patch:type mismatch ", 0L);
339 if (mtype != ctype)
340 pnga_error(" pnga_median_patch:type mismatch ", 0L);
341
342 /* check if patch indices and g_a dims match */
343 for (i = 0; i < andim; i++)
344 if (alo[i] <= 0 || ahi[i] > adims[i])
345 {
346 pnga_error("pnga_median_patch: g_a indices out of range ", g_a);
347 }
348
349 for (i = 0; i < bndim; i++)
350 if (blo[i] <= 0 || bhi[i] > bdims[i])
351 {
352 pnga_error("pnga_median_patch:g_b indices out of range ", g_b);
353 }
354
355 for (i = 0; i < cndim; i++)
356 if (clo[i] <= 0 || chi[i] > cdims[i])
357 {
358 pnga_error("pnga_median_patch:g_c indices out of range ", g_c);
359 }
360 for (i = 0; i < mndim; i++)
361 if (mlo[i] <= 0 || mhi[i] > mdims[i])
362 {
363 pnga_error("pnga_median_patch:g_m indices out of range ", g_m);
364 }
365
366 /* check if numbers of elements in two patches match each other */
367
368 atotal = 1;
369 for (i = 0; i < andim; i++)
370 atotal *= (ahi[i] - alo[i] + 1);
371
372 btotal = 1;
373 for (i = 0; i < bndim; i++)
374 btotal *= (bhi[i] - blo[i] + 1);
375
376 ctotal = 1;
377 for (i = 0; i < cndim; i++)
378 ctotal *= (chi[i] - clo[i] + 1);
379
380 mtotal = 1;
381 for (i = 0; i < mndim; i++)
382 mtotal *= (mhi[i] - mlo[i] + 1);
383
384
385 if (mtotal != atotal)
386 pnga_error("pnga_median_patch: capacities of patches do not match ", 0L);
387
388 if (mtotal != btotal)
389 pnga_error("pnga_median_patch: capacities of patches do not match ", 0L);
390
391 if (mtotal != ctotal)
392 pnga_error("pnga_median_patch: capacities of patches do not match ", 0L);
393
394 num_blocks_a = pnga_total_blocks(g_a);
395 num_blocks_b = pnga_total_blocks(g_b);
396 num_blocks_c = pnga_total_blocks(g_c);
397 num_blocks_m = pnga_total_blocks(g_m);
398
399 if (num_blocks_a < 0 && num_blocks_b < 0 &&
400 num_blocks_c < 0 && num_blocks_m < 0) {
401 /* find out coordinates of patches of g_A, g_B, g_C, and g_M that I own */
402 pnga_distribution (g_A, me, loA, hiA);
403 pnga_distribution (g_B, me, loB, hiB);
404 pnga_distribution (g_C, me, loC, hiC);
405 pnga_distribution (g_M, me, loM, hiM);
406
407 if (!pnga_comp_patch (andim, loA, hiA, mndim, loM, hiM)) compatible = 1;
408 else compatible = 0;
409 /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*"); */
410 pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "&&");
411 if (!compatible) {
412 /* either patches or distributions do not match:
413 * - create a temp array that matches distribution of g_a
414 * - copy & reshape patch of g_b into g_B
415 */
416 if (!pnga_duplicate(g_m, &g_A, tempname))
417 pnga_error("pnga_median_patch:duplicate failed", 0L);
418
419 pnga_copy_patch(&transp, g_a, alo, ahi, g_A, mlo, mhi);
420 andim = mndim;
421 a_temp_created = 1;
422 pnga_distribution (g_A, me, loA, hiA);
423 }
424
425 if (!pnga_comp_patch (bndim, loB, hiB, mndim, loM, hiM)) compatible = 1;
426 else compatible = 0;
427 /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*"); */
428 pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "&&");
429 if (!compatible) {
430 /* either patches or distributions do not match:
431 * - create a temp array that matches distribution of g_a
432 * - copy & reshape patch of g_c into g_C
433 */
434 if (!pnga_duplicate(g_m, &g_B, tempname))
435 pnga_error("pnga_median_patch:duplicate failed", 0L);
436
437 pnga_copy_patch(&transp, g_b, blo, bhi, g_B, mlo, mhi);
438 bndim = mndim;
439 b_temp_created = 1;
440 pnga_distribution (g_B, me, loB, hiB);
441 }
442
443 if (!pnga_comp_patch (cndim, loC, hiC, mndim, loM, hiM)) compatible = 1;
444 else compatible = 0;
445 /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*"); */
446 pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "&&");
447 if (!compatible) {
448 /* either patches or distributions do not match:
449 * - create a temp array that matches distribution of g_a
450 * - copy & reshape patch of g_m into g_M
451 */
452 if (!pnga_duplicate(g_m, &g_C, tempname))
453 pnga_error("pnga_median_patch:duplicate failed", 0L);
454
455 /*no need to copy g_m since it is the output matrix. */
456 cndim = mndim;
457 c_temp_created = 1;
458 pnga_copy_patch(&transp, g_c, clo, chi, g_C, mlo, mhi);
459 pnga_distribution (g_C, me, loC, hiC);
460 }
461
462
463 if (!pnga_comp_patch (mndim, loM, hiM, andim, loA, hiA))
464 pnga_error(" patches mismatch ", 0);
465 if (!pnga_comp_patch (mndim, loM, hiM, bndim, loB, hiB))
466 pnga_error(" patches mismatch ", 0);
467 if (!pnga_comp_patch (mndim, loM, hiM, cndim, loC, hiC))
468 pnga_error(" patches mismatch ", 0);
469
470
471 /* A[83:125,1:1] <==> B[83:125] */
472 if (mndim > andim)
473 mndim = andim; /* need more work */
474 if (mndim > bndim)
475 mndim = bndim; /* need more work */
476 if (mndim > cndim)
477 mndim = cndim; /* need more work */
478
479
480 /* determine subsets of my patches to access */
481 if (pnga_patch_intersect (mlo, mhi, loM, hiM, mndim)) {
482 offset = 0;
483 pnga_access_ptr (g_A, loM, hiM, &A_ptr, ldA);
484 pnga_access_ptr (g_B, loM, hiM, &B_ptr, ldB);
485 pnga_access_ptr (g_C, loM, hiM, &C_ptr, ldC);
486 pnga_access_ptr (g_M, loM, hiM, &M_ptr, ldM);
487
488 sgai_median_patch_values(type, mndim, loM, hiM, ldM,
489 A_ptr, B_ptr, C_ptr, M_ptr, offset);
490
491 /* release access to the data */
492 pnga_release (g_A, loM, hiM);
493 pnga_release (g_B, loM, hiM);
494 pnga_release (g_C, loM, hiM);
495 pnga_release_update (g_M, loM, hiM);
496 }
497 } else {
498 /* create copies of A, B, and C that are identically distributed
499 as M */
500 if (!pnga_duplicate(g_m, &g_A, tempname))
501 pnga_error("ga_add_patch: dup failed", 0L);
502 pnga_copy_patch(&transp, g_a, alo, ahi, g_A, mlo, mhi);
503 andim = mndim;
504 a_temp_created = 1;
505
506 if (!pnga_duplicate(g_m, &g_B, tempname))
507 pnga_error("ga_add_patch: dup failed", 0L);
508 pnga_copy_patch(&transp, g_b, blo, bhi, g_B, mlo, mhi);
509 bndim = mndim;
510 b_temp_created = 1;
511
512 if (!pnga_duplicate(g_m, &g_C, tempname))
513 pnga_error("ga_add_patch: dup failed", 0L);
514 pnga_copy_patch(&transp, g_c, clo, chi, g_C, mlo, mhi);
515 cndim = mndim;
516 c_temp_created = 1;
517
518 /* M is normally distributed so just get the mean using standard approach */
519 #if 1
520 pnga_local_iterator_init(g_A, &hdl_a);
521 pnga_local_iterator_init(g_B, &hdl_b);
522 pnga_local_iterator_init(g_C, &hdl_c);
523 pnga_local_iterator_init(g_m, &hdl_m);
524 while(pnga_local_iterator_next(&hdl_a,loA,hiA,&A_ptr,ldA)) {
525 pnga_local_iterator_next(&hdl_b,loB,hiB,&B_ptr,ldB);
526 pnga_local_iterator_next(&hdl_c,loC,hiC,&C_ptr,ldC);
527 pnga_local_iterator_next(&hdl_m,loM,hiM,&M_ptr,ldM);
528 Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
529 Integer jtot, last;
530 /* make temporary copies of loM and hiM since pnga_patch_intersect
531 destroys original versions */
532 for (j=0; j<mndim; j++) {
533 lod[j] = loM[j];
534 /*hid[j] = hiM[j];*/
535 }
536
537 /* evaluate offsets for system */
538 offset = 0;
539 last = mndim - 1;
540 jtot = 1;
541 for (j=0; j<last; j++) {
542 offset += (loM[j] - lod[j])*jtot;
543 jtot *= ldM[j];
544 }
545 offset += (loM[last]-lod[last])*jtot;
546
547 sgai_median_patch_values(type, mndim, loM, hiM, ldM,
548 A_ptr, B_ptr, C_ptr, M_ptr, offset);
549
550 }
551 #else
552 if (num_blocks_m < 0) {
553 offset = 0;
554 pnga_distribution(g_m, me, loM, hiM);
555 if (pnga_patch_intersect (mlo, mhi, loM, hiM, mndim)) {
556 pnga_access_ptr (g_A, loM, hiM, &A_ptr, ldA);
557 pnga_access_ptr (g_B, loM, hiM, &B_ptr, ldB);
558 pnga_access_ptr (g_C, loM, hiM, &C_ptr, ldC);
559 pnga_access_ptr (g_M, loM, hiM, &M_ptr, ldM);
560
561 sgai_median_patch_values(type, mndim, loM, hiM, ldM,
562 A_ptr, B_ptr, C_ptr, M_ptr, offset);
563
564 /* release access to the data */
565 pnga_release (g_A, loM, hiM);
566 pnga_release (g_B, loM, hiM);
567 pnga_release (g_C, loM, hiM);
568 pnga_release_update (g_M, loM, hiM);
569 }
570 } else {
571 Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
572 Integer jtot, last, nproc;
573 /* Simple block-cyclic data distribution */
574 if (!pnga_uses_proc_grid(g_m)) {
575 nproc = pnga_nnodes();
576 for (idx = me; idx < num_blocks_m; idx += nproc) {
577 pnga_distribution(g_m, idx, loM, hiM);
578 /* make temporary copies of loM and hiM since pnga_patch_intersect
579 destroys original versions */
580 for (j=0; j<mndim; j++) {
581 lod[j] = loM[j];
582 /*hid[j] = hiM[j];*/
583 }
584 if (pnga_patch_intersect(mlo, mhi, loM, hiM, mndim)) {
585 pnga_access_block_ptr(g_A, idx, &A_ptr, ldA);
586 pnga_access_block_ptr(g_B, idx, &B_ptr, ldB);
587 pnga_access_block_ptr(g_C, idx, &C_ptr, ldC);
588 pnga_access_block_ptr(g_m, idx, &M_ptr, ldM);
589
590 /* evaluate offsets for system */
591 offset = 0;
592 last = mndim - 1;
593 jtot = 1;
594 for (j=0; j<last; j++) {
595 offset += (loM[j] - lod[j])*jtot;
596 jtot *= ldM[j];
597 }
598 offset += (loM[last]-lod[last])*jtot;
599
600 sgai_median_patch_values(type, mndim, loM, hiM, ldM,
601 A_ptr, B_ptr, C_ptr, M_ptr, offset);
602
603 /* release access to the data */
604 pnga_release_block (g_A, idx);
605 pnga_release_block (g_B, idx);
606 pnga_release_block (g_C, idx);
607 pnga_release_update_block (g_M, idx);
608 }
609 }
610 } else {
611 /* Uses scalapack block-cyclic data distribution */
612 Integer lod[MAXDIM]/*, hid[MAXDIM], chk*/;
613 Integer proc_index[MAXDIM], index[MAXDIM];
614 Integer topology[MAXDIM];
615 Integer blocks[MAXDIM], block_dims[MAXDIM];
616 pnga_get_proc_index(g_m, me, proc_index);
617 pnga_get_proc_index(g_m, me, index);
618 pnga_get_block_info(g_m, blocks, block_dims);
619 pnga_get_proc_grid(g_m, topology);
620
621 while (index[mndim-1] < blocks[mndim-1]) {
622 /* find bounding coordinates of block */
623 /*chk = 1;*/
624 for (i = 0; i < mndim; i++) {
625 loM[i] = index[i]*block_dims[i]+1;
626 hiM[i] = (index[i] + 1)*block_dims[i];
627 if (hiM[i] > mdims[i]) hiM[i] = mdims[i];
628 /*if (hiM[i] < loM[i]) chk = 0;*/
629 }
630 /* make temporary copies of loC and hiC since pnga_patch_intersect
631 destroys original versions */
632 for (j=0; j<mndim; j++) {
633 lod[j] = loM[j];
634 /*hid[j] = hiM[j];*/
635 }
636 if (pnga_patch_intersect(mlo, mhi, loM, hiM, mndim)) {
637 pnga_access_block_grid_ptr(g_A, index, &A_ptr, ldA);
638 pnga_access_block_grid_ptr(g_B, index, &B_ptr, ldB);
639 pnga_access_block_grid_ptr(g_C, index, &C_ptr, ldC);
640 pnga_access_block_grid_ptr(g_m, index, &M_ptr, ldM);
641
642 /* evaluate offsets for system */
643 offset = 0;
644 last = mndim - 1;
645 jtot = 1;
646 for (j=0; j<last; j++) {
647 offset += (loM[j] - lod[j])*jtot;
648 jtot *= ldM[j];
649 }
650 offset += (loM[last]-lod[last])*jtot;
651
652 sgai_median_patch_values(type, mndim, loM, hiM, ldM,
653 A_ptr, B_ptr, C_ptr, M_ptr, offset);
654
655 /* release access to the data */
656 pnga_release_block_grid (g_A, index);
657 pnga_release_block_grid (g_B, index);
658 pnga_release_block_grid (g_C, index);
659 pnga_release_update_block_grid (g_M, index);
660 }
661
662 /* increment index to get next block on processor */
663 index[0] += topology[0];
664 for (i = 0; i < mndim; i++) {
665 if (index[i] >= blocks[i] && i<mndim-1) {
666 index[i] = proc_index[i];
667 index[i+1] += topology[i+1];
668 }
669 }
670 }
671 }
672 }
673 #endif
674 }
675 if (a_temp_created)
676 pnga_destroy (g_A);
677 if (b_temp_created)
678 pnga_destroy (g_B);
679 if (c_temp_created)
680 pnga_destroy (g_C);
681 if(local_sync_end)pnga_sync();
682 }
683
684
685
686 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
687 # pragma weak wnga_median = pnga_median
688 #endif
pnga_median(Integer g_a,Integer g_b,Integer g_c,Integer g_m)689 void pnga_median(Integer g_a, Integer g_b, Integer g_c, Integer g_m){
690
691
692 Integer atype, andim;
693 Integer alo[MAXDIM],ahi[MAXDIM];
694 Integer btype, bndim;
695 Integer blo[MAXDIM],bhi[MAXDIM];
696 Integer ctype, cndim;
697 Integer clo[MAXDIM],chi[MAXDIM];
698 Integer mtype, mndim;
699 Integer mlo[MAXDIM],mhi[MAXDIM];
700
701 pnga_inquire(g_a, &atype, &andim, ahi);
702 pnga_inquire(g_b, &btype, &bndim, bhi);
703 pnga_inquire(g_c, &ctype, &cndim, chi);
704 pnga_inquire(g_m, &mtype, &mndim, mhi);
705
706 while(andim){
707 alo[andim-1]=1;
708 andim--;
709 }
710
711 while(bndim){
712 blo[bndim-1]=1;
713 bndim--;
714 }
715
716 while(cndim){
717 clo[cndim-1]=1;
718 cndim--;
719 }
720
721 while(mndim){
722 mlo[mndim-1]=1;
723 mndim--;
724 }
725 _ga_sync_begin = 1;
726 pnga_median_patch(g_a, alo, ahi, g_b, blo, bhi, g_c, clo, chi, g_m, mlo, mhi);
727
728 }
729
sgai_norm_infinity_block(Integer g_a,void * ptr,Integer * lo,Integer * hi,Integer ld,Integer type,Integer ndim,Integer * dims,void * buf)730 static void sgai_norm_infinity_block(Integer g_a, void *ptr,
731 Integer *lo, Integer *hi, Integer ld,
732 Integer type,
733 Integer ndim, Integer *dims, void *buf)
734 {
735 /*Integer size, nelem, dim2;*/
736 Integer iloA=0, ihiA=0, jloA=0, jhiA=0;
737 Integer i, j;
738 int *isum = NULL;
739 long *lsum = NULL;
740 double *dsum = NULL;
741 float *fsum = NULL;
742 DoubleComplex *zsum = NULL;
743 SingleComplex *csum = NULL;
744
745 switch (type)
746 {
747 case C_INT:
748 isum = (int *) buf;
749 break;
750 case C_LONG:
751 lsum = (long *) buf;
752 break;
753 case C_FLOAT:
754 fsum = (float *) buf;
755 break;
756 case C_DBL:
757 dsum = (double *) buf;
758 break;
759 case C_DCPL:
760 zsum = (DoubleComplex *) buf;
761 break;
762 case C_SCPL:
763 csum = (SingleComplex *) buf;
764 break;
765 default:
766 pnga_error("ga_norm_infinity_: wrong data type:", type);
767 }
768
769 if(ndim<=0)
770 pnga_error("ga_norm_infinity: wrong dimension", ndim);
771 else if(ndim == 1) {
772 iloA=lo[0];
773 ihiA=hi[0];
774 jloA=1;
775 jhiA=1;
776 }
777 else if(ndim == 2) {
778 iloA=lo[0];
779 ihiA=hi[0];
780 jloA=lo[1];
781 jhiA=hi[1];
782 }
783 else
784 pnga_error("ga_norm_infinity: wrong dimension", ndim);
785
786 /* determine subset of my patch to access */
787 if (ihiA > 0 && jhiA > 0)
788 {
789 switch (type)
790 {
791 int *pi;
792 double *pd;
793 long *pl;
794 float *pf;
795 DoubleComplex *pz;
796 SingleComplex *pc;
797 DoubleComplex zval;
798 SingleComplex cval;
799 double dtemp;
800 float ftemp;
801 case C_INT:
802 pi = (int *) ptr;
803 *isum = GA_ABS(pi[0]);
804 for (j = 0; j < jhiA - jloA + 1; j++) {
805 for (i = 0; i < ihiA - iloA + 1; i++)
806 if (GA_ABS(pi[j*ld + i]) > *isum) *isum = GA_ABS(pi[j*ld+i]);
807 }
808 break;
809 case C_LONG:
810 pl = (long *) ptr;
811 *lsum = GA_ABS(pl[0]);
812 for (j = 0; j < jhiA - jloA + 1; j++)
813 for (i = 0; i < ihiA - iloA + 1; i++)
814 if (GA_ABS(pl[j*ld + i]) > *lsum) *lsum = GA_ABS(pl[j*ld+i]);
815 break;
816 case C_DCPL:
817 pz = (DoubleComplex *) ptr;
818 zval = pz[0];
819 dtemp =
820 sqrt (zval.real * zval.real + zval.imag * zval.imag);
821 (*zsum).real = dtemp;
822 for (j = 0; j < jhiA - jloA + 1; j++)
823 for (i = 0; i < ihiA - iloA + 1; i++)
824 {
825 zval = pz[j * ld + i];
826 dtemp =
827 sqrt (zval.real * zval.real + zval.imag * zval.imag);
828 if (dtemp > (*zsum).real) (*zsum).real = dtemp;
829 }
830 break;
831 case C_SCPL:
832 pc = (SingleComplex *) ptr;
833 cval = pc[0];
834 ftemp =
835 sqrt (cval.real * cval.real + cval.imag * cval.imag);
836 (*csum).real = ftemp;
837 for (j = 0; j < jhiA - jloA + 1; j++)
838 for (i = 0; i < ihiA - iloA + 1; i++)
839 {
840 cval = pc[j * ld + i];
841 ftemp =
842 sqrt (cval.real * cval.real + cval.imag * cval.imag);
843 if (ftemp > (*csum).real) (*csum).real = ftemp;
844 }
845 break;
846 case C_FLOAT:
847 pf = (float *) ptr;
848 *fsum = pf[0];
849 for (j = 0; j < jhiA - jloA + 1; j++)
850 for (i = 0; i < ihiA - iloA + 1; i++)
851 if (GA_ABS(pf[j*ld+i]) > *fsum) *fsum = GA_ABS(pf[j*ld+i]);
852 break;
853 case C_DBL:
854 pd = (double *) ptr;
855 *dsum = pd[0];
856 for (i = 0; i < ihiA - iloA + 1; i++)
857 for (j = 0; j < jhiA - jloA + 1; j++)
858 if (GA_ABS(pd[j*ld+i]) > *dsum) *dsum = GA_ABS(pd[j*ld+i]);
859 break;
860 default:
861 pnga_error("sgai_norm_infinity_block: wrong data type ", type);
862 }
863 }
864 }
865
866 /* Evaluate the infinity norm of an array. This is the maximum absolute
867 * value of the the elements in the array */
868 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
869 # pragma weak wnga_norm_infinity = pnga_norm_infinity
870 #endif
pnga_norm_infinity(Integer g_a,double * nm)871 void pnga_norm_infinity(Integer g_a, double *nm)
872 {
873 Integer type;
874 Integer me = pnga_nodeid (), i, j, nproc = pnga_nnodes();
875 Integer ndim, dims[MAXDIM], lo[2], hi[2], ld;
876 Integer num_blocks_a;
877 int local_sync_begin,local_sync_end;
878 int isum = 0;
879 long lsum = 0;
880 double dsum = 0.0;
881 float fsum = 0.0;
882 float fval;
883 double dval;
884 DoubleComplex zsum;
885 SingleComplex csum;
886 char *buf = NULL;
887 char *ptr = NULL;
888 zsum.real = 0.0;
889 zsum.imag = 0.0;
890 csum.real = 0.0;
891 csum.imag = 0.0;
892 _iterator_hdl hdl;
893
894
895 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
896 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
897 if(local_sync_begin)pnga_sync();
898
899 pnga_check_handle (g_a, "ga_norm_infinity_");
900
901 pnga_inquire (g_a, &type, &ndim, dims);
902
903 if(ndim<=0)
904 pnga_error("ga_norm_infinity: wrong dimension", ndim);
905 else if (ndim >= 3)
906 pnga_error("ga_norm_infinity: wrong dimension", ndim);
907
908
909 switch (type)
910 {
911 case C_INT:
912 buf = (void*)(&isum);
913 break;
914 case C_LONG:
915 buf = (void*)(&lsum);
916 break;
917 case C_FLOAT:
918 buf = (void*)(&fsum);
919 break;
920 case C_DBL:
921 buf = (void*)(&dsum);
922 break;
923 case C_DCPL:
924 buf = (void*)(&zsum);
925 break;
926 case C_SCPL:
927 buf = (void*)(&csum);
928 break;
929 default:
930 pnga_error("ga_norm_infinity_: wrong data type:", type);
931 }
932
933 #if 1
934 pnga_local_iterator_init(g_a, &hdl);
935 while (pnga_local_iterator_next(&hdl,lo,hi,&ptr,&ld)) {
936 sgai_norm_infinity_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
937 }
938 #else
939 num_blocks_a = pnga_total_blocks(g_a);
940
941 if (num_blocks_a < 0) {
942 pnga_distribution(g_a, me, lo, hi);
943 pnga_access_ptr(g_a, lo, hi, &ptr, &ld);
944 sgai_norm_infinity_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
945 pnga_release_update(g_a, lo, hi);
946 } else {
947 Integer idx;
948 /* Simple block-cyclic data distribution */
949 if (!pnga_uses_proc_grid(g_a)) {
950 for (idx = me; idx < num_blocks_a; idx += nproc) {
951 pnga_distribution(g_a, idx, lo, hi);
952 pnga_access_block_ptr(g_a, idx, &ptr, &ld);
953 sgai_norm_infinity_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
954 pnga_release_update_block(g_a, idx);
955 }
956 } else {
957 /* Uses scalapack block-cyclic data distribution */
958 Integer chk;
959 Integer proc_index[MAXDIM], index[MAXDIM];
960 Integer topology[MAXDIM];
961 Integer blocks[MAXDIM], block_dims[MAXDIM];
962 pnga_get_proc_index(g_a, me, proc_index);
963 pnga_get_proc_index(g_a, me, index);
964 pnga_get_block_info(g_a, blocks, block_dims);
965 pnga_get_proc_grid(g_a, topology);
966 while (index[ndim-1] < blocks[ndim-1]) {
967 /* find bounding coordinates of block */
968 chk = 1;
969 for (i = 0; i < ndim; i++) {
970 lo[i] = index[i]*block_dims[i]+1;
971 hi[i] = (index[i] + 1)*block_dims[i];
972 if (hi[i] > dims[i]) hi[i] = dims[i];
973 if (hi[i] < lo[i]) chk = 0;
974 }
975 if (chk) {
976 pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
977 sgai_norm_infinity_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
978 pnga_release_update_block_grid(g_a, index);
979 }
980 /* increment index to get next block on processor */
981 index[0] += topology[0];
982 for (i = 0; i < ndim; i++) {
983 if (index[i] >= blocks[i] && i<ndim-1) {
984 index[i] = proc_index[i];
985 index[i+1] += topology[i+1];
986 }
987 }
988 }
989 }
990 }
991 #endif
992
993
994 /*calculate the global value buf[j] for each column */
995 switch (type)
996 {
997 case C_INT:
998 armci_msg_igop (&isum, 1, "max");
999 break;
1000 case C_DBL:
1001 armci_msg_dgop (&dsum, 1, "max");
1002 break;
1003 case C_DCPL:
1004 dval = zsum.real;
1005 armci_msg_dgop (&dval, 1, "max");
1006 zsum.real = dval;
1007 break;
1008 case C_FLOAT:
1009 armci_msg_fgop (&fsum, 1, "max");
1010 break;
1011 case C_SCPL:
1012 fval = csum.real;
1013 armci_msg_fgop (&fval, 1, "max");
1014 csum.real = fval;
1015 break;
1016 case C_LONG:
1017 armci_msg_lgop (&lsum, 1, "max");
1018 break;
1019 default:
1020 pnga_error("ga_norm_infinity_: wrong data type ", type);
1021 }
1022
1023 /*evaluate the norm infinity for the matrix g_a */
1024 switch (type)
1025 {
1026 case C_INT:
1027 *nm = (double)isum;
1028 break;
1029 case C_LONG:
1030 *nm = (double)lsum;
1031 break;
1032 case C_FLOAT:
1033 *nm = (double)fsum;
1034 break;
1035 case C_DBL:
1036 *nm = (double)dsum;
1037 break;
1038 case C_DCPL:
1039 *nm = (double)(zsum.real);
1040 break;
1041 case C_SCPL:
1042 *nm = (double)(csum.real);
1043 break;
1044 default:
1045 pnga_error("ga_norm_infinity_:wrong data type.", type);
1046 }
1047
1048 if (local_sync_end)pnga_sync();
1049 }
1050
sgai_norm1_block(Integer g_a,void * ptr,Integer * lo,Integer * hi,Integer ld,Integer type,Integer ndim,Integer * dims,void * buf)1051 static void sgai_norm1_block(Integer g_a, void *ptr,
1052 Integer *lo, Integer *hi, Integer ld,
1053 Integer type,
1054 Integer ndim, Integer *dims, void *buf)
1055 {
1056 Integer iloA=0, ihiA=0, jloA=0, jhiA=0;
1057 Integer i, j;
1058 int *isum = NULL;
1059 long *lsum = NULL;
1060 double *dsum = NULL;
1061 float *fsum = NULL;
1062 DoubleComplex *zsum = NULL;
1063 SingleComplex *csum = NULL;
1064
1065 switch (type)
1066 {
1067 case C_INT:
1068 isum = (int *) buf;
1069 break;
1070 case C_LONG:
1071 lsum = (long *) buf;
1072 break;
1073 case C_FLOAT:
1074 fsum = (float *) buf;
1075 break;
1076 case C_DBL:
1077 dsum = (double *) buf;
1078 break;
1079 case C_DCPL:
1080 zsum = (DoubleComplex *) buf;
1081 break;
1082 case C_SCPL:
1083 csum = (SingleComplex *) buf;
1084 break;
1085 default:
1086 pnga_error("ga1_norm1_block: wrong data type:", type);
1087 }
1088
1089 if(ndim<=0)
1090 pnga_error("sgai_norm1_block: wrong dimension", ndim);
1091 else if(ndim == 1) {
1092 iloA=lo[0];
1093 ihiA=hi[0];
1094 jloA=1;
1095 jhiA=1;
1096 }
1097 else if(ndim == 2)
1098 {
1099 iloA=lo[0];
1100 ihiA=hi[0];
1101 jloA=lo[1];
1102 jhiA=hi[1];
1103 }
1104 else
1105 pnga_error("sgai_norm1_block: wrong dimension", ndim);
1106
1107 /* determine subset of my patch to access */
1108 if (ihiA > 0 && jhiA > 0)
1109 {
1110 switch (type)
1111 {
1112 int *pi;
1113 double *pd;
1114 long *pl;
1115 float *pf;
1116 DoubleComplex *pz;
1117 SingleComplex *pc;
1118 case C_INT:
1119 pi = (int *) ptr;
1120 *isum = 0;
1121 for (i = 0; i < ihiA - iloA + 1; i++)
1122 for (j = 0; j < jhiA - jloA + 1; j++)
1123 *isum += GA_ABS (pi[j * ld + i]);
1124 break;
1125 case C_LONG:
1126 pl = (long *) ptr;
1127 *lsum = 0;
1128 for (i = 0; i < ihiA - iloA + 1; i++)
1129 for (j = 0; j < jhiA - jloA + 1; j++)
1130 *lsum += GA_ABS (pl[j * ld + i]);
1131 break;
1132 case C_DCPL:
1133 pz = (DoubleComplex *) ptr;
1134 (*zsum).real = 0.0;
1135 (*zsum).imag = 0.0;
1136 for (i = 0; i < ihiA - iloA + 1; i++)
1137 for (j = 0; j < jhiA - jloA + 1; j++)
1138 {
1139 DoubleComplex zval = pz[j * ld + i];
1140 double temp =
1141 sqrt (zval.real * zval.real + zval.imag * zval.imag);
1142 (*zsum).real += temp;
1143 }
1144 break;
1145 case C_SCPL:
1146 pc = (SingleComplex *) ptr;
1147 (*csum).real = 0.0;
1148 (*csum).imag = 0.0;
1149 for (i = 0; i < ihiA - iloA + 1; i++)
1150 for (j = 0; j < jhiA - jloA + 1; j++)
1151 {
1152 SingleComplex cval = pc[j * ld + i];
1153 float temp =
1154 sqrt (cval.real * cval.real + cval.imag * cval.imag);
1155 (*csum).real += temp;
1156 }
1157 break;
1158 case C_FLOAT:
1159 pf = (float *) ptr;
1160 *fsum = 0.0;
1161 for (i = 0; i < ihiA - iloA + 1; i++)
1162 for (j = 0; j < jhiA - jloA + 1; j++)
1163 *fsum += GA_ABS (pf[j * ld + i]);
1164 break;
1165 case C_DBL:
1166 pd = (double *) ptr;
1167 *dsum = 0.0;
1168 for (i = 0; i < ihiA - iloA + 1; i++)
1169 for (j = 0; j < jhiA - jloA + 1; j++)
1170 *dsum += GA_ABS (pd[j * ld + i]);
1171 break;
1172 default:
1173 pnga_error("sgai_norm1_block: wrong data type ", type);
1174 }
1175 }
1176 }
1177
1178 /* Evaluate the L_1 norm of an array. This should be the sum of the absolute
1179 * value of the individual array elements*/
1180 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1181 # pragma weak wnga_norm1 = pnga_norm1
1182 #endif
pnga_norm1(Integer g_a,double * nm)1183 void pnga_norm1(Integer g_a, double *nm)
1184 {
1185 Integer type=0;
1186 Integer me = pnga_nodeid (), i, j, nproc = pnga_nnodes();
1187 Integer ndim, dims[MAXDIM], lo[2], hi[2], ld;
1188 Integer num_blocks_a;
1189 int local_sync_begin,local_sync_end;
1190 int isum = 0;
1191 long lsum = 0;
1192 double dsum = 0.0;
1193 float fsum = 0.0;
1194 double dval;
1195 float fval;
1196 DoubleComplex zsum;
1197 SingleComplex csum;
1198 char *buf = NULL; /*temporary buffer */
1199 char *ptr = NULL;
1200 zsum.real = 0.0;
1201 zsum.imag = 0.0;
1202 csum.real = 0.0;
1203 csum.imag = 0.0;
1204 _iterator_hdl hdl;
1205
1206 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1207 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1208 if(local_sync_begin)pnga_sync();
1209
1210 pnga_check_handle (g_a, "ga_norm1_");
1211
1212 pnga_inquire (g_a, &type, &ndim, dims);
1213
1214
1215 if(ndim<=0 || ndim > 2)
1216 pnga_error("ga_norm1: wrong dimension", ndim);
1217
1218 switch (type)
1219 {
1220 case C_INT:
1221 buf = (void*)(&isum);
1222 break;
1223 case C_LONG:
1224 buf = (void*)(&lsum);
1225 break;
1226 case C_FLOAT:
1227 buf = (void*)(&fsum);
1228 break;
1229 case C_DBL:
1230 buf = (void*)(&dsum);
1231 break;
1232 case C_DCPL:
1233 buf = (void*)(&zsum);
1234 break;
1235 case C_SCPL:
1236 buf = (void*)(&csum);
1237 break;
1238 default:
1239 pnga_error("ga_norm1_: wrong data type:", type);
1240 }
1241
1242 #if 1
1243 pnga_local_iterator_init(g_a, &hdl);
1244 while (pnga_local_iterator_next(&hdl,lo,hi,&ptr,&ld)) {
1245 sgai_norm1_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
1246 }
1247 #else
1248 num_blocks_a = pnga_total_blocks(g_a);
1249
1250 if (num_blocks_a < 0) {
1251 pnga_distribution(g_a, me, lo, hi);
1252 pnga_access_ptr(g_a, lo, hi, &ptr, &ld);
1253 sgai_norm1_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
1254 pnga_release_update(g_a, lo, hi);
1255 } else {
1256 Integer idx;
1257 /* Simple block-cyclic data distribution */
1258 if (!pnga_uses_proc_grid(g_a)) {
1259 for (idx = me; idx < num_blocks_a; idx += nproc) {
1260 pnga_distribution(g_a, idx, lo, hi);
1261 pnga_access_block_ptr(g_a, idx, &ptr, &ld);
1262 sgai_norm1_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
1263 pnga_release_update_block(g_a, idx);
1264 }
1265 } else {
1266 /* Uses scalapack block-cyclic data distribution */
1267 Integer chk;
1268 Integer proc_index[MAXDIM], index[MAXDIM];
1269 Integer topology[MAXDIM];
1270 Integer blocks[MAXDIM], block_dims[MAXDIM];
1271 pnga_get_proc_index(g_a, me, proc_index);
1272 pnga_get_proc_index(g_a, me, index);
1273 pnga_get_block_info(g_a, blocks, block_dims);
1274 pnga_get_proc_grid(g_a, topology);
1275 while (index[ndim-1] < blocks[ndim-1]) {
1276 /* find bounding coordinates of block */
1277 chk = 1;
1278 for (i = 0; i < ndim; i++) {
1279 lo[i] = index[i]*block_dims[i]+1;
1280 hi[i] = (index[i] + 1)*block_dims[i];
1281 if (hi[i] > dims[i]) hi[i] = dims[i];
1282 if (hi[i] < lo[i]) chk = 0;
1283 }
1284 if (chk) {
1285 pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
1286 sgai_norm1_block(g_a, ptr, lo, hi, ld, type, ndim, dims, buf);
1287 pnga_release_update_block_grid(g_a, index);
1288 }
1289 /* increment index to get next block on processor */
1290 index[0] += topology[0];
1291 for (i = 0; i < ndim; i++) {
1292 if (index[i] >= blocks[i] && i<ndim-1) {
1293 index[i] = proc_index[i];
1294 index[i+1] += topology[i+1];
1295 }
1296 }
1297 }
1298 }
1299 }
1300 #endif
1301
1302 /*calculate the global value buf[j] for each column */
1303 switch (type)
1304 {
1305 case C_INT:
1306 armci_msg_igop (&isum, 1, "+");
1307 break;
1308 case C_DBL:
1309 armci_msg_dgop (&dsum, 1, "+");
1310 break;
1311 case C_DCPL:
1312 dval = zsum.real;
1313 armci_msg_dgop (&dval, 1, "+");
1314 zsum.real = dval;
1315 break;
1316 case C_FLOAT:
1317 armci_msg_fgop (&fsum, 1, "+");
1318 break;
1319 case C_SCPL:
1320 fval = csum.real;
1321 armci_msg_fgop (&fval, 1, "+");
1322 csum.real = fval;
1323 break;
1324 case C_LONG:
1325 armci_msg_lgop (&lsum, 1, "+");
1326 break;
1327 default:
1328 pnga_error("ga_norm1_: wrong data type ", type);
1329 }
1330
1331 /*evaluate the norm1 for the matrix g_a */
1332 switch (type)
1333 {
1334 case C_INT:
1335 *nm = (double)isum;
1336 break;
1337 case C_LONG:
1338 *nm = (double)lsum;
1339 break;
1340 case C_FLOAT:
1341 *nm = (double)fsum;
1342 break;
1343 case C_DBL:
1344 *nm = (double)dsum;
1345 break;
1346 case C_DCPL:
1347 *nm = (double)(zsum.real);
1348 break;
1349 case C_SCPL:
1350 *nm = (double)(csum.real);
1351 break;
1352 default:
1353 pnga_error("ga_norm1_:wrong data type.", type);
1354 }
1355
1356 if(local_sync_end)pnga_sync();
1357 }
1358
sgai_get_diagonal_block(Integer g_a,void * ptr,Integer g_v,Integer * loA,Integer * hiA,Integer ld,Integer type)1359 static void sgai_get_diagonal_block(Integer g_a, void *ptr, Integer g_v,
1360 Integer *loA, Integer *hiA, Integer ld,
1361 Integer type)
1362 {
1363 Integer nelem, size;
1364 Integer vlo, vhi, iloA, ihiA, jloA, jhiA, lo[2], hi[2];
1365 Integer i;
1366 void *buf;
1367 int *ia;
1368 float *fa;
1369 double *da;
1370 long *la;
1371 DoubleComplex *dca;
1372 SingleComplex *fca;
1373
1374 iloA = loA[0];
1375 ihiA = hiA[0];
1376 jloA = loA[1];
1377 jhiA = hiA[1];
1378
1379 /* determine subset of my patch to access */
1380 if (iloA > 0)
1381 {
1382 lo[0] = GA_MAX (iloA, jloA);
1383 lo[1] = GA_MAX (iloA, jloA);
1384 hi[0] = GA_MIN (ihiA, jhiA);
1385 hi[1] = GA_MIN (ihiA, jhiA);
1386
1387
1388
1389 if (hi[0] >= lo[0]) /*make sure the equality symbol is there!!! */
1390 { /* we got a block containing diagonal elements */
1391
1392 /*allocate a buffer for the given vector g_v */
1393 size = GAsizeof (type);
1394 vlo = GA_MAX (iloA, jloA);
1395 vhi = GA_MIN (ihiA, jhiA);
1396 nelem = vhi - vlo + 1;
1397 buf = malloc (nelem * size);
1398 if (buf == NULL)
1399 pnga_error
1400 ("ga_get_diag_:failed to allocate memory for the local buffer.",
1401 9999);
1402
1403 /* get the vector from the global array g_a, put that in the the local memory buffer buf */
1404 switch (type)
1405 {
1406 case C_INT:
1407 ia = (int *) ptr;
1408 ia += ld*(lo[1]-jloA) + lo[0]-iloA;
1409 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1410 {
1411 ((int *) buf)[i] = *ia;
1412 ia += ld + 1;
1413 }
1414 break;
1415 case C_LONG:
1416 la = (long *) ptr;
1417 la += ld*(lo[1]-jloA) + lo[0]-iloA;
1418 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1419 {
1420 ((long *) buf)[i] = *la;
1421 la += ld + 1;
1422 }
1423 break;
1424 case C_FLOAT:
1425 fa = (float *) ptr;
1426 fa += ld*(lo[1]-jloA) + lo[0]-iloA;
1427 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1428 {
1429 ((float *) buf)[i] = *fa;
1430 fa += ld + 1;
1431 }
1432 break;
1433 case C_DBL:
1434 da = (double *) ptr;
1435 da += ld*(lo[1]-jloA) + lo[0]-iloA;
1436 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1437 {
1438 ((double *) buf)[i] = *da;
1439 da += ld + 1;
1440 }
1441 break;
1442 case C_DCPL:
1443 dca = (DoubleComplex *) ptr;
1444 dca += ld*(lo[1]-jloA) + lo[0]-iloA;
1445 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1446 {
1447 (((DoubleComplex *) buf)[i]).real = (*dca).real;
1448 (((DoubleComplex *) buf)[i]).imag = (*dca).imag;
1449 dca += ld + 1;
1450 }
1451 break;
1452
1453 case C_SCPL:
1454 fca = (SingleComplex *) ptr;
1455 fca += ld*(lo[1]-jloA) + lo[0]-iloA;
1456 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1457 {
1458 (((SingleComplex *) buf)[i]).real = (*fca).real;
1459 (((SingleComplex *) buf)[i]).imag = (*fca).imag;
1460 fca += ld + 1;
1461 }
1462 break;
1463
1464 default:
1465 pnga_error("get_diagonal_zero: wrong data type:", type);
1466 }
1467
1468 /* copy the local memory buffer buf to g_v */
1469 pnga_put(g_v, &vlo, &vhi, buf, &vhi);
1470
1471 /*free the memory */
1472 free (buf);
1473 }
1474 }
1475 }
1476
1477 /* Get the diagonal elements of a matrix and copy them into a one dimensional
1478 * array */
1479 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1480 # pragma weak wnga_get_diag = pnga_get_diag
1481 #endif
pnga_get_diag(Integer g_a,Integer g_v)1482 void pnga_get_diag(Integer g_a, Integer g_v)
1483 {
1484 Integer vndim, vdims, dim1, dim2, vtype, atype, type;
1485 Integer me = pnga_nodeid (), i, nproc = pnga_nnodes();
1486 Integer andim, adims[2];
1487 Integer loA[2], hiA[2], ld;
1488 Integer num_blocks_a;
1489 int local_sync_begin,local_sync_end;
1490 char *ptr;
1491 _iterator_hdl hdl;
1492
1493 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1494 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1495 if(local_sync_begin)pnga_sync();
1496
1497 pnga_check_handle (g_a, "ga_get_diag_");
1498 pnga_check_handle (g_v, "ga_get_diag_");
1499
1500 pnga_inquire (g_a, &atype, &andim, adims);
1501 dim1 = adims[0];
1502 dim2 = adims[1];
1503 type = atype;
1504 pnga_inquire (g_v, &vtype, &vndim, &vdims);
1505
1506 /* Perform some error checking */
1507 if (andim != 2)
1508 pnga_error("ga_get_diag: wrong dimension for g_a.", andim);
1509 if (vndim != 1)
1510 pnga_error("ga_get_diag: wrong dimension for g_v.", vndim);
1511
1512 if (vdims != GA_MIN (dim1, dim2))
1513 pnga_error
1514 ("ga_get_diag: The size of the first array's diagonal is greater than the size of the second array.",
1515 type);
1516
1517 if (vtype != atype)
1518 {
1519 pnga_error
1520 ("ga_get_diag: input global arrays do not have the same data type. Global array type =",
1521 atype);
1522 }
1523
1524 #if 1
1525 pnga_local_iterator_init(g_a, &hdl);
1526 while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
1527 sgai_get_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1528 }
1529 #else
1530 num_blocks_a = pnga_total_blocks(g_a);
1531
1532 if (num_blocks_a < 0) {
1533 pnga_distribution(g_a, me, loA, hiA);
1534 pnga_access_ptr(g_a, loA, hiA, &ptr, &ld);
1535 sgai_get_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1536 pnga_release_update(g_a, loA, hiA);
1537 } else {
1538 Integer idx;
1539 /* Simple block-cyclic data distribution */
1540 if (!pnga_uses_proc_grid(g_a)) {
1541 for (idx = me; idx < num_blocks_a; idx += nproc) {
1542 pnga_distribution(g_a, idx, loA, hiA);
1543 pnga_access_block_ptr(g_a, idx, &ptr, &ld);
1544 sgai_get_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1545 pnga_release_update_block(g_a, idx);
1546 }
1547 } else {
1548 /* Uses scalapack block-cyclic data distribution */
1549 Integer chk;
1550 Integer proc_index[MAXDIM], index[MAXDIM];
1551 Integer topology[MAXDIM];
1552 Integer blocks[MAXDIM], block_dims[MAXDIM];
1553 pnga_get_proc_index(g_a, me, proc_index);
1554 pnga_get_proc_index(g_a, me, index);
1555 pnga_get_block_info(g_a, blocks, block_dims);
1556 pnga_get_proc_grid(g_a, topology);
1557 while (index[andim-1] < blocks[andim-1]) {
1558 /* find bounding coordinates of block */
1559 chk = 1;
1560 for (i = 0; i < andim; i++) {
1561 loA[i] = index[i]*block_dims[i]+1;
1562 hiA[i] = (index[i] + 1)*block_dims[i];
1563 if (hiA[i] > adims[i]) hiA[i] = adims[i];
1564 if (hiA[i] < loA[i]) chk = 0;
1565 }
1566 if (chk) {
1567 pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
1568 sgai_get_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1569 pnga_release_update_block_grid(g_a, index);
1570 }
1571 /* increment index to get next block on processor */
1572 index[0] += topology[0];
1573 for (i = 0; i < andim; i++) {
1574 if (index[i] >= blocks[i] && i<andim-1) {
1575 index[i] = proc_index[i];
1576 index[i+1] += topology[i+1];
1577 }
1578 }
1579 }
1580 }
1581 }
1582 #endif
1583
1584 if(local_sync_end)pnga_sync();
1585 }
1586
1587
sgai_add_diagonal_block(Integer g_a,void * ptr,Integer g_v,Integer * loA,Integer * hiA,Integer ld,Integer type)1588 static void sgai_add_diagonal_block(Integer g_a, void *ptr, Integer g_v,
1589 Integer *loA, Integer *hiA, Integer ld,
1590 Integer type)
1591 {
1592 Integer nelem, size;
1593 Integer vlo, vhi, iloA, ihiA, jloA, jhiA, lo[2], hi[2];
1594 Integer i;
1595 void *buf;
1596 int *ia;
1597 float *fa;
1598 double *da;
1599 long *la;
1600 DoubleComplex *dca;
1601 SingleComplex *fca;
1602
1603 iloA = loA[0];
1604 ihiA = hiA[0];
1605 jloA = loA[1];
1606 jhiA = hiA[1];
1607
1608 /* determine subset of my patch to access */
1609 if (iloA > 0)
1610 {
1611 lo[0] = GA_MAX (iloA, jloA);
1612 lo[1] = GA_MAX (iloA, jloA);
1613 hi[0] = GA_MIN (ihiA, jhiA);
1614 hi[1] = GA_MIN (ihiA, jhiA);
1615
1616
1617
1618 if (hi[0] >= lo[0]) /*make sure the equality symbol is there!!! */
1619 { /* we got a block containing diagonal elements */
1620
1621 /*allocate a buffer for the given vector g_v */
1622 size = GAsizeof (type);
1623 vlo = GA_MAX (iloA, jloA);
1624 vhi = GA_MIN (ihiA, jhiA);
1625 nelem = vhi - vlo + 1;
1626 buf = malloc (nelem * size);
1627 if (buf == NULL)
1628 pnga_error
1629 ("ga_add_diagonal_:failed to allocate memory for the local buffer.",
1630 0);
1631
1632 /* get the vector from the global array to the local memory buffer */
1633 pnga_get (g_v, &vlo, &vhi, buf, &vhi);
1634
1635 switch (type)
1636 {
1637 case C_INT:
1638 ia = (int *) ptr;
1639 ia += ld*(lo[1]-jloA) + lo[0]-iloA;
1640 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1641 {
1642 *ia += ((int *) buf)[i];
1643 ia += ld + 1;
1644 }
1645 break;
1646 case C_LONG:
1647 la = (long *) ptr;
1648 la += ld*(lo[1]-jloA) + lo[0]-iloA;
1649 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1650 {
1651 *la += ((long *) buf)[i];
1652 la += ld + 1;
1653 }
1654 break;
1655 case C_FLOAT:
1656 fa = (float *) ptr;
1657 fa += ld*(lo[1]-jloA) + lo[0]-iloA;
1658 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1659 {
1660 *fa += ((float *) buf)[i];
1661 fa += ld + 1;
1662 }
1663 break;
1664 case C_DBL:
1665 da = (double *) ptr;
1666 da += ld*(lo[1]-jloA) + lo[0]-iloA;
1667 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1668 {
1669 *da += ((double *) buf)[i];
1670 da += ld + 1;
1671 }
1672 break;
1673 case C_DCPL:
1674 dca = (DoubleComplex *) ptr;
1675 dca += ld*(lo[1]-jloA) + lo[0]-iloA;
1676 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1677 {
1678 (*dca).real += (((DoubleComplex *) buf)[i]).real;
1679 (*dca).imag += (((DoubleComplex *) buf)[i]).imag;
1680 dca += ld + 1;
1681 }
1682 break;
1683
1684 case C_SCPL:
1685 fca = (SingleComplex *) ptr;
1686 fca += ld*(lo[1]-jloA) + lo[0]-iloA;
1687 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1688 {
1689 (*fca).real += (((SingleComplex *) buf)[i]).real;
1690 (*fca).imag += (((SingleComplex *) buf)[i]).imag;
1691 fca += ld + 1;
1692 }
1693 break;
1694
1695 default:
1696 pnga_error("ga_add_diagonal_: wrong data type:", type);
1697 }
1698
1699 /*free the memory */
1700 free (buf);
1701 }
1702 }
1703 }
1704
1705 /* Add values in vector g_v to diagonal elements of g_a */
1706 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1707 # pragma weak wnga_add_diagonal = pnga_add_diagonal
1708 #endif
pnga_add_diagonal(Integer g_a,Integer g_v)1709 void pnga_add_diagonal(Integer g_a, Integer g_v)
1710 {
1711 Integer vndim, vdims, dim1, dim2, vtype, atype, type;
1712 Integer me = pnga_nodeid (), i, nproc = pnga_nnodes();
1713 Integer andim, adims[2];
1714 Integer loA[2], hiA[2], ld;
1715 Integer num_blocks_a;
1716 int local_sync_begin,local_sync_end;
1717 char *ptr;
1718 _iterator_hdl hdl;
1719
1720 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1721 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1722 if(local_sync_begin)pnga_sync();
1723
1724 pnga_check_handle (g_a, "ga_add_diagonal_");
1725 pnga_check_handle (g_v, "ga_add_diagonal_");
1726
1727 pnga_inquire(g_a, &atype, &andim, adims);
1728 dim1 = adims[0];
1729 dim2 = adims[1];
1730 type = atype;
1731 pnga_inquire(g_v, &vtype, &vndim, &vdims);
1732
1733 /* Perform some error checking */
1734 if (andim != 2)
1735 pnga_error("ga_add_diagonal: wrong dimension for g_a.", andim);
1736 if (vndim != 1)
1737 pnga_error("ga_add_diagonal: wrong dimension for g_v.", vndim);
1738
1739
1740 if (vdims != GA_MIN (dim1, dim2))
1741 pnga_error
1742 ("ga_add_diagonal: The size of the first array's diagonal is greater than the size of the second array.",
1743 type);
1744
1745 if (vtype != atype)
1746 {
1747 pnga_error
1748 ("ga_add_diagonal: input global arrays do not have the same data type. Global array type =",
1749 atype);
1750 }
1751
1752 #if 1
1753 pnga_local_iterator_init(g_a, &hdl);
1754 while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
1755 sgai_add_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1756 }
1757 #else
1758 num_blocks_a = pnga_total_blocks(g_a);
1759
1760 if (num_blocks_a < 0) {
1761 pnga_distribution(g_a, me, loA, hiA);
1762 pnga_access_ptr(g_a, loA, hiA, &ptr, &ld);
1763 sgai_add_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1764 } else {
1765 Integer idx;
1766 /* Simple block-cyclic data distribution */
1767 if (!pnga_uses_proc_grid(g_a)) {
1768 for (idx = me; idx < num_blocks_a; idx += nproc) {
1769 pnga_distribution(g_a, idx, loA, hiA);
1770 pnga_access_block_ptr(g_a, idx, &ptr, &ld);
1771 sgai_add_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1772 pnga_release_update_block(g_a, idx);
1773 }
1774 } else {
1775 /* Uses scalapack block-cyclic data distribution */
1776 Integer chk;
1777 Integer proc_index[MAXDIM], index[MAXDIM];
1778 Integer topology[MAXDIM];
1779 Integer blocks[MAXDIM], block_dims[MAXDIM];
1780 pnga_get_proc_index(g_a, me, proc_index);
1781 pnga_get_proc_index(g_a, me, index);
1782 pnga_get_block_info(g_a, blocks, block_dims);
1783 pnga_get_proc_grid(g_a, topology);
1784 while (index[andim-1] < blocks[andim-1]) {
1785 /* find bounding coordinates of block */
1786 chk = 1;
1787 for (i = 0; i < andim; i++) {
1788 loA[i] = index[i]*block_dims[i]+1;
1789 hiA[i] = (index[i] + 1)*block_dims[i];
1790 if (hiA[i] > adims[i]) hiA[i] = adims[i];
1791 if (hiA[i] < loA[i]) chk = 0;
1792 }
1793 if (chk) {
1794 pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
1795 sgai_add_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1796 pnga_release_update_block_grid(g_a, index);
1797 }
1798 /* increment index to get next block on processor */
1799 index[0] += topology[0];
1800 for (i = 0; i < andim; i++) {
1801 if (index[i] >= blocks[i] && i<andim-1) {
1802 index[i] = proc_index[i];
1803 index[i+1] += topology[i+1];
1804 }
1805 }
1806 }
1807 }
1808 }
1809 #endif
1810 if(local_sync_end)pnga_sync();
1811 }
1812
sgai_set_diagonal_block(Integer g_a,void * ptr,Integer g_v,Integer * loA,Integer * hiA,Integer ld,Integer type)1813 static void sgai_set_diagonal_block(Integer g_a, void *ptr, Integer g_v, Integer *loA,
1814 Integer *hiA, Integer ld, Integer type)
1815 {
1816 Integer nelem, size;
1817 Integer vlo, vhi, iloA, ihiA, jloA, jhiA, lo[2], hi[2];
1818 Integer i;
1819 void *buf;
1820 int *ia;
1821 float *fa;
1822 double *da;
1823 long *la;
1824 DoubleComplex *dca;
1825 SingleComplex *fca;
1826
1827 iloA = loA[0];
1828 ihiA = hiA[0];
1829 jloA = loA[1];
1830 jhiA = hiA[1];
1831
1832 /* determine subset of my patch to access */
1833 if (iloA > 0)
1834 {
1835 lo[0] = GA_MAX (iloA, jloA);
1836 lo[1] = GA_MAX (iloA, jloA);
1837 hi[0] = GA_MIN (ihiA, jhiA);
1838 hi[1] = GA_MIN (ihiA, jhiA);
1839
1840
1841
1842 if (hi[0] >= lo[0]) /*make sure the equality symbol is there!!! */
1843 { /* we got a block containing diagonal elements*/
1844
1845 /*allocate a buffer for the given vector g_v */
1846 size = GAsizeof (type);
1847 vlo = GA_MAX (iloA, jloA);
1848 vhi = GA_MIN (ihiA, jhiA);
1849 nelem = vhi - vlo + 1;
1850 buf = malloc (nelem * size);
1851 if (buf == NULL)
1852 pnga_error
1853 ("ga_set_diagonal_:failed to allocate memory for local buffer",0);
1854
1855 /* get the vector from the global array to the local memory buffer */
1856 pnga_get (g_v, &vlo, &vhi, buf, &vhi);
1857
1858 switch (type)
1859 {
1860 case C_INT:
1861 ia = (int *) ptr;
1862 ia += ld*(lo[1]-jloA) + lo[0]-iloA;
1863 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1864 {
1865 *ia = ((int *) buf)[i];
1866 ia += ld + 1;
1867 }
1868 break;
1869 case C_LONG:
1870 la = (long *) ptr;
1871 la += ld*(lo[1]-jloA) + lo[0]-iloA;
1872 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1873 {
1874 *la = ((long *) buf)[i];
1875 la += ld + 1;
1876 }
1877 break;
1878 case C_FLOAT:
1879 fa = (float *) ptr;
1880 fa += ld*(lo[1]-jloA) + lo[0]-iloA;
1881 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1882 {
1883 *fa = ((float *) buf)[i];
1884 fa += ld + 1;
1885 }
1886 break;
1887 case C_DBL:
1888 da = (double *) ptr;
1889 da += ld*(lo[1]-jloA) + lo[0]-iloA;
1890 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1891 {
1892 *da = ((double *) buf)[i];
1893 da += ld + 1;
1894 }
1895 break;
1896 case C_DCPL:
1897 dca = (DoubleComplex *) ptr;
1898 dca += ld*(lo[1]-jloA) + lo[0]-iloA;
1899 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1900 {
1901 (*dca).real = (((DoubleComplex *) buf)[i]).real;
1902 (*dca).imag = (((DoubleComplex *) buf)[i]).imag;
1903 dca += ld + 1;
1904 }
1905 break;
1906
1907 case C_SCPL:
1908 fca = (SingleComplex *) ptr;
1909 fca += ld*(lo[1]-jloA) + lo[0]-iloA;
1910 for (i = 0; i < hi[0] - lo[0] + 1; i++)
1911 {
1912 (*fca).real = (((SingleComplex *) buf)[i]).real;
1913 (*fca).imag = (((SingleComplex *) buf)[i]).imag;
1914 fca += ld + 1;
1915 }
1916 break;
1917
1918 default:
1919 pnga_error("ga_set_diagonal_: wrong data type:", type);
1920 }
1921
1922 /*free the memory */
1923 free (buf);
1924
1925 /* release access to the data */
1926 lo[0] = iloA;
1927 lo[1] = jloA;
1928 hi[0] = ihiA;
1929 hi[1] = jhiA;
1930 pnga_release_update (g_a, lo, hi);
1931 }
1932 }
1933 }
1934
1935 /* Set diagonal of g_a to have values in g_v */
1936 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1937 # pragma weak wnga_set_diagonal = pnga_set_diagonal
1938 #endif
pnga_set_diagonal(Integer g_a,Integer g_v)1939 void pnga_set_diagonal(Integer g_a, Integer g_v)
1940 {
1941 Integer vndim, vdims, dim1, dim2, vtype, atype, type;
1942 Integer me = pnga_nodeid (), i, nproc = pnga_nnodes();
1943 Integer andim, adims[2];
1944 Integer loA[2], hiA[2], ld;
1945 Integer num_blocks_a;
1946 int local_sync_begin,local_sync_end;
1947 char *ptr;
1948 _iterator_hdl hdl;
1949
1950 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1951 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1952 if(local_sync_begin)pnga_sync();
1953
1954 pnga_check_handle (g_a, "ga_set_diagonal_");
1955 pnga_check_handle (g_v, "ga_set_diagonal_");
1956
1957 pnga_inquire (g_a, &atype, &andim, adims);
1958 dim1 = adims[0];
1959 dim2 = adims[1];
1960 type = atype;
1961 pnga_inquire (g_v, &vtype, &vndim, &vdims);
1962
1963 /* Perform some error checking */
1964 if (andim != 2)
1965 pnga_error("ga_set_diagonal: wrong dimension for g_a.", andim);
1966 if (vndim != 1)
1967 pnga_error("ga_set_diagonal: wrong dimension for g_v.", vndim);
1968
1969
1970 if (vdims != GA_MIN (dim1, dim2))
1971 pnga_error
1972 ("ga_set_diagonal: The size of the first array's diagonal is greater than the size of the second array.",
1973 type);
1974
1975 if (vtype != atype)
1976 {
1977 pnga_error
1978 ("ga_set_diagonal: input global arrays do not have the same data type. Global array type =",
1979 atype);
1980 }
1981
1982 #if 1
1983 pnga_local_iterator_init(g_a, &hdl);
1984 while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
1985 sgai_set_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1986 }
1987 #else
1988 num_blocks_a = pnga_total_blocks(g_a);
1989
1990 if (num_blocks_a < 0) {
1991 pnga_distribution(g_a, me, loA, hiA);
1992 pnga_access_ptr(g_a, loA, hiA, &ptr, &ld);
1993 sgai_set_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
1994 pnga_release_update(g_a, loA, hiA);
1995 } else {
1996 Integer idx;
1997 /* Simple block-cyclic data distribution */
1998 if (!pnga_uses_proc_grid(g_a)) {
1999 for (idx = me; idx < num_blocks_a; idx += nproc) {
2000 pnga_distribution(g_a, idx, loA, hiA);
2001 pnga_access_block_ptr(g_a, idx, &ptr, &ld);
2002 sgai_set_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
2003 pnga_release_update_block(g_a, idx);
2004 }
2005 } else {
2006 /* Uses scalapack block-cyclic data distribution */
2007 Integer chk;
2008 Integer proc_index[MAXDIM], index[MAXDIM];
2009 Integer topology[MAXDIM];
2010 Integer blocks[MAXDIM], block_dims[MAXDIM];
2011 pnga_get_proc_index(g_a, me, proc_index);
2012 pnga_get_proc_index(g_a, me, index);
2013 pnga_get_block_info(g_a, blocks, block_dims);
2014 pnga_get_proc_grid(g_a, topology);
2015 while (index[andim-1] < blocks[andim-1]) {
2016 /* find bounding coordinates of block */
2017 chk = 1;
2018 for (i = 0; i < andim; i++) {
2019 loA[i] = index[i]*block_dims[i]+1;
2020 hiA[i] = (index[i] + 1)*block_dims[i];
2021 if (hiA[i] > adims[i]) hiA[i] = adims[i];
2022 if (hiA[i] < loA[i]) chk = 0;
2023 }
2024 if (chk) {
2025 pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
2026 sgai_set_diagonal_block(g_a, ptr, g_v, loA, hiA, ld, type);
2027 pnga_release_update_block_grid(g_a, index);
2028 }
2029 /* increment index to get next block on processor */
2030 index[0] += topology[0];
2031 for (i = 0; i < andim; i++) {
2032 if (index[i] >= blocks[i] && i<andim-1) {
2033 index[i] = proc_index[i];
2034 index[i+1] += topology[i+1];
2035 }
2036 }
2037 }
2038 }
2039
2040 }
2041 #endif
2042 if(local_sync_end)pnga_sync();
2043 }
2044
sgai_shift_diagonal_block(Integer g_a,void * ptr,Integer * loA,Integer * hiA,Integer ld,void * c,Integer type)2045 static void sgai_shift_diagonal_block(Integer g_a, void *ptr, Integer *loA, Integer *hiA,
2046 Integer ld, void *c, Integer type)
2047 {
2048 Integer iloA, ihiA, jloA, jhiA, lo[2], hi[2];
2049 Integer i;
2050 int *ia;
2051 float *fa;
2052 double *da;
2053 long *la;
2054 DoubleComplex *dca;
2055 SingleComplex *fca;
2056
2057 iloA = loA[0];
2058 ihiA = hiA[0];
2059 jloA = loA[1];
2060 jhiA = hiA[1];
2061
2062 /* determine subset of my patch to access */
2063 if (iloA > 0)
2064 {
2065 lo[0] = GA_MAX (iloA, jloA);
2066 lo[1] = GA_MAX (iloA, jloA);
2067 hi[0] = GA_MIN (ihiA, jhiA);
2068 hi[1] = GA_MIN (ihiA, jhiA);
2069 if (hi[0] >= lo[0]) /*make sure the equality sign is there since it is the singleton case */
2070 { /* we got a block containing diagonal elements */
2071
2072 switch (type)
2073 {
2074 case C_INT:
2075 ia = (int *) ptr;
2076 ia += ld*(lo[1]-jloA) + lo[0]-iloA;
2077 for (i = 0; i < hi[0] - lo[0] + 1; i++)
2078 {
2079 *ia += *((int *) c);
2080 ia += ld + 1;
2081 }
2082 break;
2083 case C_LONG:
2084 la = (long *) ptr;
2085 la += ld*(lo[1]-jloA) + lo[0]-iloA;
2086 for (i = 0; i < hi[0] - lo[0] + 1; i++)
2087 {
2088 *la += *((long *) c);
2089 la += ld + 1;
2090 }
2091 break;
2092 case C_FLOAT:
2093 fa = (float *) ptr;
2094 fa += ld*(lo[1]-jloA) + lo[0]-iloA;
2095 for (i = 0; i < hi[0] - lo[0] + 1; i++)
2096 {
2097 *fa += *((float *) c);
2098 fa += ld + 1;
2099 }
2100 break;
2101 case C_DBL:
2102 da = (double *) ptr;
2103 da += ld*(lo[1]-jloA) + lo[0]-iloA;
2104 for (i = 0; i < hi[0] - lo[0] + 1; i++)
2105 {
2106 *da += *((double *) c);
2107 da += ld + 1;
2108 }
2109 break;
2110 case C_DCPL:
2111 dca = (DoubleComplex *) ptr;
2112 dca += ld*(lo[1]-jloA) + lo[0]-iloA;
2113 for (i = 0; i < hi[0] - lo[0] + 1; i++)
2114 {
2115 (*dca).real += (*((DoubleComplex *) c)).real;
2116 (*dca).imag += (*((DoubleComplex *) c)).imag;
2117 dca += ld + 1;
2118 }
2119 break;
2120
2121 case C_SCPL:
2122 fca = (SingleComplex *) ptr;
2123 fca += ld*(lo[1]-jloA) + lo[0]-iloA;
2124 for (i = 0; i < hi[0] - lo[0] + 1; i++)
2125 {
2126 (*fca).real += (*((SingleComplex *) c)).real;
2127 (*fca).imag += (*((SingleComplex *) c)).imag;
2128 fca += ld + 1;
2129 }
2130 break;
2131
2132 default:
2133 pnga_error("ga_shift_diagonal_: wrong data type:", type);
2134 }
2135 }
2136 }
2137 }
2138
2139 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2140 # pragma weak wnga_shift_diagonal = pnga_shift_diagonal
2141 #endif
pnga_shift_diagonal(Integer g_a,void * c)2142 void pnga_shift_diagonal(Integer g_a, void *c)
2143 {
2144 Integer loA[2], hiA[2]/*, dim1, dim2*/, ld;
2145 Integer andim, adims[2], type, atype;
2146 Integer me = pnga_nodeid (), i, nproc = pnga_nnodes();
2147 char *ptr;
2148 Integer num_blocks_a;
2149 int local_sync_begin,local_sync_end;
2150 _iterator_hdl hdl;
2151
2152 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2153 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2154 if(local_sync_begin)pnga_sync();
2155
2156 pnga_check_handle (g_a, "ga_shift_diagonal_");
2157
2158 pnga_inquire (g_a, &atype, &andim, adims);
2159 /*dim1 = adims[0];*/
2160 /*dim2 = adims[1];*/
2161 type = atype;
2162 if (andim != 2)
2163 pnga_error("Dimension must be 2 for shift diagonal operation",andim);
2164
2165 #if 1
2166 pnga_local_iterator_init(g_a, &hdl);
2167 while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
2168 sgai_shift_diagonal_block(g_a, ptr, loA, hiA, ld, c, type);
2169 }
2170 #else
2171 num_blocks_a = pnga_total_blocks(g_a);
2172
2173 if (num_blocks_a < 0) {
2174 pnga_distribution(g_a, me, loA, hiA);
2175 pnga_access_ptr(g_a, loA, hiA, &ptr, &ld);
2176 sgai_shift_diagonal_block(g_a, ptr, loA, hiA, ld, c, type);
2177 pnga_release_update(g_a, loA, hiA);
2178 } else {
2179 Integer idx;
2180 /* Simple block-cyclic data distribution */
2181 if (!pnga_uses_proc_grid(g_a)) {
2182 for (idx = me; idx < num_blocks_a; idx += nproc) {
2183 pnga_distribution(g_a, idx, loA, hiA);
2184 pnga_access_block_ptr(g_a, idx, &ptr, &ld);
2185 sgai_shift_diagonal_block(g_a, ptr, loA, hiA, ld, c, type);
2186 pnga_release_update_block(g_a, idx);
2187 }
2188 } else {
2189 /* Uses scalapack block-cyclic data distribution */
2190 Integer chk;
2191 Integer proc_index[MAXDIM], index[MAXDIM];
2192 Integer topology[MAXDIM];
2193 Integer blocks[MAXDIM], block_dims[MAXDIM];
2194 pnga_get_proc_index(g_a, me, proc_index);
2195 pnga_get_proc_index(g_a, me, index);
2196 pnga_get_block_info(g_a, blocks, block_dims);
2197 pnga_get_proc_grid(g_a, topology);
2198 while (index[andim-1] < blocks[andim-1]) {
2199 /* find bounding coordinates of block */
2200 chk = 1;
2201 for (i = 0; i < andim; i++) {
2202 loA[i] = index[i]*block_dims[i]+1;
2203 hiA[i] = (index[i] + 1)*block_dims[i];
2204 if (hiA[i] > adims[i]) hiA[i] = adims[i];
2205 if (hiA[i] < loA[i]) chk = 0;
2206 }
2207 if (chk) {
2208 pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
2209 sgai_shift_diagonal_block(g_a, ptr, loA, hiA, ld, c, type);
2210 pnga_release_update_block_grid(g_a, index);
2211 }
2212 /* increment index to get next block on processor */
2213 index[0] += topology[0];
2214 for (i = 0; i < andim; i++) {
2215 if (index[i] >= blocks[i] && i<andim-1) {
2216 index[i] = proc_index[i];
2217 index[i+1] += topology[i+1];
2218 }
2219 }
2220 }
2221 }
2222 }
2223 #endif
2224 if(local_sync_end)pnga_sync();
2225 }
2226
sgai_zero_diagonal_block(Integer g_a,void * ptr,Integer * lo,Integer * hi,Integer ld,Integer offset,Integer type)2227 static void sgai_zero_diagonal_block(Integer g_a, void *ptr, Integer *lo, Integer *hi,
2228 Integer ld, Integer offset, Integer type)
2229 {
2230 Integer i;
2231 int *ia;
2232 float *fa;
2233 double *da;
2234 long *la;
2235 DoubleComplex *dca;
2236 SingleComplex *fca;
2237
2238 switch (type) {
2239 case C_INT:
2240 ia = (int *) ptr + offset;
2241 for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2242 *ia = 0;
2243 ia += ld + 1;
2244 }
2245 break;
2246 case C_LONG:
2247 la = (long *) ptr + offset;
2248 for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2249 *la = 0;
2250 la += ld + 1;
2251 }
2252 break;
2253 case C_FLOAT:
2254 fa = (float *) ptr + offset;
2255 for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2256 *fa = 0.0;
2257 fa += ld + 1;
2258 }
2259 break;
2260 case C_DBL:
2261 da = (double *) ptr + offset;
2262 for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2263 *da = 0.0;
2264 da += ld + 1;
2265 }
2266 break;
2267 case C_DCPL:
2268 dca = (DoubleComplex *) ptr + offset;
2269 for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2270 (*dca).real = 0.0;
2271 (*dca).imag = 0.0;
2272 dca += ld + 1;
2273 }
2274 break;
2275 case C_SCPL:
2276 fca = (SingleComplex *) ptr + offset;
2277 for (i = 0; i < hi[0] - lo[0] + 1; i++) {
2278 (*fca).real = 0.0;
2279 (*fca).imag = 0.0;
2280 fca += ld + 1;
2281 }
2282 break;
2283 default:
2284 pnga_error("set_diagonal_zero: wrong data type:", type);
2285 }
2286 }
2287
2288 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2289 # pragma weak wnga_zero_diagonal = pnga_zero_diagonal
2290 #endif
pnga_zero_diagonal(Integer g_a)2291 void pnga_zero_diagonal(Integer g_a)
2292 {
2293 Integer /*dim1, dim2,*/ type;
2294 Integer ld, lo[2], hi[2], loA[2], hiA[2];
2295 Integer me = pnga_nodeid (), i, offset;
2296 char *ptr;
2297 Integer num_blocks_a;
2298 Integer atype, andim, adims[2];
2299 int local_sync_begin,local_sync_end;
2300 _iterator_hdl hdl;
2301
2302 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2303 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2304 if(local_sync_begin)pnga_sync();
2305
2306
2307 pnga_inquire (g_a, &atype, &andim, adims);
2308 /*dim1 = adims[0];*/
2309 /*dim2 = adims[1];*/
2310 type = atype;
2311
2312 #if 1
2313 pnga_local_iterator_init(g_a, &hdl);
2314 while (pnga_local_iterator_next(&hdl,loA,hiA,&ptr,&ld)) {
2315 offset = 0;
2316 if (loA[0] > 0) {
2317 lo[0] = GA_MAX (loA[0], loA[1]);
2318 lo[1] = GA_MAX (loA[0], loA[1]);
2319 hi[0] = GA_MIN (hiA[0], hiA[1]);
2320 hi[1] = GA_MIN (hiA[0], hiA[1]);
2321 if (hi[0] >= lo[0]) {
2322 sgai_zero_diagonal_block(g_a, ptr, lo, hi, ld, offset, type);
2323 }
2324 }
2325 }
2326 #else
2327 num_blocks_a = pnga_total_blocks(g_a);
2328
2329 if (num_blocks_a < 0) {
2330 offset = 0;
2331 pnga_distribution (g_a, me, loA, hiA);
2332 /* determine subset of my patch to access */
2333 if (loA[0] > 0) {
2334 lo[0] = GA_MAX (loA[0], loA[1]);
2335 lo[1] = GA_MAX (loA[0], loA[1]);
2336 hi[0] = GA_MIN (hiA[0], hiA[1]);
2337 hi[1] = GA_MIN (hiA[0], hiA[1]);
2338 if (hi[0] >= lo[0]) {
2339 /* we got a block containing diagonal elements */
2340 pnga_access_ptr (g_a, lo, hi, &ptr, &ld);
2341 sgai_zero_diagonal_block(g_a, ptr, lo, hi, ld, offset, type);
2342 /* release access to the data */
2343 pnga_release_update (g_a, lo, hi);
2344 }
2345 }
2346 } else {
2347 Integer idx, lld[MAXDIM], offset;
2348 Integer jtot, last, j;
2349 Integer nproc = pnga_nnodes();
2350 /* Simple block-cyclic data distribution */
2351 if (!pnga_uses_proc_grid(g_a)) {
2352 for (idx = me; idx < num_blocks_a; idx += nproc) {
2353 pnga_distribution(g_a, idx, loA, hiA);
2354 lo[0] = GA_MAX (loA[0], loA[1]);
2355 lo[1] = GA_MAX (loA[0], loA[1]);
2356 hi[0] = GA_MIN (hiA[0], hiA[1]);
2357 hi[1] = GA_MIN (hiA[0], hiA[1]);
2358
2359 if (hi[0] >= lo[0]) {
2360 pnga_access_block_ptr(g_a, idx, &ptr, lld);
2361 /* evaluate offsets for system */
2362 offset = 0;
2363 last = andim - 1;
2364 jtot = 1;
2365 for (j=0; j<last; j++) {
2366 offset += (lo[j] - loA[j])*jtot;
2367 jtot *= lld[j];
2368 }
2369 offset += (lo[last]-loA[last])*jtot;
2370 sgai_zero_diagonal_block(g_a, ptr, lo, hi, lld[0], offset, type);
2371 pnga_release_update_block(g_a, idx);
2372 }
2373 }
2374 } else {
2375 /* Uses scalapack block-cyclic data distribution */
2376 Integer lld[MAXDIM]/*, chk*/;
2377 Integer proc_index[MAXDIM], index[MAXDIM];
2378 Integer topology[MAXDIM];
2379 Integer blocks[MAXDIM], block_dims[MAXDIM];
2380 pnga_get_proc_index(g_a, me, proc_index);
2381 pnga_get_proc_index(g_a, me, index);
2382 pnga_get_block_info(g_a, blocks, block_dims);
2383 pnga_get_proc_grid(g_a, topology);
2384
2385 while (index[andim-1] < blocks[andim-1]) {
2386 /* find bounding coordinates of block */
2387 /*chk = 1;*/
2388 for (i = 0; i < andim; i++) {
2389 loA[i] = index[i]*block_dims[i]+1;
2390 hiA[i] = (index[i] + 1)*block_dims[i];
2391 if (hiA[i] > adims[i]) hiA[i] = adims[i];
2392 /*if (hiA[i] < loA[i]) chk = 0;*/
2393 }
2394 lo[0] = GA_MAX (loA[0], loA[1]);
2395 lo[1] = GA_MAX (loA[0], loA[1]);
2396 hi[0] = GA_MIN (hiA[0], hiA[1]);
2397 hi[1] = GA_MIN (hiA[0], hiA[1]);
2398
2399 if (hi[0] >= lo[0]) {
2400 pnga_access_block_grid_ptr(g_a, index, &ptr, lld);
2401 /* evaluate offsets for system */
2402 offset = 0;
2403 last = andim - 1;
2404 jtot = 1;
2405 for (j=0; j<last; j++) {
2406 offset += (lo[j] - loA[j])*jtot;
2407 jtot *= lld[j];
2408 }
2409 offset += (lo[last]-loA[last])*jtot;
2410 sgai_zero_diagonal_block(g_a, ptr, lo, hi, lld[0], offset, type);
2411 pnga_release_update_block_grid(g_a, index);
2412 }
2413
2414 /* increment index to get next block on processor */
2415 index[0] += topology[0];
2416 for (i = 0; i < andim; i++) {
2417 if (index[i] >= blocks[i] && i<andim-1) {
2418 index[i] = proc_index[i];
2419 index[i+1] += topology[i+1];
2420 }
2421 }
2422 }
2423 }
2424 }
2425 #endif
2426 if(local_sync_end)pnga_sync();
2427 }
2428
sgai_scale_row_values(Integer type,Integer * lo,Integer * hi,Integer ld,void * ptr,Integer g_v)2429 static void sgai_scale_row_values(Integer type, Integer *lo,
2430 Integer *hi, Integer ld, void *ptr, Integer g_v)
2431 {
2432 Integer vlo, vhi, size;
2433 int *ia;
2434 float *fa;
2435 double *da;
2436 long *la;
2437 DoubleComplex *dca;
2438 SingleComplex *fca;
2439 void *buf;
2440
2441 /* determine subset of my patch to access */
2442 if (lo[0] > 0) {
2443
2444 if (hi[0] >= lo[0]) { /*make sure the equality symbol is there!!! */
2445 /* we got a block containing diagonal elements */
2446
2447 Integer myrows = hi[0] - lo[0] + 1;
2448 Integer i, j;
2449 /*number of rows on the patch is jhiA - jloA + 1 */
2450 vlo =lo[0] ;
2451 vhi = hi[0];
2452
2453 /*allocate a buffer for the given vector g_v */
2454 size = GAsizeof (type);
2455
2456 buf = malloc (myrows * size);
2457 if (buf == NULL)
2458 pnga_error
2459 ("ga_scale_rows_:failed to allocate memory for the local buffer.",
2460 0);
2461
2462 /* get the vector from the global array to the local memory buffer */
2463 pnga_get (g_v, &vlo, &vhi, buf, &vhi);
2464
2465 switch (type) {
2466 case C_INT:
2467 ia = (int *) ptr;
2468 for (i = 0; i < hi[0]-lo[0]+1; i++) /*for each row */
2469 for(j=0;j<hi[1]-lo[1]+1;j++) /*for each column*/
2470 ia[j*ld+i] *= ((int *) buf)[i];
2471 break;
2472 case C_LONG:
2473 la = (long *) ptr;
2474 for (i = 0; i < hi[0]-lo[0]+1; i++)
2475 for(j=0;j<hi[1]-lo[1]+1;j++)
2476 la[j*ld+i] *= ((long *) buf)[i];
2477 break;
2478 case C_FLOAT:
2479 fa = (float *) ptr;
2480 for (i = 0; i < hi[0]-lo[0]+1; i++)
2481 for(j=0;j<hi[1]-lo[1]+1;j++)
2482 fa[j*ld+i] *= ((float *) buf)[i];
2483 break;
2484 case C_DBL:
2485 da = (double *) ptr;
2486 for (i = 0; i < hi[0]-lo[0]+1; i++)
2487 for(j=0;j<hi[1]-lo[1]+1;j++)
2488 da[j*ld+i] *= ((double *) buf)[i];
2489 break;
2490 case C_DCPL:
2491 dca = (DoubleComplex *) ptr;
2492 for (i = 0; i < hi[0]-lo[0]+1; i++)
2493 for(j=0;j<hi[1]-lo[1]+1;j++) {
2494 (dca[j*ld+i]).real *= (((DoubleComplex *) buf)[i]).real;
2495 (dca[j*ld+i]).imag *= (((DoubleComplex *) buf)[i]).imag;
2496 }
2497 break;
2498 case C_SCPL:
2499 fca = (SingleComplex *) ptr;
2500 for (i = 0; i < hi[0]-lo[0]+1; i++)
2501 for(j=0;j<hi[1]-lo[1]+1;j++) {
2502 (fca[j*ld+i]).real *= (((SingleComplex *) buf)[i]).real;
2503 (fca[j*ld+i]).imag *= (((SingleComplex *) buf)[i]).imag;
2504 }
2505 break;
2506 default:
2507 pnga_error("ga_scale_rows_: wrong data type:", type);
2508 }
2509
2510 /*free the memory */
2511 free (buf);
2512 }
2513 }
2514 }
2515
2516 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2517 # pragma weak wnga_scale_rows = pnga_scale_rows
2518 #endif
pnga_scale_rows(Integer g_a,Integer g_v)2519 void pnga_scale_rows(Integer g_a, Integer g_v)
2520 {
2521 Integer vndim, vdims, dim1/*, dim2*/, vtype, atype, type;
2522 Integer ld, lo[2], hi[2];
2523 Integer me = pnga_nodeid (), i, chk;
2524 char *ptr;
2525 Integer andim, adims[2];
2526 Integer num_blocks_a;
2527 int local_sync_begin,local_sync_end;
2528 _iterator_hdl hdl;
2529
2530 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2531 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2532 if(local_sync_begin)pnga_sync();
2533
2534 pnga_check_handle (g_a, "ga_scale_rows_");
2535 pnga_check_handle (g_v, "ga_scale_rows_");
2536
2537 pnga_inquire (g_a, &atype, &andim, adims);
2538 dim1 = adims[0];
2539 /*dim2 = adims[1];*/
2540 type = atype;
2541 pnga_inquire (g_v, &vtype, &vndim, &vdims);
2542
2543 /* Perform some error checking */
2544 if (andim != 2)
2545 pnga_error("ga_scale_rows_: wrong dimension for g_a.", andim);
2546 if (vndim != 1)
2547 pnga_error("ga_scale_rows_: wrong dimension for g_v.", vndim);
2548
2549 /*in internal functions, dim1 = number of rows of the matrix g_a*/
2550 /*in internal functions, dim2 = number of columns of the matrix g_a*/
2551 if (vdims != dim1)
2552 pnga_error
2553 ("ga_scale_rows_: The size of the scalar array is not the same as the number of the rows of g_a.",
2554 vdims);
2555
2556 if (vtype != atype)
2557 {
2558 pnga_error
2559 ("ga_scale_rows_: input global arrays do not have the same data type. Global array type =",
2560 atype);
2561 }
2562
2563 #if 1
2564 pnga_local_iterator_init(g_a, &hdl);
2565 while (pnga_local_iterator_next(&hdl,lo,hi,&ptr,&ld)) {
2566 sgai_scale_row_values(type, lo, hi, ld, ptr, g_v);
2567 }
2568 #else
2569 num_blocks_a = pnga_total_blocks(g_a);
2570
2571 if (num_blocks_a < 0) {
2572 pnga_distribution (g_a, me, lo, hi);
2573
2574 chk = 1;
2575 for (i=0; i<andim; i++) {
2576 if (lo[i]>hi[i]) chk = 0;
2577 }
2578 if (chk) {
2579 pnga_access_ptr (g_a, lo, hi, &ptr, &ld);
2580
2581 sgai_scale_row_values(type, lo, hi, ld, ptr, g_v);
2582
2583 /* release access to the data */
2584 pnga_release_update (g_a, lo, hi);
2585 }
2586 } else {
2587 Integer nproc = pnga_nnodes();
2588 /* Simple block-cyclic data distribution */
2589 if (!pnga_uses_proc_grid(g_a)) {
2590 Integer idx;
2591 for (idx=me; idx<num_blocks_a; idx += nproc) {
2592 pnga_distribution(g_a, idx, lo, hi);
2593 pnga_access_block_ptr(g_a, idx, &ptr, &ld);
2594 sgai_scale_row_values(type, lo, hi, ld, ptr, g_v);
2595 pnga_release_update_block(g_a, idx);
2596 }
2597 } else {
2598 Integer proc_index[MAXDIM], index[MAXDIM];
2599 Integer topology[MAXDIM];
2600 Integer blocks[MAXDIM], block_dims[MAXDIM];
2601 pnga_get_proc_index(g_a, me, proc_index);
2602 pnga_get_proc_index(g_a, me, index);
2603 pnga_get_block_info(g_a, blocks, block_dims);
2604 pnga_get_proc_grid(g_a, topology);
2605
2606 while (index[andim-1] < blocks[andim-1]) {
2607 /* find bounding coordinates of block */
2608 chk = 1;
2609 for (i = 0; i < andim; i++) {
2610 lo[i] = index[i]*block_dims[i]+1;
2611 hi[i] = (index[i] + 1)*block_dims[i];
2612 if (hi[i] > adims[i]) hi[i] = adims[i];
2613 if (hi[i] < lo[i]) chk = 0;
2614 }
2615 if (chk) {
2616 pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
2617 sgai_scale_row_values(type, lo, hi, ld, ptr, g_v);
2618 pnga_release_update_block_grid(g_a, index);
2619 }
2620 /* increment index to get next block on processor */
2621 index[0] += topology[0];
2622 for (i = 0; i < andim; i++) {
2623 if (index[i] >= blocks[i] && i<andim-1) {
2624 index[i] = proc_index[i];
2625 index[i+1] += topology[i+1];
2626 }
2627 }
2628 }
2629 }
2630 }
2631 #endif
2632 if(local_sync_end)pnga_sync();
2633 }
2634
sgai_scale_col_values(Integer type,Integer * lo,Integer * hi,Integer ld,void * ptr,Integer g_v)2635 static void sgai_scale_col_values(Integer type, Integer *lo,
2636 Integer *hi, Integer ld, void *ptr, Integer g_v)
2637 {
2638 Integer vlo, vhi, size;
2639 void *buf;
2640 int *ia;
2641 float *fa;
2642 double *da;
2643 long *la;
2644 DoubleComplex *dca;
2645 SingleComplex *fca;
2646
2647 /* determine subset of my patch to access */
2648 if (lo[1] > 0) {
2649 if (hi[0] >= lo[0]) { /*make sure the equality symbol is there!!! */
2650 /* we got a block containing diagonal elements*/
2651
2652 Integer mycols = hi[1] - lo[1] + 1;
2653 Integer i, j;
2654 /*number of rows on the patch is jhiA - jloA + 1 */
2655 vlo =lo[1] ;
2656 vhi = hi[1];
2657
2658 /*allocate a buffer for the given vector g_v */
2659 size = GAsizeof (type);
2660
2661 buf = malloc (mycols * size);
2662 if (buf == NULL)
2663 pnga_error
2664 ("ga_scale_cols_:failed to allocate memory for the local buffer.",
2665 0);
2666
2667 /* get the vector from the global array to the local memory buffer */
2668 pnga_get (g_v, &vlo, &vhi, buf, &vhi);
2669
2670 switch (type) {
2671 case C_INT:
2672 ia = (int *) ptr;
2673 for(j=0;j<hi[1]-lo[1]+1;j++) /*for each column*/
2674 for (i = 0; i < hi[0]-lo[0]+1; i++) /*for each row */
2675 ia[j*ld+i] *= ((int *) buf)[j];
2676 break;
2677 case C_LONG:
2678 la = (long *) ptr;
2679 for(j=0;j<hi[1]-lo[1]+1;j++)
2680 for (i = 0; i < hi[0]-lo[0]+1; i++)
2681 la[j*ld+i] *= ((long *) buf)[j];
2682 break;
2683 case C_FLOAT:
2684 fa = (float *) ptr;
2685 for(j=0;j<hi[1]-lo[1]+1;j++)
2686 for (i = 0; i < hi[0]-lo[0]+1; i++)
2687 fa[j*ld+i] *= ((float *) buf)[j];
2688 break;
2689 case C_DBL:
2690 da = (double *) ptr;
2691 for(j=0;j<hi[1]-lo[1]+1;j++)
2692 for (i = 0; i < hi[0]-lo[0]+1; i++)
2693 da[j*ld+i] *= ((double *) buf)[j];
2694 break;
2695 case C_DCPL:
2696 dca = (DoubleComplex *) ptr;
2697 for(j=0;j<hi[1]-lo[1]+1;j++)
2698 for (i = 0; i < hi[0]-lo[0]+1; i++)
2699 {
2700 (dca[j*ld+i]).real *= (((DoubleComplex *) buf)[j]).real;
2701 (dca[j*ld+i]).imag *= (((DoubleComplex *) buf)[j]).imag;
2702 }
2703 break;
2704 case C_SCPL:
2705 fca = (SingleComplex *) ptr;
2706 for(j=0;j<hi[1]-lo[1]+1;j++)
2707 for (i = 0; i < hi[0]-lo[0]+1; i++)
2708 {
2709 (fca[j*ld+i]).real *= (((SingleComplex *) buf)[j]).real;
2710 (fca[j*ld+i]).imag *= (((SingleComplex *) buf)[j]).imag;
2711 }
2712 break;
2713 default:
2714 pnga_error("ga_scale_cols_: wrong data type:", type);
2715 }
2716 /*free the memory */
2717 free (buf);
2718 }
2719 }
2720 }
2721
2722 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2723 # pragma weak wnga_scale_cols = pnga_scale_cols
2724 #endif
pnga_scale_cols(Integer g_a,Integer g_v)2725 void pnga_scale_cols(Integer g_a, Integer g_v)
2726 {
2727 Integer vndim, vdims/*, dim1*/, dim2, vtype, atype, type;
2728 Integer ld, lo[2], hi[2];
2729 Integer me = pnga_nodeid (), i;
2730 char *ptr;
2731 Integer andim, adims[2];
2732 int local_sync_begin,local_sync_end;
2733 Integer num_blocks_a;
2734 Integer chk;
2735 _iterator_hdl hdl;
2736
2737 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2738 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2739 if(local_sync_begin)pnga_sync();
2740
2741 pnga_check_handle (g_a, "ga_scale_cols_");
2742 pnga_check_handle (g_v, "ga_scale_cols_");
2743
2744 pnga_inquire(g_a, &atype, &andim, adims);
2745 /*dim1 = adims[0];*/
2746 dim2 = adims[1];
2747 type = atype;
2748 pnga_inquire(g_v, &vtype, &vndim, &vdims);
2749
2750 /* Perform some error checking */
2751 if (andim != 2)
2752 pnga_error("ga_scale_cols_: wrong dimension for g_a.", andim);
2753 if (vndim != 1)
2754 pnga_error("ga_scale_cols_: wrong dimension for g_v.", vndim);
2755
2756 /*in internal functions, dim1 = number of rows of the matrix g_a*/
2757 /*in internal functions, dim2 = number of columns of the matrix g_a*/
2758 if (vdims != dim2)
2759 pnga_error
2760 ("ga_scale_cols_: The size of the scalar array is not the same as the number of the rows of g_a.",
2761 vdims);
2762
2763 if (vtype != atype)
2764 {
2765 pnga_error
2766 ("ga_scale_cols_: input global arrays do not have the same data type. Global array type =",
2767 atype);
2768 }
2769
2770 #if 1
2771 pnga_local_iterator_init(g_a, &hdl);
2772 while (pnga_local_iterator_next(&hdl,lo,hi,&ptr,&ld)) {
2773 sgai_scale_col_values(type, lo, hi, ld, ptr, g_v);
2774 }
2775 #else
2776 num_blocks_a = pnga_total_blocks(g_a);
2777
2778 if (num_blocks_a < 0) {
2779 pnga_distribution (g_a, me, lo, hi);
2780
2781 chk = 1;
2782 for (i=0; i<andim; i++) {
2783 if (lo[i]>hi[i]) chk = 0;
2784 }
2785 if (chk) {
2786 pnga_access_ptr (g_a, lo, hi, &ptr, &ld);
2787
2788 sgai_scale_col_values(type, lo, hi, ld, ptr, g_v);
2789
2790 /* release access to the data */
2791 pnga_release_update (g_a, lo, hi);
2792 }
2793 } else {
2794 Integer nproc = pnga_nnodes();
2795 /* Simple block-cyclic data distribution */
2796 if (!pnga_uses_proc_grid(g_a)) {
2797 Integer idx;
2798 for (idx=me; idx<num_blocks_a; idx += nproc) {
2799 pnga_distribution(g_a, idx, lo, hi);
2800 pnga_access_block_ptr(g_a, idx, &ptr, &ld);
2801 sgai_scale_col_values(type, lo, hi, ld, ptr, g_v);
2802 pnga_release_update_block(g_a, idx);
2803 }
2804 } else {
2805 Integer proc_index[MAXDIM], index[MAXDIM];
2806 Integer topology[MAXDIM];
2807 Integer blocks[MAXDIM], block_dims[MAXDIM];
2808 pnga_get_proc_index(g_a, me, proc_index);
2809 pnga_get_proc_index(g_a, me, index);
2810 pnga_get_block_info(g_a, blocks, block_dims);
2811 pnga_get_proc_grid(g_a, topology);
2812
2813 while (index[andim-1] < blocks[andim-1]) {
2814 /* find bounding coordinates of block */
2815 chk = 1;
2816 for (i = 0; i < andim; i++) {
2817 lo[i] = index[i]*block_dims[i]+1;
2818 hi[i] = (index[i] + 1)*block_dims[i];
2819 if (hi[i] > adims[i]) hi[i] = adims[i];
2820 if (hi[i] < lo[i]) chk = 0;
2821 }
2822 if (chk) {
2823 pnga_access_block_grid_ptr(g_a, index, &ptr, &ld);
2824 sgai_scale_col_values(type, lo, hi, ld, ptr, g_v);
2825 pnga_release_update_block_grid(g_a, index);
2826 }
2827 /* increment index to get next block on processor */
2828 index[0] += topology[0];
2829 for (i = 0; i < andim; i++) {
2830 if (index[i] >= blocks[i] && i<andim-1) {
2831 index[i] = proc_index[i];
2832 index[i+1] += topology[i+1];
2833 }
2834 }
2835 }
2836 }
2837 }
2838 #endif
2839 if(local_sync_end)pnga_sync();
2840 }
2841