1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /*
6 * module: global.npatch.c
7 * author: Jialin Ju
8 * description: Implements the n-dimensional patch operations:
9 * - fill patch
10 * - copy patch
11 * - scale patch
12 * - dot patch
13 * - add patch
14 *
15 * DISCLAIMER
16 *
17 * This material was prepared as an account of work sponsored by an
18 * agency of the United States Government. Neither the United States
19 * Government nor the United States Department of Energy, nor Battelle,
20 * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
21 * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
22 * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
23 * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
24 * INFRINGE PRIVATELY OWNED RIGHTS.
25 *
26 *
27 * ACKNOWLEDGMENT
28 *
29 * This software and its documentation were produced with United States
30 * Government support under Contract Number DE-AC06-76RLO-1830 awarded by
31 * the United States Department of Energy. The United States Government
32 * retains a paid-up non-exclusive, irrevocable worldwide license to
33 * reproduce, prepare derivative works, perform publicly and display
34 * publicly by or for the US Government, including the right to
35 * distribute to other US Government contractors.
36 */
37
38 #if HAVE_MATH_H
39 # include <math.h>
40 #endif
41
42 #include "message.h"
43 #include "globalp.h"
44 #include "armci.h"
45 #include "ga_iterator.h"
46 #include "ga-papi.h"
47 #include "ga-wapi.h"
48
49 #ifdef MSG_COMMS_MPI
50 extern ARMCI_Group* ga_get_armci_group_(int);
51 #endif
52
53 /**********************************************************
54 * n-dimensional utilities *
55 **********************************************************/
56
57 /*\ compute Index from subscript and convert it back to subscript
58 * in another array
59 \*/
snga_dest_indices(Integer ndims,Integer * los,Integer * blos,Integer * dimss,Integer ndimd,Integer * lod,Integer * blod,Integer * dimsd)60 static void snga_dest_indices(Integer ndims, Integer *los, Integer *blos, Integer *dimss,
61 Integer ndimd, Integer *lod, Integer *blod, Integer *dimsd)
62 {
63 Integer idx = 0, i, factor=1;
64
65 for(i=0;i<ndims;i++) {
66 idx += (los[i] - blos[i])*factor;
67 factor *= dimss[i];
68 }
69
70 for(i=0;i<ndims;i++) {
71 lod[i] = idx % dimsd[i] + blod[i];
72 idx /= dimsd[i];
73 }
74 }
75
76
77 /* check if I own data in the patch */
pnga_patch_intersect(Integer * lo,Integer * hi,Integer * lop,Integer * hip,Integer ndim)78 logical pnga_patch_intersect(Integer *lo, Integer *hi,
79 Integer *lop, Integer *hip, Integer ndim)
80 {
81 Integer i;
82
83 /* check consistency of patch coordinates */
84 for(i=0; i<ndim; i++) {
85 if(hi[i] < lo[i]) return FALSE; /* inconsistent */
86 if(hip[i] < lop[i]) return FALSE; /* inconsistent */
87 }
88
89 /* find the intersection and update (ilop: ihip, jlop: jhip) */
90 for(i=0; i<ndim; i++) {
91 if(hi[i] < lop[i]) return FALSE; /* don't intersect */
92 if(hip[i] < lo[i]) return FALSE; /* don't intersect */
93 }
94
95 for(i=0; i<ndim; i++) {
96 lop[i] = GA_MAX(lo[i], lop[i]);
97 hip[i] = GA_MIN(hi[i], hip[i]);
98 }
99
100 return TRUE;
101 }
102
103
104 /*\ check if patches are identical
105 \*/
pnga_comp_patch(Integer andim,Integer * alo,Integer * ahi,Integer bndim,Integer * blo,Integer * bhi)106 logical pnga_comp_patch(Integer andim, Integer *alo, Integer *ahi,
107 Integer bndim, Integer *blo, Integer *bhi)
108 {
109 Integer i;
110 Integer ndim;
111
112 if(andim > bndim) {
113 ndim = bndim;
114 for(i=ndim; i<andim; i++)
115 if(alo[i] != ahi[i]) return FALSE;
116 }
117 else if(andim < bndim) {
118 ndim = andim;
119 for(i=ndim; i<bndim; i++)
120 if(blo[i] != bhi[i]) return FALSE;
121 }
122 else ndim = andim;
123
124 for(i=0; i<ndim; i++)
125 if((alo[i] != blo[i]) || (ahi[i] != bhi[i])) return FALSE;
126
127 return TRUE;
128 }
129
130
131 /* test two GAs to see if they have the same shape */
snga_test_shape(Integer * alo,Integer * ahi,Integer * blo,Integer * bhi,Integer andim,Integer bndim)132 static logical snga_test_shape(Integer *alo, Integer *ahi, Integer *blo,
133 Integer *bhi, Integer andim, Integer bndim)
134 {
135 Integer i;
136
137 if(andim != bndim) return FALSE;
138
139 for(i=0; i<andim; i++)
140 if((ahi[i] - alo[i]) != (bhi[i] - blo[i])) return FALSE;
141
142 return TRUE;
143 }
144
145
146 /**********************************************************
147 * n-dimensional functions *
148 **********************************************************/
149
150
151 /*\ COPY A PATCH AND POSSIBLY RESHAPE
152 *
153 * . the element capacities of two patches must be identical
154 * . copy by column order - Fortran convention
155 \*/
156 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
157 # pragma weak wnga_copy_patch = pnga_copy_patch
158 #endif
pnga_copy_patch(char * trans,Integer g_a,Integer * alo,Integer * ahi,Integer g_b,Integer * blo,Integer * bhi)159 void pnga_copy_patch(char *trans,
160 Integer g_a, Integer *alo, Integer *ahi,
161 Integer g_b, Integer *blo, Integer *bhi)
162 {
163 Integer i, j;
164 Integer idx, factor;
165 Integer atype, btype, andim, adims[MAXDIM], bndim, bdims[MAXDIM];
166 Integer nelem;
167 Integer atotal, btotal;
168 Integer los[MAXDIM], his[MAXDIM];
169 Integer lod[MAXDIM], hid[MAXDIM];
170 Integer ld[MAXDIM], ald[MAXDIM], bld[MAXDIM];
171 char *src_data_ptr, *tmp_ptr;
172 Integer *src_idx_ptr, *dst_idx_ptr;
173 Integer bvalue[MAXDIM], bunit[MAXDIM];
174 Integer factor_idx1[MAXDIM], factor_idx2[MAXDIM], factor_data[MAXDIM];
175 Integer base;
176 Integer me_a, me_b;
177 Integer a_grp, b_grp, anproc, bnproc;
178 Integer num_blocks_a, num_blocks_b/*, chk*/;
179 int use_put, has_intersection;
180 int local_sync_begin,local_sync_end;
181
182 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
183 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
184 a_grp = pnga_get_pgroup(g_a);
185 b_grp = pnga_get_pgroup(g_b);
186 me_a = pnga_pgroup_nodeid(a_grp);
187 me_b = pnga_pgroup_nodeid(b_grp);
188 anproc = pnga_get_pgroup_size(a_grp);
189 bnproc = pnga_get_pgroup_size(b_grp);
190 if (anproc <= bnproc) {
191 use_put = 1;
192 } else {
193 use_put = 0;
194 }
195
196 /*if (a_grp != b_grp)
197 pnga_error("All matrices must be on same group for pnga_copy_patch", 0L); */
198 if(local_sync_begin) {
199 if (anproc <= bnproc) {
200 pnga_pgroup_sync(a_grp);
201 } else if (a_grp == pnga_pgroup_get_world() &&
202 b_grp == pnga_pgroup_get_world()) {
203 pnga_sync();
204 } else {
205 pnga_pgroup_sync(b_grp);
206 }
207 }
208
209
210 pnga_inquire(g_a, &atype, &andim, adims);
211 pnga_inquire(g_b, &btype, &bndim, bdims);
212
213 if(g_a == g_b) {
214 /* they are the same patch */
215 if(pnga_comp_patch(andim, alo, ahi, bndim, blo, bhi)) {
216 return;
217 /* they are in the same GA, but not the same patch */
218 } else if (pnga_patch_intersect(alo, ahi, blo, bhi, andim)) {
219 pnga_error("array patches cannot overlap ", 0L);
220 }
221 }
222
223 if(atype != btype ) pnga_error("array type mismatch ", 0L);
224
225 /* check if patch indices and dims match */
226 for(i=0; i<andim; i++)
227 if(alo[i] <= 0 || ahi[i] > adims[i])
228 pnga_error("g_a indices out of range ", 0L);
229 for(i=0; i<bndim; i++)
230 if(blo[i] <= 0 || bhi[i] > bdims[i])
231 pnga_error("g_b indices out of range ", 0L);
232
233 /* check if numbers of elements in two patches match each other */
234 atotal = 1; btotal = 1;
235 for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
236 for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
237 if(atotal != btotal)
238 pnga_error("capacities two of patches do not match ", 0L);
239
240 /* additional restrictions that apply if one or both arrays use
241 block-cyclic data distributions */
242 num_blocks_a = pnga_total_blocks(g_a);
243 num_blocks_b = pnga_total_blocks(g_b);
244 if (num_blocks_a >= 0 || num_blocks_b >= 0) {
245 if (!(*trans == 'n' || *trans == 'N')) {
246 pnga_error("Transpose option not supported for block-cyclic data", 0L);
247 }
248 if (!snga_test_shape(alo, ahi, blo, bhi, andim, bndim)) {
249 pnga_error("Change in shape not supported for block-cyclic data", 0L);
250 }
251 }
252
253 if (num_blocks_a < 0 && num_blocks_b <0) {
254 /* now find out cordinates of a patch of g_a that I own */
255 if (use_put) {
256 pnga_distribution(g_a, me_a, los, his);
257 } else {
258 pnga_distribution(g_b, me_b, los, his);
259 }
260
261 /* copy my share of data */
262 if (use_put) {
263 has_intersection = pnga_patch_intersect(alo, ahi, los, his, andim);
264 } else {
265 has_intersection = pnga_patch_intersect(blo, bhi, los, his, bndim);
266 }
267 if(has_intersection){
268 if (use_put) {
269 pnga_access_ptr(g_a, los, his, &src_data_ptr, ld);
270 } else {
271 pnga_access_ptr(g_b, los, his, &src_data_ptr, ld);
272 }
273
274 /* calculate the number of elements in the patch that I own */
275 nelem = 1; for(i=0; i<andim; i++) nelem *= (his[i] - los[i] + 1);
276
277 for(i=0; i<andim; i++) ald[i] = ahi[i] - alo[i] + 1;
278 for(i=0; i<bndim; i++) bld[i] = bhi[i] - blo[i] + 1;
279
280 base = 0; factor = 1;
281 for(i=0; i<andim; i++) {
282 base += los[i] * factor;
283 factor *= ld[i];
284 }
285
286 /*** straight copy possible if there's no reshaping or transpose ***/
287 if((*trans == 'n' || *trans == 'N') &&
288 snga_test_shape(alo, ahi, blo, bhi, andim, bndim)) {
289 /* find source[lo:hi] --> destination[lo:hi] */
290 if (use_put) {
291 snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
292 snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
293 pnga_put(g_b, lod, hid, src_data_ptr, ld);
294 pnga_release(g_a, los, his);
295 } else {
296 snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
297 snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
298 pnga_get(g_a, lod, hid, src_data_ptr, ld);
299 pnga_release(g_b, los, his);
300 }
301 /*** due to generality of this transformation scatter is required ***/
302 } else{
303 if (use_put) {
304 tmp_ptr = ga_malloc(nelem, atype, "v");
305 src_idx_ptr = (Integer*) ga_malloc((andim*nelem), MT_F_INT, "si");
306 dst_idx_ptr = (Integer*) ga_malloc((bndim*nelem), MT_F_INT, "di");
307
308 /* calculate the destination indices */
309
310 /* given los and his, find indices for each elements
311 * bvalue: starting index in each dimension
312 * bunit: stride in each dimension
313 */
314 for (i=0; i<andim; i++) {
315 bvalue[i] = los[i];
316 if (i == 0) bunit[i] = 1;
317 else bunit[i] = bunit[i-1] * (his[i-1] - los[i-1] + 1);
318 }
319
320 /* source indices */
321 for (i=0; i<nelem; i++) {
322 for (j=0; j<andim; j++){
323 src_idx_ptr[i*andim+j] = bvalue[j];
324 /* if the next element is the first element in
325 * one dimension, increment the index by 1
326 */
327 if (((i+1) % bunit[j]) == 0) bvalue[j]++;
328 /* if the index becomes larger than the upper
329 * bound in one dimension, reset it.
330 */
331 if(bvalue[j] > his[j]) bvalue[j] = los[j];
332 }
333 }
334
335 /* index factor: reshaping without transpose */
336 factor_idx1[0] = 1;
337 for (j=1; j<andim; j++)
338 factor_idx1[j] = factor_idx1[j-1] * ald[j-1];
339
340 /* index factor: reshaping with transpose */
341 factor_idx2[andim-1] = 1;
342 for (j=(andim-1)-1; j>=0; j--)
343 factor_idx2[j] = factor_idx2[j+1] * ald[j+1];
344
345 /* data factor */
346 factor_data[0] = 1;
347 for (j=1; j<andim; j++)
348 factor_data[j] = factor_data[j-1] * ld[j-1];
349
350 /* destination indices */
351 for(i=0; i<nelem; i++) {
352 /* linearize the n-dimensional indices to one dimension */
353 idx = 0;
354 if (*trans == 'n' || *trans == 'N')
355 for (j=0; j<andim; j++)
356 idx += (src_idx_ptr[i*andim+j] - alo[j]) *
357 factor_idx1[j];
358 else
359 /* if the patch needs to be transposed, reverse
360 * the indices: (i, j, ...) -> (..., j, i)
361 */
362 for (j=(andim-1); j>=0; j--)
363 idx += (src_idx_ptr[i*andim+j] - alo[j]) *
364 factor_idx2[j];
365
366 /* convert the one dimensional index to n-dimensional
367 * indices of destination
368 */
369 for (j=0; j<bndim; j++) {
370 dst_idx_ptr[i*bndim+j] = idx % bld[j] + blo[j];
371 idx /= bld[j];
372 }
373
374 /* move the data block to create a new block */
375 /* linearize the data indices */
376 idx = 0;
377 for (j=0; j<andim; j++)
378 idx += (src_idx_ptr[i*andim+j]) * factor_data[j];
379
380 /* adjust the position
381 * base: starting address of the first element */
382 idx -= base;
383
384 /* move the element to the temporary location */
385 switch(atype) {
386 case C_DBL: ((double*)tmp_ptr)[i] =
387 ((double*)src_data_ptr)[idx];
388 break;
389 case C_INT:
390 ((int *)tmp_ptr)[i] = ((int *)src_data_ptr)[idx];
391 break;
392 case C_DCPL:((DoubleComplex *)tmp_ptr)[i] =
393 ((DoubleComplex *)src_data_ptr)[idx];
394 break;
395 case C_SCPL:((SingleComplex *)tmp_ptr)[i] =
396 ((SingleComplex *)src_data_ptr)[idx];
397 break;
398 case C_FLOAT: ((float *)tmp_ptr)[i] =
399 ((float *)src_data_ptr)[idx];
400 break;
401 case C_LONG: ((long *)tmp_ptr)[i] =
402 ((long *)src_data_ptr)[idx];
403 break;
404 case C_LONGLONG: ((long long *)tmp_ptr)[i] =
405 ((long long *)src_data_ptr)[idx];
406 }
407 }
408 pnga_release(g_a, los, his);
409 pnga_scatter(g_b, tmp_ptr, dst_idx_ptr, 0, nelem);
410 ga_free(dst_idx_ptr);
411 ga_free(src_idx_ptr);
412 ga_free(tmp_ptr);
413 } else {
414 tmp_ptr = ga_malloc(nelem, atype, "v");
415 src_idx_ptr = (Integer*) ga_malloc((bndim*nelem), MT_F_INT, "si");
416 dst_idx_ptr = (Integer*) ga_malloc((andim*nelem), MT_F_INT, "di");
417
418 /* calculate the destination indices */
419
420 /* given los and his, find indices for each elements
421 * bvalue: starting index in each dimension
422 * bunit: stride in each dimension
423 */
424 for (i=0; i<andim; i++) {
425 bvalue[i] = los[i];
426 if (i == 0) bunit[i] = 1;
427 else bunit[i] = bunit[i-1] * (his[i-1] - los[i-1] + 1);
428 }
429
430 /* destination indices */
431 for (i=0; i<nelem; i++) {
432 for (j=0; j<bndim; j++){
433 src_idx_ptr[i*bndim+j] = bvalue[j];
434 /* if the next element is the first element in
435 * one dimension, increment the index by 1
436 */
437 if (((i+1) % bunit[j]) == 0) bvalue[j]++;
438 /* if the index becomes larger than the upper
439 * bound in one dimension, reset it.
440 */
441 if(bvalue[j] > his[j]) bvalue[j] = los[j];
442 }
443 }
444
445 /* index factor: reshaping without transpose */
446 factor_idx1[0] = 1;
447 for (j=1; j<bndim; j++)
448 factor_idx1[j] = factor_idx1[j-1] * bld[j-1];
449
450 /* index factor: reshaping with transpose */
451 factor_idx2[bndim-1] = 1;
452 for (j=(bndim-1)-1; j>=0; j--)
453 factor_idx2[j] = factor_idx2[j+1] * bld[j+1];
454
455 /* data factor */
456 factor_data[0] = 1;
457 for (j=1; j<bndim; j++)
458 factor_data[j] = factor_data[j-1] * ld[j-1];
459
460 /* destination indices */
461 for(i=0; i<nelem; i++) {
462 /* linearize the n-dimensional indices to one dimension */
463 idx = 0;
464 if (*trans == 'n' || *trans == 'N')
465 for (j=0; j<andim; j++)
466 idx += (src_idx_ptr[i*bndim+j] - blo[j]) *
467 factor_idx1[j];
468 else
469 /* if the patch needs to be transposed, reverse
470 * the indices: (i, j, ...) -> (..., j, i)
471 */
472 for (j=(andim-1); j>=0; j--)
473 idx += (src_idx_ptr[i*bndim+j] - blo[j]) *
474 factor_idx2[j];
475
476 /* convert the one dimensional index to n-dimensional
477 * indices of destination
478 */
479 for (j=0; j<andim; j++) {
480 dst_idx_ptr[i*bndim+j] = idx % ald[j] + alo[j];
481 idx /= ald[j];
482 }
483
484 /* move the data block to create a new block */
485 /* linearize the data indices */
486 idx = 0;
487 for (j=0; j<bndim; j++)
488 idx += (src_idx_ptr[i*bndim+j]) * factor_data[j];
489
490 /* adjust the position
491 * base: starting address of the first element */
492 idx -= base;
493
494 /* move the element to the temporary location */
495 switch(atype) {
496 case C_DBL: ((double*)tmp_ptr)[i] =
497 ((double*)src_data_ptr)[idx];
498 break;
499 case C_INT:
500 ((int *)tmp_ptr)[i] = ((int *)src_data_ptr)[idx];
501 break;
502 case C_DCPL:((DoubleComplex *)tmp_ptr)[i] =
503 ((DoubleComplex *)src_data_ptr)[idx];
504 break;
505 case C_SCPL:((SingleComplex *)tmp_ptr)[i] =
506 ((SingleComplex *)src_data_ptr)[idx];
507 break;
508 case C_FLOAT: ((float *)tmp_ptr)[i] =
509 ((float *)src_data_ptr)[idx];
510 break;
511 case C_LONG: ((long *)tmp_ptr)[i] =
512 ((long *)src_data_ptr)[idx];
513 break;
514 case C_LONGLONG: ((long long *)tmp_ptr)[i] =
515 ((long long *)src_data_ptr)[idx];
516 }
517 }
518 pnga_release(g_b, los, his);
519 pnga_gather(g_a, tmp_ptr, dst_idx_ptr, 0, nelem);
520 ga_free(dst_idx_ptr);
521 ga_free(src_idx_ptr);
522 ga_free(tmp_ptr);
523 }
524 }
525 }
526 } else {
527 Integer offset, last, jtot;
528 for (i=0; i<andim; i++) {
529 ald[i] = ahi[i] - alo[i] + 1;
530 }
531 for (i=0; i<bndim; i++) {
532 bld[i] = bhi[i] - blo[i] + 1;
533 }
534 if (use_put) {
535 /* Array a is block-cyclic distributed */
536 if (num_blocks_a >= 0) {
537 #if 1
538 _iterator_hdl hdl_a;
539 pnga_local_iterator_init(g_a, &hdl_a);
540 while (pnga_local_iterator_next(&hdl_a, los, his,
541 &src_data_ptr, ld)) {
542 /* Copy limits since patch intersect modifies los array */
543 for (j=0; j < andim; j++) {
544 lod[j] = los[j];
545 hid[j] = his[j];
546 }
547 if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
548 offset = 0;
549 last = andim - 1;
550 jtot = 1;
551 for (j=0; j<last; j++) {
552 offset += (los[j]-lod[j])*jtot;
553 jtot *= ld[j];
554 }
555 offset += (los[last]-lod[last])*jtot;
556 switch(atype) {
557 case C_DBL:
558 src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
559 break;
560 case C_INT:
561 src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
562 break;
563 case C_DCPL:
564 src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
565 break;
566 case C_SCPL:
567 src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
568 break;
569 case C_FLOAT:
570 src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
571 break;
572 case C_LONG:
573 src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
574 break;
575 case C_LONGLONG:
576 src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
577 break;
578 default:
579 break;
580 }
581 snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
582 snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
583 pnga_put(g_b, lod, hid, src_data_ptr, ld);
584 pnga_release_block(g_a, i);
585 }
586 }
587 #else
588 /* Uses simple block-cyclic data distribution */
589 if (!pnga_uses_proc_grid(g_a)) {
590 for (i = me_a; i < num_blocks_a; i += anproc) {
591 pnga_distribution(g_a, i, los, his);
592 /* make temporory copies of los, his since ngai_patch_intersection
593 destroys original versions */
594 for (j=0; j < andim; j++) {
595 lod[j] = los[j];
596 hid[j] = his[j];
597 }
598 if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
599 pnga_access_block_ptr(g_a, i, &src_data_ptr, ld);
600 offset = 0;
601 last = andim - 1;
602 jtot = 1;
603 for (j=0; j<last; j++) {
604 offset += (los[j]-lod[j])*jtot;
605 jtot *= ld[j];
606 }
607 offset += (los[last]-lod[last])*jtot;
608 switch(atype) {
609 case C_DBL:
610 src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
611 break;
612 case C_INT:
613 src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
614 break;
615 case C_DCPL:
616 src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
617 break;
618 case C_SCPL:
619 src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
620 break;
621 case C_FLOAT:
622 src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
623 break;
624 case C_LONG:
625 src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
626 break;
627 case C_LONGLONG:
628 src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
629 break;
630 default:
631 break;
632 }
633 snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
634 snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
635 pnga_put(g_b, lod, hid, src_data_ptr, ld);
636 pnga_release_block(g_a, i);
637 }
638 }
639 } else {
640 /* Uses scalapack block-cyclic data distribution */
641 Integer proc_index[MAXDIM], index[MAXDIM];
642 Integer topology[MAXDIM];
643 Integer blocks[MAXDIM], block_dims[MAXDIM];
644 pnga_get_proc_index(g_a, me_a, proc_index);
645 pnga_get_proc_index(g_a, me_a, index);
646 pnga_get_block_info(g_a, blocks, block_dims);
647 pnga_get_proc_grid(g_a, topology);
648 while (index[andim-1] < blocks[andim-1]) {
649 /* find bounding coordinates of block */
650 /*chk = 1;*/
651 for (i = 0; i < andim; i++) {
652 los[i] = index[i]*block_dims[i]+1;
653 his[i] = (index[i] + 1)*block_dims[i];
654 if (his[i] > adims[i]) his[i] = adims[i];
655 /*if (his[i] < los[i]) chk = 0;*/
656 }
657 /* make temperary copies of los, his since ngai_patch_intersection
658 destroys original versions */
659 for (j=0; j < andim; j++) {
660 lod[j] = los[j];
661 hid[j] = his[j];
662 }
663 if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
664 pnga_access_block_grid_ptr(g_a, index, &src_data_ptr, ld);
665 offset = 0;
666 last = andim - 1;
667 jtot = 1;
668 for (j=0; j<last; j++) {
669 offset += (los[j]-lod[j])*jtot;
670 jtot *= ld[j];
671 }
672 offset += (los[last]-lod[last])*jtot;
673 switch(atype) {
674 case C_DBL:
675 src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
676 break;
677 case C_INT:
678 src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
679 break;
680 case C_DCPL:
681 src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
682 break;
683 case C_SCPL:
684 src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
685 break;
686 case C_FLOAT:
687 src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
688 break;
689 case C_LONG:
690 src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
691 break;
692 case C_LONGLONG:
693 src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
694 break;
695 default:
696 break;
697 }
698 snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
699 snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
700 pnga_put(g_b, lod, hid, src_data_ptr, ld);
701 pnga_release_block_grid(g_a, index);
702 }
703
704 /* increment index to get next block on processor */
705 index[0] += topology[0];
706 for (i = 0; i < andim; i++) {
707 if (index[i] >= blocks[i] && i<andim-1) {
708 index[i] = proc_index[i];
709 index[i+1] += topology[i+1];
710 }
711 }
712 }
713 }
714 #endif
715 } else {
716 /* Only array b is block-cyclic distributed */
717 pnga_distribution(g_a, me_a, los, his);
718 if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
719 pnga_access_ptr(g_a, los, his, &src_data_ptr, ld);
720 snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
721 snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
722 pnga_put(g_b, lod, hid, src_data_ptr, ld);
723 pnga_release(g_a, los, his);
724 }
725 }
726 } else {
727 /* Array b is block-cyclic distributed */
728 if (num_blocks_b >= 0) {
729 #if 1
730 _iterator_hdl hdl_b;
731 pnga_local_iterator_init(g_b, &hdl_b);
732 while (pnga_local_iterator_next(&hdl_b, los, his,
733 &src_data_ptr, ld)) {
734 /* Copy limits since patch intersect modifies los array */
735 for (j=0; j < andim; j++) {
736 lod[j] = los[j];
737 hid[j] = his[j];
738 }
739 if (pnga_patch_intersect(blo,bhi,los,his,andim)) {
740 offset = 0;
741 last = andim - 1;
742 jtot = 1;
743 for (j=0; j<last; j++) {
744 offset += (los[j]-lod[j])*jtot;
745 jtot *= ld[j];
746 }
747 offset += (los[last]-lod[last])*jtot;
748 switch(atype) {
749 case C_DBL:
750 src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
751 break;
752 case C_INT:
753 src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
754 break;
755 case C_DCPL:
756 src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
757 break;
758 case C_SCPL:
759 src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
760 break;
761 case C_FLOAT:
762 src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
763 break;
764 case C_LONG:
765 src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
766 break;
767 case C_LONGLONG:
768 src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
769 break;
770 default:
771 break;
772 }
773 snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
774 snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
775 pnga_get(g_a, lod, hid, src_data_ptr, ld);
776 pnga_release_block(g_b, i);
777 }
778 }
779 #else
780 /* Uses simple block-cyclic data distribution */
781 if (!pnga_uses_proc_grid(g_b)) {
782 for (i = me_b; i < num_blocks_b; i += bnproc) {
783 pnga_distribution(g_b, i, los, his);
784 /* make temporory copies of los, his since ngai_patch_intersection
785 destroys original versions */
786 for (j=0; j < andim; j++) {
787 lod[j] = los[j];
788 hid[j] = his[j];
789 }
790 if (pnga_patch_intersect(blo,bhi,los,his,andim)) {
791 pnga_access_block_ptr(g_b, i, &src_data_ptr, ld);
792 offset = 0;
793 last = bndim - 1;
794 jtot = 1;
795 for (j=0; j<last; j++) {
796 offset += (los[j]-lod[j])*jtot;
797 jtot *= ld[j];
798 }
799 offset += (los[last]-lod[last])*jtot;
800 switch(atype) {
801 case C_DBL:
802 src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
803 break;
804 case C_INT:
805 src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
806 break;
807 case C_DCPL:
808 src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
809 break;
810 case C_SCPL:
811 src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
812 break;
813 case C_FLOAT:
814 src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
815 break;
816 case C_LONG:
817 src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
818 break;
819 case C_LONGLONG:
820 src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
821 break;
822 default:
823 break;
824 }
825 snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
826 snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
827 pnga_get(g_a, lod, hid, src_data_ptr, ld);
828 pnga_release_block(g_b, i);
829 }
830 }
831 } else {
832 /* Uses scalapack block-cyclic data distribution */
833 Integer proc_index[MAXDIM], index[MAXDIM];
834 Integer topology[MAXDIM];
835 Integer blocks[MAXDIM], block_dims[MAXDIM];
836 pnga_get_proc_index(g_b, me_b, proc_index);
837 pnga_get_proc_index(g_b, me_b, index);
838 pnga_get_block_info(g_b, blocks, block_dims);
839 pnga_get_proc_grid(g_b, topology);
840 while (index[bndim-1] < blocks[bndim-1]) {
841 /* find bounding coordinates of block */
842 /*chk = 1;*/
843 for (i = 0; i < bndim; i++) {
844 los[i] = index[i]*block_dims[i]+1;
845 his[i] = (index[i] + 1)*block_dims[i];
846 if (his[i] > bdims[i]) his[i] = bdims[i];
847 /*if (his[i] < los[i]) chk = 0;*/
848 }
849 /* make temporory copies of los, his since ngai_patch_intersection
850 destroys original versions */
851 for (j=0; j < andim; j++) {
852 lod[j] = los[j];
853 hid[j] = his[j];
854 }
855 if (pnga_patch_intersect(blo,bhi,los,his,andim)) {
856 pnga_access_block_grid_ptr(g_b, index, &src_data_ptr, ld);
857 offset = 0;
858 last = bndim - 1;
859 jtot = 1;
860 for (j=0; j<last; j++) {
861 offset += (los[j]-lod[j])*jtot;
862 jtot *= ld[j];
863 }
864 offset += (los[last]-lod[last])*jtot;
865 switch(atype) {
866 case C_DBL:
867 src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
868 break;
869 case C_INT:
870 src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
871 break;
872 case C_DCPL:
873 src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
874 break;
875 case C_SCPL:
876 src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
877 break;
878 case C_FLOAT:
879 src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
880 break;
881 case C_LONG:
882 src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
883 break;
884 case C_LONGLONG:
885 src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
886 break;
887 default:
888 break;
889 }
890 snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
891 snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
892 pnga_get(g_a, lod, hid, src_data_ptr, ld);
893 pnga_release_block_grid(g_b, index);
894 }
895
896 /* increment index to get next block on processor */
897 index[0] += topology[0];
898 for (i = 0; i < bndim; i++) {
899 if (index[i] >= blocks[i] && i<bndim-1) {
900 index[i] = proc_index[i];
901 index[i+1] += topology[i+1];
902 }
903 }
904 }
905 }
906 #endif
907 } else {
908 /* Array a is block-cyclic distributed */
909 pnga_distribution(g_b, me_b, los, his);
910 if (pnga_patch_intersect(blo,bhi,los,his,bndim)) {
911 pnga_access_ptr(g_b, los, his, &src_data_ptr, ld);
912 snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
913 snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
914 pnga_get(g_a, lod, hid, src_data_ptr, ld);
915 pnga_release(g_b, los, his);
916 }
917 }
918 }
919 }
920 /* ARMCI_AllFence(); */
921 if(local_sync_end) {
922 if (anproc <= bnproc) {
923 pnga_pgroup_sync(a_grp);
924 } else if (a_grp == pnga_pgroup_get_world() &&
925 b_grp == pnga_pgroup_get_world()) {
926 pnga_sync();
927 } else {
928 pnga_pgroup_sync(b_grp);
929 }
930 }
931 }
932
933
snga_dot_local_patch(Integer atype,Integer andim,Integer * loA,Integer * hiA,Integer * ldA,void * A_ptr,void * B_ptr,int * alen,void * retval)934 static void snga_dot_local_patch(Integer atype, Integer andim, Integer *loA,
935 Integer *hiA, Integer *ldA, void *A_ptr, void *B_ptr,
936 int *alen, void *retval)
937 {
938 int isum;
939 double dsum;
940 DoubleComplex zsum;
941 SingleComplex csum;
942 float fsum;
943 long lsum;
944 long long llsum;
945 Integer i, j, n1dim, idx;
946 Integer bvalue[MAXDIM], bunit[MAXDIM], baseldA[MAXDIM];
947
948 isum = 0; dsum = 0.; zsum.real = 0.; zsum.imag = 0.; fsum = 0;lsum=0;llsum=0;
949 csum.real = 0.; csum.imag = 0.;
950
951 /* number of n-element of the first dimension */
952 n1dim = 1; for(i=1; i<andim; i++) n1dim *= (hiA[i] - loA[i] + 1);
953
954 /* calculate the destination indices */
955 bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
956 /* baseldA[0] = ldA[0]
957 * baseldA[1] = ldA[0] * ldA[1]
958 * baseldA[2] = ldA[0] * ldA[1] * ldA[2] .....
959 */
960 baseldA[0] = ldA[0]; baseldA[1] = baseldA[0] *ldA[1];
961 for(i=2; i<andim; i++) {
962 bvalue[i] = 0;
963 bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
964 baseldA[i] = baseldA[i-1] * ldA[i];
965 }
966
967 /* compute "local" contribution to the dot product */
968 switch (atype){
969 case C_INT:
970 for(i=0; i<n1dim; i++) {
971 idx = 0;
972 for(j=1; j<andim; j++) {
973 idx += bvalue[j] * baseldA[j-1];
974 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
975 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
976 }
977
978 for(j=0; j<(hiA[0]-loA[0]+1); j++)
979 isum += ((int *)A_ptr)[idx+j] *
980 ((int *)B_ptr)[idx+j];
981 }
982 *(int*)retval += isum;
983 break;
984 case C_DCPL:
985 for(i=0; i<n1dim; i++) {
986 idx = 0;
987 for(j=1; j<andim; j++) {
988 idx += bvalue[j] * baseldA[j-1];
989 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
990 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
991 }
992
993 for(j=0; j<(hiA[0]-loA[0]+1); j++) {
994 DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
995 DoubleComplex b = ((DoubleComplex *)B_ptr)[idx+j];
996 zsum.real += a.real*b.real - b.imag * a.imag;
997 zsum.imag += a.imag*b.real + b.imag * a.real;
998 }
999 }
1000 ((double*)retval)[0] += zsum.real;
1001 ((double*)retval)[1] += zsum.imag;
1002 break;
1003 case C_SCPL:
1004 for(i=0; i<n1dim; i++) {
1005 idx = 0;
1006 for(j=1; j<andim; j++) {
1007 idx += bvalue[j] * baseldA[j-1];
1008 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1009 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1010 }
1011
1012 for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1013 SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
1014 SingleComplex b = ((SingleComplex *)B_ptr)[idx+j];
1015 csum.real += a.real*b.real - b.imag * a.imag;
1016 csum.imag += a.imag*b.real + b.imag * a.real;
1017 }
1018 }
1019 ((float*)retval)[0] += csum.real;
1020 ((float*)retval)[1] += csum.imag;
1021 break;
1022 case C_DBL:
1023 for(i=0; i<n1dim; i++) {
1024 idx = 0;
1025 for(j=1; j<andim; j++) {
1026 idx += bvalue[j] * baseldA[j-1];
1027 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1028 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1029 }
1030
1031 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1032 dsum += ((double*)A_ptr)[idx+j] *
1033 ((double*)B_ptr)[idx+j];
1034 }
1035 *(double*)retval += dsum;
1036 break;
1037 case C_FLOAT:
1038 for(i=0; i<n1dim; i++) {
1039 idx = 0;
1040 for(j=1; j<andim; j++) {
1041 idx += bvalue[j] * baseldA[j-1];
1042 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1043 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1044 }
1045
1046 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1047 fsum += ((float *)A_ptr)[idx+j] *
1048 ((float *)B_ptr)[idx+j];
1049 }
1050 *(float*)retval += fsum;
1051 break;
1052 case C_LONG:
1053 for(i=0; i<n1dim; i++) {
1054 idx = 0;
1055 for(j=1; j<andim; j++) {
1056 idx += bvalue[j] * baseldA[j-1];
1057 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1058 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1059 }
1060
1061 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1062 lsum += ((long *)A_ptr)[idx+j] *
1063 ((long *)B_ptr)[idx+j];
1064 }
1065 *(long*)retval += lsum;
1066 break;
1067 case C_LONGLONG:
1068 for(i=0; i<n1dim; i++) {
1069 idx = 0;
1070 for(j=1; j<andim; j++) {
1071 idx += bvalue[j] * baseldA[j-1];
1072 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1073 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1074 }
1075
1076 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1077 llsum += ((long long *)A_ptr)[idx+j] *
1078 ((long long *)B_ptr)[idx+j];
1079 }
1080 *(long long*)retval += llsum;
1081 break;
1082 default:
1083 pnga_error("snga_dot_local_patch: type not supported",atype);
1084
1085 }
1086 }
1087
1088
1089 /*\ generic dot product routine
1090 \*/
1091 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1092 # pragma weak wnga_dot_patch = pnga_dot_patch
1093 #endif
pnga_dot_patch(Integer g_a,char * t_a,Integer * alo,Integer * ahi,Integer g_b,char * t_b,Integer * blo,Integer * bhi,void * retval)1094 void pnga_dot_patch(Integer g_a, char *t_a, Integer *alo, Integer *ahi, Integer g_b, char *t_b, Integer *blo, Integer *bhi, void *retval)
1095 {
1096 Integer i=0, j=0;
1097 Integer compatible=0;
1098 Integer atype=0, btype=0, andim=0, adims[MAXDIM], bndim=0, bdims[MAXDIM];
1099 Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
1100 Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
1101 Integer g_A = g_a, g_B = g_b;
1102 char *A_ptr=NULL, *B_ptr=NULL;
1103 Integer ctype=0;
1104 Integer atotal=0, btotal=0;
1105 int isum=0, alen=0;
1106 long lsum=0;
1107 long long llsum=0;
1108 double dsum=0;
1109 DoubleComplex zsum={0,0};
1110 DoubleComplex csum={0,0};
1111 float fsum=0;
1112 Integer me= pnga_nodeid(), temp_created=0;
1113 Integer nproc = pnga_nnodes();
1114 Integer num_blocks_a=0, num_blocks_b=0;
1115 char *tempname = "temp", transp, transp_a, transp_b;
1116 int local_sync_begin=0;
1117 Integer a_grp=0, b_grp=0;
1118 _iterator_hdl hdl_a, hdl_b;
1119
1120 local_sync_begin = _ga_sync_begin;
1121 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1122 if(local_sync_begin)pnga_sync();
1123
1124 a_grp = pnga_get_pgroup(g_a);
1125 b_grp = pnga_get_pgroup(g_b);
1126 if (a_grp != b_grp)
1127 pnga_error("Both arrays must be defined on same group",0L);
1128 me = pnga_pgroup_nodeid(a_grp);
1129
1130 pnga_inquire(g_a, &atype, &andim, adims);
1131 pnga_inquire(g_b, &btype, &bndim, bdims);
1132
1133 if(atype != btype ) pnga_error(" type mismatch ", 0L);
1134
1135 /* check if patch indices and g_a dims match */
1136 for(i=0; i<andim; i++)
1137 if(alo[i] <= 0 || ahi[i] > adims[i])
1138 pnga_error("g_a indices out of range ", g_a);
1139 for(i=0; i<bndim; i++)
1140 if(blo[i] <= 0 || bhi[i] > bdims[i])
1141 pnga_error("g_b indices out of range ", g_b);
1142
1143 /* check if numbers of elements in two patches match each other */
1144 atotal = 1; for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
1145 btotal = 1; for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
1146
1147 if(atotal != btotal)
1148 pnga_error(" capacities of patches do not match ", 0L);
1149
1150 /* is transpose operation required ? */
1151 /* -- only if for one array transpose operation requested*/
1152 transp_a = (*t_a == 'n' || *t_a =='N')? 'n' : 't';
1153 transp_b = (*t_b == 'n' || *t_b =='N')? 'n' : 't';
1154 transp = (transp_a == transp_b)? 'n' : 't';
1155
1156 /* Find out if distribution is block-cyclic */
1157 num_blocks_a = pnga_total_blocks(g_a);
1158 num_blocks_b = pnga_total_blocks(g_b);
1159
1160 if (num_blocks_a >= 0 || num_blocks_b >= 0) {
1161 if (transp_a == 't' || transp_b == 't')
1162 pnga_error("transpose not supported for block-cyclic data ", 0);
1163 }
1164
1165 isum = 0; dsum = 0.; zsum.real = 0.; zsum.imag = 0.; fsum = 0;lsum=0;llsum=0;
1166 csum.real = 0.; csum.imag = 0.;
1167
1168 switch (atype){
1169 case C_INT:
1170 *(int*)retval = isum;
1171 alen = 1;
1172 break;
1173 case C_DCPL:
1174 ((double*)retval)[0] = zsum.real;
1175 ((double*)retval)[1] = zsum.imag;
1176 alen = 2;
1177 break;
1178 case C_SCPL:
1179 ((float*)retval)[0] = csum.real;
1180 ((float*)retval)[1] = csum.imag;
1181 alen = 2;
1182 break;
1183 case C_DBL:
1184 *(double*)retval = dsum;
1185 alen = 1;
1186 break;
1187 case C_FLOAT:
1188 *(float*)retval = fsum;
1189 alen = 1;
1190 break;
1191 case C_LONG:
1192 *(long*)retval = lsum;
1193 alen = 1;
1194 break;
1195 case C_LONGLONG:
1196 *(long long*)retval = llsum;
1197 alen = 1;
1198 break;
1199 default:
1200 pnga_error("snga_dot_local_patch: type not supported",atype);
1201 }
1202
1203 if (num_blocks_a < 0 && num_blocks_b < 0) {
1204 /* find out coordinates of patches of g_A and g_B that I own */
1205 pnga_distribution(g_A, me, loA, hiA);
1206 pnga_distribution(g_B, me, loB, hiB);
1207
1208 if(pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB) &&
1209 pnga_comp_patch(andim, alo, ahi, bndim, blo, bhi)) compatible = 1;
1210 else compatible = 0;
1211 /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*"); */
1212 pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "&&");
1213 if(!(compatible && (transp=='n'))) {
1214 /* either patches or distributions do not match:
1215 * - create a temp array that matches distribution of g_a
1216 * - copy & reshape patch of g_b into g_B
1217 */
1218 if (!pnga_duplicate(g_a, &g_B, tempname))
1219 pnga_error("duplicate failed",0L);
1220
1221 pnga_copy_patch(&transp, g_b, blo, bhi, g_B, alo, ahi);
1222 bndim = andim;
1223 temp_created = 1;
1224 pnga_distribution(g_B, me, loB, hiB);
1225 }
1226
1227 if(!pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB))
1228 pnga_error(" patches mismatch ",0);
1229
1230 /* A[83:125,1:1] <==> B[83:125] */
1231 if(andim > bndim) andim = bndim; /* need more work */
1232
1233 /* determine subsets of my patches to access */
1234 if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1235 pnga_access_ptr(g_A, loA, hiA, &A_ptr, ldA);
1236 pnga_access_ptr(g_B, loA, hiA, &B_ptr, ldB);
1237
1238 snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1239 &alen, retval);
1240 /* release access to the data */
1241 pnga_release(g_A, loA, hiA);
1242 pnga_release(g_B, loA, hiA);
1243 }
1244 } else {
1245 /* Create copy of g_b identical with identical distribution as g_a */
1246 if (!pnga_duplicate(g_a, &g_B, tempname))
1247 pnga_error("duplicate failed",0L);
1248 pnga_copy_patch(&transp, g_b, blo, bhi, g_B, alo, ahi);
1249 temp_created = 1;
1250
1251 #if 1
1252 pnga_local_iterator_init(g_a, &hdl_a);
1253 pnga_local_iterator_init(g_B, &hdl_b);
1254 while(pnga_local_iterator_next(&hdl_a, loA, hiA, &A_ptr, ldA)) {
1255 Integer lo[MAXDIM]/*, hi[MAXDIM]*/;
1256 Integer offset, jtot, last;
1257 pnga_local_iterator_next(&hdl_b, loB, hiB, &B_ptr, ldB);
1258 /* make copies of loA and hiA since pnga_patch_intersect destroys
1259 original versions */
1260 for (j=0; j<andim; j++) {
1261 lo[j] = loA[j];
1262 /*hi[j] = hiA[j];*/
1263 }
1264 if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1265 /* evaluate offsets for system */
1266 offset = 0;
1267 last = andim-1;
1268 jtot = 1;
1269 for (j=0; j<last; j++) {
1270 offset += (loA[j] - lo[j])*jtot;
1271 jtot *= ldA[j];
1272 }
1273 offset += (loA[last]-lo[last])*jtot;
1274
1275 /* offset pointers by correct amount */
1276 switch (atype){
1277 case C_INT:
1278 A_ptr = (void*)((int*)(A_ptr) + offset);
1279 B_ptr = (void*)((int*)(B_ptr) + offset);
1280 break;
1281 case C_DCPL:
1282 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1283 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1284 break;
1285 case C_SCPL:
1286 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1287 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1288 break;
1289 case C_DBL:
1290 A_ptr = (void*)((double*)(A_ptr) + offset);
1291 B_ptr = (void*)((double*)(B_ptr) + offset);
1292 break;
1293 case C_FLOAT:
1294 A_ptr = (void*)((float*)(A_ptr) + offset);
1295 B_ptr = (void*)((float*)(B_ptr) + offset);
1296 break;
1297 case C_LONG:
1298 A_ptr = (void*)((long*)(A_ptr) + offset);
1299 B_ptr = (void*)((long*)(B_ptr) + offset);
1300 break;
1301 case C_LONGLONG:
1302 A_ptr = (void*)((long long*)(A_ptr) + offset);
1303 B_ptr = (void*)((long long*)(B_ptr) + offset);
1304 break;
1305 }
1306 snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1307 &alen, retval);
1308 }
1309 }
1310 #else
1311 /* If g_a regular distribution, then just use normal dot product on patch */
1312 if (num_blocks_a < 0) {
1313 /* find out coordinates of patches of g_A and g_B that I own */
1314 pnga_distribution(g_A, me, loA, hiA);
1315 pnga_distribution(g_B, me, loB, hiB);
1316
1317 if(!pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB))
1318 pnga_error(" patches mismatch ",0);
1319
1320 /* A[83:125,1:1] <==> B[83:125] */
1321 if(andim > bndim) andim = bndim; /* need more work */
1322 if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1323 pnga_access_ptr(g_A, loA, hiA, &A_ptr, ldA);
1324 pnga_access_ptr(g_B, loA, hiA, &B_ptr, ldB);
1325
1326 snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1327 &alen, retval);
1328 /* release access to the data */
1329 pnga_release(g_A, loA, hiA);
1330 pnga_release(g_B, loA, hiA);
1331 }
1332 } else {
1333 Integer lo[MAXDIM]/*, hi[MAXDIM]*/;
1334 Integer offset, jtot, last;
1335 /* simple block cyclic data distribution */
1336 if (!pnga_uses_proc_grid(g_a)) {
1337 for (i=me; i<num_blocks_a; i += nproc) {
1338 pnga_distribution(g_A, i, loA, hiA);
1339 /* make copies of loA and hiA since pnga_patch_intersect destroys
1340 original versions */
1341 for (j=0; j<andim; j++) {
1342 lo[j] = loA[j];
1343 /*hi[j] = hiA[j];*/
1344 }
1345 if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1346 pnga_access_block_ptr(g_A, i, &A_ptr, ldA);
1347 pnga_access_block_ptr(g_B, i, &B_ptr, ldB);
1348
1349 /* evaluate offsets for system */
1350 offset = 0;
1351 last = andim-1;
1352 jtot = 1;
1353 for (j=0; j<last; j++) {
1354 offset += (loA[j] - lo[j])*jtot;
1355 jtot *= ldA[j];
1356 }
1357 offset += (loA[last]-lo[last])*jtot;
1358
1359 /* offset pointers by correct amount */
1360 switch (atype){
1361 case C_INT:
1362 A_ptr = (void*)((int*)(A_ptr) + offset);
1363 B_ptr = (void*)((int*)(B_ptr) + offset);
1364 break;
1365 case C_DCPL:
1366 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1367 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1368 break;
1369 case C_SCPL:
1370 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1371 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1372 break;
1373 case C_DBL:
1374 A_ptr = (void*)((double*)(A_ptr) + offset);
1375 B_ptr = (void*)((double*)(B_ptr) + offset);
1376 break;
1377 case C_FLOAT:
1378 A_ptr = (void*)((float*)(A_ptr) + offset);
1379 B_ptr = (void*)((float*)(B_ptr) + offset);
1380 break;
1381 case C_LONG:
1382 A_ptr = (void*)((long*)(A_ptr) + offset);
1383 B_ptr = (void*)((long*)(B_ptr) + offset);
1384 break;
1385 case C_LONGLONG:
1386 A_ptr = (void*)((long long*)(A_ptr) + offset);
1387 B_ptr = (void*)((long long*)(B_ptr) + offset);
1388 break;
1389 }
1390 snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1391 &alen, retval);
1392 /* release access to the data */
1393 pnga_release_block(g_A, i);
1394 pnga_release_block(g_B, i);
1395 }
1396 }
1397 } else {
1398 /* Uses scalapack block-cyclic data distribution */
1399 Integer proc_index[MAXDIM], index[MAXDIM];
1400 Integer topology[MAXDIM]/*, chk*/;
1401 Integer blocks[MAXDIM], block_dims[MAXDIM];
1402 pnga_get_proc_index(g_a, me, proc_index);
1403 pnga_get_proc_index(g_a, me, index);
1404 pnga_get_block_info(g_a, blocks, block_dims);
1405 pnga_get_proc_grid(g_a, topology);
1406 while (index[andim-1] < blocks[andim-1]) {
1407 /* find bounding coordinates of block */
1408 /*chk = 1;*/
1409 for (i = 0; i < andim; i++) {
1410 loA[i] = index[i]*block_dims[i]+1;
1411 hiA[i] = (index[i] + 1)*block_dims[i];
1412 if (hiA[i] > adims[i]) hiA[i] = adims[i];
1413 /*if (hiA[i] < loA[i]) chk = 0;*/
1414 }
1415 /* make copies of loA and hiA since pnga_patch_intersect destroys
1416 original versions */
1417 for (j=0; j<andim; j++) {
1418 lo[j] = loA[j];
1419 /*hi[j] = hiA[j];*/
1420 }
1421 if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1422 pnga_access_block_grid_ptr(g_A, index, &A_ptr, ldA);
1423 pnga_access_block_grid_ptr(g_B, index, &B_ptr, ldB);
1424
1425 /* evaluate offsets for system */
1426 offset = 0;
1427 last = andim-1;
1428 jtot = 1;
1429 for (j=0; j<last; j++) {
1430 offset += (loA[j] - lo[j])*jtot;
1431 jtot *= ldA[j];
1432 }
1433 offset += (loA[last]-lo[last])*jtot;
1434
1435 /* offset pointers by correct amount */
1436 switch (atype){
1437 case C_INT:
1438 A_ptr = (void*)((int*)(A_ptr) + offset);
1439 B_ptr = (void*)((int*)(B_ptr) + offset);
1440 break;
1441 case C_DCPL:
1442 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1443 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1444 break;
1445 case C_SCPL:
1446 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1447 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1448 break;
1449 case C_DBL:
1450 A_ptr = (void*)((double*)(A_ptr) + offset);
1451 B_ptr = (void*)((double*)(B_ptr) + offset);
1452 break;
1453 case C_FLOAT:
1454 A_ptr = (void*)((float*)(A_ptr) + offset);
1455 B_ptr = (void*)((float*)(B_ptr) + offset);
1456 break;
1457 case C_LONG:
1458 A_ptr = (void*)((long*)(A_ptr) + offset);
1459 B_ptr = (void*)((long*)(B_ptr) + offset);
1460 break;
1461 case C_LONGLONG:
1462 A_ptr = (void*)((long long*)(A_ptr) + offset);
1463 B_ptr = (void*)((long long*)(B_ptr) + offset);
1464 break;
1465 }
1466 snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1467 &alen, retval);
1468 /* release access to the data */
1469 pnga_release_block_grid(g_A, index);
1470 pnga_release_block_grid(g_B, index);
1471 }
1472
1473 /* increment index to get next block on processor */
1474 index[0] += topology[0];
1475 for (i = 0; i < andim; i++) {
1476 if (index[i] >= blocks[i] && i<andim-1) {
1477 index[i] = proc_index[i];
1478 index[i+1] += topology[i+1];
1479 }
1480 }
1481 }
1482 }
1483 }
1484 #endif
1485 }
1486
1487 /*convert from C data type to ARMCI type */
1488 switch(atype) {
1489 case C_FLOAT: ctype=ARMCI_FLOAT; break;
1490 case C_DBL: ctype=ARMCI_DOUBLE; break;
1491 case C_INT: ctype=ARMCI_INT; break;
1492 case C_LONG: ctype=ARMCI_LONG; break;
1493 case C_LONGLONG: ctype=ARMCI_LONG_LONG; break;
1494 case C_DCPL: ctype=ARMCI_DOUBLE; break;
1495 case C_SCPL: ctype=ARMCI_FLOAT; break;
1496 default: pnga_error("pnga_dot_patch: type not supported",atype);
1497 }
1498
1499 if (pnga_is_mirrored(g_a) && pnga_is_mirrored(g_b)) {
1500 armci_msg_gop_scope(SCOPE_NODE,retval,alen,"+",ctype);
1501 } else {
1502 if (a_grp == -1) {
1503 armci_msg_gop_scope(SCOPE_ALL,retval,alen,"+",ctype);
1504 #ifdef MSG_COMMS_MPI
1505 } else {
1506 armci_msg_group_gop_scope(SCOPE_ALL,retval,alen,"+",ctype,
1507 ga_get_armci_group_((int)a_grp));
1508 #endif
1509 }
1510 }
1511
1512 if(temp_created) pnga_destroy(g_B);
1513 }
1514
1515
1516 /*****************************************************************************
1517 * ngai_Xdot_patch routines
1518 ****************************************************************************/
1519
1520
1521 /*\
1522 * Set all values in patch to value stored in *val
1523 \*/
snga_set_patch_value(Integer type,Integer ndim,Integer * loA,Integer * hiA,Integer * ld,void * data_ptr,void * val)1524 static void snga_set_patch_value(
1525 Integer type, Integer ndim, Integer *loA, Integer *hiA,
1526 Integer *ld, void *data_ptr, void *val)
1527 {
1528 Integer n1dim, i, j, idx;
1529 Integer bvalue[MAXDIM], bunit[MAXDIM], baseld[MAXDIM];
1530 /* number of n-element of the first dimension */
1531 n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiA[i] - loA[i] + 1);
1532
1533 /* calculate the destination indices */
1534 bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1535 /* baseld[0] = ld[0]
1536 * baseld[1] = ld[0] * ld[1]
1537 * baseld[2] = ld[0] * ld[1] * ld[2] .....
1538 */
1539 baseld[0] = ld[0]; baseld[1] = baseld[0] *ld[1];
1540 for(i=2; i<ndim; i++) {
1541 bvalue[i] = 0;
1542 bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
1543 baseld[i] = baseld[i-1] * ld[i];
1544 }
1545
1546 switch (type){
1547 case C_INT:
1548 for(i=0; i<n1dim; i++) {
1549 idx = 0;
1550 for(j=1; j<ndim; j++) {
1551 idx += bvalue[j] * baseld[j-1];
1552 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1553 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1554 }
1555 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1556 ((int *)data_ptr)[idx+j] = *(int*)val;
1557 }
1558 break;
1559 case C_DCPL:
1560 for(i=0; i<n1dim; i++) {
1561 idx = 0;
1562 for(j=1; j<ndim; j++) {
1563 idx += bvalue[j] * baseld[j-1];
1564 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1565 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1566 }
1567
1568 for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1569 DoubleComplex tmp = *(DoubleComplex *)val;
1570 ((DoubleComplex *)data_ptr)[idx+j].real = tmp.real;
1571 ((DoubleComplex *)data_ptr)[idx+j].imag = tmp.imag;
1572 }
1573 }
1574 break;
1575 case C_SCPL:
1576 for(i=0; i<n1dim; i++) {
1577 idx = 0;
1578 for(j=1; j<ndim; j++) {
1579 idx += bvalue[j] * baseld[j-1];
1580 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1581 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1582 }
1583
1584 for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1585 SingleComplex tmp = *(SingleComplex *)val;
1586 ((SingleComplex *)data_ptr)[idx+j].real = tmp.real;
1587 ((SingleComplex *)data_ptr)[idx+j].imag = tmp.imag;
1588 }
1589 }
1590 break;
1591 case C_DBL:
1592 for(i=0; i<n1dim; i++) {
1593 idx = 0;
1594 for(j=1; j<ndim; j++) {
1595 idx += bvalue[j] * baseld[j-1];
1596 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1597 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1598 }
1599
1600 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1601 ((double*)data_ptr)[idx+j] =
1602 *(double*)val;
1603 }
1604 break;
1605 case C_FLOAT:
1606 for(i=0; i<n1dim; i++) {
1607 idx = 0;
1608 for(j=1; j<ndim; j++) {
1609 idx += bvalue[j] * baseld[j-1];
1610 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1611 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1612 }
1613 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1614 ((float *)data_ptr)[idx+j] = *(float*)val;
1615 }
1616 break;
1617 case C_LONG:
1618 for(i=0; i<n1dim; i++) {
1619 idx = 0;
1620 for(j=1; j<ndim; j++) {
1621 idx += bvalue[j] * baseld[j-1];
1622 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1623 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1624 }
1625 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1626 ((long *)data_ptr)[idx+j] = *(long*)val;
1627 }
1628 break;
1629 case C_LONGLONG:
1630 for(i=0; i<n1dim; i++) {
1631 idx = 0;
1632 for(j=1; j<ndim; j++) {
1633 idx += bvalue[j] * baseld[j-1];
1634 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1635 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1636 }
1637 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1638 ((long long*)data_ptr)[idx+j] = *(long long*)val;
1639 }
1640 break;
1641 default: pnga_error(" wrong data type ",type);
1642 }
1643
1644 }
1645
1646
1647 /*\ FILL IN ARRAY WITH VALUE
1648 \*/
1649 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1650 # pragma weak wnga_fill_patch = pnga_fill_patch
1651 #endif
pnga_fill_patch(Integer g_a,Integer * lo,Integer * hi,void * val)1652 void pnga_fill_patch(Integer g_a, Integer *lo, Integer *hi, void* val)
1653 {
1654 Integer i;
1655 Integer ndim, dims[MAXDIM], type;
1656 Integer loA[MAXDIM], hiA[MAXDIM], ld[MAXDIM];
1657 char *data_ptr;
1658 Integer num_blocks, nproc;
1659 Integer me= pnga_nodeid();
1660 int local_sync_begin,local_sync_end;
1661 _iterator_hdl hdl;
1662
1663 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1664 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1665 if(local_sync_begin)pnga_sync();
1666
1667
1668 pnga_inquire(g_a, &type, &ndim, dims);
1669 num_blocks = pnga_total_blocks(g_a);
1670
1671 #if 1
1672 pnga_local_iterator_init(g_a, &hdl);
1673 while (pnga_local_iterator_next(&hdl,loA,hiA,&data_ptr,ld)) {
1674 Integer offset, j, jtmp, chk;
1675 Integer loS[MAXDIM];
1676
1677 /* loA is changed by pnga_patch_intersect, so
1678 save a copy */
1679 for (j=0; j<ndim; j++) {
1680 loS[j] = loA[j];
1681 }
1682
1683 /* determine subset of my local patch to access */
1684 /* Output is in loA and hiA */
1685 if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1686 /* Check for partial overlap */
1687 chk = 1;
1688 for (j=0; j<ndim; j++) {
1689 if (loS[j] < loA[j]) {
1690 chk=0;
1691 break;
1692 }
1693 }
1694 if (!chk) {
1695 /* Evaluate additional offset for pointer */
1696 offset = 0;
1697 jtmp = 1;
1698 for (j=0; j<ndim-1; j++) {
1699 offset += (loA[j]-loS[j])*jtmp;
1700 jtmp *= ld[j];
1701 }
1702 offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1703 switch (type){
1704 case C_INT:
1705 data_ptr = (void*)((int*)data_ptr + offset);
1706 break;
1707 case C_DCPL:
1708 data_ptr = (void*)((double*)data_ptr + 2*offset);
1709 break;
1710 case C_SCPL:
1711 data_ptr = (void*)((float*)data_ptr + 2*offset);
1712 break;
1713 case C_DBL:
1714 data_ptr = (void*)((double*)data_ptr + offset);
1715 break;
1716 case C_FLOAT:
1717 data_ptr = (void*)((float*)data_ptr + offset);
1718 break;
1719 case C_LONG:
1720 data_ptr = (void*)((long*)data_ptr + offset);
1721 break;
1722 case C_LONGLONG:
1723 data_ptr = (void*)((long long*)data_ptr + offset);
1724 break;
1725 default: pnga_error(" wrong data type ",type);
1726 }
1727 }
1728
1729 /* set all values in patch to *val */
1730 snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1731 }
1732 }
1733 #else
1734 if (num_blocks < 0) {
1735 /* get limits of VISIBLE patch */
1736 pnga_distribution(g_a, me, loA, hiA);
1737
1738 /* determine subset of my local patch to access */
1739 /* Output is in loA and hiA */
1740 if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1741
1742 /* get data_ptr to corner of patch */
1743 /* ld are leading dimensions INCLUDING ghost cells */
1744 pnga_access_ptr(g_a, loA, hiA, &data_ptr, ld);
1745
1746 /* set all values in patch to *val */
1747 snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1748
1749 /* release access to the data */
1750 pnga_release_update(g_a, loA, hiA);
1751 }
1752 } else {
1753 Integer offset, j, jtmp, chk;
1754 Integer loS[MAXDIM];
1755 nproc = pnga_nnodes();
1756 /* using simple block-cyclic data distribution */
1757 if (!pnga_uses_proc_grid(g_a)){
1758 for (i=me; i<num_blocks; i += nproc) {
1759 /* get limits of patch */
1760 pnga_distribution(g_a, i, loA, hiA);
1761
1762 /* loA is changed by pnga_patch_intersect, so
1763 save a copy */
1764 for (j=0; j<ndim; j++) {
1765 loS[j] = loA[j];
1766 }
1767
1768 /* determine subset of my local patch to access */
1769 /* Output is in loA and hiA */
1770 if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1771
1772 /* get data_ptr to corner of patch */
1773 /* ld are leading dimensions for block */
1774 pnga_access_block_ptr(g_a, i, &data_ptr, ld);
1775
1776 /* Check for partial overlap */
1777 chk = 1;
1778 for (j=0; j<ndim; j++) {
1779 if (loS[j] < loA[j]) {
1780 chk=0;
1781 break;
1782 }
1783 }
1784 if (!chk) {
1785 /* Evaluate additional offset for pointer */
1786 offset = 0;
1787 jtmp = 1;
1788 for (j=0; j<ndim-1; j++) {
1789 offset += (loA[j]-loS[j])*jtmp;
1790 jtmp *= ld[j];
1791 }
1792 offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1793 switch (type){
1794 case C_INT:
1795 data_ptr = (void*)((int*)data_ptr + offset);
1796 break;
1797 case C_DCPL:
1798 data_ptr = (void*)((double*)data_ptr + 2*offset);
1799 break;
1800 case C_SCPL:
1801 data_ptr = (void*)((float*)data_ptr + 2*offset);
1802 break;
1803 case C_DBL:
1804 data_ptr = (void*)((double*)data_ptr + offset);
1805 break;
1806 case C_FLOAT:
1807 data_ptr = (void*)((float*)data_ptr + offset);
1808 break;
1809 case C_LONG:
1810 data_ptr = (void*)((long*)data_ptr + offset);
1811 break;
1812 case C_LONGLONG:
1813 data_ptr = (void*)((long long*)data_ptr + offset);
1814 break;
1815 default: pnga_error(" wrong data type ",type);
1816 }
1817 }
1818
1819 /* set all values in patch to *val */
1820 snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1821
1822 /* release access to the data */
1823 pnga_release_update_block(g_a, i);
1824 }
1825 }
1826 } else {
1827 /* using scalapack block-cyclic data distribution */
1828 Integer proc_index[MAXDIM], index[MAXDIM];
1829 Integer topology[MAXDIM];
1830 Integer blocks[MAXDIM], block_dims[MAXDIM];
1831 pnga_get_proc_index(g_a, me, proc_index);
1832 pnga_get_proc_index(g_a, me, index);
1833 pnga_get_block_info(g_a, blocks, block_dims);
1834 pnga_get_proc_grid(g_a, topology);
1835 while (index[ndim-1] < blocks[ndim-1]) {
1836 /* find bounding coordinates of block */
1837 chk = 1;
1838 for (i = 0; i < ndim; i++) {
1839 loA[i] = index[i]*block_dims[i]+1;
1840 hiA[i] = (index[i] + 1)*block_dims[i];
1841 if (hiA[i] > dims[i]) hiA[i] = dims[i];
1842 if (hiA[i] < loA[i]) chk = 0;
1843 }
1844
1845 /* loA is changed by pnga_patch_intersect, so
1846 save a copy */
1847 for (j=0; j<ndim; j++) {
1848 loS[j] = loA[j];
1849 }
1850
1851 /* determine subset of my local patch to access */
1852 /* Output is in loA and hiA */
1853 if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1854
1855 /* get data_ptr to corner of patch */
1856 /* ld are leading dimensions for block */
1857 pnga_access_block_grid_ptr(g_a, index, &data_ptr, ld);
1858
1859 /* Check for partial overlap */
1860 chk = 1;
1861 for (j=0; j<ndim; j++) {
1862 if (loS[j] < loA[j]) {
1863 chk=0;
1864 break;
1865 }
1866 }
1867 if (!chk) {
1868 /* Evaluate additional offset for pointer */
1869 offset = 0;
1870 jtmp = 1;
1871 for (j=0; j<ndim-1; j++) {
1872 offset += (loA[j]-loS[j])*jtmp;
1873 jtmp *= ld[j];
1874 }
1875 offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1876 switch (type){
1877 case C_INT:
1878 data_ptr = (void*)((int*)data_ptr + offset);
1879 break;
1880 case C_DCPL:
1881 data_ptr = (void*)((double*)data_ptr + 2*offset);
1882 break;
1883 case C_SCPL:
1884 data_ptr = (void*)((float*)data_ptr + 2*offset);
1885 break;
1886 case C_DBL:
1887 data_ptr = (void*)((double*)data_ptr + offset);
1888 break;
1889 case C_FLOAT:
1890 data_ptr = (void*)((float*)data_ptr + offset);
1891 break;
1892 case C_LONG:
1893 data_ptr = (void*)((long*)data_ptr + offset);
1894 break;
1895 case C_LONGLONG:
1896 data_ptr = (void*)((long long*)data_ptr + offset);
1897 break;
1898 default: pnga_error(" wrong data type ",type);
1899 }
1900 }
1901
1902 /* set all values in patch to *val */
1903 snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1904
1905 /* release access to the data */
1906 pnga_release_update_block_grid(g_a, index);
1907 }
1908 /* increment index to get next block on processor */
1909 index[0] += topology[0];
1910 for (i = 0; i < ndim; i++) {
1911 if (index[i] >= blocks[i] && i<ndim-1) {
1912 index[i] = proc_index[i];
1913 index[i+1] += topology[i+1];
1914 }
1915 }
1916 }
1917 }
1918 }
1919 #endif
1920 if(local_sync_end)pnga_sync();
1921 }
1922
1923
snga_scale_patch_value(Integer type,Integer ndim,Integer * loA,Integer * hiA,Integer * ld,void * src_data_ptr,void * alpha)1924 static void snga_scale_patch_value(Integer type, Integer ndim, Integer *loA, Integer *hiA,
1925 Integer *ld, void *src_data_ptr, void *alpha)
1926 {
1927 Integer n1dim, i, j, idx;
1928 Integer bvalue[MAXDIM], bunit[MAXDIM], baseld[MAXDIM];
1929 DoublePrecision tmp1_real, tmp1_imag, tmp2_real, tmp2_imag;
1930 float ftmp1_real, ftmp1_imag, ftmp2_real, ftmp2_imag;
1931 /* number of n-element of the first dimension */
1932 n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiA[i] - loA[i] + 1);
1933
1934 /* calculate the destination indices */
1935 bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1936 /* baseld[0] = ld[0]
1937 * baseld[1] = ld[0] * ld[1]
1938 * baseld[2] = ld[0] * ld[1] * ld[2] .....
1939 */
1940 baseld[0] = ld[0]; baseld[1] = baseld[0] *ld[1];
1941 for(i=2; i<ndim; i++) {
1942 bvalue[i] = 0;
1943 bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
1944 baseld[i] = baseld[i-1] * ld[i];
1945 }
1946
1947 /* scale local part of g_a */
1948 switch(type){
1949 case C_DBL:
1950 for(i=0; i<n1dim; i++) {
1951 idx = 0;
1952 for(j=1; j<ndim; j++) {
1953 idx += bvalue[j] * baseld[j-1];
1954 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1955 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1956 }
1957
1958 for(j=0; j<(hiA[0]-loA[0]+1); j++)
1959 ((double*)src_data_ptr)[idx+j] *=
1960 *(double*)alpha;
1961 }
1962 break;
1963 case C_DCPL:
1964 for(i=0; i<n1dim; i++) {
1965 idx = 0;
1966 for(j=1; j<ndim; j++) {
1967 idx += bvalue[j] * baseld[j-1];
1968 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1969 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1970 }
1971
1972 for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1973 tmp1_real =((DoubleComplex *)src_data_ptr)[idx+j].real;
1974 tmp1_imag =((DoubleComplex *)src_data_ptr)[idx+j].imag;
1975 tmp2_real = (*(DoubleComplex*)alpha).real;
1976 tmp2_imag = (*(DoubleComplex*)alpha).imag;
1977
1978 ((DoubleComplex *)src_data_ptr)[idx+j].real =
1979 tmp1_real*tmp2_real - tmp1_imag * tmp2_imag;
1980 ((DoubleComplex *)src_data_ptr)[idx+j].imag =
1981 tmp2_imag*tmp1_real + tmp1_imag * tmp2_real;
1982 }
1983 }
1984 break;
1985 case C_SCPL:
1986 for(i=0; i<n1dim; i++) {
1987 idx = 0;
1988 for(j=1; j<ndim; j++) {
1989 idx += bvalue[j] * baseld[j-1];
1990 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1991 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1992 }
1993
1994 for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1995 ftmp1_real =((SingleComplex *)src_data_ptr)[idx+j].real;
1996 ftmp1_imag =((SingleComplex *)src_data_ptr)[idx+j].imag;
1997 ftmp2_real = (*(SingleComplex*)alpha).real;
1998 ftmp2_imag = (*(SingleComplex*)alpha).imag;
1999
2000 ((SingleComplex *)src_data_ptr)[idx+j].real =
2001 ftmp1_real*ftmp2_real - ftmp1_imag * ftmp2_imag;
2002 ((SingleComplex *)src_data_ptr)[idx+j].imag =
2003 ftmp2_imag*ftmp1_real + ftmp1_imag * ftmp2_real;
2004 }
2005 }
2006 break;
2007 case C_INT:
2008 for(i=0; i<n1dim; i++) {
2009 idx = 0;
2010 for(j=1; j<ndim; j++) {
2011 idx += bvalue[j] * baseld[j-1];
2012 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2013 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2014 }
2015
2016 for(j=0; j<(hiA[0]-loA[0]+1); j++)
2017 ((int*)src_data_ptr)[idx+j] *= *(int*)alpha;
2018 }
2019 break;
2020 case C_LONG:
2021 for(i=0; i<n1dim; i++) {
2022 idx = 0;
2023 for(j=1; j<ndim; j++) {
2024 idx += bvalue[j] * baseld[j-1];
2025 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2026 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2027 }
2028
2029 for(j=0; j<(hiA[0]-loA[0]+1); j++)
2030 ((long *)src_data_ptr)[idx+j] *= *(long*)alpha;
2031 }
2032 break;
2033 case C_LONGLONG:
2034 for(i=0; i<n1dim; i++) {
2035 idx = 0;
2036 for(j=1; j<ndim; j++) {
2037 idx += bvalue[j] * baseld[j-1];
2038 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2039 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2040 }
2041
2042 for(j=0; j<(hiA[0]-loA[0]+1); j++)
2043 ((long long*)src_data_ptr)[idx+j] *= *(long long*)alpha;
2044 }
2045 break;
2046 case C_FLOAT:
2047 for(i=0; i<n1dim; i++) {
2048 idx = 0;
2049 for(j=1; j<ndim; j++) {
2050 idx += bvalue[j] * baseld[j-1];
2051 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2052 if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2053 }
2054
2055 for(j=0; j<(hiA[0]-loA[0]+1); j++)
2056 ((float *)src_data_ptr)[idx+j] *= *(float*)alpha;
2057 }
2058 break;
2059 default: pnga_error(" wrong data type ",type);
2060 }
2061 }
2062
2063
2064 /*\ SCALE ARRAY
2065 \*/
2066 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2067 # pragma weak wnga_scale_patch = pnga_scale_patch
2068 #endif
pnga_scale_patch(Integer g_a,Integer * lo,Integer * hi,void * alpha)2069 void pnga_scale_patch(Integer g_a, Integer *lo, Integer *hi, void *alpha)
2070 {
2071 Integer ndim, dims[MAXDIM], type;
2072 Integer loA[MAXDIM], hiA[MAXDIM];
2073 Integer ld[MAXDIM];
2074 char *src_data_ptr;
2075 Integer num_blocks, nproc;
2076 Integer me= pnga_nodeid();
2077 int local_sync_begin,local_sync_end;
2078 _iterator_hdl hdl;
2079
2080 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2081 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2082 if(local_sync_begin)pnga_sync();
2083
2084
2085 pnga_inquire(g_a, &type, &ndim, dims);
2086 num_blocks = pnga_total_blocks(g_a);
2087
2088 #if 1
2089 pnga_local_iterator_init(g_a, &hdl);
2090 while (pnga_local_iterator_next(&hdl,loA,hiA,&src_data_ptr,ld)) {
2091 Integer offset, j, jtmp, chk;
2092 Integer loS[MAXDIM];
2093 /* loA is changed by pnga_patch_intersect, so
2094 save a copy */
2095 for (j=0; j<ndim; j++) {
2096 loS[j] = loA[j];
2097 }
2098
2099 /* determine subset of my local patch to access */
2100 /* Output is in loA and hiA */
2101 if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
2102
2103 /* Check for partial overlap */
2104 chk = 1;
2105 for (j=0; j<ndim; j++) {
2106 if (loS[j] < loA[j]) {
2107 chk=0;
2108 break;
2109 }
2110 }
2111 if (!chk) {
2112 /* Evaluate additional offset for pointer */
2113 offset = 0;
2114 jtmp = 1;
2115 for (j=0; j<ndim-1; j++) {
2116 offset += (loA[j]-loS[j])*jtmp;
2117 jtmp *= ld[j];
2118 }
2119 offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
2120 switch (type){
2121 case C_INT:
2122 src_data_ptr = (void*)((int*)src_data_ptr + offset);
2123 break;
2124 case C_DCPL:
2125 src_data_ptr = (void*)((double*)src_data_ptr + 2*offset);
2126 break;
2127 case C_SCPL:
2128 src_data_ptr = (void*)((float*)src_data_ptr + 2*offset);
2129 break;
2130 case C_DBL:
2131 src_data_ptr = (void*)((double*)src_data_ptr + offset);
2132 break;
2133 case C_FLOAT:
2134 src_data_ptr = (void*)((float*)src_data_ptr + offset);
2135 break;
2136 case C_LONG:
2137 src_data_ptr = (void*)((long*)src_data_ptr + offset);
2138 break;
2139 case C_LONGLONG:
2140 src_data_ptr = (void*)((long long*)src_data_ptr + offset);
2141 break;
2142 default: pnga_error(" wrong data type ",type);
2143 }
2144 }
2145
2146 /* set all values in patch to *val */
2147 snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2148 }
2149
2150 }
2151 #else
2152 if (num_blocks < 0) {
2153 pnga_distribution(g_a, me, loA, hiA);
2154
2155 /* determine subset of my patch to access */
2156 if (pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
2157 pnga_access_ptr(g_a, loA, hiA, &src_data_ptr, ld);
2158
2159 snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2160
2161 /* release access to the data */
2162 pnga_release_update(g_a, loA, hiA);
2163 }
2164 } else {
2165 Integer offset, i, j, jtmp, chk;
2166 Integer loS[MAXDIM];
2167 nproc = pnga_nnodes();
2168 /* using simple block-cyclic data distribution */
2169 if (!pnga_uses_proc_grid(g_a)){
2170 for (i=me; i<num_blocks; i += nproc) {
2171 /* get limits of VISIBLE patch */
2172 pnga_distribution(g_a, i, loA, hiA);
2173
2174 /* loA is changed by pnga_patch_intersect, so
2175 save a copy */
2176 for (j=0; j<ndim; j++) {
2177 loS[j] = loA[j];
2178 }
2179
2180 /* determine subset of my local patch to access */
2181 /* Output is in loA and hiA */
2182 if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
2183
2184 /* get src_data_ptr to corner of patch */
2185 /* ld are leading dimensions INCLUDING ghost cells */
2186 pnga_access_block_ptr(g_a, i, &src_data_ptr, ld);
2187
2188 /* Check for partial overlap */
2189 chk = 1;
2190 for (j=0; j<ndim; j++) {
2191 if (loS[j] < loA[j]) {
2192 chk=0;
2193 break;
2194 }
2195 }
2196 if (!chk) {
2197 /* Evaluate additional offset for pointer */
2198 offset = 0;
2199 jtmp = 1;
2200 for (j=0; j<ndim-1; j++) {
2201 offset += (loA[j]-loS[j])*jtmp;
2202 jtmp *= ld[j];
2203 }
2204 offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
2205 switch (type){
2206 case C_INT:
2207 src_data_ptr = (void*)((int*)src_data_ptr + offset);
2208 break;
2209 case C_DCPL:
2210 src_data_ptr = (void*)((double*)src_data_ptr + 2*offset);
2211 break;
2212 case C_SCPL:
2213 src_data_ptr = (void*)((float*)src_data_ptr + 2*offset);
2214 break;
2215 case C_DBL:
2216 src_data_ptr = (void*)((double*)src_data_ptr + offset);
2217 break;
2218 case C_FLOAT:
2219 src_data_ptr = (void*)((float*)src_data_ptr + offset);
2220 break;
2221 case C_LONG:
2222 src_data_ptr = (void*)((long*)src_data_ptr + offset);
2223 break;
2224 case C_LONGLONG:
2225 src_data_ptr = (void*)((long long*)src_data_ptr + offset);
2226 break;
2227 default: pnga_error(" wrong data type ",type);
2228 }
2229 }
2230
2231 /* set all values in patch to *val */
2232 snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2233
2234 /* release access to the data */
2235 pnga_release_update_block(g_a, i);
2236 }
2237 }
2238 } else {
2239 /* using scalapack block-cyclic data distribution */
2240 Integer proc_index[MAXDIM], index[MAXDIM];
2241 Integer topology[MAXDIM];
2242 Integer blocks[MAXDIM], block_dims[MAXDIM];
2243 pnga_get_proc_index(g_a, me, proc_index);
2244 pnga_get_proc_index(g_a, me, index);
2245 pnga_get_block_info(g_a, blocks, block_dims);
2246 pnga_get_proc_grid(g_a, topology);
2247 while (index[ndim-1] < blocks[ndim-1]) {
2248 /* find bounding coordinates of block */
2249 chk = 1;
2250 for (i = 0; i < ndim; i++) {
2251 loA[i] = index[i]*block_dims[i]+1;
2252 hiA[i] = (index[i] + 1)*block_dims[i];
2253 if (hiA[i] > dims[i]) hiA[i] = dims[i];
2254 if (hiA[i] < loA[i]) chk = 0;
2255 }
2256
2257 /* loA is changed by pnga_patch_intersect, so
2258 save a copy */
2259 for (j=0; j<ndim; j++) {
2260 loS[j] = loA[j];
2261 }
2262
2263 /* determine subset of my local patch to access */
2264 /* Output is in loA and hiA */
2265 if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
2266
2267 /* get data_ptr to corner of patch */
2268 /* ld are leading dimensions for block */
2269 pnga_access_block_grid_ptr(g_a, index, &src_data_ptr, ld);
2270
2271 /* Check for partial overlap */
2272 chk = 1;
2273 for (j=0; j<ndim; j++) {
2274 if (loS[j] < loA[j]) {
2275 chk=0;
2276 break;
2277 }
2278 }
2279 if (!chk) {
2280 /* Evaluate additional offset for pointer */
2281 offset = 0;
2282 jtmp = 1;
2283 for (j=0; j<ndim-1; j++) {
2284 offset += (loA[j]-loS[j])*jtmp;
2285 jtmp *= ld[j];
2286 }
2287 offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
2288 switch (type){
2289 case C_INT:
2290 src_data_ptr = (void*)((int*)src_data_ptr + offset);
2291 break;
2292 case C_DCPL:
2293 src_data_ptr = (void*)((double*)src_data_ptr + 2*offset);
2294 break;
2295 case C_SCPL:
2296 src_data_ptr = (void*)((float*)src_data_ptr + 2*offset);
2297 break;
2298 case C_DBL:
2299 src_data_ptr = (void*)((double*)src_data_ptr + offset);
2300 break;
2301 case C_FLOAT:
2302 src_data_ptr = (void*)((float*)src_data_ptr + offset);
2303 break;
2304 case C_LONG:
2305 src_data_ptr = (void*)((long*)src_data_ptr + offset);
2306 break;
2307 case C_LONGLONG:
2308 src_data_ptr = (void*)((long long*)src_data_ptr + offset);
2309 break;
2310 default: pnga_error(" wrong data type ",type);
2311 }
2312 }
2313
2314 /* set all values in patch to *val */
2315 snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2316
2317 /* release access to the data */
2318 pnga_release_update_block_grid(g_a, index);
2319 }
2320 /* increment index to get next block on processor */
2321 index[0] += topology[0];
2322 for (i = 0; i < ndim; i++) {
2323 if (index[i] >= blocks[i] && i<ndim-1) {
2324 index[i] = proc_index[i];
2325 index[i+1] += topology[i+1];
2326 }
2327 }
2328 }
2329 }
2330 }
2331 #endif
2332 if(local_sync_end)pnga_sync();
2333 }
2334
2335 /*\ Utility function to accumulate patch values C = C + alpha*A
2336 \*/
snga_acc_patch_values(Integer type,void * alpha,Integer ndim,Integer * loC,Integer * hiC,Integer * ldC,void * A_ptr,void * C_ptr)2337 static void snga_acc_patch_values(Integer type, void* alpha, Integer ndim,
2338 Integer *loC, Integer *hiC, Integer *ldC,
2339 void *A_ptr, void *C_ptr)
2340 {
2341 Integer bvalue[MAXDIM], bunit[MAXDIM], baseldC[MAXDIM];
2342 Integer idx, n1dim;
2343 Integer i, j;
2344 /* compute "local" add */
2345
2346 /* number of n-element of the first dimension */
2347 n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiC[i] - loC[i] + 1);
2348
2349 /* calculate the destination indices */
2350 bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
2351 /* baseld[0] = ld[0]
2352 * baseld[1] = ld[0] * ld[1]
2353 * baseld[2] = ld[0] * ld[1] * ld[2] .....
2354 */
2355 baseldC[0] = ldC[0]; baseldC[1] = baseldC[0] *ldC[1];
2356 for(i=2; i<ndim; i++) {
2357 bvalue[i] = 0;
2358 bunit[i] = bunit[i-1] * (hiC[i-1] - loC[i-1] + 1);
2359 baseldC[i] = baseldC[i-1] * ldC[i];
2360 }
2361
2362 switch(type){
2363 case C_DBL:
2364 for(i=0; i<n1dim; i++) {
2365 idx = 0;
2366 for(j=1; j<ndim; j++) {
2367 idx += bvalue[j] * baseldC[j-1];
2368 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2369 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2370 }
2371
2372 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2373 ((double*)C_ptr)[idx+j] = ((double*)C_ptr)[idx+j]
2374 + *(double*)alpha * ((double*)A_ptr)[idx+j];
2375 }
2376 break;
2377 case C_DCPL:
2378 for(i=0; i<n1dim; i++) {
2379 idx = 0;
2380 for(j=1; j<ndim; j++) {
2381 idx += bvalue[j] * baseldC[j-1];
2382 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2383 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2384 }
2385
2386 for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2387 DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
2388 DoubleComplex x= *(DoubleComplex*)alpha;
2389 ((DoubleComplex *)C_ptr)[idx+j].real =
2390 ((DoubleComplex *)C_ptr)[idx+j].real
2391 + x.real*a.real - x.imag*a.imag;
2392 ((DoubleComplex *)C_ptr)[idx+j].imag =
2393 ((DoubleComplex *)C_ptr)[idx+j].imag
2394 + x.real*a.imag + x.imag*a.real;
2395 }
2396 }
2397 break;
2398 case C_SCPL:
2399 for(i=0; i<n1dim; i++) {
2400 idx = 0;
2401 for(j=1; j<ndim; j++) {
2402 idx += bvalue[j] * baseldC[j-1];
2403 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2404 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2405 }
2406
2407 for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2408 SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
2409 SingleComplex x= *(SingleComplex*)alpha;
2410 ((SingleComplex *)C_ptr)[idx+j].real =
2411 ((SingleComplex *)C_ptr)[idx+j].real
2412 + x.real*a.real - x.imag*a.imag;
2413 ((SingleComplex *)C_ptr)[idx+j].imag =
2414 ((SingleComplex *)C_ptr)[idx+j].imag
2415 + x.real*a.imag + x.imag*a.real;
2416 }
2417 }
2418 break;
2419 case C_INT:
2420 for(i=0; i<n1dim; i++) {
2421 idx = 0;
2422 for(j=1; j<ndim; j++) {
2423 idx += bvalue[j] * baseldC[j-1];
2424 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2425 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2426 }
2427
2428 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2429 ((int*)C_ptr)[idx+j] = ((int*)C_ptr)[idx+j]
2430 + *(int *)alpha * ((int*)A_ptr)[idx+j];
2431 }
2432 break;
2433 case C_FLOAT:
2434 for(i=0; i<n1dim; i++) {
2435 idx = 0;
2436 for(j=1; j<ndim; j++) {
2437 idx += bvalue[j] * baseldC[j-1];
2438 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2439 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2440 }
2441
2442 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2443 ((float *)C_ptr)[idx+j] = ((float *)C_ptr)[idx+j]
2444 + *(float *)alpha * ((float *)A_ptr)[idx+j];
2445 }
2446 break;
2447 case C_LONG:
2448 for(i=0; i<n1dim; i++) {
2449 idx = 0;
2450 for(j=1; j<ndim; j++) {
2451 idx += bvalue[j] * baseldC[j-1];
2452 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2453 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2454 }
2455
2456 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2457 ((long *)C_ptr)[idx+j] = ((long *)C_ptr)[idx+j]
2458 + *(long *)alpha * ((long *)A_ptr)[idx+j];
2459 }
2460 break;
2461 case C_LONGLONG:
2462 for(i=0; i<n1dim; i++) {
2463 idx = 0;
2464 for(j=1; j<ndim; j++) {
2465 idx += bvalue[j] * baseldC[j-1];
2466 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2467 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2468 }
2469
2470 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2471 ((long long*)C_ptr)[idx+j] = ((long long*)C_ptr)[idx+j]
2472 + *(long long*)alpha * ((long long*)A_ptr)[idx+j];
2473 }
2474 break;
2475 default: pnga_error(" wrong data type ",type);
2476 }
2477 }
2478
2479 /*\ Utility function to add patch values together
2480 \*/
snga_add_patch_values(Integer type,void * alpha,void * beta,Integer ndim,Integer * loC,Integer * hiC,Integer * ldC,void * A_ptr,void * B_ptr,void * C_ptr)2481 static void snga_add_patch_values(Integer type, void* alpha, void *beta,
2482 Integer ndim, Integer *loC, Integer *hiC, Integer *ldC,
2483 void *A_ptr, void *B_ptr, void *C_ptr)
2484 {
2485 Integer bvalue[MAXDIM], bunit[MAXDIM], baseldC[MAXDIM];
2486 Integer idx, n1dim;
2487 Integer i, j;
2488 /* compute "local" add */
2489
2490 /* number of n-element of the first dimension */
2491 n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiC[i] - loC[i] + 1);
2492
2493 /* calculate the destination indices */
2494 bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
2495 /* baseld[0] = ld[0]
2496 * baseld[1] = ld[0] * ld[1]
2497 * baseld[2] = ld[0] * ld[1] * ld[2] .....
2498 */
2499 baseldC[0] = ldC[0]; baseldC[1] = baseldC[0] *ldC[1];
2500 for(i=2; i<ndim; i++) {
2501 bvalue[i] = 0;
2502 bunit[i] = bunit[i-1] * (hiC[i-1] - loC[i-1] + 1);
2503 baseldC[i] = baseldC[i-1] * ldC[i];
2504 }
2505
2506 switch(type){
2507 case C_DBL:
2508 for(i=0; i<n1dim; i++) {
2509 idx = 0;
2510 for(j=1; j<ndim; j++) {
2511 idx += bvalue[j] * baseldC[j-1];
2512 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2513 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2514 }
2515
2516 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2517 ((double*)C_ptr)[idx+j] =
2518 *(double*)alpha *
2519 ((double*)A_ptr)[idx+j] +
2520 *(double*)beta *
2521 ((double*)B_ptr)[idx+j];
2522 }
2523 break;
2524 case C_DCPL:
2525 for(i=0; i<n1dim; i++) {
2526 idx = 0;
2527 for(j=1; j<ndim; j++) {
2528 idx += bvalue[j] * baseldC[j-1];
2529 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2530 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2531 }
2532
2533 for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2534 DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
2535 DoubleComplex b = ((DoubleComplex *)B_ptr)[idx+j];
2536 DoubleComplex x= *(DoubleComplex*)alpha;
2537 DoubleComplex y= *(DoubleComplex*)beta;
2538 ((DoubleComplex *)C_ptr)[idx+j].real = x.real*a.real -
2539 x.imag*a.imag + y.real*b.real - y.imag*b.imag;
2540 ((DoubleComplex *)C_ptr)[idx+j].imag = x.real*a.imag +
2541 x.imag*a.real + y.real*b.imag + y.imag*b.real;
2542 }
2543 }
2544 break;
2545 case C_SCPL:
2546 for(i=0; i<n1dim; i++) {
2547 idx = 0;
2548 for(j=1; j<ndim; j++) {
2549 idx += bvalue[j] * baseldC[j-1];
2550 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2551 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2552 }
2553
2554 for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2555 SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
2556 SingleComplex b = ((SingleComplex *)B_ptr)[idx+j];
2557 SingleComplex x= *(SingleComplex*)alpha;
2558 SingleComplex y= *(SingleComplex*)beta;
2559 ((SingleComplex *)C_ptr)[idx+j].real = x.real*a.real -
2560 x.imag*a.imag + y.real*b.real - y.imag*b.imag;
2561 ((SingleComplex *)C_ptr)[idx+j].imag = x.real*a.imag +
2562 x.imag*a.real + y.real*b.imag + y.imag*b.real;
2563 }
2564 }
2565 break;
2566 case C_INT:
2567 for(i=0; i<n1dim; i++) {
2568 idx = 0;
2569 for(j=1; j<ndim; j++) {
2570 idx += bvalue[j] * baseldC[j-1];
2571 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2572 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2573 }
2574
2575 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2576 ((int*)C_ptr)[idx+j] = *(int *)alpha *
2577 ((int*)A_ptr)[idx+j] + *(int*)beta *
2578 ((int*)B_ptr)[idx+j];
2579 }
2580 break;
2581 case C_FLOAT:
2582 for(i=0; i<n1dim; i++) {
2583 idx = 0;
2584 for(j=1; j<ndim; j++) {
2585 idx += bvalue[j] * baseldC[j-1];
2586 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2587 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2588 }
2589
2590 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2591 ((float *)C_ptr)[idx+j] = *(float *)alpha *
2592 ((float *)A_ptr)[idx+j] + *(float *)beta *
2593 ((float *)B_ptr)[idx+j];
2594 }
2595 break;
2596 case C_LONG:
2597 for(i=0; i<n1dim; i++) {
2598 idx = 0;
2599 for(j=1; j<ndim; j++) {
2600 idx += bvalue[j] * baseldC[j-1];
2601 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2602 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2603 }
2604
2605 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2606 ((long *)C_ptr)[idx+j] = *(long *)alpha *
2607 ((long *)A_ptr)[idx+j] + *(long *)beta *
2608 ((long *)B_ptr)[idx+j];
2609 }
2610 break;
2611 case C_LONGLONG:
2612 for(i=0; i<n1dim; i++) {
2613 idx = 0;
2614 for(j=1; j<ndim; j++) {
2615 idx += bvalue[j] * baseldC[j-1];
2616 if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2617 if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2618 }
2619
2620 for(j=0; j<(hiC[0]-loC[0]+1); j++)
2621 ((long long*)C_ptr)[idx+j] = *(long long*)alpha *
2622 ((long long*)A_ptr)[idx+j] + *(long long*)beta *
2623 ((long long*)B_ptr)[idx+j];
2624 }
2625 break;
2626 default: pnga_error(" wrong data type ",type);
2627 }
2628 }
2629
2630
2631 /*\ SCALED ADDITION of two patches
2632 \*/
2633 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2634 # pragma weak wnga_add_patch = pnga_add_patch
2635 #endif
pnga_add_patch(alpha,g_a,alo,ahi,beta,g_b,blo,bhi,g_c,clo,chi)2636 void pnga_add_patch(alpha, g_a, alo, ahi, beta, g_b, blo, bhi,
2637 g_c, clo, chi)
2638 Integer g_a, *alo, *ahi; /* patch of g_a */
2639 Integer g_b, *blo, *bhi; /* patch of g_b */
2640 Integer g_c, *clo, *chi; /* patch of g_c */
2641 void *alpha, *beta;
2642 {
2643 Integer i, j;
2644 Integer compatible_a, compatible_b;
2645 Integer atype, btype, ctype;
2646 Integer andim, adims[MAXDIM], bndim, bdims[MAXDIM], cndim, cdims[MAXDIM];
2647 Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
2648 Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
2649 Integer loC[MAXDIM], hiC[MAXDIM], ldC[MAXDIM];
2650 char *A_ptr, *B_ptr, *C_ptr;
2651 Integer n1dim;
2652 Integer atotal, btotal;
2653 Integer g_A = g_a, g_B = g_b;
2654 Integer me= pnga_nodeid(), A_created=0, B_created=0;
2655 Integer nproc = pnga_nnodes();
2656 Integer num_blocks_a, num_blocks_b, num_blocks_c;
2657 char *tempname = "temp", notrans='n';
2658 int local_sync_begin,local_sync_end;
2659
2660 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2661 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2662 if(local_sync_begin)pnga_sync();
2663
2664
2665 pnga_inquire(g_a, &atype, &andim, adims);
2666 pnga_inquire(g_b, &btype, &bndim, bdims);
2667 pnga_inquire(g_c, &ctype, &cndim, cdims);
2668
2669 if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
2670
2671 /* check if patch indices and dims match */
2672 for(i=0; i<andim; i++)
2673 if(alo[i] <= 0 || ahi[i] > adims[i])
2674 pnga_error("g_a indices out of range ", g_a);
2675 for(i=0; i<bndim; i++)
2676 if(blo[i] <= 0 || bhi[i] > bdims[i])
2677 pnga_error("g_b indices out of range ", g_b);
2678 for(i=0; i<cndim; i++)
2679 if(clo[i] <= 0 || chi[i] > cdims[i])
2680 pnga_error("g_c indices out of range ", g_c);
2681
2682 /* check if numbers of elements in patches match each other */
2683 n1dim = 1; for(i=0; i<cndim; i++) n1dim *= (chi[i] - clo[i] + 1);
2684 atotal = 1; for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
2685 btotal = 1; for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
2686
2687 if((atotal != n1dim) || (btotal != n1dim))
2688 pnga_error(" capacities of patches do not match ", 0L);
2689
2690 num_blocks_a = pnga_total_blocks(g_a);
2691 num_blocks_b = pnga_total_blocks(g_b);
2692 num_blocks_c = pnga_total_blocks(g_c);
2693
2694 if (num_blocks_a < 0 && num_blocks_b < 0 && num_blocks_c < 0) {
2695 /* find out coordinates of patches of g_a, g_b and g_c that I own */
2696 pnga_distribution( g_A, me, loA, hiA);
2697 pnga_distribution( g_B, me, loB, hiB);
2698 pnga_distribution( g_c, me, loC, hiC);
2699
2700 /* test if the local portion of patches matches */
2701 if(pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC) &&
2702 pnga_comp_patch(andim, alo, ahi, cndim, clo, chi)) compatible_a = 1;
2703 else compatible_a = 0;
2704 /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_a, 1, "*"); */
2705 pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_a, 1, "&&");
2706 if(pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC) &&
2707 pnga_comp_patch(bndim, blo, bhi, cndim, clo, chi)) compatible_b = 1;
2708 else compatible_b = 0;
2709 /* pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_b, 1, "*"); */
2710 pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_b, 1, "&&");
2711 if (compatible_a && compatible_b) {
2712 if(andim > bndim) cndim = bndim;
2713 if(andim < bndim) cndim = andim;
2714
2715 if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2716 pnga_error(" A patch mismatch ", g_A);
2717 if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2718 pnga_error(" B patch mismatch ", g_B);
2719
2720 /* determine subsets of my patches to access */
2721 if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2722 pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2723 pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2724 pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2725
2726 snga_add_patch_values(atype, alpha, beta, cndim,
2727 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2728
2729 /* release access to the data */
2730 pnga_release (g_A, loC, hiC);
2731 pnga_release (g_B, loC, hiC);
2732 pnga_release_update(g_c, loC, hiC);
2733 }
2734 } else if (!compatible_a && compatible_b) {
2735 /* either patches or distributions do not match:
2736 * - create a temp array that matches distribution of g_c
2737 * - do C<= A
2738 */
2739 if(g_b != g_c) {
2740 pnga_copy_patch(¬rans, g_a, alo, ahi, g_c, clo, chi);
2741 andim = cndim;
2742 g_A = g_c;
2743 pnga_distribution(g_A, me, loA, hiA);
2744 } else {
2745 if (!pnga_duplicate(g_c, &g_A, tempname))
2746 pnga_error("ga_dadd_patch: dup failed", 0L);
2747 pnga_copy_patch(¬rans, g_a, alo, ahi, g_A, clo, chi);
2748 andim = cndim;
2749 A_created = 1;
2750 pnga_distribution(g_A, me, loA, hiA);
2751 }
2752 if(andim > bndim) cndim = bndim;
2753 if(andim < bndim) cndim = andim;
2754
2755 if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2756 pnga_error(" A patch mismatch ", g_A);
2757 if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2758 pnga_error(" B patch mismatch ", g_B);
2759
2760 /* determine subsets of my patches to access */
2761 if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2762 pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2763 pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2764 pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2765
2766 snga_add_patch_values(atype, alpha, beta, cndim,
2767 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2768
2769 /* release access to the data */
2770 pnga_release (g_A, loC, hiC);
2771 pnga_release (g_B, loC, hiC);
2772 pnga_release_update(g_c, loC, hiC);
2773 }
2774 } else if (compatible_a && !compatible_b) {
2775 /* either patches or distributions do not match:
2776 * - create a temp array that matches distribution of g_c
2777 * - copy & reshape patch of g_b into g_B
2778 */
2779 if (!pnga_duplicate(g_c, &g_B, tempname))
2780 pnga_error("ga_dadd_patch: dup failed", 0L);
2781 pnga_copy_patch(¬rans, g_b, blo, bhi, g_B, clo, chi);
2782 bndim = cndim;
2783 B_created = 1;
2784 pnga_distribution(g_B, me, loB, hiB);
2785
2786 if(andim > bndim) cndim = bndim;
2787 if(andim < bndim) cndim = andim;
2788
2789 if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2790 pnga_error(" A patch mismatch ", g_A);
2791 if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2792 pnga_error(" B patch mismatch ", g_B);
2793
2794 /* determine subsets of my patches to access */
2795 if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2796 pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2797 pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2798 pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2799
2800 snga_add_patch_values(atype, alpha, beta, cndim,
2801 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2802
2803 /* release access to the data */
2804 pnga_release (g_A, loC, hiC);
2805 pnga_release (g_B, loC, hiC);
2806 pnga_release_update(g_c, loC, hiC);
2807 }
2808 } else if (!compatible_a && !compatible_b) {
2809 /* there is no match between any of the global arrays */
2810 if (!pnga_duplicate(g_c, &g_B, tempname))
2811 pnga_error("ga_dadd_patch: dup failed", 0L);
2812 pnga_copy_patch(¬rans, g_b, blo, bhi, g_B, clo, chi);
2813 bndim = cndim;
2814 B_created = 1;
2815 if(andim > bndim) cndim = bndim;
2816 if(andim < bndim) cndim = andim;
2817 cndim = bndim;
2818 pnga_copy_patch(¬rans, g_a, alo, ahi, g_c, clo, chi);
2819 pnga_scale_patch(g_c, clo, chi, alpha);
2820 /* determine subsets of my patches to access */
2821 if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2822 pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2823 pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2824
2825 snga_acc_patch_values(atype, beta, cndim, loC, hiC, ldC,
2826 B_ptr, C_ptr);
2827 /* release access to the data */
2828 pnga_release (g_B, loC, hiC);
2829 pnga_release_update(g_c, loC, hiC);
2830 }
2831 }
2832 } else {
2833 _iterator_hdl hdl_a, hdl_b, hdl_c;
2834 /* create copies of arrays A and B that are identically distributed
2835 as C*/
2836 if (!pnga_duplicate(g_c, &g_A, tempname))
2837 pnga_error("ga_dadd_patch: dup failed", 0L);
2838 pnga_copy_patch(¬rans, g_a, alo, ahi, g_A, clo, chi);
2839 andim = cndim;
2840 A_created = 1;
2841
2842 if (!pnga_duplicate(g_c, &g_B, tempname))
2843 pnga_error("ga_dadd_patch: dup failed", 0L);
2844 pnga_copy_patch(¬rans, g_b, blo, bhi, g_B, clo, chi);
2845 bndim = cndim;
2846 B_created = 1;
2847
2848 #if 1
2849 pnga_local_iterator_init(g_A, &hdl_a);
2850 pnga_local_iterator_init(g_B, &hdl_b);
2851 pnga_local_iterator_init(g_c, &hdl_c);
2852 while (pnga_local_iterator_next(&hdl_c,loC,hiC,&C_ptr,ldC)) {
2853 pnga_local_iterator_next(&hdl_a,loA,hiA,&A_ptr,ldA);
2854 pnga_local_iterator_next(&hdl_b,loB,hiB,&B_ptr,ldB);
2855 Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
2856 Integer offset, jtot, last;
2857 /* make temporary copies of loC and hiC since pnga_patch_intersect
2858 destroys original versions */
2859 for (j=0; j<cndim; j++) {
2860 lod[j] = loC[j];
2861 /*hid[j] = hiC[j];*/
2862 }
2863 if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)) {
2864
2865 /* evaluate offsets for system */
2866 offset = 0;
2867 last = cndim - 1;
2868 jtot = 1;
2869 for (j=0; j<last; j++) {
2870 offset += (loC[j] - lod[j])*jtot;
2871 jtot *= ldC[j];
2872 }
2873 offset += (loC[last]-lod[last])*jtot;
2874
2875 switch(ctype) {
2876 case C_DBL:
2877 A_ptr = (void*)((double*)(A_ptr) + offset);
2878 B_ptr = (void*)((double*)(B_ptr) + offset);
2879 C_ptr = (void*)((double*)(C_ptr) + offset);
2880 break;
2881 case C_INT:
2882 A_ptr = (void*)((int*)(A_ptr) + offset);
2883 B_ptr = (void*)((int*)(B_ptr) + offset);
2884 C_ptr = (void*)((int*)(C_ptr) + offset);
2885 break;
2886 case C_DCPL:
2887 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
2888 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
2889 C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
2890 break;
2891 case C_SCPL:
2892 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
2893 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
2894 C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
2895 break;
2896 case C_FLOAT:
2897 A_ptr = (void*)((float*)(A_ptr) + offset);
2898 B_ptr = (void*)((float*)(B_ptr) + offset);
2899 C_ptr = (void*)((float*)(C_ptr) + offset);
2900 break;
2901 case C_LONG:
2902 A_ptr = (void*)((long*)(A_ptr) + offset);
2903 B_ptr = (void*)((long*)(B_ptr) + offset);
2904 C_ptr = (void*)((long*)(C_ptr) + offset);
2905 break;
2906 case C_LONGLONG:
2907 A_ptr = (void*)((long long*)(A_ptr) + offset);
2908 B_ptr = (void*)((long long*)(B_ptr) + offset);
2909 C_ptr = (void*)((long long*)(C_ptr) + offset);
2910 break;
2911 default:
2912 break;
2913 }
2914 snga_add_patch_values(atype, alpha, beta, cndim,
2915 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2916 }
2917 }
2918 #else
2919 /* C is normally distributed so just add copies together for regular
2920 arrays */
2921 if (num_blocks_c < 0) {
2922 pnga_distribution(g_c, me, loC, hiC);
2923 if(andim > bndim) cndim = bndim;
2924 if(andim < bndim) cndim = andim;
2925 if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2926 pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2927 pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2928 pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2929
2930 snga_add_patch_values(atype, alpha, beta, cndim,
2931 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2932
2933 /* release access to the data */
2934 pnga_release (g_A, loC, hiC);
2935 pnga_release (g_B, loC, hiC);
2936 pnga_release_update(g_c, loC, hiC);
2937 }
2938 } else {
2939 Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
2940 Integer offset, jtot, last;
2941 /* Simple block-cyclic data disribution */
2942 if (!pnga_uses_proc_grid(g_c)) {
2943 for (idx = me; idx < num_blocks_c; idx += nproc) {
2944 pnga_distribution(g_c, idx, loC, hiC);
2945 /* make temporary copies of loC and hiC since pnga_patch_intersect
2946 destroys original versions */
2947 for (j=0; j<cndim; j++) {
2948 lod[j] = loC[j];
2949 /*hid[j] = hiC[j];*/
2950 }
2951 if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)) {
2952 pnga_access_block_ptr(g_A, idx, &A_ptr, ldA);
2953 pnga_access_block_ptr(g_B, idx, &B_ptr, ldB);
2954 pnga_access_block_ptr(g_c, idx, &C_ptr, ldC);
2955
2956 /* evaluate offsets for system */
2957 offset = 0;
2958 last = cndim - 1;
2959 jtot = 1;
2960 for (j=0; j<last; j++) {
2961 offset += (loC[j] - lod[j])*jtot;
2962 jtot *= ldC[j];
2963 }
2964 offset += (loC[last]-lod[last])*jtot;
2965
2966 switch(ctype) {
2967 case C_DBL:
2968 A_ptr = (void*)((double*)(A_ptr) + offset);
2969 B_ptr = (void*)((double*)(B_ptr) + offset);
2970 C_ptr = (void*)((double*)(C_ptr) + offset);
2971 break;
2972 case C_INT:
2973 A_ptr = (void*)((int*)(A_ptr) + offset);
2974 B_ptr = (void*)((int*)(B_ptr) + offset);
2975 C_ptr = (void*)((int*)(C_ptr) + offset);
2976 break;
2977 case C_DCPL:
2978 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
2979 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
2980 C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
2981 break;
2982 case C_SCPL:
2983 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
2984 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
2985 C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
2986 break;
2987 case C_FLOAT:
2988 A_ptr = (void*)((float*)(A_ptr) + offset);
2989 B_ptr = (void*)((float*)(B_ptr) + offset);
2990 C_ptr = (void*)((float*)(C_ptr) + offset);
2991 break;
2992 case C_LONG:
2993 A_ptr = (void*)((long*)(A_ptr) + offset);
2994 B_ptr = (void*)((long*)(B_ptr) + offset);
2995 C_ptr = (void*)((long*)(C_ptr) + offset);
2996 break;
2997 case C_LONGLONG:
2998 A_ptr = (void*)((long long*)(A_ptr) + offset);
2999 B_ptr = (void*)((long long*)(B_ptr) + offset);
3000 C_ptr = (void*)((long long*)(C_ptr) + offset);
3001 break;
3002 default:
3003 break;
3004 }
3005 snga_add_patch_values(atype, alpha, beta, cndim,
3006 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
3007
3008 /* release access to the data */
3009 pnga_release_block (g_A, idx);
3010 pnga_release_block (g_B, idx);
3011 pnga_release_update_block(g_c, idx);
3012 }
3013 }
3014 } else {
3015 /* Uses scalapack block-cyclic data distribution */
3016 Integer lod[MAXDIM]/*, hid[MAXDIM], chk*/;
3017 Integer proc_index[MAXDIM], index[MAXDIM];
3018 Integer topology[MAXDIM];
3019 Integer blocks[MAXDIM], block_dims[MAXDIM];
3020 pnga_get_proc_index(g_c, me, proc_index);
3021 pnga_get_proc_index(g_c, me, index);
3022 pnga_get_block_info(g_c, blocks, block_dims);
3023 pnga_get_proc_grid(g_c, topology);
3024 while (index[cndim-1] < blocks[cndim-1]) {
3025 /* find bounding coordinates of block */
3026 /*chk = 1;*/
3027 for (i = 0; i < cndim; i++) {
3028 loC[i] = index[i]*block_dims[i]+1;
3029 hiC[i] = (index[i] + 1)*block_dims[i];
3030 if (hiC[i] > cdims[i]) hiC[i] = cdims[i];
3031 /*if (hiC[i] < loC[i]) chk = 0;*/
3032 }
3033 /* make temporary copies of loC and hiC since pnga_patch_intersect
3034 destroys original versions */
3035 for (j=0; j<cndim; j++) {
3036 lod[j] = loC[j];
3037 /*hid[j] = hiC[j];*/
3038 }
3039 if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)) {
3040 pnga_access_block_grid_ptr(g_A, index, &A_ptr, ldA);
3041 pnga_access_block_grid_ptr(g_B, index, &B_ptr, ldB);
3042 pnga_access_block_grid_ptr(g_c, index, &C_ptr, ldC);
3043
3044 /* evaluate offsets for system */
3045 offset = 0;
3046 last = cndim - 1;
3047 jtot = 1;
3048 for (j=0; j<last; j++) {
3049 offset += (loC[j] - lod[j])*jtot;
3050 jtot *= ldC[j];
3051 }
3052 offset += (loC[last]-lod[last])*jtot;
3053
3054 switch(ctype) {
3055 case C_DBL:
3056 A_ptr = (void*)((double*)(A_ptr) + offset);
3057 B_ptr = (void*)((double*)(B_ptr) + offset);
3058 C_ptr = (void*)((double*)(C_ptr) + offset);
3059 break;
3060 case C_INT:
3061 A_ptr = (void*)((int*)(A_ptr) + offset);
3062 B_ptr = (void*)((int*)(B_ptr) + offset);
3063 C_ptr = (void*)((int*)(C_ptr) + offset);
3064 break;
3065 case C_DCPL:
3066 A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
3067 B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
3068 C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
3069 break;
3070 case C_SCPL:
3071 A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
3072 B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
3073 C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
3074 break;
3075 case C_FLOAT:
3076 A_ptr = (void*)((float*)(A_ptr) + offset);
3077 B_ptr = (void*)((float*)(B_ptr) + offset);
3078 C_ptr = (void*)((float*)(C_ptr) + offset);
3079 break;
3080 case C_LONG:
3081 A_ptr = (void*)((long*)(A_ptr) + offset);
3082 B_ptr = (void*)((long*)(B_ptr) + offset);
3083 C_ptr = (void*)((long*)(C_ptr) + offset);
3084 break;
3085 case C_LONGLONG:
3086 A_ptr = (void*)((long long*)(A_ptr) + offset);
3087 B_ptr = (void*)((long long*)(B_ptr) + offset);
3088 C_ptr = (void*)((long long*)(C_ptr) + offset);
3089 break;
3090 default:
3091 break;
3092 }
3093 snga_add_patch_values(atype, alpha, beta, cndim,
3094 loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
3095
3096 /* release access to the data */
3097 pnga_release_block_grid ( g_A, index);
3098 pnga_release_block_grid ( g_B, index);
3099 pnga_release_update_block_grid(g_c, index);
3100 }
3101
3102 /* increment index to get next block on processor */
3103 index[0] += topology[0];
3104 for (i = 0; i < cndim; i++) {
3105 if (index[i] >= blocks[i] && i<cndim-1) {
3106 index[i] = proc_index[i];
3107 index[i+1] += topology[i+1];
3108 }
3109 }
3110 }
3111 }
3112 }
3113 #endif
3114 }
3115
3116 if(A_created) pnga_destroy(g_A);
3117 if(B_created) pnga_destroy(g_B);
3118
3119 if(local_sync_end)pnga_sync();
3120 }
3121
3122
3123 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3124 # pragma weak wnga_zero_patch = pnga_zero_patch
3125 #endif
pnga_zero_patch(Integer g_a,Integer * lo,Integer * hi)3126 void pnga_zero_patch(Integer g_a, Integer *lo, Integer *hi)
3127 {
3128 Integer ndim, dims[MAXDIM], type;
3129 int ival = 0;
3130 long lval = 0;
3131 long llval = 0;
3132 double dval = 0.0;
3133 DoubleComplex cval;
3134 SingleComplex cfval;
3135 float fval = 0.0;
3136 void *valptr = NULL;
3137 int local_sync_begin,local_sync_end;
3138
3139 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3140 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3141 if(local_sync_begin)pnga_sync();
3142
3143
3144 pnga_inquire(g_a, &type, &ndim, dims);
3145
3146 switch (type){
3147 case C_INT:
3148 valptr = (void *)(&ival);
3149 break;
3150 case C_DBL:
3151 valptr = (void *)(&dval);
3152 break;
3153 case C_DCPL:
3154 {
3155 cval.real = 0.0; cval.imag = 0.0;
3156 valptr = (void *)(&cval);
3157 break;
3158 }
3159 case C_SCPL:
3160 {
3161 cfval.real = 0.0; cfval.imag = 0.0;
3162 valptr = (void *)(&cfval);
3163 break;
3164 }
3165 case C_FLOAT:
3166 valptr = (void *)(&fval);
3167 break;
3168 case C_LONG:
3169 valptr = (void *)(&lval);
3170 break;
3171 case C_LONGLONG:
3172 valptr = (void *)(&llval);
3173 break;
3174 default: pnga_error(" wrong data type ",type);
3175 }
3176 pnga_fill_patch(g_a, lo, hi, valptr);
3177
3178 if(local_sync_end)pnga_sync();
3179 }
3180