1 /* $Id: onesided.c,v 1.80.2.18 2007/12/18 22:22:27 d3g293 Exp $ */
2 /*
3 * module: onesided.c
4 * author: Jarek Nieplocha
5 * description: implements GA primitive communication operations --
6 * accumulate, scatter, gather, read&increment & synchronization
7 *
8 * DISCLAIMER
9 *
10 * This material was prepared as an account of work sponsored by an
11 * agency of the United States Government. Neither the United States
12 * Government nor the United States Department of Energy, nor Battelle,
13 * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
14 * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
15 * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
16 * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
17 * INFRINGE PRIVATELY OWNED RIGHTS.
18 *
19 *
20 * ACKNOWLEDGMENT
21 *
22 * This software and its documentation were produced with United States
23 * Government support under Contract Number DE-AC06-76RLO-1830 awarded by
24 * the United States Department of Energy. The United States Government
25 * retains a paid-up non-exclusive, irrevocable worldwide license to
26 * reproduce, prepare derivative works, perform publicly and display
27 * publicly by or for the US Government, including the right to
28 * distribute to other US Government contractors.
29 */
30 #if HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33
34 #define USE_GATSCAT_NEW
35
36 #if HAVE_STDIO_H
37 # include <stdio.h>
38 #endif
39 #if HAVE_STRING_H
40 # include <string.h>
41 #endif
42 #if HAVE_STDLIB_H
43 # include <stdlib.h>
44 #endif
45 #if HAVE_STDINT_H
46 # include <stdint.h>
47 #endif
48 #if HAVE_MATH_H
49 # include <math.h>
50 #endif
51 #if HAVE_ASSERT_H
52 # include <assert.h>
53 #endif
54 #if HAVE_STDDEF_H
55 #include <stddef.h>
56 #endif
57
58 #include "global.h"
59 #include "globalp.h"
60 #include "base.h"
61 #include "ga_iterator.h"
62 #include "armci.h"
63 #include "macdecls.h"
64 #include "ga-papi.h"
65 #include "ga-wapi.h"
66 #include "thread-safe.h"
67
68 #define DEBUG 0
69 #define USE_MALLOC 1
70 #define INVALID_MA_HANDLE -1
71 #define NEAR_INT(x) (x)< 0.0 ? ceil( (x) - 0.5) : floor((x) + 0.5)
72
73 #define BYTE_ADDRESSABLE_MEMORY
74
75 #ifdef PROFILE_OLD
76 #include "ga_profile.h"
77 #endif
78
79
80 #define DISABLE_NBOPT /* disables Non-Blocking OPTimization */
81
82 /*uncomment line below to verify consistency of MA in every sync */
83 /*#define CHECK_MA yes */
84
85 char *fence_array;
86 static int GA_fence_set=0;
87
88 static int GA_prealloc_gatscat = 0;
89 static Integer *GA_header;
90 static Integer *GA_list;
91 static Integer *GA_elems;
92
93 extern void armci_read_strided(void*, int, int*, int*, char*);
94 extern void armci_write_strided(void*, int, int*, int*, char*);
95 extern armci_hdl_t* get_armci_nbhandle(Integer *);
96
97 /***************************************************************************/
98
99 /**
100 * Synchronize all processes on group
101 */
102 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
103 # pragma weak wnga_pgroup_sync = pnga_pgroup_sync
104 #endif
105
pnga_pgroup_sync(Integer grp_id)106 void pnga_pgroup_sync(Integer grp_id)
107 {
108 #ifdef CHECK_MA
109 Integer status;
110 #endif
111
112 /* printf("p[%d] calling ga_pgroup_sync on group: %d\n",GAme,*grp_id); */
113 #ifdef USE_ARMCI_GROUP_FENCE
114 int grp = (int)grp_id;
115 ARMCI_GroupFence(&grp);
116 pnga_msg_pgroup_sync(grp_id);
117 #else
118 if (grp_id > 0) {
119 # ifdef MSG_COMMS_MPI
120 /* fence on all processes in this group */
121 { int p; for(p=0;p<PGRP_LIST[grp_id].map_nproc;p++)
122 ARMCI_Fence(ARMCI_Absolute_id(&PGRP_LIST[grp_id].group, p)); }
123 pnga_msg_pgroup_sync(grp_id);
124 if(GA_fence_set)bzero(fence_array,(int)GAnproc);
125 GA_fence_set=0;
126 # else
127 pnga_error("ga_pgroup_sync_(): MPI not defined. ga_msg_pgroup_sync_() can be called only if GA is built with MPI", 0);
128 # endif
129 } else {
130 /* printf("p[%d] calling regular sync in ga_pgroup_sync\n",GAme); */
131 ARMCI_AllFence();
132 pnga_msg_pgroup_sync(grp_id);
133 if(GA_fence_set)bzero(fence_array,(int)GAnproc);
134 GA_fence_set=0;
135 }
136 #endif
137 #ifdef CHECK_MA
138 status = MA_verify_allocator_stuff();
139 #endif
140 }
141
142 /**
143 * Synchronize all processes and complete all outstanding communications
144 */
145 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
146 # pragma weak wnga_sync = pnga_sync
147 #endif
148
pnga_sync()149 void pnga_sync()
150 {
151 GA_Internal_Threadsafe_Lock();
152 #ifdef CHECK_MA
153 Integer status;
154 #endif
155
156 #ifdef USE_ARMCI_GROUP_FENCE
157 if (GA_Default_Proc_Group == -1) {
158 int grp_id = (int)GA_Default_Proc_Group;
159 ARMCI_GroupFence(&grp_id);
160 pnga_msg_sync();
161 } else {
162 int grp_id = (Integer)GA_Default_Proc_Group;
163 pnga_pgroup_sync(grp_id);
164 }
165 #else
166 if (GA_Default_Proc_Group == -1) {
167 ARMCI_AllFence();
168 pnga_msg_sync();
169 if(GA_fence_set)bzero(fence_array,(int)GAnproc);
170 GA_fence_set=0;
171 } else {
172 Integer grp_id = (Integer)GA_Default_Proc_Group;
173 pnga_pgroup_sync(grp_id);
174 }
175 #endif
176 #ifdef CHECK_MA
177 status = MA_verify_allocator_stuff();
178 #endif
179 GA_Internal_Threadsafe_Unlock();
180 }
181
182
183 /**
184 * Wait until all requests initiated by calling process are completed
185 */
186 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
187 # pragma weak wnga_fence = pnga_fence
188 #endif
189
pnga_fence()190 void pnga_fence()
191 {
192 int proc;
193 if(GA_fence_set<1)pnga_error("ga_fence: fence not initialized",0);
194 GA_fence_set--;
195 for(proc=0;proc<GAnproc;proc++)if(fence_array[proc])ARMCI_Fence(proc);
196 bzero(fence_array,(int)GAnproc);
197 }
198
199 /**
200 * Initialize tracing of request completion
201 */
202 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
203 # pragma weak wnga_init_fence = pnga_init_fence
204 #endif
205
pnga_init_fence()206 void pnga_init_fence()
207 {
208 GA_fence_set++;
209 }
210
gai_init_onesided()211 void gai_init_onesided()
212 {
213 fence_array = calloc((size_t)GAnproc,1);
214 if(!fence_array) pnga_error("ga_init:calloc failed",0);
215 }
216
217
gai_finalize_onesided()218 void gai_finalize_onesided()
219 {
220 free(fence_array);
221 fence_array = NULL;
222 }
223
224
225 /*\ prepare permuted list of processes for remote ops
226 \*/
227 #define gaPermuteProcList(nproc)\
228 {\
229 if((nproc) ==1) ProcListPerm[0]=0; \
230 else{\
231 int _i, iswap, temp;\
232 if((nproc) > GAnproc) pnga_error("permute_proc: error ", (nproc));\
233 \
234 /* every process generates different random sequence */\
235 (void)srand((unsigned)GAme); \
236 \
237 /* initialize list */\
238 for(_i=0; _i< (nproc); _i++) ProcListPerm[_i]=_i;\
239 \
240 /* list permutation generated by random swapping */\
241 for(_i=0; _i< (nproc); _i++){ \
242 iswap = (int)(rand() % (nproc)); \
243 temp = ProcListPerm[iswap]; \
244 ProcListPerm[iswap] = ProcListPerm[_i]; \
245 ProcListPerm[_i] = temp; \
246 } \
247 }\
248 }
249
250
251 #define gaShmemLocation(proc, g_a, _i, _j, ptr_loc, _pld) \
252 { \
253 Integer _ilo, _ihi, _jlo, _jhi, offset, proc_place, g_handle=(g_a)+GA_OFFSET;\
254 Integer _lo[2], _hi[2], _p_handle; \
255 Integer _iw = GA[g_handle].width[0]; \
256 Integer _jw = GA[g_handle].width[1]; \
257 Integer _num_rstrctd = GA[g_handle].num_rstrctd; \
258 \
259 ga_ownsM(g_handle, (proc),_lo,_hi); \
260 _p_handle = GA[g_handle].p_handle; \
261 if (_p_handle != 0) { \
262 proc_place = proc; \
263 if (_num_rstrctd != 0) { \
264 proc_place = GA[g_handle].rstrctd_list[proc_place]; \
265 } \
266 } else { \
267 proc_place = PGRP_LIST[_p_handle].inv_map_proc_list[proc]; \
268 } \
269 _ilo = _lo[0]; _ihi=_hi[0]; \
270 _jlo = _lo[1]; _jhi=_hi[1]; \
271 \
272 if((_i)<_ilo || (_i)>_ihi || (_j)<_jlo || (_j)>_jhi){ \
273 char err_string[ERR_STR_LEN]; \
274 sprintf(err_string,"%s:p=%ld invalid i/j (%ld,%ld)><(%ld:%ld,%ld:%ld)", \
275 "gaShmemLocation", (long)proc, (long)(_i),(long)(_j), \
276 (long)_ilo, (long)_ihi, (long)_jlo, (long)_jhi); \
277 pnga_error(err_string, g_a ); \
278 } \
279 offset = ((_i)-_ilo+_iw) + (_ihi-_ilo+1+2*_iw)*((_j)-_jlo+_jw); \
280 \
281 /* find location of the proc in current cluster pointer array */ \
282 *(ptr_loc) = GA[g_handle].ptr[proc_place] + \
283 offset*GAsizeofM(GA[g_handle].type); \
284 *(_pld) = _ihi-_ilo+1+2*_iw; \
285 }
286
287 /*\ Return pointer (ptr_loc) to location in memory of element with subscripts
288 * (subscript). Also return physical dimensions of array in memory in ld.
289 \*/
290 #define gam_Location(proc, g_handle, subscript, ptr_loc, ld) \
291 { \
292 Integer _offset=0, _d, _w, _factor=1, _last=GA[g_handle].ndim-1; \
293 Integer _lo[MAXDIM], _hi[MAXDIM], _pinv, _p_handle; \
294 \
295 ga_ownsM(g_handle, proc, _lo, _hi); \
296 gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim); \
297 if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0]; \
298 __CRAYX1_PRAGMA("_CRI shortloop"); \
299 for(_d=0; _d < _last; _d++) { \
300 _w = (Integer)GA[g_handle].width[_d]; \
301 _offset += (subscript[_d]-_lo[_d]+_w) * _factor; \
302 ld[_d] = _hi[_d] - _lo[_d] + 1 + 2*_w; \
303 _factor *= ld[_d]; \
304 } \
305 _offset += (subscript[_last]-_lo[_last] \
306 + (Integer)GA[g_handle].width[_last]) \
307 * _factor; \
308 _p_handle = GA[g_handle].p_handle; \
309 if (_p_handle != 0) { \
310 if (GA[g_handle].num_rstrctd == 0) { \
311 _pinv = proc; \
312 } else { \
313 _pinv = GA[g_handle].rstrctd_list[proc]; \
314 } \
315 } else { \
316 _pinv = PGRP_LIST[_p_handle].inv_map_proc_list[proc]; \
317 } \
318 *(ptr_loc) = GA[g_handle].ptr[_pinv]+_offset*GA[g_handle].elemsize; \
319 }
320
321
322 #define gam_GetRangeFromMap(p, ndim, plo, phi){\
323 Integer _mloc = p* ndim *2;\
324 *plo = (Integer*)_ga_map + _mloc;\
325 *phi = *plo + ndim;\
326 }
327
328 /* compute index of point subscripted by plo relative to point
329 subscripted by lo, for a block with dimensions dims */
330 #define gam_ComputePatchIndex(ndim, lo, plo, dims, pidx){ \
331 Integer _d, _factor; \
332 *pidx = plo[0] -lo[0]; \
333 __CRAYX1_PRAGMA("_CRI shortloop"); \
334 for(_d= 0,_factor=1; _d< ndim -1; _d++){ \
335 _factor *= (dims[_d]); \
336 *pidx += _factor * (plo[_d+1]-lo[_d+1]); \
337 } \
338 }
339
340 #define gam_GetBlockPatch(plo,phi,lo,hi,blo,bhi,ndim) { \
341 Integer _d; \
342 for (_d=0; _d<ndim; _d++) { \
343 if (lo[_d] <= phi[_d] && lo[_d] >= plo[_d]) blo[_d] = lo[_d]; \
344 else blo[_d] = plo[_d]; \
345 if (hi[_d] <= phi[_d] && hi[_d] >= plo[_d]) bhi[_d] = hi[_d]; \
346 else bhi[_d] = phi[_d]; \
347 } \
348 }
349
350 /**
351 * A routine to test for a non-blocking call
352 */
353 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
354 # pragma weak wnga_nbtest = pnga_nbtest
355 #endif
356
pnga_nbtest(Integer * nbhandle)357 Integer pnga_nbtest(Integer *nbhandle)
358 {
359 return !nga_test_internal((Integer *)nbhandle);
360 }
361
362 /**
363 * A routine to wait for a non-blocking call to complete
364 */
365 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
366 # pragma weak wnga_nbwait = pnga_nbwait
367 #endif
368
pnga_nbwait(Integer * nbhandle)369 void pnga_nbwait(Integer *nbhandle)
370 {
371 GA_Internal_Threadsafe_Lock();
372 nga_wait_internal((Integer *)nbhandle);
373 GA_Internal_Threadsafe_Unlock();
374 }
375
ngai_puts(char * loc_base_ptr,char * pbuf,int * stride_loc,char * prem,int * stride_rem,int * count,int nstrides,int proc,int field_off,int field_size,int type_size)376 static void ngai_puts(char *loc_base_ptr, char *pbuf, int *stride_loc, char *prem, int *stride_rem,
377 int *count, int nstrides, int proc, int field_off,
378 int field_size, int type_size) {
379 if(field_size<0 || field_size == type_size) {
380 ARMCI_PutS(pbuf,stride_loc,prem,stride_rem,count,nstrides,proc);
381 }
382 else {
383 int i;
384 count -= 1;
385 stride_loc -= 1;
386 stride_rem -= 1;
387 nstrides += 1;
388
389 for(i=1; i<nstrides; i++) {
390 stride_loc[i] /= type_size;
391 stride_loc[i] *= field_size;
392 }
393
394 pbuf = loc_base_ptr + (pbuf - loc_base_ptr)/type_size*field_size;
395 prem += field_off;
396
397 count[1] /= type_size;
398 ARMCI_PutS(pbuf,stride_loc,prem,stride_rem,count,nstrides,proc);
399 count[1] *= type_size; /*restore*/
400 for(i=1; i<nstrides; i++) {
401 stride_loc[i] /= field_size;
402 stride_loc[i] *= type_size;
403 }
404 }
405 }
406
407
ngai_nbputs(char * loc_base_ptr,char * pbuf,int * stride_loc,char * prem,int * stride_rem,int * count,int nstrides,int proc,int field_off,int field_size,int type_size,armci_hdl_t * nbhandle)408 static void ngai_nbputs(char *loc_base_ptr, char *pbuf,int *stride_loc, char *prem, int *stride_rem,
409 int *count, int nstrides, int proc, int field_off,
410 int field_size, int type_size, armci_hdl_t *nbhandle) {
411 if(field_size<0 || field_size == type_size) {
412 ARMCI_NbPutS(pbuf,stride_loc,prem,stride_rem,count,nstrides,proc,nbhandle);
413 }
414 else {
415 int i;
416 count -= 1;
417 stride_loc -= 1;
418 stride_rem -= 1;
419 nstrides += 1;
420
421 for(i=1; i<nstrides; i++) {
422 stride_loc[i] /= type_size;
423 stride_loc[i] *= field_size;
424 }
425
426 pbuf = loc_base_ptr + (pbuf - loc_base_ptr)/type_size*field_size;
427 prem += field_off;
428
429 count[1] /= type_size;
430 ARMCI_NbPutS(pbuf,stride_loc,prem,stride_rem,count,nstrides,proc, nbhandle);
431 count[1] *= type_size; /*restore*/
432 for(i=1; i<nstrides; i++) {
433 stride_loc[i] /= field_size;
434 stride_loc[i] *= type_size;
435 }
436 }
437 }
438
ngai_nbgets(char * loc_base_ptr,char * prem,int * stride_rem,char * pbuf,int * stride_loc,int * count,int nstrides,int proc,int field_off,int field_size,int type_size,armci_hdl_t * nbhandle)439 static void ngai_nbgets(char *loc_base_ptr, char *prem,int *stride_rem, char *pbuf, int *stride_loc,
440 int *count, int nstrides, int proc, int field_off,
441 int field_size, int type_size, armci_hdl_t *nbhandle) {
442 if(field_size<0 || field_size == type_size) {
443 ARMCI_NbGetS(prem,stride_rem,pbuf,stride_loc,count,nstrides,proc,nbhandle);
444 }
445 else {
446 int i;
447 count -= 1;
448 stride_loc -= 1;
449 stride_rem -= 1;
450
451 pbuf = loc_base_ptr + (pbuf - loc_base_ptr)/type_size*field_size;
452 prem += field_off;
453
454 count[1] /= type_size;
455 nstrides += 1;
456
457 for(i=1; i<nstrides; i++) {
458 stride_loc[i] /= type_size;
459 stride_loc[i] *= field_size;
460 }
461 /* { */
462 /* int i; */
463 /* printf("%s: calling armci_nbgets: field_off=%d field_size=%d type_size=%d proc=%d nstrides=%d\n", */
464 /* __FUNCTION__, (int)field_off, (int)field_size, (int)type_size, proc, nstrides); */
465 /* printf("me=%d loc_ptr=%p rem_ptr=%p count=[", pnga_nodeid(), pbuf, prem); */
466 /* for(i=0; i<=nstrides; i++) { */
467 /* printf("%d ",count[i]); */
468 /* } */
469 /* printf("]\n"); */
470 /* printf("src_stride_arr=["); */
471 /* for(i=0; i<nstrides; i++) { */
472 /* printf("%d ",stride_rem[i]); */
473 /* } */
474 /* printf("]\n"); */
475 /* printf("dst_stride_arr=["); */
476 /* for(i=0; i<nstrides; i++) { */
477 /* printf("%d ",stride_loc[i]); */
478 /* } */
479 /* printf("]\n"); */
480 /* } */
481 ARMCI_NbGetS(prem,stride_rem,pbuf,stride_loc,count,nstrides,proc,nbhandle);
482 count[1] *= type_size; /*restore*/
483 for(i=1; i<nstrides; i++) {
484 stride_loc[i] /= field_size;
485 stride_loc[i] *= type_size;
486 }
487 }
488 }
489
ngai_gets(char * loc_base_ptr,char * prem,int * stride_rem,char * pbuf,int * stride_loc,int * count,int nstrides,int proc,int field_off,int field_size,int type_size)490 static void ngai_gets(char *loc_base_ptr, char *prem,int *stride_rem, char *pbuf, int *stride_loc,
491 int *count, int nstrides, int proc, int field_off,
492 int field_size, int type_size) {
493 #if 1
494 Integer handle;
495 ga_init_nbhandle(&handle);
496 ngai_nbgets(loc_base_ptr, prem, stride_rem, pbuf, stride_loc, count, nstrides, proc,
497 field_off, field_size, type_size, (armci_hdl_t*)get_armci_nbhandle(&handle));
498 nga_wait_internal(&handle);
499 #else
500 if(field_size<0 || field_size == type_size) {
501 ARMCI_GetS(prem,stride_rem,pbuf,stride_loc,count,nstrides,proc);
502 } else {
503 int i;
504 count -= 1;
505 stride_loc -= 1;
506 stride_rem -= 1;
507
508 pbuf = loc_base_ptr + (pbuf - loc_base_ptr)/type_size*field_size;
509 prem += field_off;
510
511 count[1] /= type_size;
512 nstrides += 1;
513
514 for(i=1; i<nstrides; i++) {
515 stride_loc[i] /= type_size;
516 stride_loc[i] *= field_size;
517 }
518 ARMCI_GetS(prem,stride_rem,pbuf,stride_loc,count,nstrides,proc);
519 count[1] *= type_size; /*restore*/
520 for(i=1; i<nstrides; i++) {
521 stride_loc[i] /= field_size;
522 stride_loc[i] *= type_size;
523 }
524 }
525 #endif
526 }
527
528 /**
529 * A common routine called by both non-blocking and blocking GA put calls.
530 */
531 #ifdef __crayx1
532 #pragma _CRI inline pnga_locate_region
533 #endif
ngai_put_common(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer field_off,Integer field_size,Integer * nbhandle)534 void ngai_put_common(Integer g_a,
535 Integer *lo,
536 Integer *hi,
537 void *buf,
538 Integer *ld,
539 Integer field_off,
540 Integer field_size,
541 Integer *nbhandle)
542 {
543 Integer p, np, handle=GA_OFFSET + g_a;
544 Integer idx, elems, size, p_handle;
545 int proc, ndim, loop, cond;
546 int num_loops=2; /* 1st loop for remote procs; 2nd loop for local procs */
547 Integer n_rstrctd;
548 Integer *rank_rstrctd;
549 #if defined(__crayx1) || defined(DISABLE_NBOPT)
550 #else
551 Integer ga_nbhandle;
552 int counter=0;
553 #endif
554
555 int _stride_rem[MAXDIM+1], _stride_loc[MAXDIM+1], _count[MAXDIM+1];
556 int *stride_rem=&_stride_rem[1], *stride_loc=&_stride_loc[1], *count=&_count[1];
557 _iterator_hdl it_hdl;
558
559
560
561 ga_check_handleM(g_a, "ngai_put_common");
562
563 size = GA[handle].elemsize;
564 ndim = GA[handle].ndim;
565 p_handle = GA[handle].p_handle;
566 n_rstrctd = (Integer)GA[handle].num_rstrctd;
567 rank_rstrctd = GA[handle].rank_rstrctd;
568
569 /*initial stride portion for field-wise operations*/
570 _stride_rem[0] = size;
571 _stride_loc[0] = field_size;
572 _count[0] = field_size;
573
574
575 gai_iterator_init(g_a, lo, hi, &it_hdl);
576
577 #ifndef NO_GA_STATS
578 gam_CountElems(ndim, lo, hi, &elems);
579 GAbytes.puttot += (double)size*elems;
580 GAstat.numput++;
581 GAstat.numput_procs += np;
582 #endif
583
584 if(nbhandle)ga_init_nbhandle(nbhandle);
585 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
586 else ga_init_nbhandle(&ga_nbhandle);
587 #endif
588
589 #ifdef PROFILE_OLD
590 ga_profile_start((int)handle, (long)size*elems, ndim, lo, hi,
591 ENABLE_PROFILE_PUT);
592 #endif
593
594 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
595 for(loop=0; loop<num_loops; loop++) {
596 __CRAYX1_PRAGMA("_CRI novector");
597 #endif
598 Integer ldrem[MAXDIM];
599 Integer idx_buf, *plo, *phi;
600 char *pbuf, *prem;
601 gai_iterator_reset(&it_hdl);
602 while (gai_iterator_next(&it_hdl, &proc, &plo, &phi, &prem, ldrem)) {
603
604 /* check if it is local to SMP */
605 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
606 cond = armci_domain_same_id(ARMCI_DOMAIN_SMP,(int)proc);
607 if(loop==0) cond = !cond;
608 if(cond) {
609 #endif
610 /* find the right spot in the user buffer */
611 gam_ComputePatchIndex(ndim, lo, plo, ld, &idx_buf);
612 pbuf = size*idx_buf + (char*)buf;
613
614 gam_ComputeCount(ndim, plo, phi, count);
615
616 /* scale number of rows by element size */
617 count[0] *= size;
618 gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc);
619
620 /*
621 if (p_handle >= 0) {
622 proc = (int)GA_proclist[p];
623 proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
624 }
625 */
626 if(GA_fence_set)fence_array[proc]=1;
627
628 #ifndef NO_GA_STATS
629 if(proc == GAme){
630 gam_CountElems(ndim, plo, phi, &elems);
631 GAbytes.putloc += (double)size*elems;
632 }
633 #endif
634
635 /*casting what ganb_get_armci_handle function returns to armci_hdl is
636 very crucial here as on 64 bit platforms, pointer is 64 bits where
637 as temporary is only 32 bits*/
638 if(nbhandle) {
639 /* ARMCI_NbPutS(pbuf, stride_loc, prem, stride_rem, count, ndim -1, */
640 /* proc,(armci_hdl_t*)get_armci_nbhandle(nbhandle)); */
641 ngai_nbputs(buf,pbuf, stride_loc, prem, stride_rem, count, ndim -1,
642 proc,field_off, field_size, size,
643 (armci_hdl_t*)get_armci_nbhandle(nbhandle));
644 } else {
645 #if defined(__crayx1) || defined(DISABLE_NBOPT)
646 /* ARMCI_PutS(pbuf,stride_loc,prem,stride_rem,count,ndim-1,proc); */
647 ngai_puts(buf, pbuf,stride_loc,prem,stride_rem,count,ndim-1,proc,
648 field_off, field_size, size);
649 #else
650 /* do blocking put for local processes. If all processes
651 are remote processes then do blocking put for the last one */
652 if((loop==0 && gai_iterator_last(&it_hdl)) || loop==1) {
653 /* ARMCI_PutS(pbuf,stride_loc,prem,stride_rem,count,ndim-1,proc); */
654 ngai_puts(buf, pbuf,stride_loc,prem,stride_rem,count,ndim-1,
655 proc, field_off, field_size, size);
656 } else {
657 ++counter;
658 /* ARMCI_NbPutS(pbuf,stride_loc,prem,stride_rem,count, ndim-1, */
659 /* proc,(armci_hdl_t*)get_armci_nbhandle(&ga_nbhandle)); */
660 ngai_nbputs(buf,pbuf,stride_loc,prem,stride_rem,count, ndim-1,
661 proc, field_off, field_size, size,
662 (armci_hdl_t*)get_armci_nbhandle(&ga_nbhandle));
663 }
664 #endif
665 }
666 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
667 } /* end if(cond) */
668 #endif
669 }
670 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
671 }
672 if(!nbhandle) nga_wait_internal(&ga_nbhandle);
673 #endif
674
675 #ifdef PROFILE_OLD
676 ga_profile_stop();
677 #endif
678
679 gai_iterator_destroy(&it_hdl);
680 }
681
682
683 /**
684 * (Non-blocking) Put an N-dimensional patch of data into a Global Array
685 */
686 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
687 # pragma weak wnga_nbput = pnga_nbput
688 #endif
689
pnga_nbput(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer * nbhandle)690 void pnga_nbput(Integer g_a, Integer *lo, Integer *hi, void *buf, Integer *ld, Integer *nbhandle)
691 {
692 GA_Internal_Threadsafe_Lock();
693 ngai_put_common(g_a,lo,hi,buf,ld,0,-1,nbhandle);
694 GA_Internal_Threadsafe_Unlock();
695 }
696
697 /**
698 * (Non-blocking) Put an N-dimensional patch of data into a Global Array and notify the other
699 side with information on another Global Array
700 */
701
702 #define HANDLES_OUTSTANDING 100
703 /* Maximum number of outstanding put/notify handles */
704
705 typedef struct {
706 Integer *orighdl;
707 Integer firsthdl;
708 Integer elementhdl;
709 void *elem_copy;
710 } gai_putn_hdl_t;
711
712 static int putn_handles_initted = 0;
713 static gai_putn_hdl_t putn_handles[HANDLES_OUTSTANDING];
714
putn_check_single_elem(Integer g_a,Integer * lon,Integer * hin)715 static int putn_check_single_elem(Integer g_a, Integer *lon, Integer *hin)
716 {
717 int ndims, i;
718
719 ndims = pnga_ndim(g_a);
720 for (i = 0; i < ndims; i++)
721 if (lon[i] != hin[i])
722 return 0;
723
724 return 1;
725 } /* putn_check_single_elem */
726
putn_find_empty_slot(void)727 static int putn_find_empty_slot(void)
728 {
729 int i;
730
731 for (i = 0; i < HANDLES_OUTSTANDING; i++)
732 if (!putn_handles[i].orighdl)
733 return i;
734
735 return -1;
736 } /* putn_find_empty_slot */
737
putn_intersect_coords(Integer g_a,Integer * lo,Integer * hi,Integer * ecoords)738 static int putn_intersect_coords(Integer g_a, Integer *lo, Integer *hi, Integer *ecoords)
739 {
740 int ndims, i;
741
742 ndims = pnga_ndim(g_a);
743
744 for (i = 0; i < ndims; i++)
745 if ((ecoords[i] < lo[i]) || (ecoords[i] > hi[i]))
746 return 0;
747
748 return 1;
749 } /* putn_intersect_coords */
750
putn_verify_element_in_buf(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer * ecoords,void * bufn,Integer elemSize)751 static int putn_verify_element_in_buf(Integer g_a, Integer *lo, Integer *hi, void *buf,
752 Integer *ld, Integer *ecoords, void *bufn,
753 Integer elemSize)
754 {
755 int i, ndims;
756 #if HAVE_STDDEF_H
757 ptrdiff_t off = (char *)bufn - (char *)buf;
758 #else
759 Integer off = (char *)bufn - (char *)buf;
760 #endif
761 Integer eoff = 0;
762
763 off /= elemSize; /* Offset in terms of elements */
764
765 ndims = pnga_ndim(g_a);
766 eoff = ecoords[0] - lo[0];
767
768 /* Check in Fortran ordering */
769 for (i = 1; i < ndims; i++)
770 eoff += (ecoords[i] - lo[i]) * ld[i - 1];
771
772 return (eoff == (Integer)off); /* Must be the same for a correct notify buffer */
773 } /* putn_verify_element_in_buf */
774
775 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
776 # pragma weak wnga_nbput_notify = pnga_nbput_notify
777 #endif
778
pnga_nbput_notify(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer g_b,Integer * ecoords,void * bufn,Integer * nbhandle)779 void pnga_nbput_notify(Integer g_a, Integer *lo, Integer *hi, void *buf, Integer *ld, Integer g_b, Integer *ecoords, void *bufn, Integer *nbhandle)
780 {
781 Integer ldn[MAXDIM] = { 1 };
782 int pos, intersect;
783
784 /* Make sure everything has been initialized */
785 if (!putn_handles_initted) {
786 memset(putn_handles, 0, sizeof(putn_handles));
787 putn_handles_initted = 1;
788 }
789
790 pos = putn_find_empty_slot();
791 if (pos == -1) /* no empty handles available */
792 pnga_error("Too many outstanding put/notify operations!", 0);
793
794 putn_handles[pos].orighdl = nbhandle; /* Store original handle for nbwait_notify */
795
796 if (g_a == g_b)
797 intersect = putn_intersect_coords(g_a, lo, hi, ecoords);
798 else
799 intersect = 0;
800
801 if (!intersect) { /* Simpler case */
802 ngai_put_common(g_a, lo, hi, buf, ld, 0, -1, &putn_handles[pos].firsthdl);
803 ngai_put_common(g_b, ecoords, ecoords, bufn, ldn, 0, -1, &putn_handles[pos].elementhdl);
804
805 putn_handles[pos].elem_copy = NULL;
806 }
807 else {
808 int ret, i;
809 Integer handle = GA_OFFSET + g_a, size;
810 void *elem_copy;
811 char *elem;
812
813 size = GA[handle].elemsize;
814 ret = putn_verify_element_in_buf(g_a, lo, hi, buf, ld, ecoords, bufn, size);
815
816 if (!ret)
817 pnga_error("Intersecting buffers, but notify element is not in buffer!", 0);
818
819 elem_copy = malloc(size);
820 memcpy(elem_copy, bufn, size);
821
822 elem = bufn;
823 for (i = 0; i < size; i++)
824 elem[i] += 1; /* Increment each byte by one, safe? */
825
826 putn_handles[pos].elem_copy = elem_copy;
827
828 ngai_put_common(g_a, lo, hi, buf, ld, 0, -1, &putn_handles[pos].firsthdl);
829 ngai_put_common(g_a, ecoords, ecoords, elem_copy, ldn, 0, -1,
830 &putn_handles[pos].elementhdl);
831 }
832 } /* pnga_nbput_notify */
833
834 /**
835 * Wait for a non-blocking put/notify to complete
836 */
837 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
838 # pragma weak wnga_nbwait_notify = pnga_nbwait_notify
839 #endif
840
pnga_nbwait_notify(Integer * nbhandle)841 void pnga_nbwait_notify(Integer *nbhandle)
842 {
843 int i;
844
845 for (i = 0; i < HANDLES_OUTSTANDING; i++)
846 if (putn_handles[i].orighdl == nbhandle)
847 break;
848
849 if (i >= HANDLES_OUTSTANDING)
850 return; /* Incorrect handle used or maybe wait was called multiple times? */
851
852 nga_wait_internal(&putn_handles[i].firsthdl);
853 nga_wait_internal(&putn_handles[i].elementhdl);
854
855 if (putn_handles[i].elem_copy) {
856 free(putn_handles[i].elem_copy);
857 putn_handles[i].elem_copy = NULL;
858 }
859
860 putn_handles[i].orighdl = NULL;
861 } /* pnga_nbwait_notify */
862
863 /**
864 * Put an N-dimensional patch of data into a Global Array
865 */
866 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
867 # pragma weak wnga_put = pnga_put
868 #endif
869
pnga_put(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld)870 void pnga_put(Integer g_a, Integer *lo, Integer *hi, void *buf, Integer *ld)
871 {
872
873 GA_Internal_Threadsafe_Lock();
874 ngai_put_common(g_a,lo,hi,buf,ld,0,-1,NULL);
875 GA_Internal_Threadsafe_Unlock();
876 }
877
878 /**
879 * (Non-blocking) Put a field in a an N-dimensional patch of data into a Global Array
880 */
881 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
882 # pragma weak wnga_nbput_field = pnga_nbput_field
883 #endif
884
pnga_nbput_field(Integer g_a,Integer * lo,Integer * hi,Integer foff,Integer fsize,void * buf,Integer * ld,Integer * nbhandle)885 void pnga_nbput_field(Integer g_a, Integer *lo, Integer *hi, Integer foff, Integer fsize, void *buf, Integer *ld, Integer *nbhandle)
886 {
887 ngai_put_common(g_a,lo,hi,buf,ld,foff, fsize, nbhandle);
888 }
889
890
891 /**
892 * Put a field in a an N-dimensional patch of data into a Global Array
893 */
894 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
895 # pragma weak wnga_put_field = pnga_put_field
896 #endif
897
pnga_put_field(Integer g_a,Integer * lo,Integer * hi,Integer foff,Integer fsize,void * buf,Integer * ld)898 void pnga_put_field(Integer g_a, Integer *lo, Integer *hi, Integer foff, Integer fsize, void *buf, Integer *ld)
899 {
900 ngai_put_common(g_a,lo,hi,buf,ld,foff, fsize, NULL);
901 }
902
903
904
905 /*\ A common routine called by both non-blocking and blocking GA Get calls.
906 \*/
ngai_get_common(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer field_off,Integer field_size,Integer * nbhandle)907 void ngai_get_common(Integer g_a,
908 Integer *lo,
909 Integer *hi,
910 void *buf,
911 Integer *ld,
912 Integer field_off,
913 Integer field_size,
914 Integer *nbhandle)
915 {
916 /* g_a: Global array handle
917 lo[]: Array of lower indices of patch of global array
918 hi[]: Array of upper indices of patch of global array
919 buf[]: Local buffer that array patch will be copied into
920 ld[]: Array of physical ndim-1 dimensions of local buffer */
921
922 Integer p, np, handle=GA_OFFSET + g_a;
923 Integer idx, elems, size, p_handle;
924 int proc, ndim, loop, cond;
925 int num_loops=2; /* 1st loop for remote procs; 2nd loop for local procs */
926 Integer n_rstrctd;
927 Integer *rank_rstrctd;
928 #if defined(__crayx1) || defined(DISABLE_NBOPT)
929 #else
930 Integer ga_nbhandle;
931 int counter=0;
932 #endif
933 _iterator_hdl it_hdl;
934
935 int _stride_rem[MAXDIM+1], _stride_loc[MAXDIM+1], _count[MAXDIM+1];
936 int *stride_rem=&_stride_rem[1], *stride_loc=&_stride_loc[1], *count=&_count[1];
937
938 ga_check_handleM(g_a, "ngai_get_common");
939
940 size = GA[handle].elemsize;
941 ndim = GA[handle].ndim;
942 p_handle = (Integer)GA[handle].p_handle;
943 n_rstrctd = (Integer)GA[handle].num_rstrctd;
944 rank_rstrctd = GA[handle].rank_rstrctd;
945
946 /*initial stride portion for field-wise operations*/
947 _stride_rem[0] = size;
948 _stride_loc[0] = field_size;
949 _count[0] = field_size;
950
951 gai_iterator_init(g_a, lo, hi, &it_hdl);
952
953 /* get total size of patch */
954 #ifndef NO_GA_STATS
955 gam_CountElems(ndim, lo, hi, &elems);
956 GAbytes.gettot += (double)size*elems;
957 GAstat.numget++;
958 GAstat.numget_procs += np;
959 #endif
960
961 if(nbhandle)ga_init_nbhandle(nbhandle);
962 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
963 else ga_init_nbhandle(&ga_nbhandle);
964 #endif
965
966 #ifdef PROFILE_OLD
967 ga_profile_start((int)handle, (long)size*elems, ndim, lo, hi,
968 ENABLE_PROFILE_GET);
969 #endif
970
971 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
972 for(loop=0; loop<num_loops; loop++) {
973 __CRAYX1_PRAGMA("_CRI novector");
974 #endif
975 Integer ldrem[MAXDIM];
976 Integer idx_buf, *plo, *phi;
977 char *pbuf, *prem;
978 gai_iterator_reset(&it_hdl);
979 while (gai_iterator_next(&it_hdl, &proc, &plo, &phi, &prem, ldrem)) {
980
981 /* check if it is local to SMP */
982 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
983 cond = armci_domain_same_id(ARMCI_DOMAIN_SMP,(int)proc);
984 if(loop==0) cond = !cond;
985 if(cond) {
986 #endif
987
988 /* find the right spot in the user buffer for the point
989 subscripted by plo given that the corner of the user
990 buffer is subscripted by lo */
991 gam_ComputePatchIndex(ndim, lo, plo, ld, &idx_buf);
992 pbuf = size*idx_buf + (char*)buf;
993
994 /* compute number of elements in each dimension and store the
995 result in count */
996 gam_ComputeCount(ndim, plo, phi, count);
997
998 /* Scale first element in count by element size. The ARMCI_GetS
999 routine uses this convention to figure out memory sizes.*/
1000 count[0] *= size;
1001
1002 /* Return strides for memory containing global array on remote
1003 processor indexed by proc (stride_rem) and for local buffer
1004 buf (stride_loc) */
1005 gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc);
1006
1007 #ifndef NO_GA_STATS
1008 if(proc == GAme){
1009 gam_CountElems(ndim, plo, phi, &elems);
1010 GAbytes.getloc += (double)size*elems;
1011 }
1012 #endif
1013 if(nbhandle) {
1014 /*ARMCI_NbGetS(prem, stride_rem, pbuf, stride_loc, count, ndim -1, */
1015 /* proc,(armci_hdl_t*)get_armci_nbhandle(nbhandle)); */
1016 ngai_nbgets(buf,prem, stride_rem, pbuf, stride_loc, count, ndim -1,
1017 proc,field_off, field_size, size,
1018 (armci_hdl_t*)get_armci_nbhandle(nbhandle));
1019 } else {
1020 #if defined(__crayx1) || defined(DISABLE_NBOPT)
1021 /*ARMCI_GetS(prem,stride_rem,pbuf,stride_loc,count,ndim-1,proc); */
1022 ngai_gets(buf,prem,stride_rem,pbuf,stride_loc,count,ndim-1,proc, field_off, field_size, size);
1023 #else
1024 if((loop==0 && gai_iterator_last(&it_hdl)) || loop==1) {
1025 /*ARMCI_GetS(prem,stride_rem,pbuf,stride_loc,count,ndim-1,proc); */
1026 ngai_gets(buf,prem,stride_rem,pbuf,stride_loc,count,ndim-1,proc, field_off, field_size, size);
1027 } else {
1028 ++counter;
1029 /*ARMCI_NbGetS(prem,stride_rem,pbuf,stride_loc,count,ndim-1, */
1030 /* proc,(armci_hdl_t*)get_armci_nbhandle(&ga_nbhandle)); */
1031 ngai_nbgets(buf,prem, stride_rem, pbuf, stride_loc, count, ndim -1,
1032 proc,field_off, field_size, size,
1033 (armci_hdl_t*)get_armci_nbhandle(nbhandle));
1034 }
1035 #endif
1036 }
1037 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
1038 } /* end if(cond) */
1039 #endif
1040 }
1041 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
1042 }
1043 if(!nbhandle) nga_wait_internal(&ga_nbhandle);
1044 #endif
1045
1046 #ifdef PROFILE_OLD
1047 ga_profile_stop();
1048 #endif
1049
1050 gai_iterator_destroy(&it_hdl);
1051 }
1052
1053 /**
1054 * Utility function to copy data from one buffer to another
1055 */
gai_mem_copy(int elemsize,int ndim,void * src_ptr,Integer * src_start,Integer * count,Integer * src_ld,void * dst_ptr,Integer * dst_start,Integer * dst_ld)1056 void gai_mem_copy(int elemsize, int ndim, void *src_ptr, Integer *src_start,
1057 Integer *count, Integer *src_ld, void *dst_ptr, Integer *dst_start,
1058 Integer *dst_ld)
1059 {
1060 Integer src_offset, dst_offset;
1061 Integer factor;
1062 void *sptr, *dptr;
1063 int i, j;
1064 long src_idx, dst_idx;
1065 int n1dim; /* total number of contiguous segments */
1066 int src_bvalue[MAXDIM], src_bunit[MAXDIM];
1067 int dst_bvalue[MAXDIM], dst_bunit[MAXDIM];
1068 int src_stride[MAXDIM], dst_stride[MAXDIM];
1069 int segsize;
1070 /* find offsets for both source and destination buffers and use these to
1071 * adjust pointers to the first element in each data set
1072 */
1073 factor = 1;
1074 src_offset = 0;
1075 for (i=0; i<ndim; i++) {
1076 src_offset += src_start[i]*factor;
1077 if (i<ndim-1) factor *= src_ld[i];
1078 }
1079 sptr = (void*)((char*)src_ptr+src_offset*elemsize);
1080
1081 factor = 1; //reset the factor
1082
1083 dst_offset = 0;
1084 for (i=0; i<ndim; i++) {
1085 dst_offset += dst_start[i]*factor;
1086 if (i<ndim-1) factor *= dst_ld[i];
1087 }
1088 dptr = (void*)((char*)dst_ptr+dst_offset*elemsize);
1089 segsize = count[0]*elemsize;
1090 /* perform the memcpy from source to destination buffer. Start by calculating
1091 * the total number of contiguous segments in the src array*/
1092 n1dim = 1;
1093
1094 src_stride[0] = dst_stride[0] = elemsize;
1095 for (i=0; i<ndim-1; i++) {
1096 src_stride[i+1] = src_stride[i];
1097 src_stride[i+1] *= (int)src_ld[i];
1098 dst_stride[i+1] = dst_stride[i];
1099 dst_stride[i+1] *= (int)dst_ld[i];
1100 }
1101
1102 for (i=1; i<=ndim-1; i++) {
1103 n1dim *= count[i];
1104 }
1105 /* initialize arrays for evaluating offset of each segment */
1106 src_bvalue[0] = 0;
1107 src_bvalue[1] = 0;
1108 src_bunit[0] = 1;
1109 src_bunit[1] = 1;
1110 dst_bvalue[0] = 0;
1111 dst_bvalue[1] = 0;
1112 dst_bunit[0] = 1;
1113 dst_bunit[1] = 1;
1114
1115 //this part does nothing?
1116 for (i=2; i<=ndim-1; i++) {
1117 src_bvalue[i] = 0;
1118 dst_bvalue[i] = 0;
1119 src_bunit[i] = src_bunit[i-1]*count[i-1];
1120 dst_bunit[i] = dst_bunit[i-1]*count[i-1];
1121 }
1122
1123 /* evaluate offset for each contiguous segment */
1124 for (i=0; i<n1dim; i++) {
1125 src_idx = 0;
1126 dst_idx = 0;
1127 for (j=1; j<=ndim-1; j++) {
1128 src_idx += src_bvalue[j]*src_stride[j];
1129 if ((i+1)%src_bunit[j] == 0) {
1130 src_bvalue[j]++;
1131 }
1132 if (src_bvalue[j] > (count[j]-1)) {
1133 src_bvalue[j] = 0;
1134 }
1135 }
1136 for (j=1; j<=ndim-1; j++) {
1137 dst_idx += dst_bvalue[j]*dst_stride[j];
1138 if ((i+1)%dst_bunit[j] == 0) {
1139 dst_bvalue[j]++;
1140 }
1141 if (dst_bvalue[j] > (count[j]-1)) {
1142 dst_bvalue[j] = 0;
1143 }
1144 }
1145 /* copy memory */
1146 memcpy((char*)dptr+dst_idx,(char*)sptr+src_idx,segsize);
1147 }
1148 }
1149
1150 /**
1151 * Get an N-dimensional patch of data from a Global Array
1152 */
1153 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1154 # pragma weak wnga_get = pnga_get
1155 #endif
1156
pnga_get(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld)1157 void pnga_get(Integer g_a, Integer *lo, Integer *hi,
1158 void *buf, Integer *ld)
1159 {
1160 Integer handle = GA_OFFSET + g_a;
1161 enum property_type ga_property = GA[handle].property;
1162
1163 if (ga_property != READ_CACHE) /* if array is not read only */
1164 {
1165 GA_Internal_Threadsafe_Lock();
1166 ngai_get_common(g_a,lo,hi,buf,ld,0,-1,(Integer *)NULL);
1167 GA_Internal_Threadsafe_Unlock();
1168 } else {
1169 int i;
1170 int nelem;
1171 int ndim = GA[handle].ndim;
1172 Integer nstart[MAXDIM], ncount[MAXDIM], nld[MAXDIM];
1173 Integer bstart[MAXDIM];
1174 /* ngai_get_common(g_a,lo,hi,buf,ld,0,-1,(Integer *)NULL); */
1175
1176 /* cache is empty condition */
1177 if (GA[handle].cache_head == NULL) {
1178 GA[handle].cache_head = malloc(sizeof(cache_struct_t));
1179 nelem = 1;
1180 for (i=0; i<ndim; i++) {
1181 GA[handle].cache_head -> lo[i] = lo[i];
1182 GA[handle].cache_head -> hi[i] = hi[i];
1183 nelem *= (hi[i]-lo[i]+1);
1184 }
1185 GA[handle].cache_head -> cache_buf = malloc(GA[handle].elemsize*nelem);
1186 GA[handle].cache_head -> next = NULL;
1187
1188 void *new_buf = GA[handle].cache_head->cache_buf;
1189
1190 /* place data to receive buffer */
1191 ngai_get_common(g_a,lo,hi,buf,ld,0,-1,(Integer*)NULL);
1192
1193 for (i=0; i<ndim; i++) {
1194 nstart[i] = 0;
1195 bstart[i] = 0;
1196 nld[i] = (hi[i]-lo[i]+1);
1197 ncount[i] = nld[i];
1198 }
1199 gai_mem_copy(GA[handle].elemsize,ndim,buf,bstart,ncount,ld,new_buf,
1200 nstart, nld);
1201 } else {
1202
1203 cache_struct_t * cache_temp_pointer = GA[handle].cache_head;
1204 int match = 0;
1205 while (cache_temp_pointer != NULL) {
1206 int chk;
1207 int sub_chk;
1208 int temp_lo[MAXDIM];
1209 int temp_hi[MAXDIM];
1210 void *temp_buf = cache_temp_pointer->cache_buf;
1211 for (i=0; i<ndim; i++) {
1212 temp_lo[i] = cache_temp_pointer->lo[i];
1213 temp_hi[i] = cache_temp_pointer->hi[i];
1214 }
1215
1216 //match
1217 chk = 1;
1218 for (i=0; i<ndim; i++) {
1219 if (!(temp_lo[i] <= lo[i] && temp_hi[i] >= hi[i])) {
1220 chk = 0;
1221 break;
1222 }
1223 }
1224 //match
1225 if (chk) {
1226 nelem = 1;
1227 for (i=0; i<ndim; i++) {
1228 nelem *= (hi[i]-lo[i]+1);
1229 }
1230 for (i=0; i<ndim; i++) {
1231 nstart[i] = (lo[i]-temp_lo[i]);
1232 bstart[i] = 0;
1233 nld[i] = (temp_hi[i]-temp_lo[i]+1);
1234 ncount[i] = (hi[i]-lo[i]+1);
1235 }
1236 /* copy data to recieve buffer */
1237 gai_mem_copy(GA[handle].elemsize,ndim,temp_buf,nstart,ncount,nld,buf,
1238 bstart, ncount /*ld*/);
1239 match = 1;
1240 break;
1241 }
1242
1243 cache_temp_pointer = cache_temp_pointer->next;
1244
1245 }
1246
1247 //no match condition
1248 if (match == 0) {
1249 void *new_buf;
1250 /* create new node on cache list*/
1251 cache_struct_t *cache_new = malloc(sizeof(cache_struct_t));
1252 nelem = 1;
1253 for (i=0; i<ndim; i++) {
1254 cache_new -> lo[i] = lo[i];
1255 cache_new -> hi[i] = hi[i];
1256 nelem *= (hi[i]-lo[i]+1);
1257 }
1258 cache_new -> cache_buf = malloc(GA[handle].elemsize*nelem);
1259 new_buf = cache_new->cache_buf;
1260 cache_new -> next = GA[handle].cache_head;
1261 GA[handle].cache_head = cache_new;
1262
1263
1264 //place data to recieve buffer
1265 ngai_get_common(g_a,lo,hi,buf,ld,0,-1,(Integer *)NULL);
1266
1267 for (i=0; i<ndim; i++) {
1268 nstart[i] = 0;
1269 bstart[i] = 0;
1270 nld[i] = (hi[i]-lo[i]+1);
1271 ncount[i] = nld[i];
1272 }
1273 gai_mem_copy(GA[handle].elemsize,ndim,buf,bstart,ncount,ld,new_buf,
1274 nstart, nld);
1275 }
1276 }
1277
1278 /* check number of items cached */
1279 cache_struct_t * cache_temp_pointer = GA[handle].cache_head;
1280 int cache_size = 0;
1281
1282 /* check number of items cached */
1283 while (cache_temp_pointer != NULL) {
1284 if (cache_size > MAXCACHE) {
1285 cache_struct_t *last = cache_temp_pointer;
1286 cache_struct_t *next = last->next;
1287 last->next = NULL;
1288 cache_temp_pointer = next;
1289 while (next) {
1290 next = cache_temp_pointer->next;
1291 free(cache_temp_pointer->cache_buf);
1292 free(cache_temp_pointer);
1293 }
1294 break;
1295 }
1296 cache_size++;
1297 }
1298 /* end */
1299 }
1300 }
1301
1302 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1303 # pragma weak wnga_nbget = pnga_nbget
1304 #endif
1305
pnga_nbget(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer * nbhandle)1306 void pnga_nbget(Integer g_a, Integer *lo, Integer *hi,
1307 void *buf, Integer *ld, Integer *nbhandle)
1308 {
1309 GA_Internal_Threadsafe_Lock();
1310 ngai_get_common(g_a,lo,hi,buf,ld,0,-1,nbhandle);
1311 GA_Internal_Threadsafe_Unlock();
1312 }
1313
1314 /**
1315 * Get a field in an N-dimensional patch of data from a Global Array
1316 */
1317 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1318 # pragma weak wnga_get_field = pnga_get_field
1319 #endif
1320
pnga_get_field(Integer g_a,Integer * lo,Integer * hi,Integer foff,Integer fsize,void * buf,Integer * ld)1321 void pnga_get_field(Integer g_a, Integer *lo, Integer *hi,Integer foff, Integer fsize,
1322 void *buf, Integer *ld)
1323 {
1324 ngai_get_common(g_a,lo,hi,buf,ld,foff,fsize,(Integer *)NULL);
1325 }
1326
1327 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1328 # pragma weak wnga_nbget_field = pnga_nbget_field
1329 #endif
1330
pnga_nbget_field(Integer g_a,Integer * lo,Integer * hi,Integer foff,Integer fsize,void * buf,Integer * ld,Integer * nbhandle)1331 void pnga_nbget_field(Integer g_a, Integer *lo, Integer *hi,Integer foff, Integer fsize,
1332 void *buf, Integer *ld, Integer *nbhandle)
1333 {
1334 ngai_get_common(g_a,lo,hi,buf,ld,foff,fsize,nbhandle);
1335 }
1336
1337 #ifdef __crayx1
1338 # pragma _CRI inline ga_get_
1339 # pragma _CRI inline ngai_get_common
1340 #endif
1341
1342 /**
1343 * A common routine called by both non-blocking and blocking GA acc calls.
1344 */
ngai_acc_common(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,void * alpha,Integer * nbhandle)1345 void ngai_acc_common(Integer g_a,
1346 Integer *lo,
1347 Integer *hi,
1348 void *buf,
1349 Integer *ld,
1350 void *alpha,
1351 Integer *nbhandle)
1352 {
1353 Integer p, np, handle=GA_OFFSET + g_a;
1354 Integer idx, elems, size, type, p_handle, ga_nbhandle;
1355 int optype=-1, loop, ndim, cond;
1356 int proc;
1357 int num_loops=2; /* 1st loop for remote procs; 2nd loop for local procs */
1358 Integer n_rstrctd;
1359 Integer *rank_rstrctd;
1360 _iterator_hdl it_hdl;
1361
1362 GA_Internal_Threadsafe_Lock();
1363
1364 ga_check_handleM(g_a, "ngai_acc_common");
1365
1366 size = GA[handle].elemsize;
1367 type = GA[handle].type;
1368 ndim = GA[handle].ndim;
1369 p_handle = GA[handle].p_handle;
1370 n_rstrctd = (Integer)GA[handle].num_rstrctd;
1371 rank_rstrctd = GA[handle].rank_rstrctd;
1372
1373 if(type==C_DBL) optype= ARMCI_ACC_DBL;
1374 else if(type==C_FLOAT) optype= ARMCI_ACC_FLT;
1375 else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
1376 else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
1377 else if(type==C_INT)optype= ARMCI_ACC_INT;
1378 else if(type==C_LONG)optype= ARMCI_ACC_LNG;
1379 else pnga_error("type not supported",type);
1380
1381 gai_iterator_init(g_a, lo, hi, &it_hdl);
1382
1383 #ifndef NO_GA_STATS
1384 gam_CountElems(ndim, lo, hi, &elems);
1385 GAbytes.acctot += (double)size*elems;
1386 GAstat.numacc++;
1387 GAstat.numacc_procs += np;
1388 #endif
1389
1390 if(nbhandle)ga_init_nbhandle(nbhandle);
1391 else ga_init_nbhandle(&ga_nbhandle);
1392
1393 #ifdef PROFILE_OLD
1394 ga_profile_start((int)handle, (long)size*elems, ndim, lo, hi,
1395 ENABLE_PROFILE_ACC);
1396 #endif
1397
1398 for(loop=0; loop<num_loops; loop++) {
1399 Integer ldrem[MAXDIM];
1400 int stride_rem[MAXDIM], stride_loc[MAXDIM], count[MAXDIM];
1401 Integer idx_buf, *plo, *phi;
1402 char *pbuf, *prem;
1403 gai_iterator_reset(&it_hdl);
1404
1405 while (gai_iterator_next(&it_hdl, &proc, &plo, &phi, &prem, ldrem)) {
1406
1407 /* check if it is local to SMP */
1408 cond = armci_domain_same_id(ARMCI_DOMAIN_SMP,(int)proc);
1409 if(loop==0) cond = !cond;
1410
1411 if(cond) {
1412
1413 /* find the right spot in the user buffer */
1414 gam_ComputePatchIndex(ndim,lo, plo, ld, &idx_buf);
1415 pbuf = size*idx_buf + (char*)buf;
1416
1417 gam_ComputeCount(ndim, plo, phi, count);
1418
1419 /* scale number of rows by element size */
1420 count[0] *= size;
1421 gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc);
1422
1423 if(GA_fence_set)fence_array[proc]=1;
1424
1425 #ifndef NO_GA_STATS
1426 if(proc == GAme){
1427 gam_CountElems(ndim, plo, phi, &elems);
1428 GAbytes.accloc += (double)size*elems;
1429 }
1430 #endif
1431
1432 if(nbhandle) {
1433 ARMCI_NbAccS(optype, alpha, pbuf, stride_loc, prem,
1434 stride_rem, count, ndim-1, proc,
1435 (armci_hdl_t*)get_armci_nbhandle(nbhandle));
1436 } else {
1437 # if !defined(DISABLE_NBOPT)
1438 if((loop==0 && gai_iterator_last(&it_hdl)) || loop==1) {
1439 ARMCI_AccS(optype, alpha, pbuf, stride_loc, prem, stride_rem,
1440 count, ndim-1, proc);
1441 } else {
1442 ARMCI_NbAccS(optype, alpha, pbuf, stride_loc, prem,
1443 stride_rem, count, ndim-1, proc,
1444 (armci_hdl_t*)get_armci_nbhandle(&ga_nbhandle));
1445 }
1446 # else
1447 ARMCI_AccS(optype, alpha, pbuf, stride_loc, prem, stride_rem,
1448 count, ndim-1, proc);
1449 # endif
1450 }
1451 } /* end if(cond) */
1452 }
1453 }
1454 #if !defined(DISABLE_NBOPT)
1455 if(!nbhandle) nga_wait_internal(&ga_nbhandle);
1456 #endif
1457
1458 #ifdef PROFILE_OLD
1459 ga_profile_stop();
1460 #endif
1461 GA_Internal_Threadsafe_Unlock();
1462
1463 gai_iterator_destroy(&it_hdl);
1464 }
1465
1466 /**
1467 * Accumulate operation for an N-dimensional patch of a Global Array
1468 * g_a += alpha * patch
1469 */
1470 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1471 # pragma weak wnga_acc = pnga_acc
1472 #endif
1473
pnga_acc(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,void * alpha)1474 void pnga_acc(Integer g_a,
1475 Integer *lo,
1476 Integer *hi,
1477 void *buf,
1478 Integer *ld,
1479 void *alpha)
1480 {
1481 ngai_acc_common(g_a,lo,hi,buf,ld,alpha,NULL);
1482 }
1483
1484 /**
1485 * (Non-blocking) Accumulate operation for an N-dimensional patch of a Global Array
1486 * g_a += alpha * patch
1487 */
1488 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1489 # pragma weak wnga_nbacc = pnga_nbacc
1490 #endif
1491
pnga_nbacc(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,void * alpha,Integer * nbhndl)1492 void pnga_nbacc(Integer g_a,
1493 Integer *lo,
1494 Integer *hi,
1495 void *buf,
1496 Integer *ld,
1497 void *alpha,
1498 Integer *nbhndl)
1499 {
1500 ngai_acc_common(g_a,lo,hi,buf,ld,alpha,nbhndl);
1501 }
1502
1503 /**
1504 * Return a pointer to local data in a Global Array
1505 */
1506 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1507 # pragma weak wnga_access_ptr = pnga_access_ptr
1508 #endif
1509
pnga_access_ptr(Integer g_a,Integer lo[],Integer hi[],void * ptr,Integer ld[])1510 void pnga_access_ptr(Integer g_a, Integer lo[], Integer hi[],
1511 void* ptr, Integer ld[])
1512 {
1513 char *lptr;
1514 Integer handle = GA_OFFSET + g_a;
1515 Integer ow,i,p_handle;
1516
1517
1518 p_handle = GA[handle].p_handle;
1519 if (!pnga_locate(g_a,lo,&ow)) pnga_error("locate top failed",0);
1520 if (p_handle != -1)
1521 ow = PGRP_LIST[p_handle].inv_map_proc_list[ow];
1522 if ((armci_domain_id(ARMCI_DOMAIN_SMP, ow) != armci_domain_my_id(ARMCI_DOMAIN_SMP)) && (ow != GAme))
1523 pnga_error("cannot access top of the patch",ow);
1524 if (!pnga_locate(g_a,hi, &ow)) pnga_error("locate bottom failed",0);
1525 if (p_handle != -1)
1526 ow = PGRP_LIST[p_handle].inv_map_proc_list[ow];
1527 if ((armci_domain_id(ARMCI_DOMAIN_SMP, ow) != armci_domain_my_id(ARMCI_DOMAIN_SMP)) && (ow != GAme))
1528 pnga_error("cannot access bottom of the patch",ow);
1529
1530 for (i=0; i<GA[handle].ndim; i++)
1531 if(lo[i]>hi[i]) {
1532 ga_RegionError(GA[handle].ndim, lo, hi, g_a);
1533 }
1534
1535 if (p_handle >= 0) {
1536 ow = PGRP_LIST[p_handle].map_proc_list[ow];
1537 }
1538 if (GA[handle].num_rstrctd > 0) {
1539 ow = GA[handle].rank_rstrctd[ow];
1540 }
1541 gam_Location(ow,handle, lo, &lptr, ld);
1542 *(char**)ptr = lptr;
1543 }
1544
1545 /*\ RETURN A POINTER TO BEGINNING OF LOCAL DATA BLOCK
1546 \*/
1547 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1548 # pragma weak wnga_access_block_grid_ptr = pnga_access_block_grid_ptr
1549 #endif
1550
pnga_access_block_grid_ptr(Integer g_a,Integer * index,void * ptr,Integer * ld)1551 void pnga_access_block_grid_ptr(Integer g_a, Integer *index, void* ptr, Integer *ld)
1552 /* g_a: array handle [input]
1553 * index: subscript of a particular block [input]
1554 * ptr: pointer to data in block [output]
1555 * ld: array of strides for block data [output]
1556 */
1557 {
1558 char *lptr;
1559 Integer handle = GA_OFFSET + g_a;
1560 Integer i/*, p_handle*/, offset, factor, inode;
1561 Integer ndim;
1562 C_Integer *num_blocks, *block_dims;
1563 int *proc_grid;
1564 Integer *dims;
1565 Integer last;
1566 Integer proc_index[MAXDIM];
1567 Integer blk_num[MAXDIM], blk_dim[MAXDIM], blk_inc[MAXDIM];
1568 Integer blk_ld[MAXDIM],hlf_blk[MAXDIM],blk_jinc;
1569 Integer j, lo, hi;
1570 Integer lld[MAXDIM], block_idx[MAXDIM], block_count[MAXDIM];
1571 Integer ldims[MAXDIM], ldidx[MAXDIM];
1572 Integer *mapc;
1573
1574
1575 /*p_handle = GA[handle].p_handle;*/
1576 if (GA[handle].distr_type != SCALAPACK && GA[handle].distr_type != TILED &&
1577 GA[handle].distr_type != TILED_IRREG) {
1578 pnga_error("Array is not using ScaLAPACK or tiled data distribution",0);
1579 }
1580 /* dimensions of processor grid */
1581 proc_grid = GA[handle].nblock;
1582 /* number of blocks along each dimension */
1583 num_blocks = GA[handle].num_blocks;
1584 /* size of individual blocks */
1585 block_dims = GA[handle].block_dims;
1586 dims = GA[handle].dims;
1587 ndim = GA[handle].ndim;
1588 for (i=0; i<ndim; i++) {
1589 if (index[i] < 0 || index[i] >= num_blocks[i])
1590 pnga_error("block index outside allowed values",index[i]);
1591 }
1592
1593 /* Find strides of requested block */
1594 if (GA[handle].distr_type == TILED) {
1595 /* find out what processor block is located on */
1596 gam_find_tile_proc_from_indices(handle, inode, index)
1597
1598 /* get proc indices of processor that owns block */
1599 gam_find_tile_proc_indices(handle,inode,proc_index)
1600 last = ndim-1;
1601
1602 for (i=0; i<ndim; i++) {
1603 lo = index[i]*block_dims[i]+1;
1604 hi = (index[i]+1)*block_dims[i];
1605 if (hi > dims[i]) hi = dims[i];
1606 ldims[i] = (hi - lo + 1);
1607 if (i<last) ld[i] = ldims[i];
1608 }
1609 } else if (GA[handle].distr_type == TILED_IRREG) {
1610 /* find out what processor block is located on */
1611 gam_find_tile_proc_from_indices(handle, inode, index);
1612
1613 /* get proc indices of processor that owns block */
1614 gam_find_tile_proc_indices(handle,inode,proc_index)
1615 last = ndim-1;
1616
1617 mapc = GA[handle].mapc;
1618 offset = 0;
1619 for (i=0; i<ndim; i++) {
1620 lld[i] = 0;
1621 ldidx[i] = 0;
1622 lo = mapc[offset+index[i]];
1623 if (index[i] < num_blocks[i]-1) {
1624 hi = mapc[offset+index[i]+1]-1;
1625 } else {
1626 hi = dims[i];
1627 }
1628 ldims[i] = hi - lo + 1;
1629 for (j = proc_index[i]; j<num_blocks[i]; j += proc_grid[i]) {
1630 lo = mapc[offset+j];
1631 if (j < num_blocks[i]-1) {
1632 hi = mapc[offset+j+1]-1;
1633 } else {
1634 hi = dims[i];
1635 }
1636 lld[i] += (hi-lo+1);
1637 if (j < index[i]) {
1638 ldidx[i] += (hi-lo+1);
1639 }
1640 }
1641 if (i<last) ld[i] = ldims[i];
1642 offset += num_blocks[i];
1643 }
1644 } else if (GA[handle].distr_type == SCALAPACK) {
1645 /* find out what processor block is located on */
1646 gam_find_proc_from_sl_indices(handle, inode, index);
1647
1648 /* get proc indices of processor that owns block */
1649 gam_find_proc_indices(handle,inode,proc_index)
1650 last = ndim-1;
1651
1652 for (i=0; i<last; i++) {
1653 blk_dim[i] = block_dims[i]*proc_grid[i];
1654 blk_num[i] = GA[handle].dims[i]/blk_dim[i];
1655 blk_inc[i] = GA[handle].dims[i] - blk_num[i]*blk_dim[i];
1656 blk_ld[i] = blk_num[i]*block_dims[i];
1657 hlf_blk[i] = blk_inc[i]/block_dims[i];
1658 ld[i] = blk_ld[i];
1659 blk_jinc = dims[i]%block_dims[i];
1660 if (blk_inc[i] > 0) {
1661 if (proc_index[i]<hlf_blk[i]) {
1662 blk_jinc = block_dims[i];
1663 } else if (proc_index[i] == hlf_blk[i]) {
1664 blk_jinc = blk_inc[i]%block_dims[i];
1665 /*
1666 if (blk_jinc == 0) {
1667 blk_jinc = block_dims[i];
1668 }
1669 */
1670 } else {
1671 blk_jinc = 0;
1672 }
1673 }
1674 ld[i] += blk_jinc;
1675 }
1676 }
1677
1678 /* Find the local grid index of block relative to local block grid and
1679 store result in block_idx.
1680 Find physical dimensions of locally held data and store in
1681 lld and set values in ldim, which is used to evaluate the
1682 offset for the requested block. */
1683 if (GA[handle].distr_type == TILED) {
1684 for (i=0; i<ndim; i++) {
1685 block_idx[i] = 0;
1686 block_count[i] = 0;
1687 lld[i] = 0;
1688 lo = 0;
1689 hi = -1;
1690 for (j=proc_index[i]; j<num_blocks[i]; j += proc_grid[i]) {
1691 lo = j*block_dims[i] + 1;
1692 hi = (j+1)*block_dims[i];
1693 if (hi > dims[i]) hi = dims[i];
1694 lld[i] += (hi - lo + 1);
1695 if (j<index[i]) block_idx[i]++;
1696 block_count[i]++;
1697 }
1698 }
1699
1700 /* Evaluate offset for requested block. The algorithm used goes like this:
1701 * The contribution from the fastest dimension is
1702 * block_idx[0]*block_dims[0]*...*block_dims[ndim-1];
1703 * The contribution from the second fastest dimension is
1704 * block_idx[1]*lld[0]*block_dims[1]*...*block_dims[ndim-1];
1705 * The contribution from the third fastest dimension is
1706 * block_idx[2]*lld[0]*lld[1]*block_dims[2]*...*block_dims[ndim-1];
1707 * etc.
1708 * If block_idx[i] is equal to the total number of blocks contained on that
1709 * processor minus 1 (the index is at the edge of the array) and the index
1710 * i is greater than the index of the dimension, then instead of using the
1711 * block dimension, use the fractional dimension of the edge block (which
1712 * may be smaller than the block dimension)
1713 */
1714 offset = 0;
1715 for (i=0; i<ndim; i++) {
1716 factor = 1;
1717 for (j=0; j<i; j++) {
1718 factor *= lld[j];
1719 }
1720 for (j=i; j<ndim; j++) {
1721 if (j > i && block_idx[j] > block_count[j]-1) {
1722 factor *= ldims[j];
1723 } else {
1724 factor *= block_dims[j];
1725 }
1726 }
1727 offset += block_idx[i]*factor;
1728 }
1729 } else if (GA[handle].distr_type == TILED_IRREG) {
1730 /* Evaluate offset for requested block. This algorithm is similar to the
1731 * algorithm for reqularly tiled data layouts.
1732 * The contribution from the fastest dimension is
1733 * ldidx[0]*ldims[2]*...*ldims[ndim-1];
1734 * The contribution from the second fastest dimension is
1735 * lld[0]*ldidx[1]*ldims[2]*...*ldims[ndim-1];
1736 * The contribution from the third fastest dimension is
1737 * lld[0]*lld[1]*ldidx[2]*ldims[3]*...*ldims[ndim-1];
1738 * etc.
1739 */
1740 offset = 0;
1741 for (i=0; i<ndim; i++) {
1742 factor = 1;
1743 for (j=0; j<i; j++) {
1744 factor *= lld[j];
1745 }
1746 factor *= ldidx[i];
1747 for (j=i+1; j<ndim; j++) {
1748 factor *= ldims[j];
1749 }
1750 offset += factor;
1751 }
1752 } else if (GA[handle].distr_type == SCALAPACK) {
1753 /* Evalauate offset for block */
1754 offset = 0;
1755 factor = 1;
1756 for (i = 0; i<ndim; i++) {
1757 offset += ((index[i]-proc_index[i])/proc_grid[i])*block_dims[i]*factor;
1758 if (i<ndim-1) factor *= ld[i];
1759 }
1760 }
1761
1762 lptr = GA[handle].ptr[inode]+offset*GA[handle].elemsize;
1763
1764 *(char**)ptr = lptr;
1765 }
1766
1767 /**
1768 * Return a pointer to the beginning of a local data block in a block-cyclic
1769 * data distribution
1770 */
1771 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1772 # pragma weak wnga_access_block_ptr = pnga_access_block_ptr
1773 #endif
1774
pnga_access_block_ptr(Integer g_a,Integer idx,void * ptr,Integer * ld)1775 void pnga_access_block_ptr(Integer g_a, Integer idx, void* ptr, Integer *ld)
1776 /* g_a: array handle [input]
1777 * idx: block index [input]
1778 * ptr: pointer to data in block [output]
1779 * ld: array of strides for block data [output]
1780 */
1781 {
1782 char *lptr;
1783 Integer handle = GA_OFFSET + g_a;
1784 Integer i, j/*, p_handle*/, nblocks, offset, tsum, inode;
1785 Integer ndim, lo[MAXDIM], hi[MAXDIM], index;
1786
1787
1788 /*p_handle = GA[handle].p_handle;*/
1789 nblocks = GA[handle].block_total;
1790 ndim = GA[handle].ndim;
1791 index = idx;
1792 if (index < 0 || index >= nblocks)
1793 pnga_error("block index outside allowed values",index);
1794
1795 if (GA[handle].distr_type == BLOCK_CYCLIC) {
1796 offset = 0;
1797 inode = index%GAnproc;
1798 for (i=inode; i<index; i += GAnproc) {
1799 ga_ownsM(handle,i,lo,hi);
1800 tsum = 1;
1801 for (j=0; j<ndim; j++) {
1802 tsum *= (hi[j]-lo[j]+1);
1803 }
1804 offset += tsum;
1805 }
1806 lptr = GA[handle].ptr[inode]+offset*GA[handle].elemsize;
1807
1808 ga_ownsM(handle,index,lo,hi);
1809 for (i=0; i<ndim-1; i++) {
1810 ld[i] = hi[i]-lo[i]+1;
1811 }
1812 } else if (GA[handle].distr_type == SCALAPACK ||
1813 GA[handle].distr_type == TILED ||
1814 GA[handle].distr_type == TILED_IRREG) {
1815 Integer indices[MAXDIM];
1816 /* find block indices */
1817 gam_find_block_indices(handle,index,indices);
1818 /* find pointer */
1819 pnga_access_block_grid_ptr(g_a, indices, &lptr, ld);
1820 }
1821 *(char**)ptr = lptr;
1822
1823 }
1824
1825 /**
1826 * Return a pointer to the beginning of local data on a processor containing
1827 * block-cyclic data
1828 */
1829 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1830 # pragma weak wnga_access_block_segment_ptr = pnga_access_block_segment_ptr
1831 #endif
1832
pnga_access_block_segment_ptr(Integer g_a,Integer proc,void * ptr,Integer * len)1833 void pnga_access_block_segment_ptr(Integer g_a, Integer proc, void* ptr, Integer *len)
1834 /* g_a: array handle [input]
1835 * proc: processor for data [input]
1836 * ptr: pointer to data start of data on processor [output]
1837 * len: length of data contained on processor [output]
1838 */
1839 {
1840 char *lptr;
1841 Integer handle = GA_OFFSET + g_a;
1842 /*Integer p_handle, nblocks;*/
1843 Integer /*ndim,*/ index;
1844 int grp = GA[handle].p_handle;
1845
1846
1847 /*p_handle = GA[handle].p_handle;*/
1848 /*nblocks = GA[handle].block_total;*/
1849 /*ndim = GA[handle].ndim;*/
1850 index = proc;
1851 if (index < 0 || index >= pnga_pgroup_nnodes(grp))
1852 pnga_error("processor index outside allowed values",index);
1853
1854 if (index != pnga_pgroup_nodeid(grp))
1855 pnga_error("Only get accurate number of elements for processor making request",0);
1856 lptr = GA[handle].ptr[index];
1857
1858 *len = GA[handle].size/GA[handle].elemsize;
1859 *(char**)ptr = lptr;
1860 }
1861
1862 /**
1863 * Provide access to a patch of a Global Array using an index
1864 */
1865 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1866 # pragma weak wnga_access_idx = pnga_access_idx
1867 #endif
1868
pnga_access_idx(Integer g_a,Integer lo[],Integer hi[],AccessIndex * index,Integer ld[])1869 void pnga_access_idx(Integer g_a, Integer lo[], Integer hi[],
1870 AccessIndex* index, Integer ld[])
1871 {
1872 char *ptr;
1873 Integer handle = GA_OFFSET + g_a;
1874 Integer ow,i,p_handle;
1875 unsigned long elemsize;
1876 unsigned long lref=0, lptr;
1877
1878
1879 p_handle = GA[handle].p_handle;
1880 if(!pnga_locate(g_a,lo,&ow))pnga_error("locate top failed",0);
1881 if (p_handle != -1)
1882 ow = PGRP_LIST[p_handle].inv_map_proc_list[ow];
1883 if ((armci_domain_id(ARMCI_DOMAIN_SMP, ow) != armci_domain_my_id(ARMCI_DOMAIN_SMP)) && (ow != GAme))
1884 pnga_error("cannot access top of the patch",ow);
1885 if(!pnga_locate(g_a,hi, &ow))pnga_error("locate bottom failed",0);
1886 if (p_handle != -1)
1887 ow = PGRP_LIST[p_handle].inv_map_proc_list[ow];
1888 if ((armci_domain_id(ARMCI_DOMAIN_SMP, ow) != armci_domain_my_id(ARMCI_DOMAIN_SMP)) && (ow != GAme))
1889 pnga_error("cannot access bottom of the patch",ow);
1890
1891 for (i=0; i<GA[handle].ndim; i++)
1892 if(lo[i]>hi[i]) {
1893 ga_RegionError(GA[handle].ndim, lo, hi, g_a);
1894 }
1895
1896
1897 if (p_handle != -1)
1898 ow = PGRP_LIST[p_handle].map_proc_list[ow];
1899
1900 gam_Location(ow,handle, lo, &ptr, ld);
1901
1902 /*
1903 * return patch address as the distance elements from the reference address
1904 *
1905 * .in Fortran we need only the index to the type array: dbl_mb or int_mb
1906 * that are elements of COMMON in the the mafdecls.h include file
1907 * .in C we need both the index and the pointer
1908 */
1909
1910 elemsize = (unsigned long)GA[handle].elemsize;
1911
1912 /* compute index and check if it is correct */
1913 switch (pnga_type_c2f(GA[handle].type)){
1914 case MT_F_DBL:
1915 *index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
1916 lref = (unsigned long)DBL_MB;
1917 break;
1918
1919 case MT_F_DCPL:
1920 *index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
1921 lref = (unsigned long)DCPL_MB;
1922 break;
1923
1924 case MT_F_SCPL:
1925 *index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
1926 lref = (unsigned long)SCPL_MB;
1927 break;
1928
1929 case MT_F_INT:
1930 *index = (AccessIndex) ((Integer*)ptr - INT_MB);
1931 lref = (unsigned long)INT_MB;
1932 break;
1933
1934 case MT_F_REAL:
1935 *index = (AccessIndex) ((float*)ptr - FLT_MB);
1936 lref = (unsigned long)FLT_MB;
1937 break;
1938 }
1939
1940 #ifdef BYTE_ADDRESSABLE_MEMORY
1941 /* check the allignment */
1942 lptr = (unsigned long)ptr;
1943 if( lptr%elemsize != lref%elemsize ){
1944 printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
1945 lref,lref%elemsize);
1946 pnga_error("nga_access: MA addressing problem: base address misallignment",
1947 handle);
1948 }
1949 #endif
1950
1951 /* adjust index for Fortran addressing */
1952 (*index) ++ ;
1953 FLUSH_CACHE;
1954
1955 }
1956
1957 /*\ PROVIDE ACCESS TO AN INDIVIDUAL DATA BLOCK OF A GLOBAL ARRAY
1958 \*/
1959 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1960 # pragma weak wnga_access_block_idx = pnga_access_block_idx
1961 #endif
1962
pnga_access_block_idx(Integer g_a,Integer idx,AccessIndex * index,Integer * ld)1963 void pnga_access_block_idx(Integer g_a, Integer idx, AccessIndex* index, Integer *ld)
1964 {
1965 char *ptr;
1966 Integer handle = GA_OFFSET + g_a;
1967 Integer /*p_handle,*/ iblock;
1968 unsigned long elemsize;
1969 unsigned long lref=0, lptr;
1970
1971
1972 /*p_handle = GA[handle].p_handle;*/
1973 iblock = idx;
1974 if (iblock < 0 || iblock >= GA[handle].block_total)
1975 pnga_error("block index outside allowed values",iblock);
1976
1977 pnga_access_block_ptr(g_a,iblock,&ptr,ld);
1978 /*
1979 * return patch address as the distance elements from the reference address
1980 *
1981 * .in Fortran we need only the index to the type array: dbl_mb or int_mb
1982 * that are elements of COMMON in the the mafdecls.h include file
1983 * .in C we need both the index and the pointer
1984 */
1985
1986 elemsize = (unsigned long)GA[handle].elemsize;
1987
1988 /* compute index and check if it is correct */
1989 switch (pnga_type_c2f(GA[handle].type)){
1990 case MT_F_DBL:
1991 *index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
1992 lref = (unsigned long)DBL_MB;
1993 break;
1994
1995 case MT_F_DCPL:
1996 *index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
1997 lref = (unsigned long)DCPL_MB;
1998 break;
1999
2000 case MT_F_SCPL:
2001 *index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
2002 lref = (unsigned long)SCPL_MB;
2003 break;
2004
2005 case MT_F_INT:
2006 *index = (AccessIndex) ((Integer*)ptr - INT_MB);
2007 lref = (unsigned long)INT_MB;
2008 break;
2009
2010 case MT_F_REAL:
2011 *index = (AccessIndex) ((float*)ptr - FLT_MB);
2012 lref = (unsigned long)FLT_MB;
2013 break;
2014 }
2015
2016 #ifdef BYTE_ADDRESSABLE_MEMORY
2017 /* check the allignment */
2018 lptr = (unsigned long)ptr;
2019 if( lptr%elemsize != lref%elemsize ){
2020 printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
2021 lref,lref%elemsize);
2022 pnga_error("nga_access: MA addressing problem: base address misallignment",
2023 handle);
2024 }
2025 #endif
2026
2027 /* adjust index for Fortran addressing */
2028 (*index) ++ ;
2029 FLUSH_CACHE;
2030
2031 }
2032
2033 /**
2034 * Provide access to an individual data block of a Global Array
2035 * with a SCALAPACK type block-cyclic data distribution
2036 */
2037 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2038 # pragma weak wnga_access_block_grid_idx = pnga_access_block_grid_idx
2039 #endif
2040
pnga_access_block_grid_idx(Integer g_a,Integer * subscript,AccessIndex * index,Integer * ld)2041 void pnga_access_block_grid_idx(Integer g_a, Integer* subscript,
2042 AccessIndex *index, Integer *ld)
2043 {
2044 char *ptr;
2045 Integer handle = GA_OFFSET + g_a;
2046 Integer i,ndim/*,p_handle*/;
2047 unsigned long elemsize;
2048 unsigned long lref=0, lptr;
2049
2050
2051 /*p_handle = GA[handle].p_handle;*/
2052 ndim = GA[handle].ndim;
2053 for (i=0; i<ndim; i++)
2054 if (subscript[i]<0 || subscript[i] >= GA[handle].num_blocks[i])
2055 pnga_error("index outside allowed values",subscript[i]);
2056
2057 pnga_access_block_grid_ptr(g_a,subscript,&ptr,ld);
2058 /*
2059 * return patch address as the distance elements from the reference address
2060 *
2061 * .in Fortran we need only the index to the type array: dbl_mb or int_mb
2062 * that are elements of COMMON in the the mafdecls.h include file
2063 * .in C we need both the index and the pointer
2064 */
2065
2066 elemsize = (unsigned long)GA[handle].elemsize;
2067
2068 /* compute index and check if it is correct */
2069 switch (pnga_type_c2f(GA[handle].type)){
2070 case MT_F_DBL:
2071 *index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
2072 lref = (unsigned long)DBL_MB;
2073 break;
2074
2075 case MT_F_DCPL:
2076 *index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
2077 lref = (unsigned long)DCPL_MB;
2078 break;
2079
2080 case MT_F_SCPL:
2081 *index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
2082 lref = (unsigned long)SCPL_MB;
2083 break;
2084
2085 case MT_F_INT:
2086 *index = (AccessIndex) ((Integer*)ptr - INT_MB);
2087 lref = (unsigned long)INT_MB;
2088 break;
2089
2090 case MT_F_REAL:
2091 *index = (AccessIndex) ((float*)ptr - FLT_MB);
2092 lref = (unsigned long)FLT_MB;
2093 break;
2094 }
2095
2096 #ifdef BYTE_ADDRESSABLE_MEMORY
2097 /* check the allignment */
2098 lptr = (unsigned long)ptr;
2099 if( lptr%elemsize != lref%elemsize ){
2100 printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
2101 lref,lref%elemsize);
2102 pnga_error("nga_access: MA addressing problem: base address misallignment",
2103 handle);
2104 }
2105 #endif
2106
2107 /* adjust index for Fortran addressing */
2108 (*index) ++ ;
2109 FLUSH_CACHE;
2110
2111 }
2112
2113 /*\ PROVIDE ACCESS TO A PATCH OF A GLOBAL ARRAY
2114 \*/
2115 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2116 # pragma weak wnga_access_block_segment_idx = pnga_access_block_segment_idx
2117 #endif
2118
pnga_access_block_segment_idx(Integer g_a,Integer proc,AccessIndex * index,Integer * len)2119 void pnga_access_block_segment_idx(Integer g_a, Integer proc,
2120 AccessIndex* index, Integer *len)
2121 {
2122 char *ptr;
2123 Integer handle = GA_OFFSET + g_a;
2124 /*Integer p_handle;*/
2125 unsigned long elemsize;
2126 unsigned long lref=0, lptr;
2127
2128
2129 /*p_handle = GA[handle].p_handle;*/
2130
2131 /*
2132 * return patch address as the distance elements from the reference address
2133 *
2134 * .in Fortran we need only the index to the type array: dbl_mb or int_mb
2135 * that are elements of COMMON in the the mafdecls.h include file
2136 * .in C we need both the index and the pointer
2137 */
2138 pnga_access_block_segment_ptr(g_a, proc, &ptr, len);
2139
2140 elemsize = (unsigned long)GA[handle].elemsize;
2141
2142 /* compute index and check if it is correct */
2143 switch (pnga_type_c2f(GA[handle].type)){
2144 case MT_F_DBL:
2145 *index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
2146 lref = (unsigned long)DBL_MB;
2147 break;
2148
2149 case MT_F_DCPL:
2150 *index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
2151 lref = (unsigned long)DCPL_MB;
2152 break;
2153
2154 case MT_F_SCPL:
2155 *index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
2156 lref = (unsigned long)SCPL_MB;
2157 break;
2158
2159 case MT_F_INT:
2160 *index = (AccessIndex) ((Integer*)ptr - INT_MB);
2161 lref = (unsigned long)INT_MB;
2162 break;
2163
2164 case MT_F_REAL:
2165 *index = (AccessIndex) ((float*)ptr - FLT_MB);
2166 lref = (unsigned long)FLT_MB;
2167 break;
2168 }
2169
2170 #ifdef BYTE_ADDRESSABLE_MEMORY
2171 /* check the allignment */
2172 lptr = (unsigned long)ptr;
2173 if( lptr%elemsize != lref%elemsize ){
2174 printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
2175 lref,lref%elemsize);
2176 pnga_error("nga_access_block_segment: MA addressing problem: base address misallignment",
2177 handle);
2178 }
2179 #endif
2180
2181 /* adjust index for Fortran addressing */
2182 (*index) ++ ;
2183 FLUSH_CACHE;
2184
2185 }
2186
2187 /**
2188 * Release access to a patch of a Global Array
2189 */
2190 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2191 # pragma weak wnga_release = pnga_release
2192 #endif
2193
pnga_release(Integer g_a,Integer * lo,Integer * hi)2194 void pnga_release(Integer g_a, Integer *lo, Integer *hi)
2195 {}
2196
2197 /**
2198 * Release access and update a patch of a Global Array
2199 */
2200 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2201 # pragma weak wnga_release_update = pnga_release_update
2202 #endif
2203
pnga_release_update(Integer g_a,Integer * lo,Integer * hi)2204 void pnga_release_update(Integer g_a, Integer *lo, Integer *hi)
2205 {}
2206
2207 /**
2208 * Release access to a block in a block-cyclic Global Array
2209 */
2210 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2211 # pragma weak wnga_release_block = pnga_release_block
2212 #endif
2213
pnga_release_block(Integer g_a,Integer iblock)2214 void pnga_release_block(Integer g_a, Integer iblock)
2215 {
2216 /*
2217 Integer i;
2218 Integer handle = GA_OFFSET + g_a;
2219 for (i=0; i<GA[handle].num_rstrctd; i++) {
2220 printf("p[%d] location: %d rstrctd_list[%d]: %d\n",GAme,*iblock,
2221 i,GA[handle].rstrctd_list[i]);
2222 }
2223 */
2224 }
2225
2226 /**
2227 * Release access and update a block in a block-cyclic Global Array
2228 */
2229 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2230 # pragma weak wnga_release_update_block = pnga_release_update_block
2231 #endif
2232
pnga_release_update_block(Integer g_a,Integer iblock)2233 void pnga_release_update_block(Integer g_a, Integer iblock)
2234 {}
2235
2236 /**
2237 * Release access to a block in a block-cyclic Global Array with proc grid
2238 * (SCALAPack) layout
2239 */
2240 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2241 # pragma weak wnga_release_block_grid = pnga_release_block_grid
2242 #endif
2243
pnga_release_block_grid(Integer g_a,Integer * index)2244 void pnga_release_block_grid(Integer g_a, Integer *index)
2245 {}
2246
2247 /**
2248 * Release access and update a block in a block-cyclic Global Array with
2249 * proc grid (SCALAPack) layout
2250 */
2251 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2252 # pragma weak wnga_release_update_block_grid = pnga_release_update_block_grid
2253 #endif
2254
pnga_release_update_block_grid(Integer g_a,Integer * index)2255 void pnga_release_update_block_grid(Integer g_a, Integer *index)
2256 {}
2257
2258 /**
2259 * Release access to data segment in a block-cyclic Global Array
2260 */
2261 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2262 # pragma weak wnga_release_block_segment = pnga_release_block_segment
2263 #endif
2264
pnga_release_block_segment(Integer g_a,Integer iproc)2265 void pnga_release_block_segment(Integer g_a, Integer iproc)
2266 {}
2267
2268 /**
2269 * Release access and update a block in a block-cyclic Global Array
2270 */
2271 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2272 # pragma weak wnga_release_update_block_segment = pnga_release_update_block_segment
2273 #endif
2274
pnga_release_update_block_segment(Integer g_a,Integer iproc)2275 void pnga_release_update_block_segment(Integer g_a, Integer iproc)
2276 {}
2277
gai_scatter_acc_local(Integer g_a,void * v,Integer * i,Integer * j,Integer nv,void * alpha,Integer proc)2278 void gai_scatter_acc_local(Integer g_a, void *v,Integer *i,Integer *j,
2279 Integer nv, void* alpha, Integer proc)
2280 {
2281 void **ptr_src, **ptr_dst;
2282 char *ptr_ref;
2283 Integer ldp, item_size, ilo, ihi, jlo, jhi, type;
2284 Integer lo[2], hi[2];
2285 Integer handle,p_handle,iproc;
2286 armci_giov_t desc;
2287 register Integer k, offset;
2288 int rc=0;
2289
2290 if (nv < 1) return;
2291
2292
2293 handle = GA_OFFSET + g_a;
2294 p_handle = GA[handle].p_handle;
2295
2296 pnga_distribution(g_a, proc, lo, hi);
2297 ilo = lo[0];
2298 jlo = lo[1];
2299 ihi = hi[0];
2300 jhi = hi[1];
2301
2302 iproc = proc;
2303 if (GA[handle].distr_type == REGULAR) {
2304 if (GA[handle].num_rstrctd > 0)
2305 iproc = GA[handle].rank_rstrctd[iproc];
2306 gaShmemLocation(iproc, g_a, ilo, jlo, &ptr_ref, &ldp);
2307 } else {
2308 /*
2309 Integer lo[2];
2310 lo[0] = ilo;
2311 lo[1] = jlo;
2312 */
2313 pnga_access_block_ptr(g_a, iproc, &ptr_ref, &ldp);
2314 pnga_release_block(g_a, iproc);
2315 if (GA[handle].distr_type == BLOCK_CYCLIC) {
2316 proc = proc%pnga_nnodes();
2317 } else if (GA[handle].distr_type == SCALAPACK) {
2318 Integer index[2];
2319 gam_find_block_indices(handle, proc, index);
2320 index[0] = index[0]%GA[handle].nblock[0];
2321 index[1] = index[1]%GA[handle].nblock[1];
2322 gam_find_proc_from_sl_indices(handle,proc,index);
2323 } else if (GA[handle].distr_type == TILED ||
2324 GA[handle].distr_type == TILED_IRREG) {
2325 Integer index[2];
2326 gam_find_block_indices(handle, proc, index);
2327 index[0] = index[0]%GA[handle].nblock[0];
2328 index[1] = index[1]%GA[handle].nblock[1];
2329 gam_find_tile_proc_from_indices(handle,proc,index);
2330 }
2331
2332 }
2333
2334 type = GA[handle].type;
2335 item_size = GAsizeofM(type);
2336
2337 ptr_src = malloc((int)nv*2*sizeof(void*));
2338 if(ptr_src==NULL)pnga_error("malloc failed",nv);
2339 ptr_dst=ptr_src+ nv;
2340
2341 for(k=0; k< nv; k++){
2342 if(i[k] < ilo || i[k] > ihi || j[k] < jlo || j[k] > jhi){
2343 char err_string[ERR_STR_LEN];
2344 sprintf(err_string,"proc=%d invalid i/j=(%ld,%ld)>< [%ld:%ld,%ld:%ld]",
2345 (int)proc, (long)i[k], (long)j[k], (long)ilo,
2346 (long)ihi, (long)jlo, (long)jhi);
2347 pnga_error(err_string,g_a);
2348 }
2349
2350 offset = (j[k] - jlo)* ldp + i[k] - ilo;
2351 ptr_dst[k] = ptr_ref + item_size * offset;
2352 ptr_src[k] = ((char*)v) + k*item_size;
2353 }
2354 desc.bytes = (int)item_size;
2355 desc.src_ptr_array = ptr_src;
2356 desc.dst_ptr_array = ptr_dst;
2357 desc.ptr_array_len = (int)nv;
2358
2359 if(GA_fence_set)fence_array[proc]=1;
2360
2361 if (p_handle >= 0) {
2362 proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
2363 }
2364 if(alpha != NULL) {
2365 int optype=-1;
2366 if(type==C_DBL) optype= ARMCI_ACC_DBL;
2367 else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
2368 else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
2369 else if(type==C_INT)optype= ARMCI_ACC_INT;
2370 else if(type==C_LONG)optype= ARMCI_ACC_LNG;
2371 else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
2372 else pnga_error("type not supported",type);
2373 rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)proc);
2374 }
2375
2376 if(rc) pnga_error("scatter/_acc failed in armci",rc);
2377
2378 free(ptr_src);
2379
2380 }
2381
2382 /**
2383 * Scatter nv elements of v into a Global Array at locations specified
2384 * by arrays i and j
2385 */
2386 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2387 # pragma weak wnga_scatter2d = pnga_scatter2d
2388 #endif
2389
pnga_scatter2d(Integer g_a,void * v,Integer * i,Integer * j,Integer nv)2390 void pnga_scatter2d(Integer g_a, void *v, Integer *i, Integer *j, Integer nv)
2391 {
2392 register Integer k;
2393 Integer kk;
2394 Integer item_size;
2395 Integer proc, type=GA[GA_OFFSET + g_a].type;
2396 Integer nproc, p_handle, iproc;
2397 Integer subscrpt[2];
2398
2399 Integer *aproc, naproc; /* active processes and numbers */
2400 Integer *map; /* map the active processes to allocated space */
2401 char *buf1, *buf2;
2402 Integer handle = g_a + GA_OFFSET;
2403
2404 Integer *count; /* counters for each process */
2405 Integer *nelem; /* number of elements for each process */
2406 /* source and destination pointers for each process */
2407 void ***ptr_src, ***ptr_dst;
2408 void **ptr_org; /* the entire pointer array */
2409 armci_giov_t desc;
2410 Integer *ilo, *ihi, *jlo, *jhi, *ldp, *owner;
2411 Integer lo[2], hi[2];
2412 char **ptr_ref;
2413 Integer num_blocks=0;
2414
2415 if (nv < 1) return;
2416
2417 ga_check_handleM(g_a, "ga_scatter");
2418
2419 GAstat.numsca++;
2420 /* determine how many processors are associated with array */
2421 p_handle = GA[handle].p_handle;
2422 if (p_handle < 0) {
2423 nproc = GAnproc;
2424 if (GA[handle].num_rstrctd > 0) {
2425 nproc = GA[handle].num_rstrctd;
2426 }
2427 } else {
2428 nproc = PGRP_LIST[p_handle].map_nproc;
2429 }
2430
2431 /* allocate temp memory */
2432 if (GA[handle].distr_type == REGULAR) {
2433 buf1 = malloc((int) (nproc *4 +nv)* (sizeof(Integer)));
2434 if(buf1 == NULL) pnga_error("malloc failed", 3*nproc);
2435 } else {
2436 num_blocks = GA[handle].block_total;
2437 buf1 = malloc((int) (num_blocks *4 +nv)* (sizeof(Integer)));
2438 if(buf1 == NULL) pnga_error("malloc failed", 3*num_blocks);
2439 }
2440
2441 owner = (Integer *)buf1;
2442 count = owner+ nv;
2443 if (GA[handle].distr_type == REGULAR) {
2444 nelem = count + nproc;
2445 aproc = count + 2 * nproc;
2446 map = count + 3 * nproc;
2447 } else {
2448 nelem = count + num_blocks;
2449 aproc = count + 2 * num_blocks;
2450 map = count + 3 * num_blocks;
2451 }
2452
2453 /* initialize the counters and nelem */
2454 if (GA[handle].distr_type == REGULAR) {
2455 for(kk=0; kk<nproc; kk++) {
2456 count[kk] = 0; nelem[kk] = 0;
2457 }
2458 } else {
2459 for(kk=0; kk<num_blocks; kk++) {
2460 count[kk] = 0; nelem[kk] = 0;
2461 }
2462 }
2463
2464 /* find proc that owns the (i,j) element; store it in temp: */
2465 if (GA[handle].num_rstrctd == 0) {
2466 for(k=0; k< nv; k++) {
2467 subscrpt[0] = *(i+k);
2468 subscrpt[1] = *(j+k);
2469 if(! pnga_locate(g_a, subscrpt, owner+k)){
2470 char err_string[ERR_STR_LEN];
2471 sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
2472 pnga_error(err_string,g_a);
2473 }
2474 iproc = owner[k];
2475 nelem[iproc]++;
2476 }
2477 } else {
2478 for(k=0; k< nv; k++) {
2479 subscrpt[0] = *(i+k);
2480 subscrpt[1] = *(j+k);
2481 if(! pnga_locate(g_a, subscrpt, owner+k)){
2482 char err_string[ERR_STR_LEN];
2483 sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
2484 pnga_error(err_string,g_a);
2485 }
2486 iproc = GA[handle].rank_rstrctd[owner[k]];
2487 nelem[iproc]++;
2488 }
2489 }
2490
2491 naproc = 0;
2492 if (GA[handle].distr_type == REGULAR) {
2493 for(k=0; k<nproc; k++) if(nelem[k] > 0) {
2494 aproc[naproc] = k;
2495 map[k] = naproc;
2496 naproc ++;
2497 }
2498 } else {
2499 for(k=0; k<num_blocks; k++) if(nelem[k] > 0) {
2500 aproc[naproc] = k;
2501 map[k] = naproc;
2502 naproc ++;
2503 }
2504 }
2505
2506 GAstat.numsca_procs += naproc;
2507
2508 buf2 = malloc((int)(2*naproc*sizeof(void **) + 2*nv*sizeof(void *) +
2509 5*naproc*sizeof(Integer) + naproc*sizeof(char*)));
2510 if(buf2 == NULL) pnga_error("malloc failed", naproc);
2511
2512 ptr_src = (void ***)buf2;
2513 ptr_dst = (void ***)(buf2 + naproc*sizeof(void **));
2514 ptr_org = (void **)(buf2 + 2*naproc*sizeof(void **));
2515 ptr_ref = (char **)(buf2+2*naproc*sizeof(void **)+2*nv*sizeof(void *));
2516 ilo = (Integer *)(((char*)ptr_ref) + naproc*sizeof(char*));
2517 ihi = ilo + naproc;
2518 jlo = ihi + naproc;
2519 jhi = jlo + naproc;
2520 ldp = jhi + naproc;
2521
2522 if (GA[handle].distr_type == REGULAR) {
2523 for(kk=0; kk<naproc; kk++) {
2524 iproc = aproc[kk];
2525 if (GA[handle].num_rstrctd > 0)
2526 iproc = GA[handle].rstrctd_list[iproc];
2527 pnga_distribution(g_a, iproc, lo, hi);
2528 ilo[kk] = lo[0];
2529 jlo[kk] = lo[1];
2530 ihi[kk] = hi[0];
2531 jhi[kk] = hi[1];
2532
2533 /* get address of the first element owned by proc */
2534 gaShmemLocation(aproc[kk], g_a, ilo[kk], jlo[kk], &(ptr_ref[kk]),
2535 &(ldp[kk]));
2536 }
2537 } else {
2538 for(kk=0; kk<naproc; kk++) {
2539 iproc = aproc[kk];
2540 pnga_distribution(g_a, iproc, lo, hi);
2541 ilo[kk] = lo[0];
2542 jlo[kk] = lo[1];
2543 ihi[kk] = hi[0];
2544 jhi[kk] = hi[1];
2545
2546 /* get address of the first element owned by proc */
2547 pnga_access_block_ptr(g_a, iproc, &(ptr_ref[kk]), &(ldp[kk]));
2548 pnga_release_block(g_a, iproc);
2549 }
2550 }
2551
2552 /* determine limit for message size -- v,i, & j will travel together */
2553 item_size = GAsizeofM(type);
2554 GAbytes.scatot += (double)item_size*nv ;
2555 iproc = owner[GAme];
2556 GAbytes.scaloc += (double)item_size* nelem[iproc];
2557 ptr_src[0] = ptr_org; ptr_dst[0] = ptr_org + nv;
2558 for(k=1; k<naproc; k++) {
2559 ptr_src[k] = ptr_src[k-1] + nelem[aproc[k-1]];
2560 ptr_dst[k] = ptr_dst[k-1] + nelem[aproc[k-1]];
2561 }
2562
2563 for(k=0; k<nv; k++){
2564 Integer this_count;
2565 proc = owner[k];
2566 if (GA[handle].num_rstrctd > 0)
2567 proc = GA[handle].rank_rstrctd[owner[k]];
2568 this_count = count[proc];
2569 count[proc]++;
2570 proc = map[proc];
2571 ptr_src[proc][this_count] = ((char*)v) + k * item_size;
2572
2573 if(i[k] < ilo[proc] || i[k] > ihi[proc] ||
2574 j[k] < jlo[proc] || j[k] > jhi[proc]){
2575 char err_string[ERR_STR_LEN];
2576 sprintf(err_string,"proc=%d invalid i/j=(%ld,%ld)><[%ld:%ld,%ld:%ld]",
2577 (int)proc, (long)i[k], (long)j[k], (long)ilo[proc],
2578 (long)ihi[proc], (long)jlo[proc], (long)jhi[proc]);
2579 pnga_error(err_string, g_a);
2580 }
2581 ptr_dst[proc][this_count] = ptr_ref[proc] + item_size *
2582 ((j[k] - jlo[proc])* ldp[proc] + i[k] - ilo[proc]);
2583 }
2584
2585 /* source and destination pointers are ready for all processes */
2586 if (GA[handle].distr_type == REGULAR) {
2587 for(k=0; k<naproc; k++) {
2588 int rc;
2589
2590 desc.bytes = (int)item_size;
2591 desc.src_ptr_array = ptr_src[k];
2592 desc.dst_ptr_array = ptr_dst[k];
2593 desc.ptr_array_len = (int)nelem[aproc[k]];
2594 if (p_handle < 0) {
2595 iproc = aproc[k];
2596 if (GA[handle].num_rstrctd > 0)
2597 iproc = GA[handle].rstrctd_list[iproc];
2598 } else {
2599 iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
2600 }
2601
2602 rc = ARMCI_PutV(&desc, 1, (int)iproc);
2603 if(rc) pnga_error("scatter failed in armci",rc);
2604 }
2605 } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
2606 for(k=0; k<naproc; k++) {
2607 int rc;
2608
2609 desc.bytes = (int)item_size;
2610 desc.src_ptr_array = ptr_src[k];
2611 desc.dst_ptr_array = ptr_dst[k];
2612 desc.ptr_array_len = (int)nelem[aproc[k]];
2613 iproc = aproc[k];
2614 iproc = iproc%nproc;
2615 if (p_handle >= 0) {
2616 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2617 }
2618
2619 rc = ARMCI_PutV(&desc, 1, (int)iproc);
2620 if(rc) pnga_error("scatter failed in armci",rc);
2621 }
2622 } else if (GA[handle].distr_type == SCALAPACK) {
2623 Integer index[MAXDIM];
2624 for(k=0; k<naproc; k++) {
2625 int rc;
2626
2627 desc.bytes = (int)item_size;
2628 desc.src_ptr_array = ptr_src[k];
2629 desc.dst_ptr_array = ptr_dst[k];
2630 desc.ptr_array_len = (int)nelem[aproc[k]];
2631 iproc = aproc[k];
2632 gam_find_block_indices(handle, iproc, index);
2633 index[0] = index[0]%GA[handle].nblock[0];
2634 index[1] = index[1]%GA[handle].nblock[1];
2635 gam_find_proc_from_sl_indices(handle,iproc,index);
2636 if (p_handle >= 0) {
2637 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2638 }
2639
2640 rc = ARMCI_PutV(&desc, 1, (int)iproc);
2641 if(rc) pnga_error("scatter failed in armci",rc);
2642 }
2643 } else if (GA[handle].distr_type == TILED ||
2644 GA[handle].distr_type == TILED_IRREG) {
2645 Integer index[MAXDIM];
2646 for(k=0; k<naproc; k++) {
2647 int rc;
2648
2649 desc.bytes = (int)item_size;
2650 desc.src_ptr_array = ptr_src[k];
2651 desc.dst_ptr_array = ptr_dst[k];
2652 desc.ptr_array_len = (int)nelem[aproc[k]];
2653 iproc = aproc[k];
2654 gam_find_block_indices(handle, iproc, index);
2655 index[0] = index[0]%GA[handle].nblock[0];
2656 index[1] = index[1]%GA[handle].nblock[1];
2657 gam_find_tile_proc_from_indices(handle,iproc,index);
2658 if (p_handle >= 0) {
2659 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2660 }
2661
2662 rc = ARMCI_PutV(&desc, 1, (int)iproc);
2663 if(rc) pnga_error("scatter failed in armci",rc);
2664 }
2665 }
2666
2667 free(buf2);
2668 free(buf1);
2669
2670 }
2671
2672 /**
2673 * Add elements of v to contents at random locations in Global Array
2674 * specified by vectors i and j
2675 */
2676 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2677 # pragma weak wnga_scatter_acc2d = pnga_scatter_acc2d
2678 #endif
2679
pnga_scatter_acc2d(g_a,v,i,j,nv,alpha)2680 void pnga_scatter_acc2d(g_a, v, i, j, nv, alpha)
2681 Integer g_a, nv, *i, *j;
2682 void *v, *alpha;
2683 {
2684 register Integer k;
2685 Integer item_size;
2686 Integer first, nelem, proc, type=GA[GA_OFFSET + g_a].type;
2687 Integer *int_ptr;
2688 Integer subscrpt[2];
2689
2690 if (nv < 1) return;
2691
2692 ga_check_handleM(g_a, "ga_scatter_acc");
2693
2694 GAstat.numsca++;
2695
2696 int_ptr = (Integer*) ga_malloc(nv, MT_F_INT, "ga_scatter_acc--p");
2697
2698 /* find proc that owns the (i,j) element; store it in temp: int_ptr */
2699 for(k=0; k< nv; k++) {
2700 subscrpt[0] = *(i+k);
2701 subscrpt[1] = *(j+k);
2702 if(! pnga_locate(g_a, subscrpt, int_ptr+k)){
2703 char err_string[ERR_STR_LEN];
2704 sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
2705 pnga_error(err_string,g_a);
2706 }
2707 }
2708
2709 /* determine limit for message size -- v,i, & j will travel together */
2710 item_size = GAsizeofM(type);
2711 GAbytes.scatot += (double)item_size*nv ;
2712
2713 /* Sort the entries by processor */
2714 ga_sort_scat(&nv, v, i, j, int_ptr, type );
2715
2716 /* go through the list again executing scatter for each processor */
2717
2718 first = 0;
2719 do {
2720 proc = int_ptr[first];
2721 nelem = 0;
2722
2723 /* count entries for proc from "first" to last */
2724 for(k=first; k< nv; k++){
2725 if(proc == int_ptr[k]) nelem++;
2726 else break;
2727 }
2728
2729 if(proc == GAme){
2730 GAbytes.scaloc += (double)item_size* nelem ;
2731 }
2732
2733 gai_scatter_acc_local(g_a, ((char*)v)+item_size*first, i+first,
2734 j+first, nelem, alpha, proc);
2735 first += nelem;
2736
2737 }while (first< nv);
2738
2739 ga_free(int_ptr);
2740
2741 }
2742
2743 #define SCATTER -99
2744 #define GATHER -98
2745 #define SCATTER_ACC -97
2746
2747 /*\ GATHER OPERATION elements from the global array into v
2748 \*/
gai_gatscat(int op,Integer g_a,void * v,Integer subscript[],Integer nv,double * locbytes,double * totbytes,void * alpha)2749 void gai_gatscat(int op, Integer g_a, void* v, Integer subscript[],
2750 Integer nv, double *locbytes, double* totbytes, void *alpha)
2751 {
2752 Integer k, handle=g_a+GA_OFFSET;
2753 int ndim, item_size, type;
2754 Integer *proc;
2755 Integer nproc, p_handle, iproc;
2756
2757 Integer *aproc, naproc; /* active processes and numbers */
2758 Integer *map; /* map the active processes to allocated space */
2759 char *buf1, *buf2;
2760
2761 Integer *count; /* counters for each process */
2762 Integer *nelem; /* number of elements for each process */
2763 /* source and destination pointers for each process */
2764 void ***ptr_src, ***ptr_dst;
2765 void **ptr_org; /* the entire pointer array */
2766 armci_giov_t desc;
2767 Integer num_blocks=0;
2768
2769
2770
2771 proc=(Integer *)ga_malloc(nv, MT_F_INT, "ga_gat-p");
2772
2773 ndim = GA[handle].ndim;
2774 type = GA[handle].type;
2775 item_size = GA[handle].elemsize;
2776 *totbytes += (double)item_size*nv;
2777
2778 /* determine how many processors are associated with array */
2779 p_handle = GA[handle].p_handle;
2780 if (p_handle < 0) {
2781 nproc = GAnproc;
2782 if (GA[handle].num_rstrctd > 0) nproc = GA[handle].num_rstrctd;
2783 } else {
2784 nproc = PGRP_LIST[p_handle].map_nproc;
2785 }
2786
2787 /* allocate temp memory */
2788 if (GA[handle].distr_type == REGULAR) {
2789 buf1 = malloc((int) nproc * 4 * (sizeof(Integer)));
2790 if(buf1 == NULL) pnga_error("malloc failed", 3*nproc);
2791 } else {
2792 num_blocks = GA[handle].block_total;
2793 buf1 = malloc((int) num_blocks * 4 * (sizeof(Integer)));
2794 if(buf1 == NULL) pnga_error("malloc failed", 3*num_blocks);
2795 }
2796
2797 count = (Integer *)buf1;
2798 if (GA[handle].distr_type == REGULAR) {
2799 nelem = (Integer *)(buf1 + nproc * sizeof(Integer));
2800 aproc = (Integer *)(buf1 + 2 * nproc * sizeof(Integer));
2801 map = (Integer *)(buf1 + 3 * nproc * sizeof(Integer));
2802 } else {
2803 num_blocks = GA[handle].block_total;
2804 nelem = (Integer *)(buf1 + num_blocks * sizeof(Integer));
2805 aproc = (Integer *)(buf1 + 2 * num_blocks * sizeof(Integer));
2806 map = (Integer *)(buf1 + 3 * num_blocks * sizeof(Integer));
2807 }
2808
2809 /* initialize the counters and nelem */
2810 if (GA[handle].distr_type == REGULAR) {
2811 for(k=0; k<nproc; k++) count[k] = 0;
2812 for(k=0; k<nproc; k++) nelem[k] = 0;
2813 } else {
2814 for(k=0; k<num_blocks; k++) count[k] = 0;
2815 for(k=0; k<num_blocks; k++) nelem[k] = 0;
2816 }
2817
2818 /* get the process id that the element should go and count the
2819 * number of elements for each process
2820 */
2821 if (GA[handle].num_rstrctd == 0) {
2822 for(k=0; k<nv; k++) {
2823 if(!pnga_locate(g_a, subscript+k*ndim, proc+k)) {
2824 gai_print_subscript("invalid subscript",ndim, subscript+k*ndim,"\n");
2825 pnga_error("failed -element:",k);
2826 }
2827 iproc = proc[k];
2828 nelem[iproc]++;
2829 }
2830 } else {
2831 for(k=0; k<nv; k++) {
2832 if(!pnga_locate(g_a, subscript+k*ndim, proc+k)) {
2833 gai_print_subscript("invalid subscript",ndim, subscript+k*ndim,"\n");
2834 pnga_error("failed -element:",k);
2835 }
2836 iproc = GA[handle].rank_rstrctd[proc[k]];
2837 nelem[iproc]++;
2838 }
2839 }
2840
2841 /* find the active processes (with which transfer data) */
2842 naproc = 0;
2843 if (GA[handle].distr_type == REGULAR) {
2844 for(k=0; k<nproc; k++) if(nelem[k] > 0) {
2845 aproc[naproc] = k;
2846 map[k] = naproc;
2847 naproc ++;
2848 }
2849 } else {
2850 for(k=0; k<num_blocks; k++) if(nelem[k] > 0) {
2851 aproc[naproc] = k;
2852 map[k] = naproc;
2853 naproc ++;
2854 }
2855 }
2856
2857 buf2 = malloc((int)(2*naproc*sizeof(void **) + 2*nv*sizeof(void *)));
2858 if(buf2 == NULL) pnga_error("malloc failed", 2*naproc);
2859
2860 ptr_src = (void ***)buf2;
2861 ptr_dst = (void ***)(buf2 + naproc * sizeof(void **));
2862 ptr_org = (void **)(buf2 + 2 * naproc * sizeof(void **));
2863
2864 /* set the pointers as
2865 * P0 P1 P0 P1
2866 * ptr_src[0] ptr_src[1] ... ptr_dst[0] ptr_dst[1] ...
2867 * \ \ \ \
2868 * ptr_org |-------------------------------|---------------------------|
2869 * | nv | nv |
2870 * | nelem[0] | nelem[1] |... | nelem[0] | nelem[1] |...
2871 */
2872 ptr_src[0] = ptr_org; ptr_dst[0] = ptr_org + nv;
2873 for(k=1; k<naproc; k++) {
2874 ptr_src[k] = ptr_src[k-1] + nelem[aproc[k-1]];
2875 ptr_dst[k] = ptr_dst[k-1] + nelem[aproc[k-1]];
2876 }
2877
2878 *locbytes += (double)item_size* nelem[GAme];
2879
2880 switch(op) {
2881 case GATHER:
2882 /* go through all the elements
2883 * for process 0: ptr_src[0][0, 1, ...] = subscript + offset0...
2884 * ptr_dst[0][0, 1, ...] = v + offset0...
2885 * for process 1: ptr_src[1][...] ...
2886 * ptr_dst[1][...] ...
2887 */
2888 if (GA[handle].distr_type == REGULAR) {
2889 for(k=0; k<nv; k++){
2890 iproc = proc[k];
2891 if (GA[handle].num_rstrctd > 0)
2892 iproc = GA[handle].rank_rstrctd[iproc];
2893 ptr_dst[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
2894 if (p_handle != 0) {
2895 gam_Loc_ptr(iproc, handle, (subscript+k*ndim),
2896 ptr_src[map[iproc]]+count[iproc]);
2897 } else {
2898 gam_Loc_ptr(proc[k], handle, (subscript+k*ndim),
2899 ptr_src[map[iproc]]+count[iproc]);
2900 }
2901 count[iproc]++;
2902 }
2903 } else {
2904 Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
2905 Integer j, jtot, last, offset;
2906 for(k=0; k<nv; k++){
2907 iproc = proc[k];
2908 ptr_dst[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
2909 pnga_distribution(g_a, iproc, lo, hi);
2910 pnga_access_block_ptr(g_a, iproc, &(ptr_src[map[iproc]][count[iproc]]), ld);
2911 pnga_release_block(g_a, iproc);
2912 /* calculate remaining offset */
2913 offset = 0;
2914 last = ndim - 1;
2915 jtot = 1;
2916 for (j=0; j<last; j++) {
2917 offset += ((subscript+k*ndim)[j]-lo[j])*jtot;
2918 jtot *= ld[j];
2919 }
2920 offset += ((subscript+k*ndim)[last]-lo[last])*jtot;
2921 ptr_src[map[iproc]][count[iproc]]=((char *)ptr_src[map[iproc]][count[iproc]])+offset*GA[handle].elemsize;
2922 count[iproc]++;
2923 }
2924 }
2925
2926 /* source and destination pointers are ready for all processes */
2927 if (GA[handle].distr_type == REGULAR) {
2928 for(k=0; k<naproc; k++) {
2929 int rc;
2930
2931 desc.bytes = (int)item_size;
2932 desc.src_ptr_array = ptr_src[k];
2933 desc.dst_ptr_array = ptr_dst[k];
2934 desc.ptr_array_len = (int)nelem[aproc[k]];
2935
2936 if (p_handle < 0) {
2937 iproc = aproc[k];
2938 if (GA[handle].num_rstrctd > 0)
2939 iproc = GA[handle].rstrctd_list[iproc];
2940 } else {
2941 iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
2942 }
2943 rc=ARMCI_GetV(&desc, 1, (int)iproc);
2944 if(rc) pnga_error("gather failed in armci",rc);
2945 }
2946 } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
2947 for(k=0; k<naproc; k++) {
2948 int rc;
2949
2950 desc.bytes = (int)item_size;
2951 desc.src_ptr_array = ptr_src[k];
2952 desc.dst_ptr_array = ptr_dst[k];
2953 desc.ptr_array_len = (int)nelem[aproc[k]];
2954 iproc = aproc[k];
2955 iproc = iproc%nproc;
2956 if (p_handle >= 0) {
2957 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2958 }
2959 rc=ARMCI_GetV(&desc, 1, (int)iproc);
2960 if(rc) pnga_error("gather failed in armci",rc);
2961 }
2962 } else if (GA[handle].distr_type == SCALAPACK) {
2963 Integer j, index[MAXDIM];
2964 for(k=0; k<naproc; k++) {
2965 int rc;
2966
2967 desc.bytes = (int)item_size;
2968 desc.src_ptr_array = ptr_src[k];
2969 desc.dst_ptr_array = ptr_dst[k];
2970 desc.ptr_array_len = (int)nelem[aproc[k]];
2971 iproc = aproc[k];
2972 gam_find_block_indices(handle, iproc, index);
2973 for (j=0; j<ndim; j++) {
2974 index[j] = index[j]%GA[handle].nblock[j];
2975 }
2976 gam_find_proc_from_sl_indices(handle,iproc,index);
2977 if (p_handle >= 0) {
2978 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2979 }
2980 rc=ARMCI_GetV(&desc, 1, (int)iproc);
2981 if(rc) pnga_error("gather failed in armci",rc);
2982 }
2983 } else if (GA[handle].distr_type == TILED ||
2984 GA[handle].distr_type == TILED_IRREG) {
2985 Integer j, index[MAXDIM];
2986 for(k=0; k<naproc; k++) {
2987 int rc;
2988
2989 desc.bytes = (int)item_size;
2990 desc.src_ptr_array = ptr_src[k];
2991 desc.dst_ptr_array = ptr_dst[k];
2992 desc.ptr_array_len = (int)nelem[aproc[k]];
2993 iproc = aproc[k];
2994 gam_find_block_indices(handle, iproc, index);
2995 for (j=0; j<ndim; j++) {
2996 index[j] = index[j]%GA[handle].nblock[j];
2997 }
2998 gam_find_tile_proc_from_indices(handle,iproc,index);
2999 if (p_handle >= 0) {
3000 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3001 }
3002 rc=ARMCI_GetV(&desc, 1, (int)iproc);
3003 if(rc) pnga_error("gather failed in armci",rc);
3004 }
3005 }
3006 GAstat.numgat_procs += naproc;
3007 break;
3008 case SCATTER:
3009 /* go through all the elements
3010 * for process 0: ptr_src[0][0, 1, ...] = v + offset0...
3011 * ptr_dst[0][0, 1, ...] = subscript + offset0...
3012 * for process 1: ptr_src[1][...] ...
3013 * ptr_dst[1][...] ...
3014 */
3015 if (GA[handle].distr_type == REGULAR) {
3016 for(k=0; k<nv; k++){
3017 iproc = proc[k];
3018 if (GA[handle].num_rstrctd > 0)
3019 iproc = GA[handle].rank_rstrctd[iproc];
3020 ptr_src[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
3021 if (p_handle != 0) {
3022 gam_Loc_ptr(iproc, handle, (subscript+k*ndim),
3023 ptr_dst[map[iproc]]+count[iproc]);
3024 } else {
3025 gam_Loc_ptr(proc[k], handle, (subscript+k*ndim),
3026 ptr_dst[map[iproc]]+count[iproc]);
3027 }
3028 count[iproc]++;
3029 }
3030 } else {
3031 Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
3032 Integer j, jtot, last, offset;
3033 for(k=0; k<nv; k++){
3034 iproc = proc[k];
3035 ptr_src[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
3036 pnga_distribution(g_a, iproc, lo, hi);
3037 pnga_access_block_ptr(g_a, iproc, &(ptr_dst[map[iproc]][count[iproc]]), ld);
3038 pnga_release_block(g_a, iproc);
3039 /* calculate remaining offset */
3040 offset = 0;
3041 last = ndim - 1;
3042 jtot = 1;
3043 for (j=0; j<last; j++) {
3044 offset += ((subscript+k*ndim)[j]-lo[j])*jtot;
3045 jtot *= ld[j];
3046 }
3047 offset += ((subscript+k*ndim)[last]-lo[last])*jtot;
3048 ptr_dst[map[iproc]][count[iproc]]=((char*)ptr_dst[map[iproc]][count[iproc]])+offset*GA[handle].elemsize;
3049 count[iproc]++;
3050 }
3051 }
3052
3053 /* source and destination pointers are ready for all processes */
3054 if (GA[handle].distr_type == REGULAR) {
3055 for(k=0; k<naproc; k++) {
3056 int rc;
3057
3058 desc.bytes = (int)item_size;
3059 desc.src_ptr_array = ptr_src[k];
3060 desc.dst_ptr_array = ptr_dst[k];
3061 desc.ptr_array_len = (int)nelem[aproc[k]];
3062
3063
3064 if (p_handle < 0) {
3065 iproc = aproc[k];
3066 if (GA[handle].num_rstrctd > 0)
3067 iproc = GA[handle].rstrctd_list[iproc];
3068 } else {
3069 iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
3070 }
3071 if(GA_fence_set) fence_array[iproc]=1;
3072
3073 rc=ARMCI_PutV(&desc, 1, (int)iproc);
3074 if(rc) pnga_error("scatter failed in armci",rc);
3075 }
3076 } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
3077 for(k=0; k<naproc; k++) {
3078 int rc;
3079
3080 desc.bytes = (int)item_size;
3081 desc.src_ptr_array = ptr_src[k];
3082 desc.dst_ptr_array = ptr_dst[k];
3083 desc.ptr_array_len = (int)nelem[aproc[k]];
3084
3085 iproc = aproc[k];
3086 iproc = iproc%nproc;
3087 if (p_handle >= 0) {
3088 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3089 }
3090 if(GA_fence_set) fence_array[iproc]=1;
3091
3092 rc=ARMCI_PutV(&desc, 1, (int)iproc);
3093 if(rc) pnga_error("scatter failed in armci",rc);
3094 }
3095 } else if (GA[handle].distr_type == SCALAPACK) {
3096 Integer j, index[MAXDIM];
3097 for(k=0; k<naproc; k++) {
3098 int rc;
3099
3100 desc.bytes = (int)item_size;
3101 desc.src_ptr_array = ptr_src[k];
3102 desc.dst_ptr_array = ptr_dst[k];
3103 desc.ptr_array_len = (int)nelem[aproc[k]];
3104
3105 iproc = aproc[k];
3106 gam_find_block_indices(handle, iproc, index);
3107 for (j=0; j<ndim; j++) {
3108 index[j] = index[j]%GA[handle].nblock[j];
3109 }
3110 gam_find_proc_from_sl_indices(handle,iproc,index);
3111 if (p_handle >= 0) {
3112 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3113 }
3114 if(GA_fence_set) fence_array[iproc]=1;
3115
3116 rc=ARMCI_PutV(&desc, 1, (int)iproc);
3117 if(rc) pnga_error("scatter failed in armci",rc);
3118 }
3119 } else if (GA[handle].distr_type == TILED ||
3120 GA[handle].distr_type == TILED_IRREG) {
3121 Integer j, index[MAXDIM];
3122 for(k=0; k<naproc; k++) {
3123 int rc;
3124
3125 desc.bytes = (int)item_size;
3126 desc.src_ptr_array = ptr_src[k];
3127 desc.dst_ptr_array = ptr_dst[k];
3128 desc.ptr_array_len = (int)nelem[aproc[k]];
3129
3130 iproc = aproc[k];
3131 gam_find_block_indices(handle, iproc, index);
3132 for (j=0; j<ndim; j++) {
3133 index[j] = index[j]%GA[handle].nblock[j];
3134 }
3135 gam_find_tile_proc_from_indices(handle,iproc,index);
3136 if (p_handle >= 0) {
3137 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3138 }
3139 if(GA_fence_set) fence_array[iproc]=1;
3140
3141 rc=ARMCI_PutV(&desc, 1, (int)iproc);
3142 if(rc) pnga_error("scatter failed in armci",rc);
3143 }
3144 }
3145 GAstat.numsca_procs += naproc;
3146 break;
3147 case SCATTER_ACC:
3148 /* go through all the elements
3149 * for process 0: ptr_src[0][0, 1, ...] = v + offset0...
3150 * ptr_dst[0][0, 1, ...] = subscript + offset0...
3151 * for process 1: ptr_src[1][...] ...
3152 * ptr_dst[1][...] ...
3153 */
3154 if (GA[handle].distr_type == REGULAR) {
3155 for(k=0; k<nv; k++){
3156 iproc = proc[k];
3157 if (GA[handle].num_rstrctd > 0)
3158 iproc = GA[handle].rank_rstrctd[iproc];
3159 ptr_src[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
3160 if (p_handle != 0) {
3161 gam_Loc_ptr(iproc, handle, (subscript+k*ndim),
3162 ptr_dst[map[iproc]]+count[iproc]);
3163 } else {
3164 gam_Loc_ptr(proc[k], handle, (subscript+k*ndim),
3165 ptr_dst[map[iproc]]+count[iproc]);
3166 }
3167 count[iproc]++;
3168 }
3169 } else {
3170 Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
3171 Integer j, jtot, last, offset;
3172 for(k=0; k<nv; k++){
3173 iproc = proc[k];
3174 ptr_src[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
3175 pnga_distribution(g_a, iproc, lo, hi);
3176 pnga_access_block_ptr(g_a, iproc, &(ptr_dst[map[iproc]][count[iproc]]), ld);
3177 pnga_release_block(g_a, iproc);
3178 /* calculate remaining offset */
3179 offset = 0;
3180 last = ndim - 1;
3181 jtot = 1;
3182 for (j=0; j<last; j++) {
3183 offset += ((subscript+k*ndim)[j]-lo[j])*jtot;
3184 jtot *= ld[j];
3185 }
3186 offset += ((subscript+k*ndim)[last]-lo[last])*jtot;
3187 ptr_dst[map[iproc]][count[iproc]]=((char*)ptr_dst[map[iproc]][count[iproc]])+offset*GA[handle].elemsize;
3188 count[iproc]++;
3189 }
3190 }
3191
3192 /* source and destination pointers are ready for all processes */
3193 if (GA[handle].distr_type == REGULAR) {
3194 for(k=0; k<naproc; k++) {
3195 int rc=0;
3196
3197 desc.bytes = (int)item_size;
3198 desc.src_ptr_array = ptr_src[k];
3199 desc.dst_ptr_array = ptr_dst[k];
3200 desc.ptr_array_len = (int)nelem[aproc[k]];
3201
3202 if (p_handle < 0) {
3203 iproc = aproc[k];
3204 if (GA[handle].num_rstrctd > 0)
3205 iproc = GA[handle].rstrctd_list[iproc];
3206 } else {
3207 iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
3208 }
3209 if(GA_fence_set) fence_array[iproc]=1;
3210
3211 if(alpha != NULL) {
3212 int optype=-1;
3213 if(type==C_DBL) optype= ARMCI_ACC_DBL;
3214 else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3215 else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3216 else if(type==C_INT)optype= ARMCI_ACC_INT;
3217 else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3218 else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3219 else pnga_error("type not supported",type);
3220 rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)iproc);
3221 }
3222 if(rc) pnga_error("scatter_acc failed in armci",rc);
3223 }
3224 } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
3225 for(k=0; k<naproc; k++) {
3226 int rc=0;
3227
3228 desc.bytes = (int)item_size;
3229 desc.src_ptr_array = ptr_src[k];
3230 desc.dst_ptr_array = ptr_dst[k];
3231 desc.ptr_array_len = (int)nelem[aproc[k]];
3232
3233 iproc = aproc[k];
3234 iproc = iproc%nproc;
3235 if (p_handle >= 0) {
3236 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3237 }
3238 if(GA_fence_set) fence_array[iproc]=1;
3239
3240 if(alpha != NULL) {
3241 int optype=-1;
3242 if(type==C_DBL) optype= ARMCI_ACC_DBL;
3243 else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3244 else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3245 else if(type==C_INT)optype= ARMCI_ACC_INT;
3246 else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3247 else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3248 else pnga_error("type not supported",type);
3249 rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)iproc);
3250 }
3251 if(rc) pnga_error("scatter_acc failed in armci",rc);
3252 }
3253 } else if (GA[handle].distr_type == SCALAPACK) {
3254 Integer j, index[MAXDIM];
3255 for(k=0; k<naproc; k++) {
3256 int rc=0;
3257
3258 desc.bytes = (int)item_size;
3259 desc.src_ptr_array = ptr_src[k];
3260 desc.dst_ptr_array = ptr_dst[k];
3261 desc.ptr_array_len = (int)nelem[aproc[k]];
3262
3263 iproc = aproc[k];
3264 gam_find_block_indices(handle, iproc, index);
3265 for (j=0; j<ndim; j++) {
3266 index[j] = index[j]%GA[handle].nblock[j];
3267 }
3268 gam_find_proc_from_sl_indices(handle,iproc,index);
3269 if (p_handle >= 0) {
3270 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3271 }
3272 if(GA_fence_set) fence_array[iproc]=1;
3273
3274 if(alpha != NULL) {
3275 int optype=-1;
3276 if(type==C_DBL) optype= ARMCI_ACC_DBL;
3277 else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3278 else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3279 else if(type==C_INT)optype= ARMCI_ACC_INT;
3280 else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3281 else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3282 else pnga_error("type not supported",type);
3283 rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)iproc);
3284 }
3285 if(rc) pnga_error("scatter_acc failed in armci",rc);
3286 }
3287 } else if (GA[handle].distr_type == TILED ||
3288 GA[handle].distr_type == TILED_IRREG) {
3289 Integer j, index[MAXDIM];
3290 for(k=0; k<naproc; k++) {
3291 int rc=0;
3292
3293 desc.bytes = (int)item_size;
3294 desc.src_ptr_array = ptr_src[k];
3295 desc.dst_ptr_array = ptr_dst[k];
3296 desc.ptr_array_len = (int)nelem[aproc[k]];
3297
3298 iproc = aproc[k];
3299 gam_find_block_indices(handle, iproc, index);
3300 for (j=0; j<ndim; j++) {
3301 index[j] = index[j]%GA[handle].nblock[j];
3302 }
3303 gam_find_tile_proc_from_indices(handle,iproc,index);
3304 if (p_handle >= 0) {
3305 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3306 }
3307 if(GA_fence_set) fence_array[iproc]=1;
3308
3309 if(alpha != NULL) {
3310 int optype=-1;
3311 if(type==C_DBL) optype= ARMCI_ACC_DBL;
3312 else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3313 else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3314 else if(type==C_INT)optype= ARMCI_ACC_INT;
3315 else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3316 else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3317 else pnga_error("type not supported",type);
3318 rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)iproc);
3319 }
3320 if(rc) pnga_error("scatter_acc failed in armci",rc);
3321 }
3322 }
3323 break;
3324 default: pnga_error("operation not supported",op);
3325 }
3326
3327 free(buf2); free(buf1);
3328
3329 ga_free(proc);
3330 }
3331
3332 /**
3333 * Preallocate data for internal linked-list arrays for gather/scatter calls.
3334 * @param nelems: the maximum number of elements that will be moved in
3335 * gather/scatter call using these arrays.
3336 */
3337 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3338 # pragma weak wnga_alloc_gatscat_buf = pnga_alloc_gatscat_buf
3339 #endif
pnga_alloc_gatscat_buf(Integer nelems)3340 void pnga_alloc_gatscat_buf(Integer nelems)
3341 {
3342 Integer nprocs = pnga_nnodes();
3343 if (GA_prealloc_gatscat)
3344 pnga_error("Gather/scatter buffers already allocated",nelems);
3345 GA_prealloc_gatscat = nelems;
3346 GA_header =(Integer *)ga_malloc(nprocs, MT_F_INT, "ga_gat_header");
3347 GA_list =(Integer *)ga_malloc(nelems, MT_F_INT, "ga_gat_list");
3348 GA_elems =(Integer *)ga_malloc(nprocs, MT_F_INT, "ga_gat_nelems");
3349 }
3350
3351 /**
3352 * Free up preallocate data for internal linked-list arrays for gather/scatter calls.
3353 */
3354 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3355 # pragma weak wnga_free_gatscat_buf = pnga_free_gatscat_buf
3356 #endif
pnga_free_gatscat_buf()3357 void pnga_free_gatscat_buf()
3358 {
3359 if (!GA_prealloc_gatscat)
3360 pnga_error("Gather/scatter buffers not allocated",0);
3361 GA_prealloc_gatscat = 0;
3362 ga_free(GA_elems);
3363 ga_free(GA_list);
3364 ga_free(GA_header);
3365 }
3366
3367 #define gam_c2f_index(index_c, index_f, ndim) \
3368 { \
3369 Integer _i; \
3370 for (_i = 0; _i < (ndim); _i++) { \
3371 index_f[_i] = (Integer)(index_c[(ndim)-1-_i]+1); \
3372 } \
3373 }
3374
3375 /*\ GATHER OPERATION elements from the global array into v
3376 \*/
gai_gatscat_new(int op,Integer g_a,void * v,void * subscript,Integer c_flag,Integer nv,double * locbytes,double * totbytes,void * alpha)3377 void gai_gatscat_new(int op, Integer g_a, void* v, void *subscript,
3378 Integer c_flag, Integer nv, double *locbytes,
3379 double* totbytes, void *alpha)
3380 {
3381 Integer handle=g_a+GA_OFFSET;
3382 int ndim, i, j, k, type, item_size;
3383 Integer idx, p_handle, num_rstrctd;
3384 Integer *proc;
3385 Integer nprocs, me, iproc, tproc, index[MAXDIM];
3386 Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM-1];
3387 Integer jtot, last, offset;
3388 Integer *subscript_ptr;
3389
3390 Integer *header, *list, *nelems;
3391 char *buf;
3392 void **ptr_rem, **ptr_loc;
3393 int rc, maxlen;
3394 int *nblock;
3395 armci_giov_t desc;
3396
3397
3398
3399 me = pnga_nodeid();
3400 num_rstrctd = GA[handle].num_rstrctd;
3401 nblock = GA[handle].nblock;
3402
3403 /* determine how many processors are associated with array */
3404 p_handle = GA[handle].p_handle;
3405 if (p_handle < 0) {
3406 nprocs = GAnproc;
3407 if (num_rstrctd > 0) nprocs = num_rstrctd;
3408 } else {
3409 nprocs = PGRP_LIST[p_handle].map_nproc;
3410 }
3411
3412 if (!GA_prealloc_gatscat) {
3413 header =(Integer *)ga_malloc(nprocs, MT_F_INT, "ga_gat_header");
3414 list =(Integer *)ga_malloc(nv, MT_F_INT, "ga_gat_list");
3415 nelems =(Integer *)ga_malloc(nprocs, MT_F_INT, "ga_gat_nelems");
3416 } else {
3417 if (GA_prealloc_gatscat < nv)
3418 pnga_error("Gather/scatter vector exceeds allocation length ",
3419 GA_prealloc_gatscat);
3420 header = (Integer*)GA_header;
3421 list = (Integer*)GA_list;
3422 nelems = (Integer*)GA_elems;
3423 }
3424
3425 ndim = GA[handle].ndim;
3426 type = GA[handle].type;
3427 item_size = GA[handle].elemsize;
3428 *totbytes = item_size * nv;
3429
3430 /* Initialize linked list data structures */
3431 for (i=0; i<nprocs; i++) {
3432 nelems[i] = 0;
3433 header[i] = -1;
3434 }
3435 for (i=0; i<nv; i++) {
3436 list[i] = 0;
3437 }
3438
3439 /* Set up linked list that partitions elements defined in subscripts array
3440 * into groups corresponding to the elements home processor
3441 */
3442 maxlen = 1;
3443 for (i=0; i<nv; i++) {
3444 if (c_flag) {
3445 gam_c2f_index(((int**)subscript)[i], index, ndim);
3446 subscript_ptr = index;
3447 } else {
3448 subscript_ptr = ((Integer*)subscript)+i*ndim;
3449 }
3450 if(!pnga_locate(g_a, subscript_ptr, &idx)) {
3451 gai_print_subscript("invalid subscript",ndim, subscript_ptr,"\n");
3452 pnga_error("failed -element:",i);
3453 }
3454 if (num_rstrctd > 0) idx = GA[handle].rank_rstrctd[idx];
3455 /* map from block to processor, if necessary */
3456 if (GA[handle].distr_type == BLOCK_CYCLIC) {
3457 idx = idx%nprocs;
3458 } else if (GA[handle].distr_type == SCALAPACK) {
3459 gam_find_block_indices(handle,idx,index);
3460 for (j=0; j<ndim; j++) {
3461 index[j] = index[j]%nblock[j];
3462 }
3463 gam_find_proc_from_sl_indices(handle,idx,index);
3464 } else if (GA[handle].distr_type == TILED ||
3465 GA[handle].distr_type == TILED_IRREG) {
3466 gam_find_block_indices(handle,idx,index);
3467 for (j=0; j<ndim; j++) {
3468 index[j] = index[j]%nblock[j];
3469 }
3470 gam_find_tile_proc_from_indices(handle,idx,index);
3471 }
3472 nelems[idx]++;
3473 if (maxlen<nelems[idx]) maxlen=nelems[idx];
3474 j = header[idx];
3475 header[idx] = i;
3476 list[i] = j;
3477 }
3478 *locbytes = item_size * nelems[me];
3479
3480 /* allocate buffers for individual vector calls */
3481 buf = (char*)malloc(2*maxlen*sizeof(void*));
3482 ptr_loc = (void**)buf;
3483 ptr_rem = (void**)(buf+maxlen*sizeof(void*));
3484
3485 for (iproc=0; iproc<nprocs; iproc++) {
3486 if (nelems[iproc] > 0) {
3487 /* loop through linked list to find all data elements associated with
3488 * the remote processor iproc
3489 */
3490 idx = header[iproc];
3491 j = 0;
3492 while (idx > -1) {
3493 if (c_flag) {
3494 gam_c2f_index(((int**)subscript)[idx], index, ndim);
3495 subscript_ptr = index;
3496 } else {
3497 subscript_ptr = ((Integer*)subscript)+idx*ndim;
3498 }
3499 if (GA[handle].distr_type == REGULAR) {
3500 /* gam_Loc_ptr modifies the value of the processor variable for
3501 * restricted arrays or processor groups, so make a temporary copy
3502 */
3503 tproc = iproc;
3504 gam_Loc_ptr(tproc, handle, (subscript_ptr),
3505 ptr_rem+j);
3506 } else {
3507 /* TODO: Figure out how to get correct pointers for block cyclic
3508 * distributions */
3509 /* find block index from subscript */
3510 if(!pnga_locate(g_a, subscript_ptr, &tproc)) {
3511 gai_print_subscript("invalid subscript for block-cyclic array",
3512 ndim, subscript_ptr,"\n");
3513 pnga_error("failed -element:",idx);
3514 }
3515 pnga_distribution(g_a, tproc, lo, hi);
3516 pnga_access_block_ptr(g_a, tproc, ptr_rem+j, ld);
3517 pnga_release_block(g_a, tproc);
3518 offset = 0;
3519 last = ndim -1;
3520 jtot = 1;
3521 for (k=0; k<last; k++) {
3522 offset += ((subscript_ptr)[k]-lo[k])*jtot;
3523 jtot *= ld[k];
3524 }
3525 offset += ((subscript_ptr)[last]-lo[last])*jtot;
3526 ptr_rem[j] = (void*)(((char*)ptr_rem[j])+offset*item_size);
3527 }
3528 ptr_loc[j] = (void*)(((char*)v) + idx * item_size);
3529 idx = list[idx];
3530 j++;
3531 }
3532 /* correct remote proc if restricted arrays or processor groups are
3533 * being used
3534 */
3535 if (num_rstrctd > 0) {
3536 tproc = GA[handle].rstrctd_list[iproc];
3537 } else {
3538 if (p_handle < 0) {
3539 tproc = iproc;
3540 } else {
3541 tproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3542 }
3543 }
3544 /* perform vector operation */
3545 switch(op) {
3546 case GATHER:
3547 desc.bytes = (int)item_size;
3548 desc.src_ptr_array = ptr_rem;
3549 desc.dst_ptr_array = ptr_loc;
3550 desc.ptr_array_len = (int)nelems[iproc];
3551 rc=ARMCI_GetV(&desc, 1, (int)tproc);
3552 if(rc) pnga_error("gather failed in armci",rc);
3553 break;
3554 case SCATTER:
3555 desc.bytes = (int)item_size;
3556 desc.src_ptr_array = ptr_loc;
3557 desc.dst_ptr_array = ptr_rem;
3558 desc.ptr_array_len = (int)nelems[iproc];
3559 rc=ARMCI_PutV(&desc, 1, (int)tproc);
3560 if(rc) pnga_error("scatter failed in armci",rc);
3561 break;
3562 case SCATTER_ACC:
3563 desc.bytes = (int)item_size;
3564 desc.src_ptr_array = ptr_loc;
3565 desc.dst_ptr_array = ptr_rem;
3566 desc.ptr_array_len = (int)nelems[iproc];
3567 if(alpha != NULL) {
3568 int optype=-1;
3569 if(type==C_DBL) optype= ARMCI_ACC_DBL;
3570 else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3571 else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3572 else if(type==C_INT)optype= ARMCI_ACC_INT;
3573 else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3574 else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3575 else pnga_error("type not supported",type);
3576 rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)tproc);
3577 }
3578 if(rc) pnga_error("scatter_acc failed in armci",rc);
3579 break;
3580 default: pnga_error("operation not supported",op);
3581 }
3582 }
3583 }
3584 free(buf);
3585 if (!GA_prealloc_gatscat) {
3586 ga_free(nelems);
3587 ga_free(list);
3588 ga_free(header);
3589 }
3590
3591 }
3592
3593 /**
3594 * Gather random elements from a global array into local buffer v
3595 \*/
3596 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3597 # pragma weak wnga_gather = pnga_gather
3598 #endif
3599
pnga_gather(Integer g_a,void * v,void * subscript,Integer c_flag,Integer nv)3600 void pnga_gather(Integer g_a, void* v, void *subscript, Integer c_flag, Integer nv)
3601 {
3602
3603 if (nv < 1) return;
3604 ga_check_handleM(g_a, "nga_gather");
3605
3606 GAstat.numgat++;
3607
3608 #ifdef USE_GATSCAT_NEW
3609 gai_gatscat_new(GATHER,g_a,v,subscript,c_flag,nv,&GAbytes.gattot,&GAbytes.gatloc, NULL);
3610 #else
3611 gai_gatscat(GATHER,g_a,v,subscript,nv,&GAbytes.gattot,&GAbytes.gatloc, NULL);
3612 #endif
3613
3614 }
3615
3616 /**
3617 * Scatter elements from a local buffer v into random locations in a Global
3618 * Array
3619 */
3620 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3621 # pragma weak wnga_scatter = pnga_scatter
3622 #endif
3623
pnga_scatter(Integer g_a,void * v,void * subscript,Integer c_flag,Integer nv)3624 void pnga_scatter(Integer g_a, void* v, void *subscript, Integer c_flag, Integer nv)
3625 {
3626
3627 if (nv < 1) return;
3628 ga_check_handleM(g_a, "nga_scatter");
3629
3630 GAstat.numsca++;
3631
3632 #ifdef USE_GATSCAT_NEW
3633 gai_gatscat_new(SCATTER,g_a,v,subscript,c_flag,nv,&GAbytes.scatot,&GAbytes.scaloc, NULL);
3634 #else
3635 gai_gatscat(SCATTER,g_a,v,subscript,nv,&GAbytes.scatot,&GAbytes.scaloc, NULL);
3636 #endif
3637
3638 }
3639
3640 /**
3641 * Add elements of v to contents at random locations in Global Array
3642 * specified by array subscript
3643 */
3644 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3645 # pragma weak wnga_scatter_acc = pnga_scatter_acc
3646 #endif
3647
pnga_scatter_acc(Integer g_a,void * v,void * subscript,Integer c_flag,Integer nv,void * alpha)3648 void pnga_scatter_acc(Integer g_a, void* v, void *subscript, Integer c_flag,
3649 Integer nv, void *alpha)
3650 {
3651
3652 if (nv < 1) return;
3653 ga_check_handleM(g_a, "nga_scatter_acc");
3654
3655 GAstat.numsca++;
3656
3657 #ifdef USE_GATSCAT_NEW
3658 gai_gatscat_new(SCATTER_ACC, g_a, v, subscript, c_flag, nv, &GAbytes.scatot,
3659 &GAbytes.scaloc, alpha);
3660 #else
3661 gai_gatscat(SCATTER_ACC, g_a, v, subscript, nv, &GAbytes.scatot,
3662 &GAbytes.scaloc, alpha);
3663 #endif
3664
3665 }
3666
3667 /**
3668 * Gather random elements from 2D global array into local buffer v
3669 */
3670 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3671 # pragma weak wnga_gather2d = pnga_gather2d
3672 #endif
3673
pnga_gather2d(Integer g_a,void * v,Integer * i,Integer * j,Integer nv)3674 void pnga_gather2d(Integer g_a, void *v, Integer *i, Integer *j,
3675 Integer nv)
3676 {
3677 Integer k, kk, proc, item_size;
3678 Integer *aproc, naproc; /* active processes and numbers */
3679 Integer *map; /* map the active processes to allocated space */
3680 char *buf1, *buf2;
3681 Integer handle = g_a + GA_OFFSET;
3682 Integer nproc, p_handle, iproc;
3683
3684 Integer *count; /* counters for each process */
3685 Integer *nelem; /* number of elements for each process */
3686 /* source and destination pointers for each process */
3687 void ***ptr_src, ***ptr_dst;
3688 void **ptr_org; /* the entire pointer array */
3689 armci_giov_t desc;
3690 Integer *ilo, *ihi, *jlo, *jhi, *ldp, *owner;
3691 Integer lo[2], hi[2];
3692 char **ptr_ref;
3693 Integer num_blocks=0;
3694 Integer subscrpt[2];
3695
3696 if (nv < 1) return;
3697
3698 ga_check_handleM(g_a, "ga_gather");
3699
3700 GAstat.numgat++;
3701
3702 /* determine how many processors are associated with array */
3703 p_handle = GA[handle].p_handle;
3704 if (p_handle < 0) {
3705 nproc = GAnproc;
3706 if (GA[handle].num_rstrctd > 0) {
3707 nproc = GA[handle].num_rstrctd;
3708 }
3709 } else {
3710 nproc = PGRP_LIST[p_handle].map_nproc;
3711 }
3712
3713 /* allocate temp memory */
3714 if (GA[handle].distr_type == REGULAR) {
3715 buf1 = malloc((int)(nproc *4 + nv)* (sizeof(Integer)));
3716 if(buf1 == NULL) pnga_error("malloc failed", 3*nproc);
3717 } else {
3718 num_blocks = GA[handle].block_total;
3719 buf1 = malloc((int)(num_blocks *4 + nv)* (sizeof(Integer)));
3720 if(buf1 == NULL) pnga_error("malloc failed", 3*num_blocks);
3721 }
3722
3723 owner = (Integer *)buf1;
3724 count = owner+ nv;
3725 if (GA[handle].distr_type == REGULAR) {
3726 nelem = count + nproc;
3727 aproc = count + 2 * nproc;
3728 map = count + 3 * nproc;
3729 } else {
3730 nelem = count + num_blocks;
3731 aproc = count + 2 * num_blocks;
3732 map = count + 3 * num_blocks;
3733 }
3734
3735 if (GA[handle].distr_type == REGULAR) {
3736 /* initialize the counters and nelem */
3737 for(kk=0; kk<nproc; kk++) {
3738 count[kk] = 0; nelem[kk] = 0;
3739 }
3740 } else {
3741 for(kk=0; kk<num_blocks; kk++) {
3742 count[kk] = 0; nelem[kk] = 0;
3743 }
3744 }
3745
3746 /* find proc or block that owns the (i,j) element; store it in temp: */
3747 if (GA[handle].num_rstrctd == 0) {
3748 for(k=0; k< nv; k++) {
3749 subscrpt[0] = *(i+k);
3750 subscrpt[1] = *(j+k);
3751 if(! pnga_locate(g_a, subscrpt, owner+k)){
3752 char err_string[ERR_STR_LEN];
3753 sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
3754 pnga_error(err_string, g_a);
3755 }
3756 iproc = owner[k];
3757 nelem[iproc]++;
3758 }
3759 } else {
3760 for(k=0; k< nv; k++) {
3761 subscrpt[0] = *(i+k);
3762 subscrpt[1] = *(j+k);
3763 if(! pnga_locate(g_a, subscrpt, owner+k)){
3764 char err_string[ERR_STR_LEN];
3765 sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
3766 pnga_error(err_string, g_a);
3767 }
3768 iproc = GA[handle].rank_rstrctd[owner[k]];
3769 nelem[iproc]++;
3770 }
3771 }
3772
3773 naproc = 0;
3774 if (GA[handle].distr_type == REGULAR) {
3775 for(k=0; k<nproc; k++) if(nelem[k] > 0) {
3776 aproc[naproc] = k;
3777 map[k] = naproc;
3778 naproc ++;
3779 }
3780 } else {
3781 for(k=0; k<num_blocks; k++) if(nelem[k] > 0) {
3782 aproc[naproc] = k;
3783 map[k] = naproc;
3784 naproc ++;
3785 }
3786 }
3787 GAstat.numgat_procs += naproc;
3788
3789 buf2 = malloc((int)(2*naproc*sizeof(void **) + 2*nv*sizeof(void *) +
3790 5*naproc*sizeof(Integer) + naproc*sizeof(char*)));
3791 if(buf2 == NULL) pnga_error("malloc failed", naproc);
3792
3793 ptr_src = (void ***)buf2;
3794 ptr_dst = (void ***)(buf2 + naproc*sizeof(void **));
3795 ptr_org = (void **)(buf2 + 2*naproc*sizeof(void **));
3796 ptr_ref = (char **)(buf2+2*naproc*sizeof(void **)+2*nv*sizeof(void *));
3797 ilo = (Integer *)(((char*)ptr_ref) + naproc*sizeof(char*));
3798 ihi = ilo + naproc;
3799 jlo = ihi + naproc;
3800 jhi = jlo + naproc;
3801 ldp = jhi + naproc;
3802
3803 if (GA[handle].distr_type == REGULAR) {
3804 for(kk=0; kk<naproc; kk++) {
3805 iproc = aproc[kk];
3806 if (GA[handle].num_rstrctd > 0)
3807 iproc = GA[handle].rstrctd_list[iproc];
3808 pnga_distribution(g_a, iproc, lo, hi);
3809 ilo[kk] = lo[0];
3810 jlo[kk] = lo[1];
3811 ihi[kk] = hi[0];
3812 jhi[kk] = hi[1];
3813
3814 /* get address of the first element owned by proc */
3815 gaShmemLocation(aproc[kk], g_a, ilo[kk], jlo[kk], &(ptr_ref[kk]),
3816 &(ldp[kk]));
3817 }
3818 } else {
3819 for(kk=0; kk<naproc; kk++) {
3820 iproc = aproc[kk];
3821 pnga_distribution(g_a, iproc, lo, hi);
3822 ilo[kk] = lo[0];
3823 jlo[kk] = lo[1];
3824 ihi[kk] = hi[0];
3825 jhi[kk] = hi[1];
3826
3827 /* get address of the first element owned by proc */
3828 pnga_access_block_ptr(g_a, iproc, &(ptr_ref[kk]), &(ldp[kk]));
3829 pnga_release_block(g_a, iproc);
3830 }
3831 }
3832
3833 item_size = GA[GA_OFFSET + g_a].elemsize;
3834 GAbytes.gattot += (double)item_size*nv;
3835 /*This stuff is probably completely wrong. Doesn't affect performance,
3836 * just statistics. */
3837 iproc = owner[GAme];
3838 GAbytes.gatloc += (double)item_size * nelem[iproc];
3839
3840 ptr_src[0] = ptr_org; ptr_dst[0] = ptr_org + nv;
3841 for(k=1; k<naproc; k++) {
3842 ptr_src[k] = ptr_src[k-1] + nelem[aproc[k-1]];
3843 ptr_dst[k] = ptr_dst[k-1] + nelem[aproc[k-1]];
3844 }
3845
3846 for(k=0; k<nv; k++){
3847 Integer this_count;
3848 proc = owner[k];
3849 if (GA[handle].num_rstrctd > 0)
3850 proc = GA[handle].rank_rstrctd[owner[k]];
3851 this_count = count[proc];
3852 count[proc]++;
3853 proc = map[proc];
3854 ptr_dst[proc][this_count] = ((char*)v) + k * item_size;
3855
3856 if(i[k] < ilo[proc] || i[k] > ihi[proc] ||
3857 j[k] < jlo[proc] || j[k] > jhi[proc]){
3858 char err_string[ERR_STR_LEN];
3859 sprintf(err_string,"proc=%d invalid i/j=(%ld,%ld)><[%ld:%ld,%ld:%ld]",
3860 (int)proc,(long)i[k],(long)j[k],
3861 (long)ilo[proc],(long)ihi[proc],
3862 (long)jlo[proc], (long)jhi[proc]);
3863 pnga_error(err_string, g_a);
3864 }
3865 ptr_src[proc][this_count] = ptr_ref[proc] + item_size *
3866 ((j[k] - jlo[proc])* ldp[proc] + i[k] - ilo[proc]);
3867 }
3868 /* source and destination pointers are ready for all processes */
3869 if (GA[handle].distr_type == REGULAR) {
3870 for(k=0; k<naproc; k++) {
3871 int rc;
3872
3873 desc.bytes = (int)item_size;
3874 desc.src_ptr_array = ptr_src[k];
3875 desc.dst_ptr_array = ptr_dst[k];
3876 desc.ptr_array_len = (int)nelem[aproc[k]];
3877 if (p_handle < 0) {
3878 iproc = aproc[k];
3879 if (GA[handle].num_rstrctd > 0)
3880 iproc = GA[handle].rstrctd_list[iproc];
3881 } else {
3882 iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
3883 }
3884 rc=ARMCI_GetV(&desc, 1, (int)iproc);
3885 if(rc) pnga_error("gather failed in armci",rc);
3886 }
3887 } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
3888 for(k=0; k<naproc; k++) {
3889 int rc;
3890
3891 desc.bytes = (int)item_size;
3892 desc.src_ptr_array = ptr_src[k];
3893 desc.dst_ptr_array = ptr_dst[k];
3894 desc.ptr_array_len = (int)nelem[aproc[k]];
3895 iproc = aproc[k];
3896 iproc = iproc%nproc;
3897 if (p_handle >= 0) {
3898 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3899 }
3900 rc=ARMCI_GetV(&desc, 1, (int)iproc);
3901 if(rc) pnga_error("gather failed in armci",rc);
3902 }
3903 } else if (GA[handle].distr_type == SCALAPACK) {
3904 Integer index[MAXDIM];
3905 for(k=0; k<naproc; k++) {
3906 int rc;
3907
3908 desc.bytes = (int)item_size;
3909 desc.src_ptr_array = ptr_src[k];
3910 desc.dst_ptr_array = ptr_dst[k];
3911 desc.ptr_array_len = (int)nelem[aproc[k]];
3912 iproc = aproc[k];
3913 gam_find_block_indices(handle, iproc, index);
3914 index[0] = index[0]%GA[handle].nblock[0];
3915 index[1] = index[1]%GA[handle].nblock[1];
3916 gam_find_proc_from_sl_indices(handle,iproc,index);
3917 if (p_handle >= 0) {
3918 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3919 }
3920 rc=ARMCI_GetV(&desc, 1, (int)iproc);
3921 if(rc) pnga_error("gather failed in armci",rc);
3922 }
3923 } else if (GA[handle].distr_type == TILED ||
3924 GA[handle].distr_type == TILED_IRREG) {
3925 Integer index[MAXDIM];
3926 for(k=0; k<naproc; k++) {
3927 int rc;
3928
3929 desc.bytes = (int)item_size;
3930 desc.src_ptr_array = ptr_src[k];
3931 desc.dst_ptr_array = ptr_dst[k];
3932 desc.ptr_array_len = (int)nelem[aproc[k]];
3933 iproc = aproc[k];
3934 gam_find_block_indices(handle, iproc, index);
3935 index[0] = index[0]%GA[handle].nblock[0];
3936 index[1] = index[1]%GA[handle].nblock[1];
3937 gam_find_tile_proc_from_indices(handle,iproc,index);
3938 if (p_handle >= 0) {
3939 iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3940 }
3941 rc=ARMCI_GetV(&desc, 1, (int)iproc);
3942 if(rc) pnga_error("gather failed in armci",rc);
3943 }
3944 }
3945
3946 free(buf2);
3947 free(buf1);
3948 }
3949
3950 /**
3951 * Read and increment an element of a Global Array
3952 */
3953 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3954 # pragma weak wnga_read_inc = pnga_read_inc
3955 #endif
3956
pnga_read_inc(Integer g_a,Integer * subscript,Integer inc)3957 Integer pnga_read_inc(Integer g_a, Integer* subscript, Integer inc)
3958 {
3959 GA_Internal_Threadsafe_Lock();
3960 char *ptr;
3961 Integer ldp[MAXDIM], proc, handle=GA_OFFSET+g_a, p_handle, ndim;
3962 int optype,ivalue;
3963 long lvalue;
3964 void *pval;
3965
3966 ga_check_handleM(g_a, "nga_read_inc");
3967
3968 /* BJP printf("p[%d] g_a: %d subscript: %d inc: %d\n",GAme, g_a, subscript[0], inc); */
3969
3970 if(GA[handle].type!=C_INT && GA[handle].type!=C_LONG &&
3971 GA[handle].type!=C_LONGLONG)
3972 pnga_error("type must be integer",GA[handle].type);
3973
3974 GAstat.numrdi++;
3975 GAbytes.rditot += (double)sizeof(Integer);
3976 p_handle = GA[handle].p_handle;
3977 ndim = GA[handle].ndim;
3978
3979 /* find out who owns it */
3980 pnga_locate(g_a, subscript, &proc);
3981
3982 /* get an address of the g_a(subscript) element */
3983 if (GA[handle].distr_type == REGULAR) {
3984 if (GA[handle].num_rstrctd == 0) {
3985 gam_Location(proc, handle, subscript, (char**)&ptr, ldp);
3986 } else {
3987 gam_Location(GA[handle].rank_rstrctd[proc], handle, subscript, (char**)&ptr, ldp);
3988 }
3989 } else {
3990 Integer lo[MAXDIM], hi[MAXDIM];
3991 Integer j, jtot, last, offset;
3992 pnga_distribution(g_a, proc, lo, hi);
3993 pnga_access_block_ptr(g_a, proc, (char**)&ptr, ldp);
3994 pnga_release_block(g_a, proc);
3995 offset = 0;
3996 last = ndim - 1;
3997 jtot = 1;
3998 for (j=0; j<last; j++) {
3999 offset += (subscript[j]-lo[j])*jtot;
4000 jtot *= ldp[j];
4001 }
4002 offset += (subscript[last]-lo[last])*jtot;
4003 ptr += offset*GA[handle].elemsize;
4004 }
4005
4006 if(GA[handle].type==C_INT){
4007 optype = ARMCI_FETCH_AND_ADD;
4008 pval = &ivalue;
4009 }else{
4010 optype = ARMCI_FETCH_AND_ADD_LONG;
4011 pval = &lvalue;
4012 }
4013
4014 if(GAme == proc)GAbytes.rdiloc += (double)sizeof(Integer);
4015
4016 if (GA[handle].distr_type == BLOCK_CYCLIC) {
4017 proc = proc%pnga_nnodes();
4018 } else if (GA[handle].distr_type == SCALAPACK) {
4019 Integer j, index[MAXDIM];
4020 gam_find_block_indices(handle, proc, index);
4021 for (j=0; j<ndim; j++) {
4022 index[j] = index[j]%GA[handle].nblock[j];
4023 }
4024 gam_find_proc_from_sl_indices(handle,proc,index);
4025 } else if (GA[handle].distr_type == TILED ||
4026 GA[handle].distr_type == TILED_IRREG) {
4027 Integer j, index[MAXDIM];
4028 gam_find_block_indices(handle, proc, index);
4029 for (j=0; j<ndim; j++) {
4030 index[j] = index[j]%GA[handle].nblock[j];
4031 }
4032 gam_find_tile_proc_from_indices(handle,proc,index);
4033 }
4034 if (p_handle != -1) {
4035 proc=PGRP_LIST[p_handle].inv_map_proc_list[proc];
4036 /*printf("\n%d:proc=%d",GAme,proc);fflush(stdout);*/
4037 }
4038
4039 ARMCI_Rmw(optype, pval, (int*)ptr, (int)inc, (int)proc);
4040
4041
4042 GA_Internal_Threadsafe_Unlock();
4043 if(GA[handle].type==C_INT)
4044 return (Integer) ivalue;
4045 else
4046 return (Integer) lvalue;
4047 }
4048
4049 /**
4050 * Utility function to correct patch indices for presence of
4051 * skips. If no data is left on patch it returns false.
4052 */
gai_correct_strided_patch(Integer ndim,Integer * lo,Integer * skip,Integer * plo,Integer * phi)4053 Integer gai_correct_strided_patch(Integer ndim,
4054 Integer *lo,
4055 Integer *skip,
4056 Integer *plo,
4057 Integer *phi)
4058 {
4059 Integer i, delta;
4060 for (i=0; i<ndim; i++) {
4061 delta = plo[i]-lo[i];
4062 if (delta%skip[i] != 0) {
4063 plo[i] = plo[i] - delta%skip[i] + skip[i];
4064 }
4065 delta = phi[i]-lo[i];
4066 if (delta%skip[i] != 0) {
4067 phi[i] = phi[i] - delta%skip[i];
4068 }
4069 if (phi[i]<plo[i]) return 0;
4070 }
4071 return 1;
4072 }
4073
4074 /**
4075 * Utility function to calculate the number of elements in each
4076 * stride direction
4077 */
4078
gai_ComputeCountWithSkip(Integer ndim,Integer * lo,Integer * hi,Integer * skip,int * count,Integer * nstride)4079 int gai_ComputeCountWithSkip(Integer ndim, Integer *lo, Integer *hi,
4080 Integer *skip, int *count, Integer *nstride)
4081 {
4082 #if 1
4083 Integer idx;
4084 int i, istride = 0;
4085 /*
4086 if (GAme==0) {
4087 printf("p[%d] ComputeCount lo[0]: %d hi[0]: %d lo[1]: %d hi[1]: %d\n",GAme,
4088 lo[0],hi[0],lo[1],hi[1]);
4089 }
4090 */
4091 count[0]=1;
4092 istride++;
4093 for (i=0; i<(int)ndim; i++) {
4094 idx = hi[i] - lo[i];
4095 if (idx < 0) return 0;
4096 if (skip[i] > 1) {
4097 count[istride] = (int)(idx/skip[i]+1);
4098 istride++;
4099 } else {
4100 count[istride] = (int)idx+1;
4101 istride++;
4102 }
4103 }
4104 *nstride = istride;
4105 return 1;
4106 #else
4107 Integer idx;
4108 int i, istride = 0;
4109 /*
4110 if (GAme==0) {
4111 printf("p[%d] ComputeCount lo[0]: %d hi[0]: %d lo[1]: %d hi[1]: %d\n",GAme,
4112 lo[0],hi[0],lo[1],hi[1]);
4113 }
4114 */
4115 for (i=0; i<(int)ndim; i++) {
4116 idx = hi[i] - lo[i];
4117 if (idx < 0) return 0;
4118 if (skip[i] > 1) {
4119 count[istride] = 1;
4120 istride++;
4121 count[istride] = (int)(idx/skip[i]+1);
4122 istride++;
4123 } else {
4124 count[istride] = (int)idx+1;
4125 istride++;
4126 }
4127 }
4128 *nstride = istride;
4129 return 1;
4130 #endif
4131 }
4132
4133 /**
4134 * Utility function to calculate stride lengths on local and
4135 * remote processors, taking into account the presence of skips
4136 */
gai_SetStrideWithSkip(Integer ndim,Integer size,Integer * ld,Integer * ldrem,int * stride_rem,int * stride_loc,Integer * skip)4137 void gai_SetStrideWithSkip(Integer ndim, Integer size, Integer *ld,
4138 Integer *ldrem, int *stride_rem,
4139 int *stride_loc, Integer *skip)
4140 {
4141 #if 1
4142 int i;
4143 stride_rem[0] = stride_loc[0] = (int)size;
4144 for (i=0; i<ndim; i++) {
4145 stride_rem[i+1] = stride_rem[i];
4146 stride_rem[i] *= skip[i];
4147 stride_rem[i+1] *= (int)ldrem[i];
4148 stride_loc[i+1] = stride_loc[i];
4149 stride_loc[i+1] *= (int)ld[i];
4150 }
4151 #else
4152 int i, istride;
4153 stride_rem[0] = stride_loc[0] = (int)size;
4154 istride = 0;
4155 for (i=0; i<ndim; i++) {
4156 if (skip[i] > 1) {
4157 stride_rem[istride] *= (int)skip[i];
4158 stride_rem[istride+1] = stride_rem[istride]/((int)skip[i]);
4159 stride_loc[istride+1] = stride_loc[istride];
4160 istride++;
4161 if (i<ndim-1) {
4162 stride_rem[istride] *= (int)ldrem[i];
4163 stride_rem[istride+1] = stride_rem[istride];
4164 stride_loc[istride] *= (int)ld[i];
4165 stride_loc[istride+1] = stride_loc[istride];
4166 }
4167 } else {
4168 if (i<ndim-1) {
4169 stride_rem[istride] *= (int)ldrem[i];
4170 stride_rem[istride+1] = stride_rem[istride];
4171 stride_loc[istride] *= ld[i];
4172 stride_loc[istride+1] = stride_loc[istride];
4173 }
4174 }
4175 istride++;
4176 }
4177 #endif
4178 }
4179
gai_ComputePatchIndexWithSkip(Integer ndim,Integer * lo,Integer * plo,Integer * skip,Integer * ld,Integer * idx_buf)4180 void gai_ComputePatchIndexWithSkip(Integer ndim, Integer *lo, Integer *plo,
4181 Integer *skip, Integer *ld, Integer *idx_buf)
4182 {
4183 Integer i, delta, inc, factor;
4184 delta = plo[0] - lo[0];
4185 inc = delta%skip[0];
4186 delta -= inc;
4187 delta /= skip[0];
4188 *idx_buf = delta;
4189 for (i=0; i<ndim-1; i++) {
4190 factor = ld[i];
4191 delta = plo[i+1]-lo[i+1];
4192 inc = delta%skip[i+1];
4193 delta -= inc;
4194 delta /= skip[i+1];
4195 *idx_buf += factor*delta;
4196 }
4197 }
4198
4199 /* Utility function to find offset from start of data segment
4200 * ndim: dimension of block
4201 * lo: Lower corner of block
4202 * plo: Point within the block
4203 * ld: Physical dimensions of data block
4204 * offset: Number of elements plo is located from start of block
4205 */
gai_FindOffset(Integer ndim,Integer * lo,Integer * plo,Integer * ld,Integer * offset)4206 void gai_FindOffset(Integer ndim,Integer *lo, Integer *plo,
4207 Integer *ld, Integer *offset)
4208 {
4209 Integer i, factor;
4210 *offset = 0;
4211 factor = 1;
4212 for (i=0; i<ndim; i++) {
4213 *offset += (plo[i]-lo[i])*factor;
4214 if (i<ndim-1) factor *= ld[i];
4215 }
4216 }
4217
4218 /**
4219 * Put an N-dimensional patch of strided data into a Global Array. This routine
4220 * can by used with user-defined data types to modify a single element instead
4221 * of copying over the entire data
4222 */
4223 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4224 # pragma weak wnga_strided_put = pnga_strided_put
4225 #endif
4226
pnga_strided_put(Integer g_a,Integer * lo,Integer * hi,Integer * skip,void * buf,Integer * ld)4227 void pnga_strided_put(Integer g_a, Integer *lo, Integer *hi, Integer *skip,
4228 void *buf, Integer *ld)
4229 {
4230 /* g_a: Global Array handle
4231 lo[]: Array of lower indices of patch of global array
4232 hi[]: Array of upper indices of patch of global array
4233 skip[]: Array of skips for each dimension
4234 buf[]: Local buffer that patch will be copied from
4235 ld[]: ndim-1 physical dimensions of local buffer */
4236 Integer p, np, handle = GA_OFFSET + g_a;
4237 Integer idx, size, nstride, p_handle, nproc;
4238 Integer ldrem[MAXDIM];
4239 Integer idx_buf, *blo, *bhi;
4240 Integer plo[MAXDIM],phi[MAXDIM];
4241 char *pbuf, *prem;
4242 int count[2*MAXDIM], stride_rem[2*MAXDIM], stride_loc[2*MAXDIM];
4243 int i, proc, ndim;
4244 _iterator_hdl it_hdl;
4245
4246 size = GA[handle].elemsize;
4247 ndim = GA[handle].ndim;
4248 nproc = pnga_nnodes();
4249 p_handle = GA[handle].p_handle;
4250
4251 /* check values of skips to make sure they are legitimate */
4252 for (i = 0; i<ndim; i++) {
4253 if (skip[i]<1) {
4254 pnga_error("nga_strided_put: Invalid value of skip along coordinate ",i);
4255 }
4256 }
4257
4258 gai_iterator_init(g_a, lo, hi, &it_hdl);
4259 while (gai_iterator_next(&it_hdl, &proc, &blo, &bhi, &prem, ldrem)) {
4260 /* Correct ranges to account for skips in original patch. If no
4261 data is left in patch jump to next processor in loop. */
4262 for (i=0; i<ndim; i++) {
4263 plo[i] = blo[i];
4264 phi[i] = bhi[i];
4265 }
4266 if (!gai_correct_strided_patch((Integer)ndim, lo, skip, plo, phi))
4267 continue;
4268 /* May need to correct location of remote buffer */
4269 gai_FindOffset(ndim,blo,plo,ldrem,&idx_buf);
4270 prem += size*idx_buf;
4271
4272 /* get pointer in local buffer to point indexed by plo given that
4273 the corner of the buffer corresponds to the point indexed by lo */
4274 gai_ComputePatchIndexWithSkip(ndim, lo, plo, skip, ld, &idx_buf);
4275 pbuf = size*idx_buf + (char*)buf;
4276
4277 /* Compute number of elements in each stride region and compute the
4278 number of stride regions. Store the results in count and nstride */
4279 if (!gai_ComputeCountWithSkip(ndim, plo, phi, skip, count, &nstride))
4280 continue;
4281
4282 /* Scale first element in count by element size. The ARMCI_PutS routine
4283 uses this convention to figure out memory sizes. */
4284 count[0] *= size;
4285
4286 /* Calculate strides in memory for remote processor indexed by proc and
4287 local buffer */
4288 gai_SetStrideWithSkip(ndim, size, ld, ldrem, stride_rem, stride_loc,
4289 skip);
4290
4291 /* BJP */
4292 if (p_handle != -1) {
4293 proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
4294 }
4295 ARMCI_PutS(pbuf, stride_loc, prem, stride_rem, count, nstride-1, proc);
4296 }
4297 gai_iterator_destroy(&it_hdl);
4298 }
4299
4300 /**
4301 * Get an N-dimensional patch of strided data from a Global Array
4302 */
4303 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4304 # pragma weak wnga_strided_get = pnga_strided_get
4305 #endif
4306
pnga_strided_get(Integer g_a,Integer * lo,Integer * hi,Integer * skip,void * buf,Integer * ld)4307 void pnga_strided_get(Integer g_a, Integer *lo, Integer *hi, Integer *skip,
4308 void *buf, Integer *ld)
4309 {
4310 /* g_a: Global Array handle
4311 lo[]: Array of lower indices of patch of global array
4312 hi[]: Array of upper indices of patch of global array
4313 skip[]: Array of skips for each dimension
4314 buf[]: Local buffer that patch will be copied from
4315 ld[]: ndim-1 physical dimensions of local buffer */
4316 Integer p, np, handle = GA_OFFSET + g_a;
4317 Integer idx, size, nstride, p_handle, nproc;
4318 int i, proc, ndim;
4319 Integer ldrem[MAXDIM];
4320 Integer idx_buf, *blo, *bhi;
4321 Integer plo[MAXDIM],phi[MAXDIM];
4322 char *pbuf, *prem;
4323 int count[2*MAXDIM], stride_rem[2*MAXDIM], stride_loc[2*MAXDIM];
4324 _iterator_hdl it_hdl;
4325
4326 size = GA[handle].elemsize;
4327 ndim = GA[handle].ndim;
4328 nproc = pnga_nnodes();
4329 p_handle = GA[handle].p_handle;
4330
4331 /* check values of skips to make sure they are legitimate */
4332 for (i = 0; i<ndim; i++) {
4333 if (skip[i]<1) {
4334 pnga_error("nga_strided_get: Invalid value of skip along coordinate ",i);
4335 }
4336 }
4337
4338 gai_iterator_init(g_a, lo, hi, &it_hdl);
4339 while (gai_iterator_next(&it_hdl, &proc, &blo, &bhi, &prem, ldrem)) {
4340 /* Correct ranges to account for skips in original patch. If no
4341 data is left in patch jump to next processor in loop. */
4342 for (i=0; i<ndim; i++) {
4343 plo[i] = blo[i];
4344 phi[i] = bhi[i];
4345 }
4346 if (!gai_correct_strided_patch((Integer)ndim, lo, skip, plo, phi))
4347 continue;
4348 /* May need to correct location of remote buffer */
4349 gai_FindOffset(ndim,blo,plo,ldrem,&idx_buf);
4350 prem += size*idx_buf;
4351
4352 /* get pointer in local buffer to point indexed by plo given that
4353 the corner of the buffer corresponds to the point indexed by lo */
4354 gai_ComputePatchIndexWithSkip(ndim, lo, plo, skip, ld, &idx_buf);
4355 pbuf = size*idx_buf + (char*)buf;
4356
4357 /* Compute number of elements in each stride region and compute the
4358 number of stride regions. Store the results in count and nstride */
4359 if (!gai_ComputeCountWithSkip(ndim, plo, phi, skip, count, &nstride))
4360 continue;
4361
4362 /* Scale first element in count by element size. The ARMCI_PutS routine
4363 uses this convention to figure out memory sizes. */
4364 count[0] *= size;
4365
4366 /* Calculate strides in memory for remote processor indexed by proc and
4367 local buffer */
4368 gai_SetStrideWithSkip(ndim, size, ld, ldrem, stride_rem, stride_loc,
4369 skip);
4370
4371 /* BJP */
4372 if (p_handle != -1) {
4373 proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
4374 }
4375 ARMCI_GetS(prem, stride_rem, pbuf, stride_loc, count, nstride-1, proc);
4376 }
4377 gai_iterator_destroy(&it_hdl);
4378 }
4379
4380 /**
4381 * Accumulate operation for an N-dimensional patch of strided data of
4382 * a Global Array
4383 * g_a += alpha * patch
4384 \*/
4385 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4386 # pragma weak wnga_strided_acc = pnga_strided_acc
4387 #endif
4388
pnga_strided_acc(Integer g_a,Integer * lo,Integer * hi,Integer * skip,void * buf,Integer * ld,void * alpha)4389 void pnga_strided_acc(Integer g_a, Integer *lo, Integer *hi, Integer *skip,
4390 void *buf, Integer *ld, void *alpha)
4391 {
4392 /* g_a: Global Array handle
4393 lo[]: Array of lower indices of patch of global array
4394 hi[]: Array of upper indices of patch of global array
4395 skip[]: Array of skips for each dimension
4396 buf[]: Local buffer that patch will be copied from
4397 ld[]: ndim-1 physical dimensions of local buffer
4398 alpha: muliplicative scale factor */
4399 Integer p, np, handle = GA_OFFSET + g_a;
4400 Integer idx, size, nstride, type, p_handle, nproc;
4401 int i, optype=-1, proc, ndim;
4402 Integer ldrem[MAXDIM];
4403 Integer idx_buf, *blo, *bhi;
4404 Integer plo[MAXDIM],phi[MAXDIM];
4405 char *pbuf, *prem;
4406 int count[2*MAXDIM], stride_rem[2*MAXDIM], stride_loc[2*MAXDIM];
4407 _iterator_hdl it_hdl;
4408
4409 size = GA[handle].elemsize;
4410 ndim = GA[handle].ndim;
4411 type = GA[handle].type;
4412 nproc = pnga_nnodes();
4413 p_handle = GA[handle].p_handle;
4414
4415 if (type == C_DBL) optype = ARMCI_ACC_DBL;
4416 else if (type == C_FLOAT) optype = ARMCI_ACC_FLT;
4417 else if (type == C_DCPL) optype = ARMCI_ACC_DCP;
4418 else if (type == C_SCPL) optype = ARMCI_ACC_CPL;
4419 else if (type == C_INT) optype = ARMCI_ACC_INT;
4420 else if (type == C_LONG) optype = ARMCI_ACC_LNG;
4421 else pnga_error("nga_strided_acc: type not supported",type);
4422
4423 /* check values of skips to make sure they are legitimate */
4424 for (i = 0; i<ndim; i++) {
4425 if (skip[i]<1) {
4426 pnga_error("nga_strided_acc: Invalid value of skip along coordinate ",i);
4427 }
4428 }
4429
4430 gai_iterator_init(g_a, lo, hi, &it_hdl);
4431 while (gai_iterator_next(&it_hdl, &proc, &blo, &bhi, &prem, ldrem)) {
4432 /* Correct ranges to account for skips in original patch. If no
4433 data is left in patch jump to next processor in loop. */
4434 for (i=0; i<ndim; i++) {
4435 plo[i] = blo[i];
4436 phi[i] = bhi[i];
4437 }
4438 if (!gai_correct_strided_patch((Integer)ndim, lo, skip, plo, phi))
4439 continue;
4440 /* May need to correct location of remote buffer */
4441 gai_FindOffset(ndim,blo,plo,ldrem,&idx_buf);
4442 prem += size*idx_buf;
4443
4444 /* get pointer in local buffer to point indexed by plo given that
4445 the corner of the buffer corresponds to the point indexed by lo */
4446 gai_ComputePatchIndexWithSkip(ndim, lo, plo, skip, ld, &idx_buf);
4447 pbuf = size*idx_buf + (char*)buf;
4448
4449 /* Compute number of elements in each stride region and compute the
4450 number of stride regions. Store the results in count and nstride */
4451 if (!gai_ComputeCountWithSkip(ndim, plo, phi, skip, count, &nstride))
4452 continue;
4453
4454 /* Scale first element in count by element size. The ARMCI_PutS routine
4455 uses this convention to figure out memory sizes. */
4456 count[0] *= size;
4457
4458 /* Calculate strides in memory for remote processor indexed by proc and
4459 local buffer */
4460 gai_SetStrideWithSkip(ndim, size, ld, ldrem, stride_rem, stride_loc,
4461 skip);
4462
4463 /* BJP */
4464 if (p_handle != -1) {
4465 proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
4466 }
4467 ARMCI_AccS(optype, alpha, pbuf, stride_loc, prem, stride_rem, count,
4468 nstride-1, proc);
4469 }
4470 gai_iterator_destroy(&it_hdl);
4471 }
4472