1 /* $Id: onesided.c,v 1.80.2.18 2007/12/18 22:22:27 d3g293 Exp $ */
2 /*
3  * module: onesided.c
4  * author: Jarek Nieplocha
5  * description: implements GA primitive communication operations --
6  *              accumulate, scatter, gather, read&increment & synchronization
7  *
8  * DISCLAIMER
9  *
10  * This material was prepared as an account of work sponsored by an
11  * agency of the United States Government.  Neither the United States
12  * Government nor the United States Department of Energy, nor Battelle,
13  * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
14  * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
15  * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
16  * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
17  * INFRINGE PRIVATELY OWNED RIGHTS.
18  *
19  *
20  * ACKNOWLEDGMENT
21  *
22  * This software and its documentation were produced with United States
23  * Government support under Contract Number DE-AC06-76RLO-1830 awarded by
24  * the United States Department of Energy.  The United States Government
25  * retains a paid-up non-exclusive, irrevocable worldwide license to
26  * reproduce, prepare derivative works, perform publicly and display
27  * publicly by or for the US Government, including the right to
28  * distribute to other US Government contractors.
29  */
30 #if HAVE_CONFIG_H
31 #   include "config.h"
32 #endif
33 
34 #define USE_GATSCAT_NEW
35 
36 #if HAVE_STDIO_H
37 #   include <stdio.h>
38 #endif
39 #if HAVE_STRING_H
40 #   include <string.h>
41 #endif
42 #if HAVE_STDLIB_H
43 #   include <stdlib.h>
44 #endif
45 #if HAVE_STDINT_H
46 #   include <stdint.h>
47 #endif
48 #if HAVE_MATH_H
49 #   include <math.h>
50 #endif
51 #if HAVE_ASSERT_H
52 #   include <assert.h>
53 #endif
54 #if HAVE_STDDEF_H
55 #include <stddef.h>
56 #endif
57 
58 #include "global.h"
59 #include "globalp.h"
60 #include "base.h"
61 #include "ga_iterator.h"
62 #include "armci.h"
63 #include "macdecls.h"
64 #include "ga-papi.h"
65 #include "ga-wapi.h"
66 #include "thread-safe.h"
67 
68 #define DEBUG 0
69 #define USE_MALLOC 1
70 #define INVALID_MA_HANDLE -1
71 #define NEAR_INT(x) (x)< 0.0 ? ceil( (x) - 0.5) : floor((x) + 0.5)
72 
73 #define BYTE_ADDRESSABLE_MEMORY
74 
75 #ifdef PROFILE_OLD
76 #include "ga_profile.h"
77 #endif
78 
79 
80 #define DISABLE_NBOPT /* disables Non-Blocking OPTimization */
81 
82 /*uncomment line below to verify consistency of MA in every sync */
83 /*#define CHECK_MA yes */
84 
85 char *fence_array;
86 static int GA_fence_set=0;
87 
88 static int GA_prealloc_gatscat = 0;
89 static Integer *GA_header;
90 static Integer *GA_list;
91 static Integer *GA_elems;
92 
93 extern void armci_read_strided(void*, int, int*, int*, char*);
94 extern void armci_write_strided(void*, int, int*, int*, char*);
95 extern armci_hdl_t* get_armci_nbhandle(Integer *);
96 
97 /***************************************************************************/
98 
99 /**
100  *  Synchronize all processes on group
101  */
102 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
103 #   pragma weak wnga_pgroup_sync = pnga_pgroup_sync
104 #endif
105 
pnga_pgroup_sync(Integer grp_id)106 void pnga_pgroup_sync(Integer grp_id)
107 {
108 #ifdef CHECK_MA
109   Integer status;
110 #endif
111 
112   /*    printf("p[%d] calling ga_pgroup_sync on group: %d\n",GAme,*grp_id); */
113 #ifdef USE_ARMCI_GROUP_FENCE
114     int grp = (int)grp_id;
115     ARMCI_GroupFence(&grp);
116     pnga_msg_pgroup_sync(grp_id);
117 #else
118   if (grp_id > 0) {
119 #   ifdef MSG_COMMS_MPI
120     /* fence on all processes in this group */
121     { int p; for(p=0;p<PGRP_LIST[grp_id].map_nproc;p++)
122       ARMCI_Fence(ARMCI_Absolute_id(&PGRP_LIST[grp_id].group, p)); }
123     pnga_msg_pgroup_sync(grp_id);
124     if(GA_fence_set)bzero(fence_array,(int)GAnproc);
125     GA_fence_set=0;
126 #   else
127     pnga_error("ga_pgroup_sync_(): MPI not defined. ga_msg_pgroup_sync_()  can be called only if GA is built with MPI", 0);
128 #   endif
129   } else {
130     /* printf("p[%d] calling regular sync in ga_pgroup_sync\n",GAme); */
131     ARMCI_AllFence();
132     pnga_msg_pgroup_sync(grp_id);
133     if(GA_fence_set)bzero(fence_array,(int)GAnproc);
134     GA_fence_set=0;
135   }
136 #endif
137 #ifdef CHECK_MA
138   status = MA_verify_allocator_stuff();
139 #endif
140 }
141 
142 /**
143  *  Synchronize all processes and complete all outstanding communications
144  */
145 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
146 #   pragma weak wnga_sync = pnga_sync
147 #endif
148 
pnga_sync()149 void pnga_sync()
150 {
151   GA_Internal_Threadsafe_Lock();
152 #ifdef CHECK_MA
153   Integer status;
154 #endif
155 
156 #ifdef USE_ARMCI_GROUP_FENCE
157   if (GA_Default_Proc_Group == -1) {
158     int grp_id = (int)GA_Default_Proc_Group;
159     ARMCI_GroupFence(&grp_id);
160     pnga_msg_sync();
161   } else {
162     int grp_id = (Integer)GA_Default_Proc_Group;
163     pnga_pgroup_sync(grp_id);
164   }
165 #else
166   if (GA_Default_Proc_Group == -1) {
167     ARMCI_AllFence();
168     pnga_msg_sync();
169     if(GA_fence_set)bzero(fence_array,(int)GAnproc);
170     GA_fence_set=0;
171   } else {
172     Integer grp_id = (Integer)GA_Default_Proc_Group;
173     pnga_pgroup_sync(grp_id);
174   }
175 #endif
176 #ifdef CHECK_MA
177   status = MA_verify_allocator_stuff();
178 #endif
179   GA_Internal_Threadsafe_Unlock();
180 }
181 
182 
183 /**
184  *  Wait until all requests initiated by calling process are completed
185  */
186 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
187 #   pragma weak wnga_fence = pnga_fence
188 #endif
189 
pnga_fence()190 void pnga_fence()
191 {
192     int proc;
193     if(GA_fence_set<1)pnga_error("ga_fence: fence not initialized",0);
194     GA_fence_set--;
195     for(proc=0;proc<GAnproc;proc++)if(fence_array[proc])ARMCI_Fence(proc);
196     bzero(fence_array,(int)GAnproc);
197 }
198 
199 /**
200  *  Initialize tracing of request completion
201  */
202 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
203 #   pragma weak wnga_init_fence = pnga_init_fence
204 #endif
205 
pnga_init_fence()206 void pnga_init_fence()
207 {
208     GA_fence_set++;
209 }
210 
gai_init_onesided()211 void gai_init_onesided()
212 {
213     fence_array = calloc((size_t)GAnproc,1);
214     if(!fence_array) pnga_error("ga_init:calloc failed",0);
215 }
216 
217 
gai_finalize_onesided()218 void gai_finalize_onesided()
219 {
220     free(fence_array);
221     fence_array = NULL;
222 }
223 
224 
225 /*\ prepare permuted list of processes for remote ops
226 \*/
227 #define gaPermuteProcList(nproc)\
228 {\
229   if((nproc) ==1) ProcListPerm[0]=0; \
230   else{\
231     int _i, iswap, temp;\
232     if((nproc) > GAnproc) pnga_error("permute_proc: error ", (nproc));\
233 \
234     /* every process generates different random sequence */\
235     (void)srand((unsigned)GAme); \
236 \
237     /* initialize list */\
238     for(_i=0; _i< (nproc); _i++) ProcListPerm[_i]=_i;\
239 \
240     /* list permutation generated by random swapping */\
241     for(_i=0; _i< (nproc); _i++){ \
242       iswap = (int)(rand() % (nproc));  \
243       temp = ProcListPerm[iswap]; \
244       ProcListPerm[iswap] = ProcListPerm[_i]; \
245       ProcListPerm[_i] = temp; \
246     } \
247   }\
248 }
249 
250 
251 #define gaShmemLocation(proc, g_a, _i, _j, ptr_loc, _pld)                      \
252 {                                                                              \
253   Integer _ilo, _ihi, _jlo, _jhi, offset, proc_place, g_handle=(g_a)+GA_OFFSET;\
254   Integer _lo[2], _hi[2], _p_handle;                                           \
255   Integer _iw = GA[g_handle].width[0];                                         \
256   Integer _jw = GA[g_handle].width[1];                                         \
257   Integer _num_rstrctd = GA[g_handle].num_rstrctd;                             \
258                                                                                \
259   ga_ownsM(g_handle, (proc),_lo,_hi);                                          \
260   _p_handle = GA[g_handle].p_handle;                                           \
261   if (_p_handle != 0) {                                                        \
262     proc_place =  proc;                                                        \
263     if (_num_rstrctd != 0) {                                                   \
264       proc_place = GA[g_handle].rstrctd_list[proc_place];                      \
265     }                                                                          \
266   } else {                                                                     \
267     proc_place = PGRP_LIST[_p_handle].inv_map_proc_list[proc];                 \
268   }                                                                            \
269   _ilo = _lo[0]; _ihi=_hi[0];                                                  \
270   _jlo = _lo[1]; _jhi=_hi[1];                                                  \
271                                                                                \
272   if((_i)<_ilo || (_i)>_ihi || (_j)<_jlo || (_j)>_jhi){                        \
273     char err_string[ERR_STR_LEN];                                              \
274     sprintf(err_string,"%s:p=%ld invalid i/j (%ld,%ld)><(%ld:%ld,%ld:%ld)",    \
275         "gaShmemLocation", (long)proc, (long)(_i),(long)(_j),                  \
276         (long)_ilo, (long)_ihi, (long)_jlo, (long)_jhi);                       \
277     pnga_error(err_string, g_a );                                                \
278   }                                                                            \
279   offset = ((_i)-_ilo+_iw) + (_ihi-_ilo+1+2*_iw)*((_j)-_jlo+_jw);              \
280                                                                                \
281   /* find location of the proc in current cluster pointer array */             \
282   *(ptr_loc) = GA[g_handle].ptr[proc_place] +                                  \
283   offset*GAsizeofM(GA[g_handle].type);                                         \
284   *(_pld) = _ihi-_ilo+1+2*_iw;                                                 \
285 }
286 
287 /*\ Return pointer (ptr_loc) to location in memory of element with subscripts
288  *  (subscript). Also return physical dimensions of array in memory in ld.
289 \*/
290 #define gam_Location(proc, g_handle,  subscript, ptr_loc, ld)                  \
291 {                                                                              \
292 Integer _offset=0, _d, _w, _factor=1, _last=GA[g_handle].ndim-1;               \
293 Integer _lo[MAXDIM], _hi[MAXDIM], _pinv, _p_handle;                            \
294                                                                                \
295       ga_ownsM(g_handle, proc, _lo, _hi);                                      \
296       gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim);               \
297       if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0];    \
298       __CRAYX1_PRAGMA("_CRI shortloop");                                       \
299       for(_d=0; _d < _last; _d++)            {                                 \
300           _w = (Integer)GA[g_handle].width[_d];                                \
301           _offset += (subscript[_d]-_lo[_d]+_w) * _factor;                     \
302           ld[_d] = _hi[_d] - _lo[_d] + 1 + 2*_w;                               \
303           _factor *= ld[_d];                                                   \
304       }                                                                        \
305       _offset += (subscript[_last]-_lo[_last]                                  \
306                + (Integer)GA[g_handle].width[_last])                           \
307                * _factor;                                                      \
308       _p_handle = GA[g_handle].p_handle;                                       \
309       if (_p_handle != 0) {                                                    \
310         if (GA[g_handle].num_rstrctd == 0) {                                   \
311           _pinv = proc;                                                        \
312         } else {                                                               \
313           _pinv = GA[g_handle].rstrctd_list[proc];                             \
314         }                                                                      \
315       } else {                                                                 \
316         _pinv = PGRP_LIST[_p_handle].inv_map_proc_list[proc];                  \
317       }                                                                        \
318       *(ptr_loc) =  GA[g_handle].ptr[_pinv]+_offset*GA[g_handle].elemsize;     \
319 }
320 
321 
322 #define gam_GetRangeFromMap(p, ndim, plo, phi){\
323 Integer   _mloc = p* ndim *2;\
324           *plo  = (Integer*)_ga_map + _mloc;\
325           *phi  = *plo + ndim;\
326 }
327 
328 /* compute index of point subscripted by plo relative to point
329    subscripted by lo, for a block with dimensions dims */
330 #define gam_ComputePatchIndex(ndim, lo, plo, dims, pidx){                      \
331 Integer _d, _factor;                                                           \
332           *pidx = plo[0] -lo[0];                                               \
333           __CRAYX1_PRAGMA("_CRI shortloop");                                   \
334           for(_d= 0,_factor=1; _d< ndim -1; _d++){                             \
335              _factor *= (dims[_d]);                                            \
336              *pidx += _factor * (plo[_d+1]-lo[_d+1]);                          \
337           }                                                                    \
338 }
339 
340 #define gam_GetBlockPatch(plo,phi,lo,hi,blo,bhi,ndim) {                        \
341   Integer _d;                                                                  \
342   for (_d=0; _d<ndim; _d++) {                                                  \
343     if (lo[_d] <= phi[_d] && lo[_d] >= plo[_d]) blo[_d] = lo[_d];              \
344     else blo[_d] = plo[_d];                                                    \
345     if (hi[_d] <= phi[_d] && hi[_d] >= plo[_d]) bhi[_d] = hi[_d];              \
346     else bhi[_d] = phi[_d];                                                    \
347   }                                                                            \
348 }
349 
350 /**
351  *  A routine to test for a non-blocking call
352  */
353 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
354 #   pragma weak wnga_nbtest = pnga_nbtest
355 #endif
356 
pnga_nbtest(Integer * nbhandle)357 Integer pnga_nbtest(Integer *nbhandle)
358 {
359     return !nga_test_internal((Integer *)nbhandle);
360 }
361 
362 /**
363  *  A routine to wait for a non-blocking call to complete
364  */
365 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
366 #   pragma weak wnga_nbwait = pnga_nbwait
367 #endif
368 
pnga_nbwait(Integer * nbhandle)369 void pnga_nbwait(Integer *nbhandle)
370 {
371   GA_Internal_Threadsafe_Lock();
372   nga_wait_internal((Integer *)nbhandle);
373   GA_Internal_Threadsafe_Unlock();
374 }
375 
ngai_puts(char * loc_base_ptr,char * pbuf,int * stride_loc,char * prem,int * stride_rem,int * count,int nstrides,int proc,int field_off,int field_size,int type_size)376 static void ngai_puts(char *loc_base_ptr, char *pbuf, int *stride_loc, char *prem, int *stride_rem,
377 		      int *count, int nstrides, int proc, int field_off,
378 		      int field_size, int type_size) {
379   if(field_size<0 || field_size == type_size) {
380     ARMCI_PutS(pbuf,stride_loc,prem,stride_rem,count,nstrides,proc);
381   }
382   else {
383     int i;
384     count -= 1;
385     stride_loc -= 1;
386     stride_rem -= 1;
387     nstrides += 1;
388 
389     for(i=1; i<nstrides; i++) {
390       stride_loc[i] /= type_size;
391       stride_loc[i] *= field_size;
392     }
393 
394     pbuf = loc_base_ptr + (pbuf - loc_base_ptr)/type_size*field_size;
395     prem += field_off;
396 
397     count[1] /= type_size;
398     ARMCI_PutS(pbuf,stride_loc,prem,stride_rem,count,nstrides,proc);
399     count[1] *= type_size; /*restore*/
400     for(i=1; i<nstrides; i++) {
401       stride_loc[i] /= field_size;
402       stride_loc[i] *= type_size;
403     }
404   }
405 }
406 
407 
ngai_nbputs(char * loc_base_ptr,char * pbuf,int * stride_loc,char * prem,int * stride_rem,int * count,int nstrides,int proc,int field_off,int field_size,int type_size,armci_hdl_t * nbhandle)408 static void ngai_nbputs(char *loc_base_ptr, char *pbuf,int *stride_loc, char *prem, int *stride_rem,
409 		      int *count, int nstrides, int proc, int field_off,
410 			int field_size, int type_size, armci_hdl_t *nbhandle) {
411   if(field_size<0 || field_size == type_size) {
412     ARMCI_NbPutS(pbuf,stride_loc,prem,stride_rem,count,nstrides,proc,nbhandle);
413   }
414   else {
415     int i;
416     count -= 1;
417     stride_loc -= 1;
418     stride_rem -= 1;
419     nstrides += 1;
420 
421     for(i=1; i<nstrides; i++) {
422       stride_loc[i] /= type_size;
423       stride_loc[i] *= field_size;
424     }
425 
426     pbuf = loc_base_ptr + (pbuf - loc_base_ptr)/type_size*field_size;
427     prem += field_off;
428 
429     count[1] /= type_size;
430     ARMCI_NbPutS(pbuf,stride_loc,prem,stride_rem,count,nstrides,proc, nbhandle);
431     count[1] *= type_size; /*restore*/
432     for(i=1; i<nstrides; i++) {
433       stride_loc[i] /= field_size;
434       stride_loc[i] *= type_size;
435     }
436   }
437 }
438 
ngai_nbgets(char * loc_base_ptr,char * prem,int * stride_rem,char * pbuf,int * stride_loc,int * count,int nstrides,int proc,int field_off,int field_size,int type_size,armci_hdl_t * nbhandle)439 static void ngai_nbgets(char *loc_base_ptr, char *prem,int *stride_rem, char *pbuf, int *stride_loc,
440 			int *count, int nstrides, int proc, int field_off,
441 			int field_size, int type_size, armci_hdl_t *nbhandle) {
442   if(field_size<0 || field_size == type_size) {
443     ARMCI_NbGetS(prem,stride_rem,pbuf,stride_loc,count,nstrides,proc,nbhandle);
444   }
445   else {
446     int i;
447     count -= 1;
448     stride_loc -= 1;
449     stride_rem -= 1;
450 
451     pbuf = loc_base_ptr + (pbuf - loc_base_ptr)/type_size*field_size;
452     prem += field_off;
453 
454     count[1] /= type_size;
455     nstrides += 1;
456 
457     for(i=1; i<nstrides; i++) {
458       stride_loc[i] /= type_size;
459       stride_loc[i] *= field_size;
460     }
461 /*     { */
462 /*       int i; */
463 /*       printf("%s: calling armci_nbgets: field_off=%d field_size=%d type_size=%d proc=%d nstrides=%d\n",  */
464 /* 	     __FUNCTION__, (int)field_off, (int)field_size, (int)type_size, proc, nstrides); */
465 /*       printf("me=%d loc_ptr=%p rem_ptr=%p count=[", pnga_nodeid(), pbuf, prem); */
466 /*       for(i=0; i<=nstrides; i++) { */
467 /* 	printf("%d ",count[i]); */
468 /*       } */
469 /*       printf("]\n"); */
470 /*       printf("src_stride_arr=["); */
471 /*       for(i=0; i<nstrides; i++) { */
472 /* 	printf("%d ",stride_rem[i]); */
473 /*       } */
474 /*       printf("]\n"); */
475 /*       printf("dst_stride_arr=["); */
476 /*       for(i=0; i<nstrides; i++) { */
477 /* 	printf("%d ",stride_loc[i]); */
478 /*       } */
479 /*       printf("]\n"); */
480 /*     } */
481     ARMCI_NbGetS(prem,stride_rem,pbuf,stride_loc,count,nstrides,proc,nbhandle);
482     count[1] *= type_size; /*restore*/
483     for(i=1; i<nstrides; i++) {
484       stride_loc[i] /= field_size;
485       stride_loc[i] *= type_size;
486     }
487   }
488 }
489 
ngai_gets(char * loc_base_ptr,char * prem,int * stride_rem,char * pbuf,int * stride_loc,int * count,int nstrides,int proc,int field_off,int field_size,int type_size)490 static void ngai_gets(char *loc_base_ptr, char *prem,int *stride_rem, char *pbuf, int *stride_loc,
491 		      int *count, int nstrides, int proc, int field_off,
492 		      int field_size, int type_size) {
493 #if 1
494   Integer handle;
495   ga_init_nbhandle(&handle);
496   ngai_nbgets(loc_base_ptr, prem, stride_rem, pbuf, stride_loc, count, nstrides, proc,
497 	      field_off, field_size, type_size, (armci_hdl_t*)get_armci_nbhandle(&handle));
498   nga_wait_internal(&handle);
499 #else
500   if(field_size<0 || field_size == type_size) {
501     ARMCI_GetS(prem,stride_rem,pbuf,stride_loc,count,nstrides,proc);
502   } else {
503     int i;
504     count -= 1;
505     stride_loc -= 1;
506     stride_rem -= 1;
507 
508     pbuf = loc_base_ptr + (pbuf - loc_base_ptr)/type_size*field_size;
509     prem += field_off;
510 
511     count[1] /= type_size;
512     nstrides += 1;
513 
514     for(i=1; i<nstrides; i++) {
515       stride_loc[i] /= type_size;
516       stride_loc[i] *= field_size;
517     }
518     ARMCI_GetS(prem,stride_rem,pbuf,stride_loc,count,nstrides,proc);
519     count[1] *= type_size; /*restore*/
520     for(i=1; i<nstrides; i++) {
521       stride_loc[i] /= field_size;
522       stride_loc[i] *= type_size;
523     }
524   }
525 #endif
526 }
527 
528 /**
529  *  A common routine called by both non-blocking and blocking GA put calls.
530  */
531 #ifdef __crayx1
532 #pragma _CRI inline pnga_locate_region
533 #endif
ngai_put_common(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer field_off,Integer field_size,Integer * nbhandle)534 void ngai_put_common(Integer g_a,
535                    Integer *lo,
536                    Integer *hi,
537                    void    *buf,
538                    Integer *ld,
539 		     Integer field_off,
540 		     Integer field_size,
541 		     Integer *nbhandle)
542 {
543   Integer  p, np, handle=GA_OFFSET + g_a;
544   Integer  idx, elems, size, p_handle;
545   int proc, ndim, loop, cond;
546   int num_loops=2; /* 1st loop for remote procs; 2nd loop for local procs */
547   Integer n_rstrctd;
548   Integer *rank_rstrctd;
549 #if defined(__crayx1) || defined(DISABLE_NBOPT)
550 #else
551   Integer ga_nbhandle;
552   int counter=0;
553 #endif
554 
555   int _stride_rem[MAXDIM+1], _stride_loc[MAXDIM+1], _count[MAXDIM+1];
556   int *stride_rem=&_stride_rem[1], *stride_loc=&_stride_loc[1], *count=&_count[1];
557   _iterator_hdl it_hdl;
558 
559 
560 
561   ga_check_handleM(g_a, "ngai_put_common");
562 
563   size = GA[handle].elemsize;
564   ndim = GA[handle].ndim;
565   p_handle = GA[handle].p_handle;
566   n_rstrctd = (Integer)GA[handle].num_rstrctd;
567   rank_rstrctd = GA[handle].rank_rstrctd;
568 
569   /*initial  stride portion for field-wise operations*/
570   _stride_rem[0] = size;
571   _stride_loc[0] = field_size;
572   _count[0] = field_size;
573 
574 
575   gai_iterator_init(g_a, lo, hi, &it_hdl);
576 
577 #ifndef NO_GA_STATS
578   gam_CountElems(ndim, lo, hi, &elems);
579   GAbytes.puttot += (double)size*elems;
580   GAstat.numput++;
581   GAstat.numput_procs += np;
582 #endif
583 
584   if(nbhandle)ga_init_nbhandle(nbhandle);
585 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
586   else ga_init_nbhandle(&ga_nbhandle);
587 #endif
588 
589 #ifdef PROFILE_OLD
590   ga_profile_start((int)handle, (long)size*elems, ndim, lo, hi,
591       ENABLE_PROFILE_PUT);
592 #endif
593 
594 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
595   for(loop=0; loop<num_loops; loop++) {
596     __CRAYX1_PRAGMA("_CRI novector");
597 #endif
598     Integer ldrem[MAXDIM];
599     Integer idx_buf, *plo, *phi;
600     char *pbuf, *prem;
601     gai_iterator_reset(&it_hdl);
602     while (gai_iterator_next(&it_hdl, &proc, &plo, &phi, &prem, ldrem)) {
603 
604       /* check if it is local to SMP */
605 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
606       cond = armci_domain_same_id(ARMCI_DOMAIN_SMP,(int)proc);
607       if(loop==0) cond = !cond;
608       if(cond) {
609 #endif
610         /* find the right spot in the user buffer */
611         gam_ComputePatchIndex(ndim, lo, plo, ld, &idx_buf);
612         pbuf = size*idx_buf + (char*)buf;
613 
614         gam_ComputeCount(ndim, plo, phi, count);
615 
616         /* scale number of rows by element size */
617         count[0] *= size;
618         gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc);
619 
620         /*
621         if (p_handle >= 0) {
622         proc = (int)GA_proclist[p];
623         proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
624         }
625         */
626         if(GA_fence_set)fence_array[proc]=1;
627 
628 #ifndef NO_GA_STATS
629         if(proc == GAme){
630           gam_CountElems(ndim, plo, phi, &elems);
631           GAbytes.putloc += (double)size*elems;
632         }
633 #endif
634 
635         /*casting what ganb_get_armci_handle function returns to armci_hdl is
636           very crucial here as on 64 bit platforms, pointer is 64 bits where
637           as temporary is only 32 bits*/
638         if(nbhandle)  {
639           /* ARMCI_NbPutS(pbuf, stride_loc, prem, stride_rem, count, ndim -1, */
640           /*              proc,(armci_hdl_t*)get_armci_nbhandle(nbhandle)); */
641           ngai_nbputs(buf,pbuf, stride_loc, prem, stride_rem, count, ndim -1,
642               proc,field_off, field_size, size,
643               (armci_hdl_t*)get_armci_nbhandle(nbhandle));
644         } else {
645 #if defined(__crayx1) || defined(DISABLE_NBOPT)
646           /* ARMCI_PutS(pbuf,stride_loc,prem,stride_rem,count,ndim-1,proc); */
647           ngai_puts(buf, pbuf,stride_loc,prem,stride_rem,count,ndim-1,proc,
648               field_off, field_size, size);
649 #else
650           /* do blocking put for local processes. If all processes
651              are remote processes then do blocking put for the last one */
652           if((loop==0 && gai_iterator_last(&it_hdl)) || loop==1) {
653             /* ARMCI_PutS(pbuf,stride_loc,prem,stride_rem,count,ndim-1,proc); */
654             ngai_puts(buf, pbuf,stride_loc,prem,stride_rem,count,ndim-1,
655                 proc, field_off, field_size, size);
656           } else {
657             ++counter;
658             /* ARMCI_NbPutS(pbuf,stride_loc,prem,stride_rem,count, ndim-1, */
659             /*              proc,(armci_hdl_t*)get_armci_nbhandle(&ga_nbhandle)); */
660             ngai_nbputs(buf,pbuf,stride_loc,prem,stride_rem,count, ndim-1,
661                 proc, field_off, field_size, size,
662                 (armci_hdl_t*)get_armci_nbhandle(&ga_nbhandle));
663           }
664 #endif
665         }
666 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
667       } /* end if(cond) */
668 #endif
669     }
670 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
671   }
672   if(!nbhandle) nga_wait_internal(&ga_nbhandle);
673 #endif
674 
675 #ifdef PROFILE_OLD
676   ga_profile_stop();
677 #endif
678 
679   gai_iterator_destroy(&it_hdl);
680 }
681 
682 
683 /**
684  * (Non-blocking) Put an N-dimensional patch of data into a Global Array
685  */
686 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
687 #   pragma weak wnga_nbput = pnga_nbput
688 #endif
689 
pnga_nbput(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer * nbhandle)690 void pnga_nbput(Integer g_a, Integer *lo, Integer *hi, void *buf, Integer *ld, Integer *nbhandle)
691 {
692   GA_Internal_Threadsafe_Lock();
693   ngai_put_common(g_a,lo,hi,buf,ld,0,-1,nbhandle);
694   GA_Internal_Threadsafe_Unlock();
695 }
696 
697 /**
698  * (Non-blocking) Put an N-dimensional patch of data into a Global Array and notify the other
699                   side with information on another Global Array
700  */
701 
702 #define HANDLES_OUTSTANDING 100
703 /* Maximum number of outstanding put/notify handles */
704 
705 typedef struct {
706   Integer *orighdl;
707   Integer firsthdl;
708   Integer elementhdl;
709   void *elem_copy;
710 } gai_putn_hdl_t;
711 
712 static int putn_handles_initted = 0;
713 static gai_putn_hdl_t putn_handles[HANDLES_OUTSTANDING];
714 
putn_check_single_elem(Integer g_a,Integer * lon,Integer * hin)715 static int putn_check_single_elem(Integer g_a, Integer *lon, Integer *hin)
716 {
717   int ndims, i;
718 
719   ndims = pnga_ndim(g_a);
720   for (i = 0; i < ndims; i++)
721     if (lon[i] != hin[i])
722       return 0;
723 
724   return 1;
725 } /* putn_check_single_elem */
726 
putn_find_empty_slot(void)727 static int putn_find_empty_slot(void)
728 {
729   int i;
730 
731   for (i = 0; i < HANDLES_OUTSTANDING; i++)
732     if (!putn_handles[i].orighdl)
733       return i;
734 
735   return -1;
736 } /* putn_find_empty_slot */
737 
putn_intersect_coords(Integer g_a,Integer * lo,Integer * hi,Integer * ecoords)738 static int putn_intersect_coords(Integer g_a, Integer *lo, Integer *hi, Integer *ecoords)
739 {
740   int ndims, i;
741 
742   ndims = pnga_ndim(g_a);
743 
744   for (i = 0; i < ndims; i++)
745     if ((ecoords[i] < lo[i]) || (ecoords[i] > hi[i]))
746       return 0;
747 
748   return 1;
749 } /* putn_intersect_coords */
750 
putn_verify_element_in_buf(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer * ecoords,void * bufn,Integer elemSize)751 static int putn_verify_element_in_buf(Integer g_a, Integer *lo, Integer *hi, void *buf,
752 				      Integer *ld, Integer *ecoords, void *bufn,
753 				      Integer elemSize)
754 {
755   int i, ndims;
756 #if HAVE_STDDEF_H
757   ptrdiff_t off = (char *)bufn - (char *)buf;
758 #else
759   Integer off = (char *)bufn - (char *)buf;
760 #endif
761   Integer eoff = 0;
762 
763   off /= elemSize; /* Offset in terms of elements */
764 
765   ndims = pnga_ndim(g_a);
766   eoff = ecoords[0] - lo[0];
767 
768   /* Check in Fortran ordering */
769   for (i = 1; i < ndims; i++)
770     eoff += (ecoords[i] - lo[i]) * ld[i - 1];
771 
772   return (eoff == (Integer)off); /* Must be the same for a correct notify buffer */
773 } /* putn_verify_element_in_buf */
774 
775 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
776 #   pragma weak wnga_nbput_notify = pnga_nbput_notify
777 #endif
778 
pnga_nbput_notify(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer g_b,Integer * ecoords,void * bufn,Integer * nbhandle)779 void pnga_nbput_notify(Integer g_a, Integer *lo, Integer *hi, void *buf, Integer *ld, Integer g_b, Integer *ecoords, void *bufn, Integer *nbhandle)
780 {
781   Integer ldn[MAXDIM] = { 1 };
782   int pos, intersect;
783 
784   /* Make sure everything has been initialized */
785   if (!putn_handles_initted) {
786     memset(putn_handles, 0, sizeof(putn_handles));
787     putn_handles_initted = 1;
788   }
789 
790   pos = putn_find_empty_slot();
791   if (pos == -1) /* no empty handles available */
792     pnga_error("Too many outstanding put/notify operations!", 0);
793 
794   putn_handles[pos].orighdl = nbhandle; /* Store original handle for nbwait_notify */
795 
796   if (g_a == g_b)
797     intersect = putn_intersect_coords(g_a, lo, hi, ecoords);
798   else
799     intersect = 0;
800 
801   if (!intersect) { /* Simpler case */
802     ngai_put_common(g_a, lo, hi, buf, ld, 0, -1, &putn_handles[pos].firsthdl);
803     ngai_put_common(g_b, ecoords, ecoords, bufn, ldn, 0, -1, &putn_handles[pos].elementhdl);
804 
805     putn_handles[pos].elem_copy = NULL;
806   }
807   else {
808     int ret, i;
809     Integer handle = GA_OFFSET + g_a, size;
810     void *elem_copy;
811     char *elem;
812 
813     size = GA[handle].elemsize;
814     ret = putn_verify_element_in_buf(g_a, lo, hi, buf, ld, ecoords, bufn, size);
815 
816     if (!ret)
817       pnga_error("Intersecting buffers, but notify element is not in buffer!", 0);
818 
819     elem_copy = malloc(size);
820     memcpy(elem_copy, bufn, size);
821 
822     elem = bufn;
823     for (i = 0; i < size; i++)
824       elem[i] += 1; /* Increment each byte by one, safe? */
825 
826     putn_handles[pos].elem_copy = elem_copy;
827 
828     ngai_put_common(g_a, lo, hi, buf, ld, 0, -1, &putn_handles[pos].firsthdl);
829     ngai_put_common(g_a, ecoords, ecoords, elem_copy, ldn, 0, -1,
830 		    &putn_handles[pos].elementhdl);
831   }
832 } /* pnga_nbput_notify */
833 
834 /**
835  *  Wait for a non-blocking put/notify to complete
836  */
837 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
838 #   pragma weak wnga_nbwait_notify = pnga_nbwait_notify
839 #endif
840 
pnga_nbwait_notify(Integer * nbhandle)841 void pnga_nbwait_notify(Integer *nbhandle)
842 {
843   int i;
844 
845   for (i = 0; i < HANDLES_OUTSTANDING; i++)
846     if (putn_handles[i].orighdl == nbhandle)
847       break;
848 
849   if (i >= HANDLES_OUTSTANDING)
850     return; /* Incorrect handle used or maybe wait was called multiple times? */
851 
852   nga_wait_internal(&putn_handles[i].firsthdl);
853   nga_wait_internal(&putn_handles[i].elementhdl);
854 
855   if (putn_handles[i].elem_copy) {
856     free(putn_handles[i].elem_copy);
857     putn_handles[i].elem_copy = NULL;
858   }
859 
860   putn_handles[i].orighdl = NULL;
861 } /* pnga_nbwait_notify */
862 
863 /**
864  * Put an N-dimensional patch of data into a Global Array
865  */
866 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
867 #   pragma weak wnga_put = pnga_put
868 #endif
869 
pnga_put(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld)870 void pnga_put(Integer g_a, Integer *lo, Integer *hi, void *buf, Integer *ld)
871 {
872 
873   GA_Internal_Threadsafe_Lock();
874   ngai_put_common(g_a,lo,hi,buf,ld,0,-1,NULL);
875   GA_Internal_Threadsafe_Unlock();
876 }
877 
878 /**
879  * (Non-blocking) Put a field in a an N-dimensional patch of data into a Global Array
880  */
881 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
882 #   pragma weak wnga_nbput_field = pnga_nbput_field
883 #endif
884 
pnga_nbput_field(Integer g_a,Integer * lo,Integer * hi,Integer foff,Integer fsize,void * buf,Integer * ld,Integer * nbhandle)885 void pnga_nbput_field(Integer g_a, Integer *lo, Integer *hi, Integer foff, Integer fsize, void *buf, Integer *ld, Integer *nbhandle)
886 {
887   ngai_put_common(g_a,lo,hi,buf,ld,foff, fsize, nbhandle);
888 }
889 
890 
891 /**
892  * Put a field in a an N-dimensional patch of data into a Global Array
893  */
894 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
895 #   pragma weak wnga_put_field = pnga_put_field
896 #endif
897 
pnga_put_field(Integer g_a,Integer * lo,Integer * hi,Integer foff,Integer fsize,void * buf,Integer * ld)898 void pnga_put_field(Integer g_a, Integer *lo, Integer *hi, Integer foff, Integer fsize, void *buf, Integer *ld)
899 {
900   ngai_put_common(g_a,lo,hi,buf,ld,foff, fsize, NULL);
901 }
902 
903 
904 
905 /*\ A common routine called by both non-blocking and blocking GA Get calls.
906 \*/
ngai_get_common(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer field_off,Integer field_size,Integer * nbhandle)907 void ngai_get_common(Integer g_a,
908                    Integer *lo,
909                    Integer *hi,
910                    void    *buf,
911                    Integer *ld,
912 		     Integer field_off,
913 		     Integer field_size,
914                    Integer *nbhandle)
915 {
916                    /* g_a:   Global array handle
917                       lo[]:  Array of lower indices of patch of global array
918                       hi[]:  Array of upper indices of patch of global array
919                       buf[]: Local buffer that array patch will be copied into
920                       ld[]:  Array of physical ndim-1 dimensions of local buffer */
921 
922   Integer  p, np, handle=GA_OFFSET + g_a;
923   Integer  idx, elems, size, p_handle;
924   int proc, ndim, loop, cond;
925   int num_loops=2; /* 1st loop for remote procs; 2nd loop for local procs */
926   Integer n_rstrctd;
927   Integer *rank_rstrctd;
928 #if defined(__crayx1) || defined(DISABLE_NBOPT)
929 #else
930   Integer ga_nbhandle;
931   int counter=0;
932 #endif
933   _iterator_hdl it_hdl;
934 
935   int _stride_rem[MAXDIM+1], _stride_loc[MAXDIM+1], _count[MAXDIM+1];
936   int *stride_rem=&_stride_rem[1], *stride_loc=&_stride_loc[1], *count=&_count[1];
937 
938   ga_check_handleM(g_a, "ngai_get_common");
939 
940   size = GA[handle].elemsize;
941   ndim = GA[handle].ndim;
942   p_handle = (Integer)GA[handle].p_handle;
943   n_rstrctd = (Integer)GA[handle].num_rstrctd;
944   rank_rstrctd = GA[handle].rank_rstrctd;
945 
946   /*initial  stride portion for field-wise operations*/
947   _stride_rem[0] = size;
948   _stride_loc[0] = field_size;
949   _count[0] = field_size;
950 
951   gai_iterator_init(g_a, lo, hi, &it_hdl);
952 
953   /* get total size of patch */
954 #ifndef NO_GA_STATS
955   gam_CountElems(ndim, lo, hi, &elems);
956   GAbytes.gettot += (double)size*elems;
957   GAstat.numget++;
958   GAstat.numget_procs += np;
959 #endif
960 
961   if(nbhandle)ga_init_nbhandle(nbhandle);
962 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
963   else ga_init_nbhandle(&ga_nbhandle);
964 #endif
965 
966 #ifdef PROFILE_OLD
967   ga_profile_start((int)handle, (long)size*elems, ndim, lo, hi,
968       ENABLE_PROFILE_GET);
969 #endif
970 
971 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
972   for(loop=0; loop<num_loops; loop++) {
973     __CRAYX1_PRAGMA("_CRI novector");
974 #endif
975     Integer ldrem[MAXDIM];
976     Integer idx_buf, *plo, *phi;
977     char *pbuf, *prem;
978     gai_iterator_reset(&it_hdl);
979     while (gai_iterator_next(&it_hdl, &proc, &plo, &phi, &prem, ldrem)) {
980 
981       /* check if it is local to SMP */
982 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
983       cond = armci_domain_same_id(ARMCI_DOMAIN_SMP,(int)proc);
984       if(loop==0) cond = !cond;
985       if(cond) {
986 #endif
987 
988         /* find the right spot in the user buffer for the point
989            subscripted by plo given that the corner of the user
990            buffer is subscripted by lo */
991         gam_ComputePatchIndex(ndim, lo, plo, ld, &idx_buf);
992         pbuf = size*idx_buf + (char*)buf;
993 
994         /* compute number of elements in each dimension and store the
995            result in count */
996         gam_ComputeCount(ndim, plo, phi, count);
997 
998         /* Scale first element in count by element size. The ARMCI_GetS
999            routine uses this convention to figure out memory sizes.*/
1000         count[0] *= size;
1001 
1002         /* Return strides for memory containing global array on remote
1003            processor indexed by proc (stride_rem) and for local buffer
1004            buf (stride_loc) */
1005         gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc);
1006 
1007 #ifndef NO_GA_STATS
1008         if(proc == GAme){
1009           gam_CountElems(ndim, plo, phi, &elems);
1010           GAbytes.getloc += (double)size*elems;
1011         }
1012 #endif
1013         if(nbhandle)  {
1014           /*ARMCI_NbGetS(prem, stride_rem, pbuf, stride_loc, count, ndim -1, */
1015           /*                 proc,(armci_hdl_t*)get_armci_nbhandle(nbhandle)); */
1016           ngai_nbgets(buf,prem, stride_rem, pbuf, stride_loc, count, ndim -1,
1017               proc,field_off, field_size, size,
1018               (armci_hdl_t*)get_armci_nbhandle(nbhandle));
1019         } else {
1020 #if defined(__crayx1) || defined(DISABLE_NBOPT)
1021           /*ARMCI_GetS(prem,stride_rem,pbuf,stride_loc,count,ndim-1,proc); */
1022           ngai_gets(buf,prem,stride_rem,pbuf,stride_loc,count,ndim-1,proc, field_off, field_size, size);
1023 #else
1024           if((loop==0 && gai_iterator_last(&it_hdl)) || loop==1) {
1025             /*ARMCI_GetS(prem,stride_rem,pbuf,stride_loc,count,ndim-1,proc); */
1026             ngai_gets(buf,prem,stride_rem,pbuf,stride_loc,count,ndim-1,proc, field_off, field_size, size);
1027           } else {
1028             ++counter;
1029             /*ARMCI_NbGetS(prem,stride_rem,pbuf,stride_loc,count,ndim-1, */
1030             /*             proc,(armci_hdl_t*)get_armci_nbhandle(&ga_nbhandle)); */
1031             ngai_nbgets(buf,prem, stride_rem, pbuf, stride_loc, count, ndim -1,
1032                 proc,field_off, field_size, size,
1033                 (armci_hdl_t*)get_armci_nbhandle(nbhandle));
1034           }
1035 #endif
1036         }
1037 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
1038       } /* end if(cond) */
1039 #endif
1040     }
1041 #if !defined(__crayx1) && !defined(DISABLE_NBOPT)
1042   }
1043   if(!nbhandle) nga_wait_internal(&ga_nbhandle);
1044 #endif
1045 
1046 #ifdef PROFILE_OLD
1047   ga_profile_stop();
1048 #endif
1049 
1050   gai_iterator_destroy(&it_hdl);
1051 }
1052 
1053 /**
1054  * Utility function to copy data from one buffer to another
1055  */
gai_mem_copy(int elemsize,int ndim,void * src_ptr,Integer * src_start,Integer * count,Integer * src_ld,void * dst_ptr,Integer * dst_start,Integer * dst_ld)1056 void gai_mem_copy(int elemsize, int ndim, void *src_ptr, Integer *src_start,
1057     Integer *count,  Integer *src_ld, void *dst_ptr, Integer *dst_start,
1058     Integer *dst_ld)
1059 {
1060   Integer src_offset, dst_offset;
1061   Integer factor;
1062   void *sptr, *dptr;
1063   int i, j;
1064   long src_idx, dst_idx;
1065   int n1dim; /* total number of contiguous segments */
1066   int src_bvalue[MAXDIM], src_bunit[MAXDIM];
1067   int dst_bvalue[MAXDIM], dst_bunit[MAXDIM];
1068   int src_stride[MAXDIM], dst_stride[MAXDIM];
1069   int segsize;
1070   /* find offsets for both source and destination buffers and use these to
1071    * adjust pointers to the first element in each data set
1072    */
1073   factor = 1;
1074   src_offset = 0;
1075   for (i=0; i<ndim; i++) {
1076     src_offset += src_start[i]*factor;
1077     if (i<ndim-1) factor *= src_ld[i];
1078   }
1079   sptr = (void*)((char*)src_ptr+src_offset*elemsize);
1080 
1081   factor = 1; //reset the factor
1082 
1083   dst_offset = 0;
1084   for (i=0; i<ndim; i++) {
1085     dst_offset += dst_start[i]*factor;
1086     if (i<ndim-1) factor *= dst_ld[i];
1087   }
1088   dptr = (void*)((char*)dst_ptr+dst_offset*elemsize);
1089   segsize = count[0]*elemsize;
1090   /* perform the memcpy from source to destination buffer. Start by calculating
1091    * the total number of contiguous segments in the src array*/
1092   n1dim = 1;
1093 
1094   src_stride[0] = dst_stride[0] = elemsize;
1095   for (i=0; i<ndim-1; i++) {
1096     src_stride[i+1] = src_stride[i];
1097     src_stride[i+1] *= (int)src_ld[i];
1098     dst_stride[i+1] = dst_stride[i];
1099     dst_stride[i+1] *= (int)dst_ld[i];
1100   }
1101 
1102   for (i=1; i<=ndim-1; i++) {
1103     n1dim *= count[i];
1104   }
1105   /* initialize arrays for evaluating offset of each segment */
1106   src_bvalue[0] = 0;
1107   src_bvalue[1] = 0;
1108   src_bunit[0] = 1;
1109   src_bunit[1] = 1;
1110   dst_bvalue[0] = 0;
1111   dst_bvalue[1] = 0;
1112   dst_bunit[0] = 1;
1113   dst_bunit[1] = 1;
1114 
1115   //this part does nothing?
1116   for (i=2; i<=ndim-1; i++) {
1117     src_bvalue[i] = 0;
1118     dst_bvalue[i] = 0;
1119     src_bunit[i] = src_bunit[i-1]*count[i-1];
1120     dst_bunit[i] = dst_bunit[i-1]*count[i-1];
1121   }
1122 
1123   /* evaluate offset for each contiguous segment */
1124   for (i=0; i<n1dim; i++) {
1125     src_idx = 0;
1126     dst_idx = 0;
1127     for (j=1; j<=ndim-1; j++) {
1128       src_idx += src_bvalue[j]*src_stride[j];
1129       if ((i+1)%src_bunit[j] == 0) {
1130         src_bvalue[j]++;
1131       }
1132       if (src_bvalue[j] > (count[j]-1)) {
1133         src_bvalue[j] = 0;
1134       }
1135     }
1136     for (j=1; j<=ndim-1; j++) {
1137       dst_idx += dst_bvalue[j]*dst_stride[j];
1138       if ((i+1)%dst_bunit[j] == 0) {
1139         dst_bvalue[j]++;
1140       }
1141       if (dst_bvalue[j] > (count[j]-1)) {
1142         dst_bvalue[j] = 0;
1143       }
1144     }
1145     /* copy memory */
1146     memcpy((char*)dptr+dst_idx,(char*)sptr+src_idx,segsize);
1147   }
1148 }
1149 
1150 /**
1151  * Get an N-dimensional patch of data from a Global Array
1152  */
1153 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1154 #   pragma weak wnga_get = pnga_get
1155 #endif
1156 
pnga_get(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld)1157 void pnga_get(Integer g_a, Integer *lo, Integer *hi,
1158               void *buf, Integer *ld)
1159 {
1160   Integer handle = GA_OFFSET + g_a;
1161   enum property_type ga_property = GA[handle].property;
1162 
1163   if (ga_property != READ_CACHE) /* if array is not read only */
1164   {
1165     GA_Internal_Threadsafe_Lock();
1166     ngai_get_common(g_a,lo,hi,buf,ld,0,-1,(Integer *)NULL);
1167     GA_Internal_Threadsafe_Unlock();
1168   } else {
1169     int i;
1170     int nelem;
1171     int ndim = GA[handle].ndim;
1172     Integer nstart[MAXDIM], ncount[MAXDIM], nld[MAXDIM];
1173     Integer bstart[MAXDIM];
1174     /* ngai_get_common(g_a,lo,hi,buf,ld,0,-1,(Integer *)NULL); */
1175 
1176     /* cache is empty condition */
1177     if (GA[handle].cache_head == NULL) {
1178       GA[handle].cache_head = malloc(sizeof(cache_struct_t));
1179       nelem = 1;
1180       for (i=0; i<ndim; i++) {
1181         GA[handle].cache_head -> lo[i] = lo[i];
1182         GA[handle].cache_head -> hi[i] = hi[i];
1183         nelem *= (hi[i]-lo[i]+1);
1184       }
1185       GA[handle].cache_head -> cache_buf = malloc(GA[handle].elemsize*nelem);
1186       GA[handle].cache_head -> next = NULL;
1187 
1188         void *new_buf = GA[handle].cache_head->cache_buf;
1189 
1190       /* place data to receive buffer */
1191       ngai_get_common(g_a,lo,hi,buf,ld,0,-1,(Integer*)NULL);
1192 
1193       for (i=0; i<ndim; i++) {
1194         nstart[i] = 0;
1195         bstart[i] = 0;
1196         nld[i] = (hi[i]-lo[i]+1);
1197         ncount[i] = nld[i];
1198       }
1199       gai_mem_copy(GA[handle].elemsize,ndim,buf,bstart,ncount,ld,new_buf,
1200           nstart, nld);
1201     } else {
1202 
1203       cache_struct_t * cache_temp_pointer = GA[handle].cache_head;
1204       int match = 0;
1205       while (cache_temp_pointer != NULL) {
1206         int chk;
1207         int sub_chk;
1208         int temp_lo[MAXDIM];
1209         int temp_hi[MAXDIM];
1210         void *temp_buf = cache_temp_pointer->cache_buf;
1211         for (i=0; i<ndim; i++) {
1212           temp_lo[i] = cache_temp_pointer->lo[i];
1213           temp_hi[i] = cache_temp_pointer->hi[i];
1214         }
1215 
1216         //match
1217         chk = 1;
1218         for (i=0; i<ndim; i++) {
1219           if (!(temp_lo[i] <= lo[i] && temp_hi[i] >= hi[i])) {
1220             chk = 0;
1221             break;
1222           }
1223         }
1224         //match
1225         if (chk) {
1226           nelem = 1;
1227           for (i=0; i<ndim; i++) {
1228             nelem *= (hi[i]-lo[i]+1);
1229           }
1230           for (i=0; i<ndim; i++) {
1231             nstart[i] = (lo[i]-temp_lo[i]);
1232             bstart[i] = 0;
1233             nld[i] = (temp_hi[i]-temp_lo[i]+1);
1234             ncount[i] = (hi[i]-lo[i]+1);
1235           }
1236           /* copy data to recieve buffer */
1237           gai_mem_copy(GA[handle].elemsize,ndim,temp_buf,nstart,ncount,nld,buf,
1238               bstart, ncount /*ld*/);
1239           match = 1;
1240           break;
1241         }
1242 
1243         cache_temp_pointer = cache_temp_pointer->next;
1244 
1245       }
1246 
1247       //no match condition
1248       if (match == 0) {
1249         void *new_buf;
1250         /* create new node on cache list*/
1251         cache_struct_t *cache_new = malloc(sizeof(cache_struct_t));
1252         nelem = 1;
1253         for (i=0; i<ndim; i++) {
1254           cache_new -> lo[i] = lo[i];
1255           cache_new -> hi[i] = hi[i];
1256           nelem *= (hi[i]-lo[i]+1);
1257         }
1258         cache_new -> cache_buf = malloc(GA[handle].elemsize*nelem);
1259         new_buf = cache_new->cache_buf;
1260         cache_new -> next = GA[handle].cache_head;
1261         GA[handle].cache_head = cache_new;
1262 
1263 
1264         //place data to recieve buffer
1265         ngai_get_common(g_a,lo,hi,buf,ld,0,-1,(Integer *)NULL);
1266 
1267         for (i=0; i<ndim; i++) {
1268           nstart[i] = 0;
1269           bstart[i] = 0;
1270           nld[i] = (hi[i]-lo[i]+1);
1271           ncount[i] = nld[i];
1272         }
1273         gai_mem_copy(GA[handle].elemsize,ndim,buf,bstart,ncount,ld,new_buf,
1274             nstart, nld);
1275       }
1276     }
1277 
1278     /* check number of items cached */
1279     cache_struct_t * cache_temp_pointer = GA[handle].cache_head;
1280     int cache_size = 0;
1281 
1282     /* check number of items cached */
1283     while (cache_temp_pointer != NULL) {
1284       if (cache_size > MAXCACHE) {
1285         cache_struct_t *last = cache_temp_pointer;
1286         cache_struct_t *next = last->next;
1287         last->next = NULL;
1288         cache_temp_pointer = next;
1289         while (next) {
1290           next = cache_temp_pointer->next;
1291           free(cache_temp_pointer->cache_buf);
1292           free(cache_temp_pointer);
1293         }
1294         break;
1295       }
1296       cache_size++;
1297     }
1298     /* end */
1299   }
1300 }
1301 
1302 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1303 #   pragma weak wnga_nbget = pnga_nbget
1304 #endif
1305 
pnga_nbget(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,Integer * nbhandle)1306 void pnga_nbget(Integer g_a, Integer *lo, Integer *hi,
1307                void *buf, Integer *ld, Integer *nbhandle)
1308 {
1309   GA_Internal_Threadsafe_Lock();
1310   ngai_get_common(g_a,lo,hi,buf,ld,0,-1,nbhandle);
1311   GA_Internal_Threadsafe_Unlock();
1312 }
1313 
1314 /**
1315  * Get a field in an N-dimensional patch of data from a Global Array
1316  */
1317 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1318 #   pragma weak wnga_get_field = pnga_get_field
1319 #endif
1320 
pnga_get_field(Integer g_a,Integer * lo,Integer * hi,Integer foff,Integer fsize,void * buf,Integer * ld)1321 void pnga_get_field(Integer g_a, Integer *lo, Integer *hi,Integer foff, Integer fsize,
1322               void *buf, Integer *ld)
1323 {
1324   ngai_get_common(g_a,lo,hi,buf,ld,foff,fsize,(Integer *)NULL);
1325 }
1326 
1327 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1328 #   pragma weak wnga_nbget_field = pnga_nbget_field
1329 #endif
1330 
pnga_nbget_field(Integer g_a,Integer * lo,Integer * hi,Integer foff,Integer fsize,void * buf,Integer * ld,Integer * nbhandle)1331 void pnga_nbget_field(Integer g_a, Integer *lo, Integer *hi,Integer foff, Integer fsize,
1332                void *buf, Integer *ld, Integer *nbhandle)
1333 {
1334   ngai_get_common(g_a,lo,hi,buf,ld,foff,fsize,nbhandle);
1335 }
1336 
1337 #ifdef __crayx1
1338 #  pragma _CRI inline ga_get_
1339 #  pragma _CRI inline ngai_get_common
1340 #endif
1341 
1342 /**
1343  *  A common routine called by both non-blocking and blocking GA acc calls.
1344  */
ngai_acc_common(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,void * alpha,Integer * nbhandle)1345 void ngai_acc_common(Integer g_a,
1346                    Integer *lo,
1347                    Integer *hi,
1348                    void    *buf,
1349                    Integer *ld,
1350                    void    *alpha,
1351                    Integer *nbhandle)
1352 {
1353   Integer  p, np, handle=GA_OFFSET + g_a;
1354   Integer  idx, elems, size, type, p_handle, ga_nbhandle;
1355   int optype=-1, loop, ndim, cond;
1356   int proc;
1357   int num_loops=2; /* 1st loop for remote procs; 2nd loop for local procs */
1358   Integer n_rstrctd;
1359   Integer *rank_rstrctd;
1360   _iterator_hdl it_hdl;
1361 
1362   GA_Internal_Threadsafe_Lock();
1363 
1364   ga_check_handleM(g_a, "ngai_acc_common");
1365 
1366   size = GA[handle].elemsize;
1367   type = GA[handle].type;
1368   ndim = GA[handle].ndim;
1369   p_handle = GA[handle].p_handle;
1370   n_rstrctd = (Integer)GA[handle].num_rstrctd;
1371   rank_rstrctd = GA[handle].rank_rstrctd;
1372 
1373   if(type==C_DBL) optype= ARMCI_ACC_DBL;
1374   else if(type==C_FLOAT) optype= ARMCI_ACC_FLT;
1375   else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
1376   else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
1377   else if(type==C_INT)optype= ARMCI_ACC_INT;
1378   else if(type==C_LONG)optype= ARMCI_ACC_LNG;
1379   else pnga_error("type not supported",type);
1380 
1381   gai_iterator_init(g_a, lo, hi, &it_hdl);
1382 
1383 #ifndef NO_GA_STATS
1384   gam_CountElems(ndim, lo, hi, &elems);
1385   GAbytes.acctot += (double)size*elems;
1386   GAstat.numacc++;
1387   GAstat.numacc_procs += np;
1388 #endif
1389 
1390   if(nbhandle)ga_init_nbhandle(nbhandle);
1391   else ga_init_nbhandle(&ga_nbhandle);
1392 
1393 #ifdef PROFILE_OLD
1394   ga_profile_start((int)handle, (long)size*elems, ndim, lo, hi,
1395       ENABLE_PROFILE_ACC);
1396 #endif
1397 
1398   for(loop=0; loop<num_loops; loop++) {
1399     Integer ldrem[MAXDIM];
1400     int stride_rem[MAXDIM], stride_loc[MAXDIM], count[MAXDIM];
1401     Integer idx_buf, *plo, *phi;
1402     char *pbuf, *prem;
1403     gai_iterator_reset(&it_hdl);
1404 
1405     while (gai_iterator_next(&it_hdl, &proc, &plo, &phi, &prem, ldrem)) {
1406 
1407       /* check if it is local to SMP */
1408       cond = armci_domain_same_id(ARMCI_DOMAIN_SMP,(int)proc);
1409       if(loop==0) cond = !cond;
1410 
1411       if(cond) {
1412 
1413         /* find the right spot in the user buffer */
1414         gam_ComputePatchIndex(ndim,lo, plo, ld, &idx_buf);
1415         pbuf = size*idx_buf + (char*)buf;
1416 
1417         gam_ComputeCount(ndim, plo, phi, count);
1418 
1419         /* scale number of rows by element size */
1420         count[0] *= size;
1421         gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc);
1422 
1423         if(GA_fence_set)fence_array[proc]=1;
1424 
1425 #ifndef NO_GA_STATS
1426         if(proc == GAme){
1427           gam_CountElems(ndim, plo, phi, &elems);
1428           GAbytes.accloc += (double)size*elems;
1429         }
1430 #endif
1431 
1432         if(nbhandle) {
1433           ARMCI_NbAccS(optype, alpha, pbuf, stride_loc, prem,
1434               stride_rem, count, ndim-1, proc,
1435               (armci_hdl_t*)get_armci_nbhandle(nbhandle));
1436         } else {
1437 #  if !defined(DISABLE_NBOPT)
1438           if((loop==0 && gai_iterator_last(&it_hdl)) || loop==1) {
1439             ARMCI_AccS(optype, alpha, pbuf, stride_loc, prem, stride_rem,
1440                 count, ndim-1, proc);
1441           } else {
1442             ARMCI_NbAccS(optype, alpha, pbuf, stride_loc, prem,
1443                 stride_rem, count, ndim-1, proc,
1444                 (armci_hdl_t*)get_armci_nbhandle(&ga_nbhandle));
1445           }
1446 #  else
1447           ARMCI_AccS(optype, alpha, pbuf, stride_loc, prem, stride_rem,
1448               count, ndim-1, proc);
1449 #  endif
1450         }
1451       } /* end if(cond) */
1452     }
1453   }
1454 #if !defined(DISABLE_NBOPT)
1455   if(!nbhandle) nga_wait_internal(&ga_nbhandle);
1456 #endif
1457 
1458 #ifdef PROFILE_OLD
1459   ga_profile_stop();
1460 #endif
1461   GA_Internal_Threadsafe_Unlock();
1462 
1463   gai_iterator_destroy(&it_hdl);
1464 }
1465 
1466 /**
1467  *  Accumulate operation for an N-dimensional patch of a Global Array
1468  *       g_a += alpha * patch
1469  */
1470 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1471 #   pragma weak wnga_acc = pnga_acc
1472 #endif
1473 
pnga_acc(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,void * alpha)1474 void pnga_acc(Integer g_a,
1475               Integer *lo,
1476               Integer *hi,
1477               void    *buf,
1478               Integer *ld,
1479               void    *alpha)
1480 {
1481     ngai_acc_common(g_a,lo,hi,buf,ld,alpha,NULL);
1482 }
1483 
1484 /**
1485  *  (Non-blocking) Accumulate operation for an N-dimensional patch of a Global Array
1486  *       g_a += alpha * patch
1487  */
1488 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1489 #   pragma weak wnga_nbacc = pnga_nbacc
1490 #endif
1491 
pnga_nbacc(Integer g_a,Integer * lo,Integer * hi,void * buf,Integer * ld,void * alpha,Integer * nbhndl)1492 void pnga_nbacc(Integer g_a,
1493                 Integer *lo,
1494                 Integer *hi,
1495                 void    *buf,
1496                 Integer *ld,
1497                 void    *alpha,
1498                 Integer *nbhndl)
1499 {
1500     ngai_acc_common(g_a,lo,hi,buf,ld,alpha,nbhndl);
1501 }
1502 
1503 /**
1504  * Return a pointer to local data in a Global Array
1505  */
1506 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1507 #   pragma weak wnga_access_ptr = pnga_access_ptr
1508 #endif
1509 
pnga_access_ptr(Integer g_a,Integer lo[],Integer hi[],void * ptr,Integer ld[])1510 void pnga_access_ptr(Integer g_a, Integer lo[], Integer hi[],
1511                       void* ptr, Integer ld[])
1512 {
1513 char *lptr;
1514 Integer  handle = GA_OFFSET + g_a;
1515 Integer  ow,i,p_handle;
1516 
1517 
1518    p_handle = GA[handle].p_handle;
1519    if (!pnga_locate(g_a,lo,&ow)) pnga_error("locate top failed",0);
1520    if (p_handle != -1)
1521       ow = PGRP_LIST[p_handle].inv_map_proc_list[ow];
1522    if ((armci_domain_id(ARMCI_DOMAIN_SMP, ow) != armci_domain_my_id(ARMCI_DOMAIN_SMP)) && (ow != GAme))
1523       pnga_error("cannot access top of the patch",ow);
1524    if (!pnga_locate(g_a,hi, &ow)) pnga_error("locate bottom failed",0);
1525    if (p_handle != -1)
1526       ow = PGRP_LIST[p_handle].inv_map_proc_list[ow];
1527    if ((armci_domain_id(ARMCI_DOMAIN_SMP, ow) != armci_domain_my_id(ARMCI_DOMAIN_SMP)) && (ow != GAme))
1528       pnga_error("cannot access bottom of the patch",ow);
1529 
1530    for (i=0; i<GA[handle].ndim; i++)
1531        if(lo[i]>hi[i]) {
1532            ga_RegionError(GA[handle].ndim, lo, hi, g_a);
1533        }
1534 
1535    if (p_handle >= 0) {
1536      ow = PGRP_LIST[p_handle].map_proc_list[ow];
1537    }
1538    if (GA[handle].num_rstrctd > 0) {
1539      ow = GA[handle].rank_rstrctd[ow];
1540    }
1541    gam_Location(ow,handle, lo, &lptr, ld);
1542    *(char**)ptr = lptr;
1543 }
1544 
1545 /*\ RETURN A POINTER TO BEGINNING OF LOCAL DATA BLOCK
1546 \*/
1547 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1548 #   pragma weak wnga_access_block_grid_ptr = pnga_access_block_grid_ptr
1549 #endif
1550 
pnga_access_block_grid_ptr(Integer g_a,Integer * index,void * ptr,Integer * ld)1551 void pnga_access_block_grid_ptr(Integer g_a, Integer *index, void* ptr, Integer *ld)
1552             /* g_a: array handle [input]
1553              * index: subscript of a particular block  [input]
1554              * ptr: pointer to data in block [output]
1555              * ld:  array of strides for block data [output]
1556              */
1557 {
1558   char *lptr;
1559   Integer  handle = GA_OFFSET + g_a;
1560   Integer  i/*, p_handle*/, offset, factor, inode;
1561   Integer ndim;
1562   C_Integer *num_blocks, *block_dims;
1563   int *proc_grid;
1564   Integer *dims;
1565   Integer last;
1566   Integer proc_index[MAXDIM];
1567   Integer blk_num[MAXDIM], blk_dim[MAXDIM], blk_inc[MAXDIM];
1568   Integer blk_ld[MAXDIM],hlf_blk[MAXDIM],blk_jinc;
1569   Integer j, lo, hi;
1570   Integer lld[MAXDIM], block_idx[MAXDIM], block_count[MAXDIM];
1571   Integer ldims[MAXDIM], ldidx[MAXDIM];
1572   Integer *mapc;
1573 
1574 
1575   /*p_handle = GA[handle].p_handle;*/
1576   if (GA[handle].distr_type != SCALAPACK && GA[handle].distr_type != TILED &&
1577       GA[handle].distr_type != TILED_IRREG) {
1578     pnga_error("Array is not using ScaLAPACK or tiled data distribution",0);
1579   }
1580   /* dimensions of processor grid */
1581   proc_grid = GA[handle].nblock;
1582   /* number of blocks along each dimension */
1583   num_blocks = GA[handle].num_blocks;
1584   /* size of individual blocks */
1585   block_dims = GA[handle].block_dims;
1586   dims = GA[handle].dims;
1587   ndim = GA[handle].ndim;
1588   for (i=0; i<ndim; i++) {
1589     if (index[i] < 0 || index[i] >= num_blocks[i])
1590       pnga_error("block index outside allowed values",index[i]);
1591   }
1592 
1593   /* Find strides of requested block */
1594   if (GA[handle].distr_type == TILED) {
1595     /* find out what processor block is located on */
1596     gam_find_tile_proc_from_indices(handle, inode, index)
1597 
1598     /* get proc indices of processor that owns block */
1599     gam_find_tile_proc_indices(handle,inode,proc_index)
1600       last = ndim-1;
1601 
1602     for (i=0; i<ndim; i++)  {
1603       lo = index[i]*block_dims[i]+1;
1604       hi = (index[i]+1)*block_dims[i];
1605       if (hi > dims[i]) hi = dims[i];
1606       ldims[i] = (hi - lo + 1);
1607       if (i<last) ld[i] = ldims[i];
1608     }
1609   } else if (GA[handle].distr_type == TILED_IRREG) {
1610     /* find out what processor block is located on */
1611     gam_find_tile_proc_from_indices(handle, inode, index);
1612 
1613     /* get proc indices of processor that owns block */
1614     gam_find_tile_proc_indices(handle,inode,proc_index)
1615       last = ndim-1;
1616 
1617     mapc = GA[handle].mapc;
1618     offset = 0;
1619     for (i=0; i<ndim; i++) {
1620       lld[i] = 0;
1621       ldidx[i] = 0;
1622       lo = mapc[offset+index[i]];
1623       if (index[i] < num_blocks[i]-1) {
1624         hi = mapc[offset+index[i]+1]-1;
1625       } else {
1626         hi = dims[i];
1627       }
1628       ldims[i] = hi - lo + 1;
1629       for (j = proc_index[i]; j<num_blocks[i]; j += proc_grid[i]) {
1630         lo = mapc[offset+j];
1631         if (j < num_blocks[i]-1) {
1632           hi = mapc[offset+j+1]-1;
1633         } else {
1634           hi = dims[i];
1635         }
1636         lld[i] += (hi-lo+1);
1637         if (j < index[i]) {
1638           ldidx[i] += (hi-lo+1);
1639         }
1640       }
1641       if (i<last) ld[i] = ldims[i];
1642       offset += num_blocks[i];
1643     }
1644   } else if (GA[handle].distr_type == SCALAPACK) {
1645     /* find out what processor block is located on */
1646     gam_find_proc_from_sl_indices(handle, inode, index);
1647 
1648     /* get proc indices of processor that owns block */
1649     gam_find_proc_indices(handle,inode,proc_index)
1650       last = ndim-1;
1651 
1652     for (i=0; i<last; i++)  {
1653       blk_dim[i] = block_dims[i]*proc_grid[i];
1654       blk_num[i] = GA[handle].dims[i]/blk_dim[i];
1655       blk_inc[i] = GA[handle].dims[i] - blk_num[i]*blk_dim[i];
1656       blk_ld[i] = blk_num[i]*block_dims[i];
1657       hlf_blk[i] = blk_inc[i]/block_dims[i];
1658       ld[i] = blk_ld[i];
1659       blk_jinc = dims[i]%block_dims[i];
1660       if (blk_inc[i] > 0) {
1661         if (proc_index[i]<hlf_blk[i]) {
1662           blk_jinc = block_dims[i];
1663         } else if (proc_index[i] == hlf_blk[i]) {
1664           blk_jinc = blk_inc[i]%block_dims[i];
1665           /*
1666           if (blk_jinc == 0) {
1667           blk_jinc = block_dims[i];
1668           }
1669           */
1670         } else {
1671           blk_jinc = 0;
1672         }
1673       }
1674       ld[i] += blk_jinc;
1675     }
1676   }
1677 
1678   /* Find the local grid index of block relative to local block grid and
1679      store result in block_idx.
1680      Find physical dimensions of locally held data and store in
1681      lld and set values in ldim, which is used to evaluate the
1682      offset for the requested block. */
1683   if (GA[handle].distr_type == TILED) {
1684     for (i=0; i<ndim; i++) {
1685       block_idx[i] = 0;
1686       block_count[i] = 0;
1687       lld[i] = 0;
1688       lo = 0;
1689       hi = -1;
1690       for (j=proc_index[i]; j<num_blocks[i]; j += proc_grid[i]) {
1691         lo = j*block_dims[i] + 1;
1692         hi = (j+1)*block_dims[i];
1693         if (hi > dims[i]) hi = dims[i];
1694         lld[i] += (hi - lo + 1);
1695         if (j<index[i]) block_idx[i]++;
1696         block_count[i]++;
1697       }
1698     }
1699 
1700     /* Evaluate offset for requested block. The algorithm used goes like this:
1701      *    The contribution from the fastest dimension is
1702      *      block_idx[0]*block_dims[0]*...*block_dims[ndim-1];
1703      *    The contribution from the second fastest dimension is
1704      *      block_idx[1]*lld[0]*block_dims[1]*...*block_dims[ndim-1];
1705      *    The contribution from the third fastest dimension is
1706      *      block_idx[2]*lld[0]*lld[1]*block_dims[2]*...*block_dims[ndim-1];
1707      *    etc.
1708      *    If block_idx[i] is equal to the total number of blocks contained on that
1709      *    processor minus 1 (the index is at the edge of the array) and the index
1710      *    i is greater than the index of the dimension, then instead of using the
1711      *    block dimension, use the fractional dimension of the edge block (which
1712      *    may be smaller than the block dimension)
1713      */
1714     offset = 0;
1715     for (i=0; i<ndim; i++) {
1716       factor = 1;
1717       for (j=0; j<i; j++) {
1718         factor *= lld[j];
1719       }
1720       for (j=i; j<ndim; j++) {
1721         if (j > i && block_idx[j] > block_count[j]-1) {
1722           factor *= ldims[j];
1723         } else {
1724           factor *= block_dims[j];
1725         }
1726       }
1727       offset += block_idx[i]*factor;
1728     }
1729   } else if (GA[handle].distr_type == TILED_IRREG) {
1730     /* Evaluate offset for requested block. This algorithm is similar to the
1731      * algorithm for reqularly tiled data layouts.
1732      *    The contribution from the fastest dimension is
1733      *      ldidx[0]*ldims[2]*...*ldims[ndim-1];
1734      *    The contribution from the second fastest dimension is
1735      *      lld[0]*ldidx[1]*ldims[2]*...*ldims[ndim-1];
1736      *    The contribution from the third fastest dimension is
1737      *      lld[0]*lld[1]*ldidx[2]*ldims[3]*...*ldims[ndim-1];
1738      *    etc.
1739      */
1740     offset = 0;
1741     for (i=0; i<ndim; i++) {
1742       factor = 1;
1743       for (j=0; j<i; j++) {
1744         factor *= lld[j];
1745       }
1746       factor *= ldidx[i];
1747       for (j=i+1; j<ndim; j++) {
1748         factor *= ldims[j];
1749       }
1750       offset += factor;
1751     }
1752   } else if (GA[handle].distr_type == SCALAPACK) {
1753     /* Evalauate offset for block */
1754     offset = 0;
1755     factor = 1;
1756     for (i = 0; i<ndim; i++) {
1757       offset += ((index[i]-proc_index[i])/proc_grid[i])*block_dims[i]*factor;
1758       if (i<ndim-1) factor *= ld[i];
1759     }
1760   }
1761 
1762   lptr = GA[handle].ptr[inode]+offset*GA[handle].elemsize;
1763 
1764   *(char**)ptr = lptr;
1765 }
1766 
1767 /**
1768  *  Return a pointer to the beginning of a local data block in a block-cyclic
1769  *  data distribution
1770  */
1771 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1772 #   pragma weak wnga_access_block_ptr = pnga_access_block_ptr
1773 #endif
1774 
pnga_access_block_ptr(Integer g_a,Integer idx,void * ptr,Integer * ld)1775 void pnga_access_block_ptr(Integer g_a, Integer idx, void* ptr, Integer *ld)
1776             /* g_a: array handle [input]
1777              * idx: block index  [input]
1778              * ptr: pointer to data in block [output]
1779              * ld:  array of strides for block data [output]
1780              */
1781 {
1782   char *lptr;
1783   Integer  handle = GA_OFFSET + g_a;
1784   Integer  i, j/*, p_handle*/, nblocks, offset, tsum, inode;
1785   Integer ndim, lo[MAXDIM], hi[MAXDIM], index;
1786 
1787 
1788   /*p_handle = GA[handle].p_handle;*/
1789   nblocks = GA[handle].block_total;
1790   ndim = GA[handle].ndim;
1791   index = idx;
1792   if (index < 0 || index >= nblocks)
1793     pnga_error("block index outside allowed values",index);
1794 
1795   if (GA[handle].distr_type == BLOCK_CYCLIC) {
1796     offset = 0;
1797     inode = index%GAnproc;
1798     for (i=inode; i<index; i += GAnproc) {
1799       ga_ownsM(handle,i,lo,hi);
1800       tsum = 1;
1801       for (j=0; j<ndim; j++) {
1802         tsum *= (hi[j]-lo[j]+1);
1803       }
1804       offset += tsum;
1805     }
1806     lptr = GA[handle].ptr[inode]+offset*GA[handle].elemsize;
1807 
1808     ga_ownsM(handle,index,lo,hi);
1809     for (i=0; i<ndim-1; i++) {
1810       ld[i] = hi[i]-lo[i]+1;
1811     }
1812   } else if (GA[handle].distr_type == SCALAPACK ||
1813       GA[handle].distr_type == TILED ||
1814       GA[handle].distr_type == TILED_IRREG) {
1815     Integer indices[MAXDIM];
1816     /* find block indices */
1817     gam_find_block_indices(handle,index,indices);
1818     /* find pointer */
1819     pnga_access_block_grid_ptr(g_a, indices, &lptr, ld);
1820   }
1821   *(char**)ptr = lptr;
1822 
1823 }
1824 
1825 /**
1826  *  Return a pointer to the beginning of local data on a processor containing
1827  *  block-cyclic data
1828  */
1829 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1830 #   pragma weak wnga_access_block_segment_ptr = pnga_access_block_segment_ptr
1831 #endif
1832 
pnga_access_block_segment_ptr(Integer g_a,Integer proc,void * ptr,Integer * len)1833 void pnga_access_block_segment_ptr(Integer g_a, Integer proc, void* ptr, Integer *len)
1834             /* g_a:  array handle [input]
1835              * proc: processor for data [input]
1836              * ptr:  pointer to data start of data on processor [output]
1837              * len:  length of data contained on processor [output]
1838              */
1839 {
1840   char *lptr;
1841   Integer  handle = GA_OFFSET + g_a;
1842   /*Integer  p_handle, nblocks;*/
1843   Integer /*ndim,*/ index;
1844   int grp = GA[handle].p_handle;
1845 
1846 
1847   /*p_handle = GA[handle].p_handle;*/
1848   /*nblocks = GA[handle].block_total;*/
1849   /*ndim = GA[handle].ndim;*/
1850   index = proc;
1851   if (index < 0 || index >= pnga_pgroup_nnodes(grp))
1852     pnga_error("processor index outside allowed values",index);
1853 
1854   if (index != pnga_pgroup_nodeid(grp))
1855     pnga_error("Only get accurate number of elements for processor making request",0);
1856   lptr = GA[handle].ptr[index];
1857 
1858   *len = GA[handle].size/GA[handle].elemsize;
1859   *(char**)ptr = lptr;
1860 }
1861 
1862 /**
1863  * Provide access to a patch of a Global Array using an index
1864  */
1865 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1866 #   pragma weak wnga_access_idx = pnga_access_idx
1867 #endif
1868 
pnga_access_idx(Integer g_a,Integer lo[],Integer hi[],AccessIndex * index,Integer ld[])1869 void pnga_access_idx(Integer g_a, Integer lo[], Integer hi[],
1870                      AccessIndex* index, Integer ld[])
1871 {
1872 char     *ptr;
1873 Integer  handle = GA_OFFSET + g_a;
1874 Integer  ow,i,p_handle;
1875 unsigned long    elemsize;
1876 unsigned long    lref=0, lptr;
1877 
1878 
1879    p_handle = GA[handle].p_handle;
1880    if(!pnga_locate(g_a,lo,&ow))pnga_error("locate top failed",0);
1881    if (p_handle != -1)
1882       ow = PGRP_LIST[p_handle].inv_map_proc_list[ow];
1883    if ((armci_domain_id(ARMCI_DOMAIN_SMP, ow) != armci_domain_my_id(ARMCI_DOMAIN_SMP)) && (ow != GAme))
1884       pnga_error("cannot access top of the patch",ow);
1885    if(!pnga_locate(g_a,hi, &ow))pnga_error("locate bottom failed",0);
1886    if (p_handle != -1)
1887       ow = PGRP_LIST[p_handle].inv_map_proc_list[ow];
1888    if ((armci_domain_id(ARMCI_DOMAIN_SMP, ow) != armci_domain_my_id(ARMCI_DOMAIN_SMP)) && (ow != GAme))
1889       pnga_error("cannot access bottom of the patch",ow);
1890 
1891    for (i=0; i<GA[handle].ndim; i++)
1892        if(lo[i]>hi[i]) {
1893            ga_RegionError(GA[handle].ndim, lo, hi, g_a);
1894        }
1895 
1896 
1897    if (p_handle != -1)
1898       ow = PGRP_LIST[p_handle].map_proc_list[ow];
1899 
1900    gam_Location(ow,handle, lo, &ptr, ld);
1901 
1902    /*
1903     * return patch address as the distance elements from the reference address
1904     *
1905     * .in Fortran we need only the index to the type array: dbl_mb or int_mb
1906     *  that are elements of COMMON in the the mafdecls.h include file
1907     * .in C we need both the index and the pointer
1908     */
1909 
1910    elemsize = (unsigned long)GA[handle].elemsize;
1911 
1912    /* compute index and check if it is correct */
1913    switch (pnga_type_c2f(GA[handle].type)){
1914      case MT_F_DBL:
1915         *index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
1916         lref = (unsigned long)DBL_MB;
1917         break;
1918 
1919      case MT_F_DCPL:
1920         *index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
1921         lref = (unsigned long)DCPL_MB;
1922         break;
1923 
1924      case MT_F_SCPL:
1925         *index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
1926         lref = (unsigned long)SCPL_MB;
1927         break;
1928 
1929      case MT_F_INT:
1930         *index = (AccessIndex) ((Integer*)ptr - INT_MB);
1931         lref = (unsigned long)INT_MB;
1932         break;
1933 
1934      case MT_F_REAL:
1935         *index = (AccessIndex) ((float*)ptr - FLT_MB);
1936         lref = (unsigned long)FLT_MB;
1937         break;
1938    }
1939 
1940 #ifdef BYTE_ADDRESSABLE_MEMORY
1941    /* check the allignment */
1942    lptr = (unsigned long)ptr;
1943    if( lptr%elemsize != lref%elemsize ){
1944        printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
1945                                                     lref,lref%elemsize);
1946        pnga_error("nga_access: MA addressing problem: base address misallignment",
1947                  handle);
1948    }
1949 #endif
1950 
1951    /* adjust index for Fortran addressing */
1952    (*index) ++ ;
1953    FLUSH_CACHE;
1954 
1955 }
1956 
1957 /*\ PROVIDE ACCESS TO AN INDIVIDUAL DATA BLOCK OF A GLOBAL ARRAY
1958 \*/
1959 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1960 #   pragma weak wnga_access_block_idx = pnga_access_block_idx
1961 #endif
1962 
pnga_access_block_idx(Integer g_a,Integer idx,AccessIndex * index,Integer * ld)1963 void pnga_access_block_idx(Integer g_a, Integer idx, AccessIndex* index, Integer *ld)
1964 {
1965 char     *ptr;
1966 Integer  handle = GA_OFFSET + g_a;
1967 Integer  /*p_handle,*/ iblock;
1968 unsigned long    elemsize;
1969 unsigned long    lref=0, lptr;
1970 
1971 
1972    /*p_handle = GA[handle].p_handle;*/
1973    iblock = idx;
1974    if (iblock < 0 || iblock >= GA[handle].block_total)
1975      pnga_error("block index outside allowed values",iblock);
1976 
1977    pnga_access_block_ptr(g_a,iblock,&ptr,ld);
1978    /*
1979     * return patch address as the distance elements from the reference address
1980     *
1981     * .in Fortran we need only the index to the type array: dbl_mb or int_mb
1982     *  that are elements of COMMON in the the mafdecls.h include file
1983     * .in C we need both the index and the pointer
1984     */
1985 
1986    elemsize = (unsigned long)GA[handle].elemsize;
1987 
1988    /* compute index and check if it is correct */
1989    switch (pnga_type_c2f(GA[handle].type)){
1990      case MT_F_DBL:
1991         *index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
1992         lref = (unsigned long)DBL_MB;
1993         break;
1994 
1995      case MT_F_DCPL:
1996         *index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
1997         lref = (unsigned long)DCPL_MB;
1998         break;
1999 
2000      case MT_F_SCPL:
2001         *index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
2002         lref = (unsigned long)SCPL_MB;
2003         break;
2004 
2005      case MT_F_INT:
2006         *index = (AccessIndex) ((Integer*)ptr - INT_MB);
2007         lref = (unsigned long)INT_MB;
2008         break;
2009 
2010      case MT_F_REAL:
2011         *index = (AccessIndex) ((float*)ptr - FLT_MB);
2012         lref = (unsigned long)FLT_MB;
2013         break;
2014    }
2015 
2016 #ifdef BYTE_ADDRESSABLE_MEMORY
2017    /* check the allignment */
2018    lptr = (unsigned long)ptr;
2019    if( lptr%elemsize != lref%elemsize ){
2020        printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
2021                                                     lref,lref%elemsize);
2022        pnga_error("nga_access: MA addressing problem: base address misallignment",
2023                  handle);
2024    }
2025 #endif
2026 
2027    /* adjust index for Fortran addressing */
2028    (*index) ++ ;
2029    FLUSH_CACHE;
2030 
2031 }
2032 
2033 /**
2034  *  Provide access to an individual data block of a Global Array
2035  *  with a SCALAPACK type block-cyclic data distribution
2036  */
2037 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2038 #   pragma weak wnga_access_block_grid_idx = pnga_access_block_grid_idx
2039 #endif
2040 
pnga_access_block_grid_idx(Integer g_a,Integer * subscript,AccessIndex * index,Integer * ld)2041 void pnga_access_block_grid_idx(Integer g_a, Integer* subscript,
2042                                 AccessIndex *index, Integer *ld)
2043 {
2044 char     *ptr;
2045 Integer  handle = GA_OFFSET + g_a;
2046 Integer  i,ndim/*,p_handle*/;
2047 unsigned long    elemsize;
2048 unsigned long    lref=0, lptr;
2049 
2050 
2051    /*p_handle = GA[handle].p_handle;*/
2052    ndim = GA[handle].ndim;
2053    for (i=0; i<ndim; i++)
2054      if (subscript[i]<0 || subscript[i] >= GA[handle].num_blocks[i])
2055        pnga_error("index outside allowed values",subscript[i]);
2056 
2057    pnga_access_block_grid_ptr(g_a,subscript,&ptr,ld);
2058    /*
2059     * return patch address as the distance elements from the reference address
2060     *
2061     * .in Fortran we need only the index to the type array: dbl_mb or int_mb
2062     *  that are elements of COMMON in the the mafdecls.h include file
2063     * .in C we need both the index and the pointer
2064     */
2065 
2066    elemsize = (unsigned long)GA[handle].elemsize;
2067 
2068    /* compute index and check if it is correct */
2069    switch (pnga_type_c2f(GA[handle].type)){
2070      case MT_F_DBL:
2071         *index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
2072         lref = (unsigned long)DBL_MB;
2073         break;
2074 
2075      case MT_F_DCPL:
2076         *index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
2077         lref = (unsigned long)DCPL_MB;
2078         break;
2079 
2080      case MT_F_SCPL:
2081         *index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
2082         lref = (unsigned long)SCPL_MB;
2083         break;
2084 
2085      case MT_F_INT:
2086         *index = (AccessIndex) ((Integer*)ptr - INT_MB);
2087         lref = (unsigned long)INT_MB;
2088         break;
2089 
2090      case MT_F_REAL:
2091         *index = (AccessIndex) ((float*)ptr - FLT_MB);
2092         lref = (unsigned long)FLT_MB;
2093         break;
2094    }
2095 
2096 #ifdef BYTE_ADDRESSABLE_MEMORY
2097    /* check the allignment */
2098    lptr = (unsigned long)ptr;
2099    if( lptr%elemsize != lref%elemsize ){
2100        printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
2101                                                     lref,lref%elemsize);
2102        pnga_error("nga_access: MA addressing problem: base address misallignment",
2103                  handle);
2104    }
2105 #endif
2106 
2107    /* adjust index for Fortran addressing */
2108    (*index) ++ ;
2109    FLUSH_CACHE;
2110 
2111 }
2112 
2113 /*\ PROVIDE ACCESS TO A PATCH OF A GLOBAL ARRAY
2114 \*/
2115 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2116 #   pragma weak wnga_access_block_segment_idx = pnga_access_block_segment_idx
2117 #endif
2118 
pnga_access_block_segment_idx(Integer g_a,Integer proc,AccessIndex * index,Integer * len)2119 void pnga_access_block_segment_idx(Integer g_a, Integer proc,
2120                                         AccessIndex* index, Integer *len)
2121 {
2122 char     *ptr;
2123 Integer  handle = GA_OFFSET + g_a;
2124 /*Integer  p_handle;*/
2125 unsigned long    elemsize;
2126 unsigned long    lref=0, lptr;
2127 
2128 
2129    /*p_handle = GA[handle].p_handle;*/
2130 
2131    /*
2132     * return patch address as the distance elements from the reference address
2133     *
2134     * .in Fortran we need only the index to the type array: dbl_mb or int_mb
2135     *  that are elements of COMMON in the the mafdecls.h include file
2136     * .in C we need both the index and the pointer
2137     */
2138    pnga_access_block_segment_ptr(g_a, proc, &ptr, len);
2139 
2140    elemsize = (unsigned long)GA[handle].elemsize;
2141 
2142    /* compute index and check if it is correct */
2143    switch (pnga_type_c2f(GA[handle].type)){
2144      case MT_F_DBL:
2145         *index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
2146         lref = (unsigned long)DBL_MB;
2147         break;
2148 
2149      case MT_F_DCPL:
2150         *index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
2151         lref = (unsigned long)DCPL_MB;
2152         break;
2153 
2154      case MT_F_SCPL:
2155         *index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
2156         lref = (unsigned long)SCPL_MB;
2157         break;
2158 
2159      case MT_F_INT:
2160         *index = (AccessIndex) ((Integer*)ptr - INT_MB);
2161         lref = (unsigned long)INT_MB;
2162         break;
2163 
2164      case MT_F_REAL:
2165         *index = (AccessIndex) ((float*)ptr - FLT_MB);
2166         lref = (unsigned long)FLT_MB;
2167         break;
2168    }
2169 
2170 #ifdef BYTE_ADDRESSABLE_MEMORY
2171    /* check the allignment */
2172    lptr = (unsigned long)ptr;
2173    if( lptr%elemsize != lref%elemsize ){
2174        printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
2175                                                     lref,lref%elemsize);
2176        pnga_error("nga_access_block_segment: MA addressing problem: base address misallignment",
2177                  handle);
2178    }
2179 #endif
2180 
2181    /* adjust index for Fortran addressing */
2182    (*index) ++ ;
2183    FLUSH_CACHE;
2184 
2185 }
2186 
2187 /**
2188  *  Release access to a patch of a Global Array
2189  */
2190 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2191 #   pragma weak wnga_release = pnga_release
2192 #endif
2193 
pnga_release(Integer g_a,Integer * lo,Integer * hi)2194 void pnga_release(Integer g_a, Integer *lo, Integer *hi)
2195 {}
2196 
2197 /**
2198  *  Release access and update a patch of a Global Array
2199  */
2200 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2201 #   pragma weak wnga_release_update = pnga_release_update
2202 #endif
2203 
pnga_release_update(Integer g_a,Integer * lo,Integer * hi)2204 void pnga_release_update(Integer g_a, Integer *lo, Integer *hi)
2205 {}
2206 
2207 /**
2208  *  Release access to a block in a block-cyclic Global Array
2209  */
2210 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2211 #   pragma weak wnga_release_block = pnga_release_block
2212 #endif
2213 
pnga_release_block(Integer g_a,Integer iblock)2214 void pnga_release_block(Integer g_a, Integer iblock)
2215 {
2216   /*
2217   Integer i;
2218   Integer handle = GA_OFFSET + g_a;
2219   for (i=0; i<GA[handle].num_rstrctd; i++) {
2220     printf("p[%d] location: %d rstrctd_list[%d]: %d\n",GAme,*iblock,
2221         i,GA[handle].rstrctd_list[i]);
2222   }
2223         */
2224 }
2225 
2226 /**
2227  *  Release access and update a block in a block-cyclic Global Array
2228  */
2229 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2230 #   pragma weak wnga_release_update_block = pnga_release_update_block
2231 #endif
2232 
pnga_release_update_block(Integer g_a,Integer iblock)2233 void pnga_release_update_block(Integer g_a, Integer iblock)
2234 {}
2235 
2236 /**
2237  *  Release access to a block in a block-cyclic Global Array with proc grid
2238  *  (SCALAPack) layout
2239  */
2240 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2241 #   pragma weak wnga_release_block_grid = pnga_release_block_grid
2242 #endif
2243 
pnga_release_block_grid(Integer g_a,Integer * index)2244 void pnga_release_block_grid(Integer g_a, Integer *index)
2245 {}
2246 
2247 /**
2248  *  Release access and update a block in a block-cyclic Global Array with
2249  *  proc grid (SCALAPack) layout
2250  */
2251 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2252 #   pragma weak wnga_release_update_block_grid = pnga_release_update_block_grid
2253 #endif
2254 
pnga_release_update_block_grid(Integer g_a,Integer * index)2255 void pnga_release_update_block_grid(Integer g_a, Integer *index)
2256 {}
2257 
2258 /**
2259  *  Release access to data segment in a block-cyclic Global Array
2260  */
2261 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2262 #   pragma weak wnga_release_block_segment = pnga_release_block_segment
2263 #endif
2264 
pnga_release_block_segment(Integer g_a,Integer iproc)2265 void pnga_release_block_segment(Integer g_a, Integer iproc)
2266 {}
2267 
2268 /**
2269  *  Release access and update a block in a block-cyclic Global Array
2270  */
2271 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2272 #   pragma weak wnga_release_update_block_segment = pnga_release_update_block_segment
2273 #endif
2274 
pnga_release_update_block_segment(Integer g_a,Integer iproc)2275 void pnga_release_update_block_segment(Integer g_a, Integer iproc)
2276 {}
2277 
gai_scatter_acc_local(Integer g_a,void * v,Integer * i,Integer * j,Integer nv,void * alpha,Integer proc)2278 void gai_scatter_acc_local(Integer g_a, void *v,Integer *i,Integer *j,
2279                           Integer nv, void* alpha, Integer proc)
2280 {
2281 void **ptr_src, **ptr_dst;
2282 char *ptr_ref;
2283 Integer ldp, item_size, ilo, ihi, jlo, jhi, type;
2284 Integer lo[2], hi[2];
2285 Integer handle,p_handle,iproc;
2286 armci_giov_t desc;
2287 register Integer k, offset;
2288 int rc=0;
2289 
2290   if (nv < 1) return;
2291 
2292 
2293   handle = GA_OFFSET + g_a;
2294   p_handle = GA[handle].p_handle;
2295 
2296   pnga_distribution(g_a, proc, lo, hi);
2297   ilo = lo[0];
2298   jlo = lo[1];
2299   ihi = hi[0];
2300   jhi = hi[1];
2301 
2302   iproc = proc;
2303   if (GA[handle].distr_type == REGULAR) {
2304     if (GA[handle].num_rstrctd > 0)
2305       iproc = GA[handle].rank_rstrctd[iproc];
2306     gaShmemLocation(iproc, g_a, ilo, jlo, &ptr_ref, &ldp);
2307   } else {
2308     /*
2309     Integer lo[2];
2310     lo[0] = ilo;
2311     lo[1] = jlo;
2312     */
2313     pnga_access_block_ptr(g_a, iproc, &ptr_ref, &ldp);
2314     pnga_release_block(g_a, iproc);
2315     if (GA[handle].distr_type == BLOCK_CYCLIC) {
2316       proc = proc%pnga_nnodes();
2317     } else if (GA[handle].distr_type == SCALAPACK) {
2318       Integer index[2];
2319       gam_find_block_indices(handle, proc, index);
2320       index[0] = index[0]%GA[handle].nblock[0];
2321       index[1] = index[1]%GA[handle].nblock[1];
2322       gam_find_proc_from_sl_indices(handle,proc,index);
2323     } else if (GA[handle].distr_type == TILED ||
2324         GA[handle].distr_type == TILED_IRREG) {
2325       Integer index[2];
2326       gam_find_block_indices(handle, proc, index);
2327       index[0] = index[0]%GA[handle].nblock[0];
2328       index[1] = index[1]%GA[handle].nblock[1];
2329       gam_find_tile_proc_from_indices(handle,proc,index);
2330     }
2331 
2332   }
2333 
2334   type = GA[handle].type;
2335   item_size = GAsizeofM(type);
2336 
2337   ptr_src = malloc((int)nv*2*sizeof(void*));
2338   if(ptr_src==NULL)pnga_error("malloc failed",nv);
2339   ptr_dst=ptr_src+ nv;
2340 
2341   for(k=0; k< nv; k++){
2342      if(i[k] < ilo || i[k] > ihi  || j[k] < jlo || j[k] > jhi){
2343        char err_string[ERR_STR_LEN];
2344        sprintf(err_string,"proc=%d invalid i/j=(%ld,%ld)>< [%ld:%ld,%ld:%ld]",
2345                (int)proc, (long)i[k], (long)j[k], (long)ilo,
2346                (long)ihi, (long)jlo, (long)jhi);
2347        pnga_error(err_string,g_a);
2348      }
2349 
2350      offset  = (j[k] - jlo)* ldp + i[k] - ilo;
2351      ptr_dst[k] = ptr_ref + item_size * offset;
2352      ptr_src[k] = ((char*)v) + k*item_size;
2353   }
2354   desc.bytes = (int)item_size;
2355   desc.src_ptr_array = ptr_src;
2356   desc.dst_ptr_array = ptr_dst;
2357   desc.ptr_array_len = (int)nv;
2358 
2359   if(GA_fence_set)fence_array[proc]=1;
2360 
2361   if (p_handle >= 0) {
2362     proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
2363   }
2364   if(alpha != NULL) {
2365     int optype=-1;
2366     if(type==C_DBL) optype= ARMCI_ACC_DBL;
2367     else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
2368     else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
2369     else if(type==C_INT)optype= ARMCI_ACC_INT;
2370     else if(type==C_LONG)optype= ARMCI_ACC_LNG;
2371     else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
2372     else pnga_error("type not supported",type);
2373     rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)proc);
2374   }
2375 
2376   if(rc) pnga_error("scatter/_acc failed in armci",rc);
2377 
2378   free(ptr_src);
2379 
2380 }
2381 
2382 /**
2383  *  Scatter nv elements of v into a Global Array at locations specified
2384  *  by arrays i and j
2385  */
2386 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2387 #   pragma weak wnga_scatter2d = pnga_scatter2d
2388 #endif
2389 
pnga_scatter2d(Integer g_a,void * v,Integer * i,Integer * j,Integer nv)2390 void pnga_scatter2d(Integer g_a, void *v, Integer *i, Integer *j, Integer nv)
2391 {
2392     register Integer k;
2393     Integer kk;
2394     Integer item_size;
2395     Integer proc, type=GA[GA_OFFSET + g_a].type;
2396     Integer nproc, p_handle, iproc;
2397     Integer subscrpt[2];
2398 
2399     Integer *aproc, naproc; /* active processes and numbers */
2400     Integer *map;           /* map the active processes to allocated space */
2401     char *buf1, *buf2;
2402     Integer handle = g_a + GA_OFFSET;
2403 
2404     Integer *count;   /* counters for each process */
2405     Integer *nelem;   /* number of elements for each process */
2406     /* source and destination pointers for each process */
2407     void ***ptr_src, ***ptr_dst;
2408     void **ptr_org; /* the entire pointer array */
2409     armci_giov_t desc;
2410     Integer *ilo, *ihi, *jlo, *jhi, *ldp, *owner;
2411     Integer lo[2], hi[2];
2412     char **ptr_ref;
2413     Integer num_blocks=0;
2414 
2415     if (nv < 1) return;
2416 
2417     ga_check_handleM(g_a, "ga_scatter");
2418 
2419     GAstat.numsca++;
2420     /* determine how many processors are associated with array */
2421     p_handle = GA[handle].p_handle;
2422     if (p_handle < 0) {
2423       nproc = GAnproc;
2424       if (GA[handle].num_rstrctd > 0) {
2425         nproc = GA[handle].num_rstrctd;
2426       }
2427     } else {
2428       nproc = PGRP_LIST[p_handle].map_nproc;
2429     }
2430 
2431     /* allocate temp memory */
2432     if (GA[handle].distr_type == REGULAR) {
2433       buf1 = malloc((int) (nproc *4 +nv)* (sizeof(Integer)));
2434       if(buf1 == NULL) pnga_error("malloc failed", 3*nproc);
2435     } else {
2436       num_blocks = GA[handle].block_total;
2437       buf1 = malloc((int) (num_blocks *4 +nv)* (sizeof(Integer)));
2438       if(buf1 == NULL) pnga_error("malloc failed", 3*num_blocks);
2439     }
2440 
2441     owner = (Integer *)buf1;
2442     count = owner+ nv;
2443     if (GA[handle].distr_type == REGULAR) {
2444       nelem =  count + nproc;
2445       aproc = count + 2 * nproc;
2446       map = count + 3 * nproc;
2447     } else {
2448       nelem =  count + num_blocks;
2449       aproc = count + 2 * num_blocks;
2450       map = count + 3 * num_blocks;
2451     }
2452 
2453     /* initialize the counters and nelem */
2454     if (GA[handle].distr_type == REGULAR) {
2455       for(kk=0; kk<nproc; kk++) {
2456         count[kk] = 0; nelem[kk] = 0;
2457       }
2458     } else {
2459       for(kk=0; kk<num_blocks; kk++) {
2460         count[kk] = 0; nelem[kk] = 0;
2461       }
2462     }
2463 
2464     /* find proc that owns the (i,j) element; store it in temp:  */
2465     if (GA[handle].num_rstrctd == 0) {
2466       for(k=0; k< nv; k++) {
2467         subscrpt[0] = *(i+k);
2468         subscrpt[1] = *(j+k);
2469         if(! pnga_locate(g_a, subscrpt, owner+k)){
2470           char err_string[ERR_STR_LEN];
2471           sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
2472           pnga_error(err_string,g_a);
2473         }
2474         iproc = owner[k];
2475         nelem[iproc]++;
2476       }
2477     } else {
2478       for(k=0; k< nv; k++) {
2479         subscrpt[0] = *(i+k);
2480         subscrpt[1] = *(j+k);
2481         if(! pnga_locate(g_a, subscrpt, owner+k)){
2482           char err_string[ERR_STR_LEN];
2483           sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
2484           pnga_error(err_string,g_a);
2485         }
2486         iproc = GA[handle].rank_rstrctd[owner[k]];
2487         nelem[iproc]++;
2488       }
2489     }
2490 
2491     naproc = 0;
2492     if (GA[handle].distr_type == REGULAR) {
2493       for(k=0; k<nproc; k++) if(nelem[k] > 0) {
2494         aproc[naproc] = k;
2495         map[k] = naproc;
2496         naproc ++;
2497       }
2498     } else {
2499       for(k=0; k<num_blocks; k++) if(nelem[k] > 0) {
2500         aproc[naproc] = k;
2501         map[k] = naproc;
2502         naproc ++;
2503       }
2504     }
2505 
2506     GAstat.numsca_procs += naproc;
2507 
2508     buf2 = malloc((int)(2*naproc*sizeof(void **) + 2*nv*sizeof(void *) +
2509                       5*naproc*sizeof(Integer) + naproc*sizeof(char*)));
2510     if(buf2 == NULL) pnga_error("malloc failed", naproc);
2511 
2512     ptr_src = (void ***)buf2;
2513     ptr_dst = (void ***)(buf2 + naproc*sizeof(void **));
2514     ptr_org = (void **)(buf2 + 2*naproc*sizeof(void **));
2515     ptr_ref = (char **)(buf2+2*naproc*sizeof(void **)+2*nv*sizeof(void *));
2516     ilo = (Integer *)(((char*)ptr_ref) + naproc*sizeof(char*));
2517     ihi = ilo + naproc;
2518     jlo = ihi + naproc;
2519     jhi = jlo + naproc;
2520     ldp = jhi + naproc;
2521 
2522     if (GA[handle].distr_type == REGULAR) {
2523       for(kk=0; kk<naproc; kk++) {
2524         iproc = aproc[kk];
2525         if (GA[handle].num_rstrctd > 0)
2526                     iproc = GA[handle].rstrctd_list[iproc];
2527         pnga_distribution(g_a, iproc, lo, hi);
2528         ilo[kk] = lo[0];
2529         jlo[kk] = lo[1];
2530         ihi[kk] = hi[0];
2531         jhi[kk] = hi[1];
2532 
2533         /* get address of the first element owned by proc */
2534         gaShmemLocation(aproc[kk], g_a, ilo[kk], jlo[kk], &(ptr_ref[kk]),
2535             &(ldp[kk]));
2536       }
2537     } else {
2538       for(kk=0; kk<naproc; kk++) {
2539         iproc = aproc[kk];
2540         pnga_distribution(g_a, iproc, lo, hi);
2541         ilo[kk] = lo[0];
2542         jlo[kk] = lo[1];
2543         ihi[kk] = hi[0];
2544         jhi[kk] = hi[1];
2545 
2546         /* get address of the first element owned by proc */
2547         pnga_access_block_ptr(g_a, iproc, &(ptr_ref[kk]), &(ldp[kk]));
2548         pnga_release_block(g_a, iproc);
2549       }
2550     }
2551 
2552     /* determine limit for message size --  v,i, & j will travel together */
2553     item_size = GAsizeofM(type);
2554     GAbytes.scatot += (double)item_size*nv ;
2555     iproc = owner[GAme];
2556     GAbytes.scaloc += (double)item_size* nelem[iproc];
2557     ptr_src[0] = ptr_org; ptr_dst[0] = ptr_org + nv;
2558     for(k=1; k<naproc; k++) {
2559         ptr_src[k] = ptr_src[k-1] + nelem[aproc[k-1]];
2560         ptr_dst[k] = ptr_dst[k-1] + nelem[aproc[k-1]];
2561     }
2562 
2563     for(k=0; k<nv; k++){
2564         Integer this_count;
2565         proc = owner[k];
2566         if (GA[handle].num_rstrctd > 0)
2567                     proc = GA[handle].rank_rstrctd[owner[k]];
2568         this_count = count[proc];
2569         count[proc]++;
2570         proc = map[proc];
2571         ptr_src[proc][this_count] = ((char*)v) + k * item_size;
2572 
2573         if(i[k] < ilo[proc] || i[k] > ihi[proc]  ||
2574            j[k] < jlo[proc] || j[k] > jhi[proc]){
2575           char err_string[ERR_STR_LEN];
2576           sprintf(err_string,"proc=%d invalid i/j=(%ld,%ld)><[%ld:%ld,%ld:%ld]",
2577              (int)proc, (long)i[k], (long)j[k], (long)ilo[proc],
2578              (long)ihi[proc], (long)jlo[proc], (long)jhi[proc]);
2579             pnga_error(err_string, g_a);
2580         }
2581         ptr_dst[proc][this_count] = ptr_ref[proc] + item_size *
2582             ((j[k] - jlo[proc])* ldp[proc] + i[k] - ilo[proc]);
2583     }
2584 
2585     /* source and destination pointers are ready for all processes */
2586     if (GA[handle].distr_type == REGULAR) {
2587       for(k=0; k<naproc; k++) {
2588         int rc;
2589 
2590         desc.bytes = (int)item_size;
2591         desc.src_ptr_array = ptr_src[k];
2592         desc.dst_ptr_array = ptr_dst[k];
2593         desc.ptr_array_len = (int)nelem[aproc[k]];
2594         if (p_handle < 0) {
2595           iproc = aproc[k];
2596           if (GA[handle].num_rstrctd > 0)
2597             iproc = GA[handle].rstrctd_list[iproc];
2598         } else {
2599           iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
2600         }
2601 
2602         rc = ARMCI_PutV(&desc, 1, (int)iproc);
2603         if(rc) pnga_error("scatter failed in armci",rc);
2604       }
2605     } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
2606       for(k=0; k<naproc; k++) {
2607         int rc;
2608 
2609         desc.bytes = (int)item_size;
2610         desc.src_ptr_array = ptr_src[k];
2611         desc.dst_ptr_array = ptr_dst[k];
2612         desc.ptr_array_len = (int)nelem[aproc[k]];
2613         iproc = aproc[k];
2614         iproc = iproc%nproc;
2615         if (p_handle >= 0) {
2616           iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2617         }
2618 
2619         rc = ARMCI_PutV(&desc, 1, (int)iproc);
2620         if(rc) pnga_error("scatter failed in armci",rc);
2621       }
2622     } else if (GA[handle].distr_type == SCALAPACK) {
2623       Integer index[MAXDIM];
2624       for(k=0; k<naproc; k++) {
2625         int rc;
2626 
2627         desc.bytes = (int)item_size;
2628         desc.src_ptr_array = ptr_src[k];
2629         desc.dst_ptr_array = ptr_dst[k];
2630         desc.ptr_array_len = (int)nelem[aproc[k]];
2631         iproc = aproc[k];
2632         gam_find_block_indices(handle, iproc, index);
2633         index[0] = index[0]%GA[handle].nblock[0];
2634         index[1] = index[1]%GA[handle].nblock[1];
2635         gam_find_proc_from_sl_indices(handle,iproc,index);
2636         if (p_handle >= 0) {
2637           iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2638         }
2639 
2640         rc = ARMCI_PutV(&desc, 1, (int)iproc);
2641         if(rc) pnga_error("scatter failed in armci",rc);
2642       }
2643     } else if (GA[handle].distr_type == TILED ||
2644         GA[handle].distr_type == TILED_IRREG) {
2645       Integer index[MAXDIM];
2646       for(k=0; k<naproc; k++) {
2647         int rc;
2648 
2649         desc.bytes = (int)item_size;
2650         desc.src_ptr_array = ptr_src[k];
2651         desc.dst_ptr_array = ptr_dst[k];
2652         desc.ptr_array_len = (int)nelem[aproc[k]];
2653         iproc = aproc[k];
2654         gam_find_block_indices(handle, iproc, index);
2655         index[0] = index[0]%GA[handle].nblock[0];
2656         index[1] = index[1]%GA[handle].nblock[1];
2657         gam_find_tile_proc_from_indices(handle,iproc,index);
2658         if (p_handle >= 0) {
2659           iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2660         }
2661 
2662         rc = ARMCI_PutV(&desc, 1, (int)iproc);
2663         if(rc) pnga_error("scatter failed in armci",rc);
2664       }
2665     }
2666 
2667     free(buf2);
2668     free(buf1);
2669 
2670 }
2671 
2672 /**
2673  *  Add elements of v to contents at random locations in Global Array
2674  *  specified by vectors i and j
2675  */
2676 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2677 #   pragma weak wnga_scatter_acc2d = pnga_scatter_acc2d
2678 #endif
2679 
pnga_scatter_acc2d(g_a,v,i,j,nv,alpha)2680 void pnga_scatter_acc2d(g_a, v, i, j, nv, alpha)
2681      Integer g_a, nv, *i, *j;
2682      void *v, *alpha;
2683 {
2684 register Integer k;
2685 Integer item_size;
2686 Integer first, nelem, proc, type=GA[GA_OFFSET + g_a].type;
2687 Integer *int_ptr;
2688 Integer subscrpt[2];
2689 
2690   if (nv < 1) return;
2691 
2692   ga_check_handleM(g_a, "ga_scatter_acc");
2693 
2694   GAstat.numsca++;
2695 
2696   int_ptr = (Integer*) ga_malloc(nv, MT_F_INT, "ga_scatter_acc--p");
2697 
2698   /* find proc that owns the (i,j) element; store it in temp: int_ptr */
2699   for(k=0; k< nv; k++) {
2700     subscrpt[0] = *(i+k);
2701     subscrpt[1] = *(j+k);
2702     if(! pnga_locate(g_a, subscrpt, int_ptr+k)){
2703          char err_string[ERR_STR_LEN];
2704          sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
2705          pnga_error(err_string,g_a);
2706     }
2707   }
2708 
2709   /* determine limit for message size --  v,i, & j will travel together */
2710   item_size = GAsizeofM(type);
2711   GAbytes.scatot += (double)item_size*nv ;
2712 
2713   /* Sort the entries by processor */
2714   ga_sort_scat(&nv, v, i, j, int_ptr, type );
2715 
2716   /* go through the list again executing scatter for each processor */
2717 
2718   first = 0;
2719   do {
2720       proc = int_ptr[first];
2721       nelem = 0;
2722 
2723       /* count entries for proc from "first" to last */
2724       for(k=first; k< nv; k++){
2725         if(proc == int_ptr[k]) nelem++;
2726         else break;
2727       }
2728 
2729       if(proc == GAme){
2730              GAbytes.scaloc += (double)item_size* nelem ;
2731       }
2732 
2733       gai_scatter_acc_local(g_a, ((char*)v)+item_size*first, i+first,
2734                            j+first, nelem, alpha, proc);
2735       first += nelem;
2736 
2737   }while (first< nv);
2738 
2739   ga_free(int_ptr);
2740 
2741 }
2742 
2743 #define SCATTER -99
2744 #define GATHER -98
2745 #define SCATTER_ACC -97
2746 
2747 /*\ GATHER OPERATION elements from the global array into v
2748 \*/
gai_gatscat(int op,Integer g_a,void * v,Integer subscript[],Integer nv,double * locbytes,double * totbytes,void * alpha)2749 void gai_gatscat(int op, Integer g_a, void* v, Integer subscript[],
2750                  Integer nv, double *locbytes, double* totbytes, void *alpha)
2751 {
2752     Integer k, handle=g_a+GA_OFFSET;
2753     int  ndim, item_size, type;
2754     Integer *proc;
2755     Integer nproc, p_handle, iproc;
2756 
2757     Integer *aproc, naproc; /* active processes and numbers */
2758     Integer *map;           /* map the active processes to allocated space */
2759     char *buf1, *buf2;
2760 
2761     Integer *count;        /* counters for each process */
2762     Integer *nelem;        /* number of elements for each process */
2763     /* source and destination pointers for each process */
2764     void ***ptr_src, ***ptr_dst;
2765     void **ptr_org; /* the entire pointer array */
2766     armci_giov_t desc;
2767     Integer num_blocks=0;
2768 
2769 
2770 
2771     proc=(Integer *)ga_malloc(nv, MT_F_INT, "ga_gat-p");
2772 
2773     ndim = GA[handle].ndim;
2774     type = GA[handle].type;
2775     item_size = GA[handle].elemsize;
2776     *totbytes += (double)item_size*nv;
2777 
2778     /* determine how many processors are associated with array */
2779     p_handle = GA[handle].p_handle;
2780     if (p_handle < 0) {
2781       nproc = GAnproc;
2782       if (GA[handle].num_rstrctd > 0) nproc = GA[handle].num_rstrctd;
2783     } else {
2784       nproc = PGRP_LIST[p_handle].map_nproc;
2785     }
2786 
2787     /* allocate temp memory */
2788     if (GA[handle].distr_type == REGULAR) {
2789       buf1 = malloc((int) nproc * 4 * (sizeof(Integer)));
2790       if(buf1 == NULL) pnga_error("malloc failed", 3*nproc);
2791     } else {
2792       num_blocks = GA[handle].block_total;
2793       buf1 = malloc((int) num_blocks * 4 * (sizeof(Integer)));
2794       if(buf1 == NULL) pnga_error("malloc failed", 3*num_blocks);
2795     }
2796 
2797     count = (Integer *)buf1;
2798     if (GA[handle].distr_type == REGULAR) {
2799       nelem = (Integer *)(buf1 + nproc * sizeof(Integer));
2800       aproc = (Integer *)(buf1 + 2 * nproc * sizeof(Integer));
2801       map = (Integer *)(buf1 + 3 * nproc * sizeof(Integer));
2802     } else {
2803       num_blocks = GA[handle].block_total;
2804       nelem = (Integer *)(buf1 + num_blocks * sizeof(Integer));
2805       aproc = (Integer *)(buf1 + 2 * num_blocks * sizeof(Integer));
2806       map = (Integer *)(buf1 + 3 * num_blocks * sizeof(Integer));
2807     }
2808 
2809     /* initialize the counters and nelem */
2810     if (GA[handle].distr_type == REGULAR) {
2811       for(k=0; k<nproc; k++) count[k] = 0;
2812       for(k=0; k<nproc; k++) nelem[k] = 0;
2813     } else {
2814       for(k=0; k<num_blocks; k++) count[k] = 0;
2815       for(k=0; k<num_blocks; k++) nelem[k] = 0;
2816     }
2817 
2818     /* get the process id that the element should go and count the
2819      * number of elements for each process
2820      */
2821     if (GA[handle].num_rstrctd == 0) {
2822       for(k=0; k<nv; k++) {
2823         if(!pnga_locate(g_a, subscript+k*ndim, proc+k)) {
2824           gai_print_subscript("invalid subscript",ndim, subscript+k*ndim,"\n");
2825           pnga_error("failed -element:",k);
2826         }
2827         iproc = proc[k];
2828         nelem[iproc]++;
2829       }
2830     } else {
2831       for(k=0; k<nv; k++) {
2832         if(!pnga_locate(g_a, subscript+k*ndim, proc+k)) {
2833           gai_print_subscript("invalid subscript",ndim, subscript+k*ndim,"\n");
2834           pnga_error("failed -element:",k);
2835         }
2836         iproc = GA[handle].rank_rstrctd[proc[k]];
2837         nelem[iproc]++;
2838       }
2839     }
2840 
2841     /* find the active processes (with which transfer data) */
2842     naproc = 0;
2843     if (GA[handle].distr_type == REGULAR) {
2844       for(k=0; k<nproc; k++) if(nelem[k] > 0) {
2845         aproc[naproc] = k;
2846         map[k] = naproc;
2847         naproc ++;
2848       }
2849     } else {
2850       for(k=0; k<num_blocks; k++) if(nelem[k] > 0) {
2851         aproc[naproc] = k;
2852         map[k] = naproc;
2853         naproc ++;
2854       }
2855     }
2856 
2857     buf2 = malloc((int)(2*naproc*sizeof(void **) + 2*nv*sizeof(void *)));
2858     if(buf2 == NULL) pnga_error("malloc failed", 2*naproc);
2859 
2860     ptr_src = (void ***)buf2;
2861     ptr_dst = (void ***)(buf2 + naproc * sizeof(void **));
2862     ptr_org = (void **)(buf2 + 2 * naproc * sizeof(void **));
2863 
2864     /* set the pointers as
2865      *    P0            P1                  P0          P1
2866      * ptr_src[0]   ptr_src[1] ...       ptr_dst[0]  ptr_dst[1] ...
2867      *        \          \                    \          \
2868      * ptr_org |-------------------------------|---------------------------|
2869      *         |                nv             |              nv           |
2870      *         | nelem[0] | nelem[1] |...      | nelem[0] | nelem[1] |...
2871      */
2872     ptr_src[0] = ptr_org; ptr_dst[0] = ptr_org + nv;
2873     for(k=1; k<naproc; k++) {
2874         ptr_src[k] = ptr_src[k-1] + nelem[aproc[k-1]];
2875         ptr_dst[k] = ptr_dst[k-1] + nelem[aproc[k-1]];
2876     }
2877 
2878     *locbytes += (double)item_size* nelem[GAme];
2879 
2880     switch(op) {
2881       case GATHER:
2882         /* go through all the elements
2883          * for process 0: ptr_src[0][0, 1, ...] = subscript + offset0...
2884          *                ptr_dst[0][0, 1, ...] = v + offset0...
2885          * for process 1: ptr_src[1][...] ...
2886          *                ptr_dst[1][...] ...
2887          */
2888         if (GA[handle].distr_type == REGULAR) {
2889           for(k=0; k<nv; k++){
2890             iproc = proc[k];
2891             if (GA[handle].num_rstrctd > 0)
2892               iproc = GA[handle].rank_rstrctd[iproc];
2893             ptr_dst[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
2894             if (p_handle != 0) {
2895               gam_Loc_ptr(iproc, handle,  (subscript+k*ndim),
2896                   ptr_src[map[iproc]]+count[iproc]);
2897             } else {
2898               gam_Loc_ptr(proc[k], handle,  (subscript+k*ndim),
2899                   ptr_src[map[iproc]]+count[iproc]);
2900             }
2901             count[iproc]++;
2902           }
2903         } else {
2904           Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
2905           Integer j, jtot, last, offset;
2906           for(k=0; k<nv; k++){
2907             iproc = proc[k];
2908             ptr_dst[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
2909             pnga_distribution(g_a, iproc, lo, hi);
2910             pnga_access_block_ptr(g_a, iproc, &(ptr_src[map[iproc]][count[iproc]]), ld);
2911             pnga_release_block(g_a, iproc);
2912             /* calculate remaining offset */
2913             offset = 0;
2914             last = ndim - 1;
2915             jtot = 1;
2916             for (j=0; j<last; j++) {
2917               offset += ((subscript+k*ndim)[j]-lo[j])*jtot;
2918               jtot *= ld[j];
2919             }
2920             offset += ((subscript+k*ndim)[last]-lo[last])*jtot;
2921             ptr_src[map[iproc]][count[iproc]]=((char *)ptr_src[map[iproc]][count[iproc]])+offset*GA[handle].elemsize;
2922             count[iproc]++;
2923           }
2924         }
2925 
2926         /* source and destination pointers are ready for all processes */
2927         if (GA[handle].distr_type == REGULAR) {
2928           for(k=0; k<naproc; k++) {
2929             int rc;
2930 
2931             desc.bytes = (int)item_size;
2932             desc.src_ptr_array = ptr_src[k];
2933             desc.dst_ptr_array = ptr_dst[k];
2934             desc.ptr_array_len = (int)nelem[aproc[k]];
2935 
2936             if (p_handle < 0) {
2937               iproc = aproc[k];
2938               if (GA[handle].num_rstrctd > 0)
2939                 iproc = GA[handle].rstrctd_list[iproc];
2940             } else {
2941               iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
2942             }
2943             rc=ARMCI_GetV(&desc, 1, (int)iproc);
2944             if(rc) pnga_error("gather failed in armci",rc);
2945           }
2946         } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
2947           for(k=0; k<naproc; k++) {
2948             int rc;
2949 
2950             desc.bytes = (int)item_size;
2951             desc.src_ptr_array = ptr_src[k];
2952             desc.dst_ptr_array = ptr_dst[k];
2953             desc.ptr_array_len = (int)nelem[aproc[k]];
2954             iproc = aproc[k];
2955             iproc = iproc%nproc;
2956             if (p_handle >= 0) {
2957               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2958             }
2959             rc=ARMCI_GetV(&desc, 1, (int)iproc);
2960             if(rc) pnga_error("gather failed in armci",rc);
2961           }
2962         } else if (GA[handle].distr_type == SCALAPACK) {
2963           Integer j, index[MAXDIM];
2964           for(k=0; k<naproc; k++) {
2965             int rc;
2966 
2967             desc.bytes = (int)item_size;
2968             desc.src_ptr_array = ptr_src[k];
2969             desc.dst_ptr_array = ptr_dst[k];
2970             desc.ptr_array_len = (int)nelem[aproc[k]];
2971             iproc = aproc[k];
2972             gam_find_block_indices(handle, iproc, index);
2973             for (j=0; j<ndim; j++) {
2974               index[j] = index[j]%GA[handle].nblock[j];
2975             }
2976             gam_find_proc_from_sl_indices(handle,iproc,index);
2977             if (p_handle >= 0) {
2978               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
2979             }
2980             rc=ARMCI_GetV(&desc, 1, (int)iproc);
2981             if(rc) pnga_error("gather failed in armci",rc);
2982           }
2983         } else if (GA[handle].distr_type == TILED ||
2984             GA[handle].distr_type == TILED_IRREG) {
2985           Integer j, index[MAXDIM];
2986           for(k=0; k<naproc; k++) {
2987             int rc;
2988 
2989             desc.bytes = (int)item_size;
2990             desc.src_ptr_array = ptr_src[k];
2991             desc.dst_ptr_array = ptr_dst[k];
2992             desc.ptr_array_len = (int)nelem[aproc[k]];
2993             iproc = aproc[k];
2994             gam_find_block_indices(handle, iproc, index);
2995             for (j=0; j<ndim; j++) {
2996               index[j] = index[j]%GA[handle].nblock[j];
2997             }
2998             gam_find_tile_proc_from_indices(handle,iproc,index);
2999             if (p_handle >= 0) {
3000               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3001             }
3002             rc=ARMCI_GetV(&desc, 1, (int)iproc);
3003             if(rc) pnga_error("gather failed in armci",rc);
3004           }
3005         }
3006         GAstat.numgat_procs += naproc;
3007         break;
3008       case SCATTER:
3009         /* go through all the elements
3010          * for process 0: ptr_src[0][0, 1, ...] = v + offset0...
3011          *                ptr_dst[0][0, 1, ...] = subscript + offset0...
3012          * for process 1: ptr_src[1][...] ...
3013          *                ptr_dst[1][...] ...
3014          */
3015         if (GA[handle].distr_type == REGULAR) {
3016           for(k=0; k<nv; k++){
3017             iproc = proc[k];
3018             if (GA[handle].num_rstrctd > 0)
3019               iproc = GA[handle].rank_rstrctd[iproc];
3020             ptr_src[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
3021             if (p_handle != 0) {
3022               gam_Loc_ptr(iproc, handle,  (subscript+k*ndim),
3023                   ptr_dst[map[iproc]]+count[iproc]);
3024             } else {
3025               gam_Loc_ptr(proc[k], handle,  (subscript+k*ndim),
3026                   ptr_dst[map[iproc]]+count[iproc]);
3027             }
3028             count[iproc]++;
3029           }
3030         } else {
3031           Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
3032           Integer j, jtot, last, offset;
3033           for(k=0; k<nv; k++){
3034             iproc = proc[k];
3035             ptr_src[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
3036             pnga_distribution(g_a, iproc, lo, hi);
3037             pnga_access_block_ptr(g_a, iproc, &(ptr_dst[map[iproc]][count[iproc]]), ld);
3038             pnga_release_block(g_a, iproc);
3039             /* calculate remaining offset */
3040             offset = 0;
3041             last = ndim - 1;
3042             jtot = 1;
3043             for (j=0; j<last; j++) {
3044               offset += ((subscript+k*ndim)[j]-lo[j])*jtot;
3045               jtot *= ld[j];
3046             }
3047             offset += ((subscript+k*ndim)[last]-lo[last])*jtot;
3048             ptr_dst[map[iproc]][count[iproc]]=((char*)ptr_dst[map[iproc]][count[iproc]])+offset*GA[handle].elemsize;
3049             count[iproc]++;
3050           }
3051         }
3052 
3053         /* source and destination pointers are ready for all processes */
3054         if (GA[handle].distr_type == REGULAR) {
3055           for(k=0; k<naproc; k++) {
3056             int rc;
3057 
3058             desc.bytes = (int)item_size;
3059             desc.src_ptr_array = ptr_src[k];
3060             desc.dst_ptr_array = ptr_dst[k];
3061             desc.ptr_array_len = (int)nelem[aproc[k]];
3062 
3063 
3064             if (p_handle < 0) {
3065               iproc = aproc[k];
3066               if (GA[handle].num_rstrctd > 0)
3067                 iproc = GA[handle].rstrctd_list[iproc];
3068             } else {
3069               iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
3070             }
3071             if(GA_fence_set) fence_array[iproc]=1;
3072 
3073             rc=ARMCI_PutV(&desc, 1, (int)iproc);
3074             if(rc) pnga_error("scatter failed in armci",rc);
3075           }
3076         } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
3077           for(k=0; k<naproc; k++) {
3078             int rc;
3079 
3080             desc.bytes = (int)item_size;
3081             desc.src_ptr_array = ptr_src[k];
3082             desc.dst_ptr_array = ptr_dst[k];
3083             desc.ptr_array_len = (int)nelem[aproc[k]];
3084 
3085             iproc = aproc[k];
3086             iproc = iproc%nproc;
3087             if (p_handle >= 0) {
3088               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3089             }
3090             if(GA_fence_set) fence_array[iproc]=1;
3091 
3092             rc=ARMCI_PutV(&desc, 1, (int)iproc);
3093             if(rc) pnga_error("scatter failed in armci",rc);
3094           }
3095         } else if (GA[handle].distr_type == SCALAPACK) {
3096           Integer j, index[MAXDIM];
3097           for(k=0; k<naproc; k++) {
3098             int rc;
3099 
3100             desc.bytes = (int)item_size;
3101             desc.src_ptr_array = ptr_src[k];
3102             desc.dst_ptr_array = ptr_dst[k];
3103             desc.ptr_array_len = (int)nelem[aproc[k]];
3104 
3105             iproc = aproc[k];
3106             gam_find_block_indices(handle, iproc, index);
3107             for (j=0; j<ndim; j++) {
3108               index[j] = index[j]%GA[handle].nblock[j];
3109             }
3110             gam_find_proc_from_sl_indices(handle,iproc,index);
3111             if (p_handle >= 0) {
3112               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3113             }
3114             if(GA_fence_set) fence_array[iproc]=1;
3115 
3116             rc=ARMCI_PutV(&desc, 1, (int)iproc);
3117             if(rc) pnga_error("scatter failed in armci",rc);
3118           }
3119         } else if (GA[handle].distr_type == TILED ||
3120             GA[handle].distr_type == TILED_IRREG) {
3121           Integer j, index[MAXDIM];
3122           for(k=0; k<naproc; k++) {
3123             int rc;
3124 
3125             desc.bytes = (int)item_size;
3126             desc.src_ptr_array = ptr_src[k];
3127             desc.dst_ptr_array = ptr_dst[k];
3128             desc.ptr_array_len = (int)nelem[aproc[k]];
3129 
3130             iproc = aproc[k];
3131             gam_find_block_indices(handle, iproc, index);
3132             for (j=0; j<ndim; j++) {
3133               index[j] = index[j]%GA[handle].nblock[j];
3134             }
3135             gam_find_tile_proc_from_indices(handle,iproc,index);
3136             if (p_handle >= 0) {
3137               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3138             }
3139             if(GA_fence_set) fence_array[iproc]=1;
3140 
3141             rc=ARMCI_PutV(&desc, 1, (int)iproc);
3142             if(rc) pnga_error("scatter failed in armci",rc);
3143           }
3144         }
3145         GAstat.numsca_procs += naproc;
3146         break;
3147       case SCATTER_ACC:
3148         /* go through all the elements
3149          * for process 0: ptr_src[0][0, 1, ...] = v + offset0...
3150          *                ptr_dst[0][0, 1, ...] = subscript + offset0...
3151          * for process 1: ptr_src[1][...] ...
3152          *                ptr_dst[1][...] ...
3153          */
3154         if (GA[handle].distr_type == REGULAR) {
3155           for(k=0; k<nv; k++){
3156             iproc = proc[k];
3157             if (GA[handle].num_rstrctd > 0)
3158               iproc = GA[handle].rank_rstrctd[iproc];
3159             ptr_src[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
3160             if (p_handle != 0) {
3161               gam_Loc_ptr(iproc, handle,  (subscript+k*ndim),
3162                   ptr_dst[map[iproc]]+count[iproc]);
3163             } else {
3164               gam_Loc_ptr(proc[k], handle,  (subscript+k*ndim),
3165                   ptr_dst[map[iproc]]+count[iproc]);
3166             }
3167             count[iproc]++;
3168           }
3169         } else {
3170           Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
3171           Integer j, jtot, last, offset;
3172           for(k=0; k<nv; k++){
3173             iproc = proc[k];
3174             ptr_src[map[iproc]][count[iproc]] = ((char*)v) + k * item_size;
3175             pnga_distribution(g_a, iproc, lo, hi);
3176             pnga_access_block_ptr(g_a, iproc, &(ptr_dst[map[iproc]][count[iproc]]), ld);
3177             pnga_release_block(g_a, iproc);
3178             /* calculate remaining offset */
3179             offset = 0;
3180             last = ndim - 1;
3181             jtot = 1;
3182             for (j=0; j<last; j++) {
3183               offset += ((subscript+k*ndim)[j]-lo[j])*jtot;
3184               jtot *= ld[j];
3185             }
3186             offset += ((subscript+k*ndim)[last]-lo[last])*jtot;
3187             ptr_dst[map[iproc]][count[iproc]]=((char*)ptr_dst[map[iproc]][count[iproc]])+offset*GA[handle].elemsize;
3188             count[iproc]++;
3189           }
3190         }
3191 
3192         /* source and destination pointers are ready for all processes */
3193         if (GA[handle].distr_type == REGULAR) {
3194           for(k=0; k<naproc; k++) {
3195             int rc=0;
3196 
3197             desc.bytes = (int)item_size;
3198             desc.src_ptr_array = ptr_src[k];
3199             desc.dst_ptr_array = ptr_dst[k];
3200             desc.ptr_array_len = (int)nelem[aproc[k]];
3201 
3202             if (p_handle < 0) {
3203               iproc = aproc[k];
3204               if (GA[handle].num_rstrctd > 0)
3205                 iproc = GA[handle].rstrctd_list[iproc];
3206             } else {
3207               iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
3208             }
3209             if(GA_fence_set) fence_array[iproc]=1;
3210 
3211             if(alpha != NULL) {
3212               int optype=-1;
3213               if(type==C_DBL) optype= ARMCI_ACC_DBL;
3214               else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3215               else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3216               else if(type==C_INT)optype= ARMCI_ACC_INT;
3217               else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3218               else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3219               else pnga_error("type not supported",type);
3220               rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)iproc);
3221             }
3222             if(rc) pnga_error("scatter_acc failed in armci",rc);
3223           }
3224         } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
3225           for(k=0; k<naproc; k++) {
3226             int rc=0;
3227 
3228             desc.bytes = (int)item_size;
3229             desc.src_ptr_array = ptr_src[k];
3230             desc.dst_ptr_array = ptr_dst[k];
3231             desc.ptr_array_len = (int)nelem[aproc[k]];
3232 
3233             iproc = aproc[k];
3234             iproc = iproc%nproc;
3235             if (p_handle >= 0) {
3236               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3237             }
3238             if(GA_fence_set) fence_array[iproc]=1;
3239 
3240             if(alpha != NULL) {
3241               int optype=-1;
3242               if(type==C_DBL) optype= ARMCI_ACC_DBL;
3243               else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3244               else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3245               else if(type==C_INT)optype= ARMCI_ACC_INT;
3246               else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3247               else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3248               else pnga_error("type not supported",type);
3249               rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)iproc);
3250             }
3251             if(rc) pnga_error("scatter_acc failed in armci",rc);
3252           }
3253         } else if (GA[handle].distr_type == SCALAPACK) {
3254           Integer j, index[MAXDIM];
3255           for(k=0; k<naproc; k++) {
3256             int rc=0;
3257 
3258             desc.bytes = (int)item_size;
3259             desc.src_ptr_array = ptr_src[k];
3260             desc.dst_ptr_array = ptr_dst[k];
3261             desc.ptr_array_len = (int)nelem[aproc[k]];
3262 
3263             iproc = aproc[k];
3264             gam_find_block_indices(handle, iproc, index);
3265             for (j=0; j<ndim; j++) {
3266               index[j] = index[j]%GA[handle].nblock[j];
3267             }
3268             gam_find_proc_from_sl_indices(handle,iproc,index);
3269             if (p_handle >= 0) {
3270               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3271             }
3272             if(GA_fence_set) fence_array[iproc]=1;
3273 
3274             if(alpha != NULL) {
3275               int optype=-1;
3276               if(type==C_DBL) optype= ARMCI_ACC_DBL;
3277               else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3278               else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3279               else if(type==C_INT)optype= ARMCI_ACC_INT;
3280               else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3281               else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3282               else pnga_error("type not supported",type);
3283               rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)iproc);
3284             }
3285             if(rc) pnga_error("scatter_acc failed in armci",rc);
3286           }
3287         } else if (GA[handle].distr_type == TILED ||
3288             GA[handle].distr_type == TILED_IRREG) {
3289           Integer j, index[MAXDIM];
3290           for(k=0; k<naproc; k++) {
3291             int rc=0;
3292 
3293             desc.bytes = (int)item_size;
3294             desc.src_ptr_array = ptr_src[k];
3295             desc.dst_ptr_array = ptr_dst[k];
3296             desc.ptr_array_len = (int)nelem[aproc[k]];
3297 
3298             iproc = aproc[k];
3299             gam_find_block_indices(handle, iproc, index);
3300             for (j=0; j<ndim; j++) {
3301               index[j] = index[j]%GA[handle].nblock[j];
3302             }
3303             gam_find_tile_proc_from_indices(handle,iproc,index);
3304             if (p_handle >= 0) {
3305               iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3306             }
3307             if(GA_fence_set) fence_array[iproc]=1;
3308 
3309             if(alpha != NULL) {
3310               int optype=-1;
3311               if(type==C_DBL) optype= ARMCI_ACC_DBL;
3312               else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3313               else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3314               else if(type==C_INT)optype= ARMCI_ACC_INT;
3315               else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3316               else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3317               else pnga_error("type not supported",type);
3318               rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)iproc);
3319             }
3320             if(rc) pnga_error("scatter_acc failed in armci",rc);
3321           }
3322         }
3323         break;
3324       default: pnga_error("operation not supported",op);
3325     }
3326 
3327     free(buf2); free(buf1);
3328 
3329     ga_free(proc);
3330 }
3331 
3332 /**
3333  *  Preallocate data for internal linked-list arrays for gather/scatter calls.
3334  *  @param nelems: the maximum number of elements that will be moved in
3335  *  gather/scatter call using these arrays.
3336  */
3337 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3338 #   pragma weak wnga_alloc_gatscat_buf = pnga_alloc_gatscat_buf
3339 #endif
pnga_alloc_gatscat_buf(Integer nelems)3340 void pnga_alloc_gatscat_buf(Integer nelems)
3341 {
3342   Integer nprocs = pnga_nnodes();
3343   if (GA_prealloc_gatscat)
3344     pnga_error("Gather/scatter buffers already allocated",nelems);
3345   GA_prealloc_gatscat = nelems;
3346   GA_header =(Integer *)ga_malloc(nprocs, MT_F_INT, "ga_gat_header");
3347   GA_list =(Integer *)ga_malloc(nelems, MT_F_INT, "ga_gat_list");
3348   GA_elems =(Integer *)ga_malloc(nprocs, MT_F_INT, "ga_gat_nelems");
3349 }
3350 
3351 /**
3352  *  Free up preallocate data for internal linked-list arrays for gather/scatter calls.
3353  */
3354 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3355 #   pragma weak wnga_free_gatscat_buf = pnga_free_gatscat_buf
3356 #endif
pnga_free_gatscat_buf()3357 void pnga_free_gatscat_buf()
3358 {
3359   if (!GA_prealloc_gatscat)
3360     pnga_error("Gather/scatter buffers not allocated",0);
3361   GA_prealloc_gatscat = 0;
3362   ga_free(GA_elems);
3363   ga_free(GA_list);
3364   ga_free(GA_header);
3365 }
3366 
3367 #define gam_c2f_index(index_c, index_f, ndim)        \
3368 {                                                    \
3369   Integer _i;                                        \
3370   for (_i = 0; _i < (ndim); _i++) {                  \
3371     index_f[_i] = (Integer)(index_c[(ndim)-1-_i]+1); \
3372   }                                                  \
3373 }
3374 
3375 /*\ GATHER OPERATION elements from the global array into v
3376 \*/
gai_gatscat_new(int op,Integer g_a,void * v,void * subscript,Integer c_flag,Integer nv,double * locbytes,double * totbytes,void * alpha)3377 void gai_gatscat_new(int op, Integer g_a, void* v, void *subscript,
3378                      Integer c_flag, Integer nv, double *locbytes,
3379                      double* totbytes, void *alpha)
3380 {
3381     Integer handle=g_a+GA_OFFSET;
3382     int  ndim, i, j, k, type, item_size;
3383     Integer idx, p_handle, num_rstrctd;
3384     Integer *proc;
3385     Integer nprocs, me, iproc, tproc, index[MAXDIM];
3386     Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM-1];
3387     Integer jtot, last, offset;
3388     Integer *subscript_ptr;
3389 
3390     Integer *header, *list, *nelems;
3391     char *buf;
3392     void **ptr_rem, **ptr_loc;
3393     int rc, maxlen;
3394     int *nblock;
3395     armci_giov_t desc;
3396 
3397 
3398 
3399     me = pnga_nodeid();
3400     num_rstrctd = GA[handle].num_rstrctd;
3401     nblock = GA[handle].nblock;
3402 
3403     /* determine how many processors are associated with array */
3404     p_handle = GA[handle].p_handle;
3405     if (p_handle < 0) {
3406       nprocs = GAnproc;
3407       if (num_rstrctd > 0) nprocs = num_rstrctd;
3408     } else {
3409       nprocs = PGRP_LIST[p_handle].map_nproc;
3410     }
3411 
3412     if (!GA_prealloc_gatscat) {
3413       header =(Integer *)ga_malloc(nprocs, MT_F_INT, "ga_gat_header");
3414       list =(Integer *)ga_malloc(nv, MT_F_INT, "ga_gat_list");
3415       nelems =(Integer *)ga_malloc(nprocs, MT_F_INT, "ga_gat_nelems");
3416     } else {
3417       if (GA_prealloc_gatscat < nv)
3418         pnga_error("Gather/scatter vector exceeds allocation length ",
3419                    GA_prealloc_gatscat);
3420       header = (Integer*)GA_header;
3421       list = (Integer*)GA_list;
3422       nelems = (Integer*)GA_elems;
3423     }
3424 
3425     ndim = GA[handle].ndim;
3426     type = GA[handle].type;
3427     item_size = GA[handle].elemsize;
3428     *totbytes = item_size * nv;
3429 
3430     /* Initialize linked list data structures */
3431     for (i=0; i<nprocs; i++) {
3432       nelems[i] = 0;
3433       header[i] = -1;
3434     }
3435     for (i=0; i<nv; i++) {
3436       list[i] = 0;
3437     }
3438 
3439     /* Set up linked list that partitions elements defined in subscripts array
3440      * into groups corresponding to the elements home processor
3441      */
3442     maxlen = 1;
3443     for (i=0; i<nv; i++) {
3444       if (c_flag) {
3445         gam_c2f_index(((int**)subscript)[i], index, ndim);
3446         subscript_ptr = index;
3447       } else {
3448         subscript_ptr = ((Integer*)subscript)+i*ndim;
3449       }
3450       if(!pnga_locate(g_a, subscript_ptr, &idx)) {
3451         gai_print_subscript("invalid subscript",ndim, subscript_ptr,"\n");
3452         pnga_error("failed -element:",i);
3453       }
3454       if (num_rstrctd > 0) idx = GA[handle].rank_rstrctd[idx];
3455       /* map from block to processor, if necessary */
3456       if (GA[handle].distr_type == BLOCK_CYCLIC) {
3457         idx = idx%nprocs;
3458       } else if (GA[handle].distr_type == SCALAPACK) {
3459         gam_find_block_indices(handle,idx,index);
3460         for (j=0; j<ndim; j++) {
3461           index[j] = index[j]%nblock[j];
3462         }
3463         gam_find_proc_from_sl_indices(handle,idx,index);
3464       } else if (GA[handle].distr_type == TILED ||
3465           GA[handle].distr_type == TILED_IRREG) {
3466         gam_find_block_indices(handle,idx,index);
3467         for (j=0; j<ndim; j++) {
3468           index[j] = index[j]%nblock[j];
3469         }
3470         gam_find_tile_proc_from_indices(handle,idx,index);
3471       }
3472       nelems[idx]++;
3473       if (maxlen<nelems[idx]) maxlen=nelems[idx];
3474       j = header[idx];
3475       header[idx] = i;
3476       list[i] = j;
3477     }
3478     *locbytes = item_size * nelems[me];
3479 
3480     /* allocate buffers for individual vector calls */
3481     buf = (char*)malloc(2*maxlen*sizeof(void*));
3482     ptr_loc = (void**)buf;
3483     ptr_rem = (void**)(buf+maxlen*sizeof(void*));
3484 
3485     for (iproc=0; iproc<nprocs; iproc++) {
3486       if (nelems[iproc] > 0) {
3487         /* loop through linked list to find all data elements associated with
3488          * the remote processor iproc
3489          */
3490         idx = header[iproc];
3491         j = 0;
3492         while (idx > -1) {
3493           if (c_flag) {
3494             gam_c2f_index(((int**)subscript)[idx], index, ndim);
3495             subscript_ptr = index;
3496           } else {
3497             subscript_ptr = ((Integer*)subscript)+idx*ndim;
3498           }
3499           if (GA[handle].distr_type == REGULAR) {
3500           /* gam_Loc_ptr modifies the value of the processor variable for
3501            * restricted arrays or processor groups, so make a temporary copy
3502            */
3503             tproc = iproc;
3504             gam_Loc_ptr(tproc, handle,  (subscript_ptr),
3505                 ptr_rem+j);
3506           } else {
3507             /* TODO: Figure out how to get correct pointers for block cyclic
3508              * distributions */
3509             /* find block index from subscript */
3510             if(!pnga_locate(g_a, subscript_ptr, &tproc)) {
3511               gai_print_subscript("invalid subscript for block-cyclic array",
3512                                    ndim, subscript_ptr,"\n");
3513               pnga_error("failed -element:",idx);
3514             }
3515             pnga_distribution(g_a, tproc, lo, hi);
3516             pnga_access_block_ptr(g_a, tproc, ptr_rem+j, ld);
3517             pnga_release_block(g_a, tproc);
3518             offset = 0;
3519             last = ndim -1;
3520             jtot = 1;
3521             for (k=0; k<last; k++) {
3522               offset += ((subscript_ptr)[k]-lo[k])*jtot;
3523               jtot *= ld[k];
3524             }
3525             offset += ((subscript_ptr)[last]-lo[last])*jtot;
3526             ptr_rem[j] = (void*)(((char*)ptr_rem[j])+offset*item_size);
3527           }
3528           ptr_loc[j] = (void*)(((char*)v) + idx * item_size);
3529           idx = list[idx];
3530           j++;
3531         }
3532         /* correct remote proc if restricted arrays or processor groups are
3533          * being used
3534          */
3535         if (num_rstrctd > 0) {
3536           tproc = GA[handle].rstrctd_list[iproc];
3537         } else {
3538           if (p_handle < 0) {
3539             tproc = iproc;
3540           } else {
3541             tproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3542           }
3543         }
3544         /* perform vector operation */
3545         switch(op) {
3546           case GATHER:
3547             desc.bytes = (int)item_size;
3548             desc.src_ptr_array = ptr_rem;
3549             desc.dst_ptr_array = ptr_loc;
3550             desc.ptr_array_len = (int)nelems[iproc];
3551             rc=ARMCI_GetV(&desc, 1, (int)tproc);
3552             if(rc) pnga_error("gather failed in armci",rc);
3553             break;
3554           case SCATTER:
3555             desc.bytes = (int)item_size;
3556             desc.src_ptr_array = ptr_loc;
3557             desc.dst_ptr_array = ptr_rem;
3558             desc.ptr_array_len = (int)nelems[iproc];
3559             rc=ARMCI_PutV(&desc, 1, (int)tproc);
3560             if(rc) pnga_error("scatter failed in armci",rc);
3561             break;
3562           case SCATTER_ACC:
3563             desc.bytes = (int)item_size;
3564             desc.src_ptr_array = ptr_loc;
3565             desc.dst_ptr_array = ptr_rem;
3566             desc.ptr_array_len = (int)nelems[iproc];
3567             if(alpha != NULL) {
3568               int optype=-1;
3569               if(type==C_DBL) optype= ARMCI_ACC_DBL;
3570               else if(type==C_DCPL)optype= ARMCI_ACC_DCP;
3571               else if(type==C_SCPL)optype= ARMCI_ACC_CPL;
3572               else if(type==C_INT)optype= ARMCI_ACC_INT;
3573               else if(type==C_LONG)optype= ARMCI_ACC_LNG;
3574               else if(type==C_FLOAT)optype= ARMCI_ACC_FLT;
3575               else pnga_error("type not supported",type);
3576               rc= ARMCI_AccV(optype, alpha, &desc, 1, (int)tproc);
3577             }
3578             if(rc) pnga_error("scatter_acc failed in armci",rc);
3579             break;
3580           default: pnga_error("operation not supported",op);
3581         }
3582       }
3583     }
3584     free(buf);
3585     if (!GA_prealloc_gatscat) {
3586       ga_free(nelems);
3587       ga_free(list);
3588       ga_free(header);
3589     }
3590 
3591 }
3592 
3593 /**
3594  *  Gather random elements from a global array into local buffer v
3595 \*/
3596 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3597 #   pragma weak wnga_gather = pnga_gather
3598 #endif
3599 
pnga_gather(Integer g_a,void * v,void * subscript,Integer c_flag,Integer nv)3600 void pnga_gather(Integer g_a, void* v, void *subscript, Integer c_flag, Integer nv)
3601 {
3602 
3603   if (nv < 1) return;
3604   ga_check_handleM(g_a, "nga_gather");
3605 
3606   GAstat.numgat++;
3607 
3608 #ifdef USE_GATSCAT_NEW
3609   gai_gatscat_new(GATHER,g_a,v,subscript,c_flag,nv,&GAbytes.gattot,&GAbytes.gatloc, NULL);
3610 #else
3611   gai_gatscat(GATHER,g_a,v,subscript,nv,&GAbytes.gattot,&GAbytes.gatloc, NULL);
3612 #endif
3613 
3614 }
3615 
3616 /**
3617  *  Scatter elements from a local buffer v into random locations in a Global
3618  *  Array
3619  */
3620 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3621 #   pragma weak wnga_scatter = pnga_scatter
3622 #endif
3623 
pnga_scatter(Integer g_a,void * v,void * subscript,Integer c_flag,Integer nv)3624 void pnga_scatter(Integer g_a, void* v, void *subscript, Integer c_flag, Integer nv)
3625 {
3626 
3627   if (nv < 1) return;
3628   ga_check_handleM(g_a, "nga_scatter");
3629 
3630   GAstat.numsca++;
3631 
3632 #ifdef USE_GATSCAT_NEW
3633   gai_gatscat_new(SCATTER,g_a,v,subscript,c_flag,nv,&GAbytes.scatot,&GAbytes.scaloc, NULL);
3634 #else
3635   gai_gatscat(SCATTER,g_a,v,subscript,nv,&GAbytes.scatot,&GAbytes.scaloc, NULL);
3636 #endif
3637 
3638 }
3639 
3640 /**
3641  *  Add elements of v to contents at random locations in Global Array
3642  *  specified by array subscript
3643  */
3644 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3645 #   pragma weak wnga_scatter_acc = pnga_scatter_acc
3646 #endif
3647 
pnga_scatter_acc(Integer g_a,void * v,void * subscript,Integer c_flag,Integer nv,void * alpha)3648 void pnga_scatter_acc(Integer g_a, void* v, void *subscript, Integer c_flag,
3649                       Integer nv, void *alpha)
3650 {
3651 
3652   if (nv < 1) return;
3653   ga_check_handleM(g_a, "nga_scatter_acc");
3654 
3655   GAstat.numsca++;
3656 
3657 #ifdef USE_GATSCAT_NEW
3658   gai_gatscat_new(SCATTER_ACC, g_a, v, subscript, c_flag, nv, &GAbytes.scatot,
3659               &GAbytes.scaloc, alpha);
3660 #else
3661   gai_gatscat(SCATTER_ACC, g_a, v, subscript, nv, &GAbytes.scatot,
3662               &GAbytes.scaloc, alpha);
3663 #endif
3664 
3665 }
3666 
3667 /**
3668  *  Gather random elements from 2D global array into local buffer v
3669  */
3670 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3671 #   pragma weak wnga_gather2d = pnga_gather2d
3672 #endif
3673 
pnga_gather2d(Integer g_a,void * v,Integer * i,Integer * j,Integer nv)3674 void pnga_gather2d(Integer g_a, void *v, Integer *i, Integer *j,
3675                          Integer nv)
3676 {
3677     Integer k, kk, proc, item_size;
3678     Integer *aproc, naproc; /* active processes and numbers */
3679     Integer *map;           /* map the active processes to allocated space */
3680     char *buf1, *buf2;
3681     Integer handle = g_a + GA_OFFSET;
3682     Integer nproc, p_handle, iproc;
3683 
3684     Integer *count;   /* counters for each process */
3685     Integer *nelem;   /* number of elements for each process */
3686     /* source and destination pointers for each process */
3687     void ***ptr_src, ***ptr_dst;
3688     void **ptr_org; /* the entire pointer array */
3689     armci_giov_t desc;
3690     Integer *ilo, *ihi, *jlo, *jhi, *ldp, *owner;
3691     Integer lo[2], hi[2];
3692     char **ptr_ref;
3693     Integer num_blocks=0;
3694     Integer subscrpt[2];
3695 
3696     if (nv < 1) return;
3697 
3698     ga_check_handleM(g_a, "ga_gather");
3699 
3700     GAstat.numgat++;
3701 
3702     /* determine how many processors are associated with array */
3703     p_handle = GA[handle].p_handle;
3704     if (p_handle < 0) {
3705       nproc = GAnproc;
3706       if (GA[handle].num_rstrctd > 0) {
3707         nproc = GA[handle].num_rstrctd;
3708       }
3709     } else {
3710       nproc = PGRP_LIST[p_handle].map_nproc;
3711     }
3712 
3713     /* allocate temp memory */
3714     if (GA[handle].distr_type == REGULAR) {
3715       buf1 = malloc((int)(nproc *4  + nv)*  (sizeof(Integer)));
3716       if(buf1 == NULL) pnga_error("malloc failed", 3*nproc);
3717     } else {
3718       num_blocks = GA[handle].block_total;
3719       buf1 = malloc((int)(num_blocks *4  + nv)*  (sizeof(Integer)));
3720       if(buf1 == NULL) pnga_error("malloc failed", 3*num_blocks);
3721     }
3722 
3723     owner = (Integer *)buf1;
3724     count = owner+ nv;
3725     if (GA[handle].distr_type == REGULAR) {
3726       nelem = count + nproc;
3727       aproc = count + 2 * nproc;
3728       map =   count + 3 * nproc;
3729     } else {
3730       nelem = count + num_blocks;
3731       aproc = count + 2 * num_blocks;
3732       map =   count + 3 * num_blocks;
3733     }
3734 
3735     if (GA[handle].distr_type == REGULAR) {
3736       /* initialize the counters and nelem */
3737       for(kk=0; kk<nproc; kk++) {
3738         count[kk] = 0; nelem[kk] = 0;
3739       }
3740     } else {
3741       for(kk=0; kk<num_blocks; kk++) {
3742         count[kk] = 0; nelem[kk] = 0;
3743       }
3744     }
3745 
3746     /* find proc or block that owns the (i,j) element; store it in temp: */
3747     if (GA[handle].num_rstrctd == 0) {
3748       for(k=0; k< nv; k++) {
3749         subscrpt[0] = *(i+k);
3750         subscrpt[1] = *(j+k);
3751         if(! pnga_locate(g_a, subscrpt, owner+k)){
3752           char err_string[ERR_STR_LEN];
3753           sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
3754           pnga_error(err_string, g_a);
3755         }
3756         iproc = owner[k];
3757         nelem[iproc]++;
3758       }
3759     } else {
3760       for(k=0; k< nv; k++) {
3761         subscrpt[0] = *(i+k);
3762         subscrpt[1] = *(j+k);
3763         if(! pnga_locate(g_a, subscrpt, owner+k)){
3764           char err_string[ERR_STR_LEN];
3765           sprintf(err_string,"invalid i/j=(%ld,%ld)", (long)i[k], (long)j[k]);
3766           pnga_error(err_string, g_a);
3767         }
3768         iproc = GA[handle].rank_rstrctd[owner[k]];
3769         nelem[iproc]++;
3770       }
3771     }
3772 
3773     naproc = 0;
3774     if (GA[handle].distr_type == REGULAR) {
3775       for(k=0; k<nproc; k++) if(nelem[k] > 0) {
3776         aproc[naproc] = k;
3777         map[k] = naproc;
3778         naproc ++;
3779       }
3780     } else {
3781       for(k=0; k<num_blocks; k++) if(nelem[k] > 0) {
3782         aproc[naproc] = k;
3783         map[k] = naproc;
3784         naproc ++;
3785       }
3786     }
3787     GAstat.numgat_procs += naproc;
3788 
3789     buf2 = malloc((int)(2*naproc*sizeof(void **) + 2*nv*sizeof(void *) +
3790                       5*naproc*sizeof(Integer) + naproc*sizeof(char*)));
3791     if(buf2 == NULL) pnga_error("malloc failed", naproc);
3792 
3793     ptr_src = (void ***)buf2;
3794     ptr_dst = (void ***)(buf2 + naproc*sizeof(void **));
3795     ptr_org = (void **)(buf2 + 2*naproc*sizeof(void **));
3796     ptr_ref = (char **)(buf2+2*naproc*sizeof(void **)+2*nv*sizeof(void *));
3797     ilo = (Integer *)(((char*)ptr_ref) + naproc*sizeof(char*));
3798     ihi = ilo + naproc;
3799     jlo = ihi + naproc;
3800     jhi = jlo + naproc;
3801     ldp = jhi + naproc;
3802 
3803     if (GA[handle].distr_type == REGULAR) {
3804       for(kk=0; kk<naproc; kk++) {
3805         iproc = aproc[kk];
3806         if (GA[handle].num_rstrctd > 0)
3807           iproc = GA[handle].rstrctd_list[iproc];
3808         pnga_distribution(g_a, iproc, lo, hi);
3809         ilo[kk] = lo[0];
3810         jlo[kk] = lo[1];
3811         ihi[kk] = hi[0];
3812         jhi[kk] = hi[1];
3813 
3814         /* get address of the first element owned by proc */
3815         gaShmemLocation(aproc[kk], g_a, ilo[kk], jlo[kk], &(ptr_ref[kk]),
3816             &(ldp[kk]));
3817       }
3818     } else {
3819       for(kk=0; kk<naproc; kk++) {
3820         iproc = aproc[kk];
3821         pnga_distribution(g_a, iproc, lo, hi);
3822         ilo[kk] = lo[0];
3823         jlo[kk] = lo[1];
3824         ihi[kk] = hi[0];
3825         jhi[kk] = hi[1];
3826 
3827         /* get address of the first element owned by proc */
3828         pnga_access_block_ptr(g_a, iproc, &(ptr_ref[kk]), &(ldp[kk]));
3829         pnga_release_block(g_a, iproc);
3830       }
3831     }
3832 
3833     item_size = GA[GA_OFFSET + g_a].elemsize;
3834     GAbytes.gattot += (double)item_size*nv;
3835     /*This stuff is probably completely wrong. Doesn't affect performance,
3836      * just statistics. */
3837     iproc = owner[GAme];
3838     GAbytes.gatloc += (double)item_size * nelem[iproc];
3839 
3840     ptr_src[0] = ptr_org; ptr_dst[0] = ptr_org + nv;
3841     for(k=1; k<naproc; k++) {
3842         ptr_src[k] = ptr_src[k-1] + nelem[aproc[k-1]];
3843         ptr_dst[k] = ptr_dst[k-1] + nelem[aproc[k-1]];
3844     }
3845 
3846     for(k=0; k<nv; k++){
3847         Integer this_count;
3848         proc = owner[k];
3849         if (GA[handle].num_rstrctd > 0)
3850           proc = GA[handle].rank_rstrctd[owner[k]];
3851         this_count = count[proc];
3852         count[proc]++;
3853         proc = map[proc];
3854         ptr_dst[proc][this_count] = ((char*)v) + k * item_size;
3855 
3856         if(i[k] < ilo[proc] || i[k] > ihi[proc]  ||
3857            j[k] < jlo[proc] || j[k] > jhi[proc]){
3858           char err_string[ERR_STR_LEN];
3859           sprintf(err_string,"proc=%d invalid i/j=(%ld,%ld)><[%ld:%ld,%ld:%ld]",
3860                  (int)proc,(long)i[k],(long)j[k],
3861                  (long)ilo[proc],(long)ihi[proc],
3862                  (long)jlo[proc], (long)jhi[proc]);
3863             pnga_error(err_string, g_a);
3864         }
3865         ptr_src[proc][this_count] = ptr_ref[proc] + item_size *
3866             ((j[k] - jlo[proc])* ldp[proc] + i[k] - ilo[proc]);
3867     }
3868     /* source and destination pointers are ready for all processes */
3869     if (GA[handle].distr_type == REGULAR) {
3870       for(k=0; k<naproc; k++) {
3871         int rc;
3872 
3873         desc.bytes = (int)item_size;
3874         desc.src_ptr_array = ptr_src[k];
3875         desc.dst_ptr_array = ptr_dst[k];
3876         desc.ptr_array_len = (int)nelem[aproc[k]];
3877         if (p_handle < 0) {
3878           iproc = aproc[k];
3879           if (GA[handle].num_rstrctd > 0)
3880             iproc = GA[handle].rstrctd_list[iproc];
3881         } else {
3882           iproc = PGRP_LIST[p_handle].inv_map_proc_list[aproc[k]];
3883         }
3884         rc=ARMCI_GetV(&desc, 1, (int)iproc);
3885         if(rc) pnga_error("gather failed in armci",rc);
3886       }
3887     } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
3888       for(k=0; k<naproc; k++) {
3889         int rc;
3890 
3891         desc.bytes = (int)item_size;
3892         desc.src_ptr_array = ptr_src[k];
3893         desc.dst_ptr_array = ptr_dst[k];
3894         desc.ptr_array_len = (int)nelem[aproc[k]];
3895         iproc = aproc[k];
3896         iproc = iproc%nproc;
3897         if (p_handle >= 0) {
3898           iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3899         }
3900         rc=ARMCI_GetV(&desc, 1, (int)iproc);
3901         if(rc) pnga_error("gather failed in armci",rc);
3902       }
3903     } else if (GA[handle].distr_type == SCALAPACK) {
3904       Integer index[MAXDIM];
3905       for(k=0; k<naproc; k++) {
3906         int rc;
3907 
3908         desc.bytes = (int)item_size;
3909         desc.src_ptr_array = ptr_src[k];
3910         desc.dst_ptr_array = ptr_dst[k];
3911         desc.ptr_array_len = (int)nelem[aproc[k]];
3912         iproc = aproc[k];
3913         gam_find_block_indices(handle, iproc, index);
3914         index[0] = index[0]%GA[handle].nblock[0];
3915         index[1] = index[1]%GA[handle].nblock[1];
3916         gam_find_proc_from_sl_indices(handle,iproc,index);
3917         if (p_handle >= 0) {
3918           iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3919         }
3920         rc=ARMCI_GetV(&desc, 1, (int)iproc);
3921         if(rc) pnga_error("gather failed in armci",rc);
3922       }
3923     } else if (GA[handle].distr_type == TILED ||
3924         GA[handle].distr_type == TILED_IRREG) {
3925       Integer index[MAXDIM];
3926       for(k=0; k<naproc; k++) {
3927         int rc;
3928 
3929         desc.bytes = (int)item_size;
3930         desc.src_ptr_array = ptr_src[k];
3931         desc.dst_ptr_array = ptr_dst[k];
3932         desc.ptr_array_len = (int)nelem[aproc[k]];
3933         iproc = aproc[k];
3934         gam_find_block_indices(handle, iproc, index);
3935         index[0] = index[0]%GA[handle].nblock[0];
3936         index[1] = index[1]%GA[handle].nblock[1];
3937         gam_find_tile_proc_from_indices(handle,iproc,index);
3938         if (p_handle >= 0) {
3939           iproc = PGRP_LIST[p_handle].inv_map_proc_list[iproc];
3940         }
3941         rc=ARMCI_GetV(&desc, 1, (int)iproc);
3942         if(rc) pnga_error("gather failed in armci",rc);
3943       }
3944     }
3945 
3946     free(buf2);
3947     free(buf1);
3948 }
3949 
3950 /**
3951  *  Read and increment an element of a Global Array
3952  */
3953 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3954 #   pragma weak wnga_read_inc = pnga_read_inc
3955 #endif
3956 
pnga_read_inc(Integer g_a,Integer * subscript,Integer inc)3957 Integer pnga_read_inc(Integer g_a, Integer* subscript, Integer inc)
3958 {
3959 GA_Internal_Threadsafe_Lock();
3960 char *ptr;
3961 Integer ldp[MAXDIM], proc, handle=GA_OFFSET+g_a, p_handle, ndim;
3962 int optype,ivalue;
3963 long lvalue;
3964 void *pval;
3965 
3966     ga_check_handleM(g_a, "nga_read_inc");
3967 
3968     /* BJP printf("p[%d] g_a: %d subscript: %d inc: %d\n",GAme, g_a, subscript[0], inc); */
3969 
3970     if(GA[handle].type!=C_INT && GA[handle].type!=C_LONG &&
3971        GA[handle].type!=C_LONGLONG)
3972        pnga_error("type must be integer",GA[handle].type);
3973 
3974     GAstat.numrdi++;
3975     GAbytes.rditot += (double)sizeof(Integer);
3976     p_handle = GA[handle].p_handle;
3977     ndim = GA[handle].ndim;
3978 
3979     /* find out who owns it */
3980     pnga_locate(g_a, subscript, &proc);
3981 
3982     /* get an address of the g_a(subscript) element */
3983     if (GA[handle].distr_type == REGULAR) {
3984       if (GA[handle].num_rstrctd == 0) {
3985         gam_Location(proc, handle,  subscript, (char**)&ptr, ldp);
3986       } else {
3987         gam_Location(GA[handle].rank_rstrctd[proc], handle,  subscript, (char**)&ptr, ldp);
3988       }
3989     } else {
3990       Integer lo[MAXDIM], hi[MAXDIM];
3991       Integer j, jtot, last, offset;
3992       pnga_distribution(g_a, proc, lo, hi);
3993       pnga_access_block_ptr(g_a, proc, (char**)&ptr, ldp);
3994       pnga_release_block(g_a, proc);
3995       offset = 0;
3996       last = ndim - 1;
3997       jtot = 1;
3998       for (j=0; j<last; j++) {
3999         offset += (subscript[j]-lo[j])*jtot;
4000         jtot *= ldp[j];
4001       }
4002       offset += (subscript[last]-lo[last])*jtot;
4003       ptr += offset*GA[handle].elemsize;
4004     }
4005 
4006     if(GA[handle].type==C_INT){
4007        optype = ARMCI_FETCH_AND_ADD;
4008        pval = &ivalue;
4009     }else{
4010        optype = ARMCI_FETCH_AND_ADD_LONG;
4011        pval = &lvalue;
4012     }
4013 
4014     if(GAme == proc)GAbytes.rdiloc += (double)sizeof(Integer);
4015 
4016     if (GA[handle].distr_type == BLOCK_CYCLIC) {
4017       proc = proc%pnga_nnodes();
4018     } else if (GA[handle].distr_type == SCALAPACK) {
4019       Integer j, index[MAXDIM];
4020       gam_find_block_indices(handle, proc, index);
4021       for (j=0; j<ndim; j++) {
4022         index[j] = index[j]%GA[handle].nblock[j];
4023       }
4024       gam_find_proc_from_sl_indices(handle,proc,index);
4025     } else if (GA[handle].distr_type == TILED ||
4026         GA[handle].distr_type == TILED_IRREG) {
4027       Integer j, index[MAXDIM];
4028       gam_find_block_indices(handle, proc, index);
4029       for (j=0; j<ndim; j++) {
4030         index[j] = index[j]%GA[handle].nblock[j];
4031       }
4032       gam_find_tile_proc_from_indices(handle,proc,index);
4033     }
4034     if (p_handle != -1) {
4035        proc=PGRP_LIST[p_handle].inv_map_proc_list[proc];
4036        /*printf("\n%d:proc=%d",GAme,proc);fflush(stdout);*/
4037     }
4038 
4039     ARMCI_Rmw(optype, pval, (int*)ptr, (int)inc, (int)proc);
4040 
4041 
4042    GA_Internal_Threadsafe_Unlock();
4043    if(GA[handle].type==C_INT)
4044        return (Integer) ivalue;
4045    else
4046        return (Integer) lvalue;
4047 }
4048 
4049 /**
4050  *  Utility function to correct patch indices for presence of
4051  *  skips. If no data is left on patch it returns false.
4052  */
gai_correct_strided_patch(Integer ndim,Integer * lo,Integer * skip,Integer * plo,Integer * phi)4053 Integer gai_correct_strided_patch(Integer ndim,
4054                                   Integer *lo,
4055                                   Integer *skip,
4056                                   Integer *plo,
4057                                   Integer *phi)
4058 {
4059   Integer i, delta;
4060   for (i=0; i<ndim; i++) {
4061     delta = plo[i]-lo[i];
4062     if (delta%skip[i] != 0) {
4063       plo[i] = plo[i] - delta%skip[i] + skip[i];
4064     }
4065     delta = phi[i]-lo[i];
4066     if (delta%skip[i] != 0) {
4067       phi[i] = phi[i] - delta%skip[i];
4068     }
4069     if (phi[i]<plo[i]) return 0;
4070   }
4071   return 1;
4072 }
4073 
4074 /**
4075  *  Utility function to calculate the number of elements in each
4076  *  stride direction
4077  */
4078 
gai_ComputeCountWithSkip(Integer ndim,Integer * lo,Integer * hi,Integer * skip,int * count,Integer * nstride)4079 int gai_ComputeCountWithSkip(Integer ndim, Integer *lo, Integer *hi,
4080                              Integer *skip, int *count, Integer *nstride)
4081 {
4082 #if 1
4083   Integer idx;
4084   int i, istride = 0;
4085   /*
4086   if (GAme==0) {
4087     printf("p[%d] ComputeCount lo[0]: %d hi[0]: %d lo[1]: %d hi[1]: %d\n",GAme,
4088         lo[0],hi[0],lo[1],hi[1]);
4089   }
4090   */
4091   count[0]=1;
4092   istride++;
4093   for (i=0; i<(int)ndim; i++) {
4094     idx = hi[i] - lo[i];
4095     if (idx < 0) return 0;
4096     if (skip[i] > 1) {
4097       count[istride] = (int)(idx/skip[i]+1);
4098       istride++;
4099     } else {
4100       count[istride] = (int)idx+1;
4101       istride++;
4102     }
4103   }
4104   *nstride = istride;
4105   return 1;
4106 #else
4107   Integer idx;
4108   int i, istride = 0;
4109   /*
4110   if (GAme==0) {
4111     printf("p[%d] ComputeCount lo[0]: %d hi[0]: %d lo[1]: %d hi[1]: %d\n",GAme,
4112         lo[0],hi[0],lo[1],hi[1]);
4113   }
4114   */
4115   for (i=0; i<(int)ndim; i++) {
4116     idx = hi[i] - lo[i];
4117     if (idx < 0) return 0;
4118     if (skip[i] > 1) {
4119       count[istride] = 1;
4120       istride++;
4121       count[istride] = (int)(idx/skip[i]+1);
4122       istride++;
4123     } else {
4124       count[istride] = (int)idx+1;
4125       istride++;
4126     }
4127   }
4128   *nstride = istride;
4129   return 1;
4130 #endif
4131 }
4132 
4133 /**
4134  *  Utility function to calculate stride lengths on local and
4135  *  remote processors, taking into account the presence of skips
4136  */
gai_SetStrideWithSkip(Integer ndim,Integer size,Integer * ld,Integer * ldrem,int * stride_rem,int * stride_loc,Integer * skip)4137 void gai_SetStrideWithSkip(Integer ndim, Integer size, Integer *ld,
4138                           Integer *ldrem, int *stride_rem,
4139                           int *stride_loc, Integer *skip)
4140 {
4141 #if 1
4142   int i;
4143   stride_rem[0] = stride_loc[0] = (int)size;
4144   for (i=0; i<ndim; i++) {
4145     stride_rem[i+1] = stride_rem[i];
4146     stride_rem[i] *= skip[i];
4147     stride_rem[i+1] *= (int)ldrem[i];
4148     stride_loc[i+1] = stride_loc[i];
4149     stride_loc[i+1] *= (int)ld[i];
4150   }
4151 #else
4152   int i, istride;
4153   stride_rem[0] = stride_loc[0] = (int)size;
4154   istride = 0;
4155   for (i=0; i<ndim; i++) {
4156     if (skip[i] > 1) {
4157       stride_rem[istride] *= (int)skip[i];
4158       stride_rem[istride+1] = stride_rem[istride]/((int)skip[i]);
4159       stride_loc[istride+1] = stride_loc[istride];
4160       istride++;
4161       if (i<ndim-1) {
4162         stride_rem[istride] *= (int)ldrem[i];
4163         stride_rem[istride+1] = stride_rem[istride];
4164         stride_loc[istride] *= (int)ld[i];
4165         stride_loc[istride+1] = stride_loc[istride];
4166       }
4167     } else {
4168       if (i<ndim-1) {
4169         stride_rem[istride] *= (int)ldrem[i];
4170         stride_rem[istride+1] = stride_rem[istride];
4171         stride_loc[istride] *= ld[i];
4172         stride_loc[istride+1] = stride_loc[istride];
4173       }
4174     }
4175     istride++;
4176   }
4177 #endif
4178 }
4179 
gai_ComputePatchIndexWithSkip(Integer ndim,Integer * lo,Integer * plo,Integer * skip,Integer * ld,Integer * idx_buf)4180 void gai_ComputePatchIndexWithSkip(Integer ndim, Integer *lo, Integer *plo,
4181                                    Integer *skip, Integer *ld, Integer *idx_buf)
4182 {
4183   Integer i, delta, inc, factor;
4184   delta = plo[0] - lo[0];
4185   inc = delta%skip[0];
4186   delta -= inc;
4187   delta /=  skip[0];
4188   *idx_buf = delta;
4189   for (i=0; i<ndim-1; i++) {
4190     factor = ld[i];
4191     delta = plo[i+1]-lo[i+1];
4192     inc = delta%skip[i+1];
4193     delta -= inc;
4194     delta /=  skip[i+1];
4195     *idx_buf += factor*delta;
4196   }
4197 }
4198 
4199 /* Utility function to find offset from start of data segment
4200  * ndim: dimension of block
4201  * lo: Lower corner of block
4202  * plo: Point within the block
4203  * ld: Physical dimensions of data block
4204  * offset: Number of elements plo is located from start of block
4205 */
gai_FindOffset(Integer ndim,Integer * lo,Integer * plo,Integer * ld,Integer * offset)4206 void  gai_FindOffset(Integer ndim,Integer *lo, Integer *plo,
4207     Integer *ld, Integer *offset)
4208 {
4209   Integer i, factor;
4210   *offset = 0;
4211   factor = 1;
4212   for (i=0; i<ndim; i++) {
4213     *offset += (plo[i]-lo[i])*factor;
4214     if (i<ndim-1) factor *= ld[i];
4215   }
4216 }
4217 
4218 /**
4219  *  Put an N-dimensional patch of strided data into a Global Array. This routine
4220  *  can by used with user-defined data types to modify a single element instead
4221  *  of copying over the entire data
4222  */
4223 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4224 #   pragma weak wnga_strided_put = pnga_strided_put
4225 #endif
4226 
pnga_strided_put(Integer g_a,Integer * lo,Integer * hi,Integer * skip,void * buf,Integer * ld)4227 void pnga_strided_put(Integer g_a, Integer *lo, Integer *hi, Integer *skip,
4228                      void    *buf, Integer *ld)
4229 {
4230   /* g_a:    Global Array handle
4231      lo[]:   Array of lower indices of patch of global array
4232      hi[]:   Array of upper indices of patch of global array
4233      skip[]: Array of skips for each dimension
4234      buf[]:  Local buffer that patch will be copied from
4235      ld[]:   ndim-1 physical dimensions of local buffer */
4236   Integer p, np, handle = GA_OFFSET + g_a;
4237   Integer idx, size, nstride, p_handle, nproc;
4238   Integer ldrem[MAXDIM];
4239   Integer idx_buf, *blo, *bhi;
4240   Integer plo[MAXDIM],phi[MAXDIM];
4241   char *pbuf, *prem;
4242   int count[2*MAXDIM], stride_rem[2*MAXDIM], stride_loc[2*MAXDIM];
4243   int i, proc, ndim;
4244   _iterator_hdl it_hdl;
4245 
4246   size = GA[handle].elemsize;
4247   ndim = GA[handle].ndim;
4248   nproc = pnga_nnodes();
4249   p_handle = GA[handle].p_handle;
4250 
4251   /* check values of skips to make sure they are legitimate */
4252   for (i = 0; i<ndim; i++) {
4253     if (skip[i]<1) {
4254       pnga_error("nga_strided_put: Invalid value of skip along coordinate ",i);
4255     }
4256   }
4257 
4258   gai_iterator_init(g_a, lo, hi, &it_hdl);
4259   while (gai_iterator_next(&it_hdl, &proc, &blo, &bhi, &prem, ldrem)) {
4260       /* Correct ranges to account for skips in original patch. If no
4261          data is left in patch jump to next processor in loop. */
4262       for (i=0; i<ndim; i++) {
4263         plo[i] = blo[i];
4264         phi[i] = bhi[i];
4265       }
4266       if (!gai_correct_strided_patch((Integer)ndim, lo, skip, plo, phi))
4267         continue;
4268       /* May need to correct location of remote buffer */
4269       gai_FindOffset(ndim,blo,plo,ldrem,&idx_buf);
4270       prem += size*idx_buf;
4271 
4272       /* get pointer in local buffer to point indexed by plo given that
4273          the corner of the buffer corresponds to the point indexed by lo */
4274       gai_ComputePatchIndexWithSkip(ndim, lo, plo, skip, ld, &idx_buf);
4275       pbuf = size*idx_buf + (char*)buf;
4276 
4277       /* Compute number of elements in each stride region and compute the
4278          number of stride regions. Store the results in count and nstride */
4279       if (!gai_ComputeCountWithSkip(ndim, plo, phi, skip, count, &nstride))
4280         continue;
4281 
4282       /* Scale first element in count by element size. The ARMCI_PutS routine
4283          uses this convention to figure out memory sizes. */
4284       count[0] *= size;
4285 
4286       /* Calculate strides in memory for remote processor indexed by proc and
4287          local buffer */
4288       gai_SetStrideWithSkip(ndim, size, ld, ldrem, stride_rem, stride_loc,
4289           skip);
4290 
4291       /* BJP */
4292       if (p_handle != -1) {
4293         proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
4294       }
4295       ARMCI_PutS(pbuf, stride_loc, prem, stride_rem, count, nstride-1, proc);
4296   }
4297   gai_iterator_destroy(&it_hdl);
4298 }
4299 
4300 /**
4301  *  Get an N-dimensional patch of strided data from a Global Array
4302  */
4303 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4304 #   pragma weak wnga_strided_get = pnga_strided_get
4305 #endif
4306 
pnga_strided_get(Integer g_a,Integer * lo,Integer * hi,Integer * skip,void * buf,Integer * ld)4307 void pnga_strided_get(Integer g_a, Integer *lo, Integer *hi, Integer *skip,
4308                    void    *buf, Integer *ld)
4309 {
4310   /* g_a:    Global Array handle
4311      lo[]:   Array of lower indices of patch of global array
4312      hi[]:   Array of upper indices of patch of global array
4313      skip[]: Array of skips for each dimension
4314      buf[]:  Local buffer that patch will be copied from
4315      ld[]:   ndim-1 physical dimensions of local buffer */
4316   Integer p, np, handle = GA_OFFSET + g_a;
4317   Integer idx, size, nstride, p_handle, nproc;
4318   int i, proc, ndim;
4319   Integer ldrem[MAXDIM];
4320   Integer idx_buf, *blo, *bhi;
4321   Integer plo[MAXDIM],phi[MAXDIM];
4322   char *pbuf, *prem;
4323   int count[2*MAXDIM], stride_rem[2*MAXDIM], stride_loc[2*MAXDIM];
4324   _iterator_hdl it_hdl;
4325 
4326   size = GA[handle].elemsize;
4327   ndim = GA[handle].ndim;
4328   nproc = pnga_nnodes();
4329   p_handle = GA[handle].p_handle;
4330 
4331   /* check values of skips to make sure they are legitimate */
4332   for (i = 0; i<ndim; i++) {
4333     if (skip[i]<1) {
4334       pnga_error("nga_strided_get: Invalid value of skip along coordinate ",i);
4335     }
4336   }
4337 
4338   gai_iterator_init(g_a, lo, hi, &it_hdl);
4339   while (gai_iterator_next(&it_hdl, &proc, &blo, &bhi, &prem, ldrem)) {
4340       /* Correct ranges to account for skips in original patch. If no
4341          data is left in patch jump to next processor in loop. */
4342       for (i=0; i<ndim; i++) {
4343         plo[i] = blo[i];
4344         phi[i] = bhi[i];
4345       }
4346       if (!gai_correct_strided_patch((Integer)ndim, lo, skip, plo, phi))
4347         continue;
4348       /* May need to correct location of remote buffer */
4349       gai_FindOffset(ndim,blo,plo,ldrem,&idx_buf);
4350       prem += size*idx_buf;
4351 
4352       /* get pointer in local buffer to point indexed by plo given that
4353          the corner of the buffer corresponds to the point indexed by lo */
4354       gai_ComputePatchIndexWithSkip(ndim, lo, plo, skip, ld, &idx_buf);
4355       pbuf = size*idx_buf + (char*)buf;
4356 
4357       /* Compute number of elements in each stride region and compute the
4358          number of stride regions. Store the results in count and nstride */
4359       if (!gai_ComputeCountWithSkip(ndim, plo, phi, skip, count, &nstride))
4360         continue;
4361 
4362       /* Scale first element in count by element size. The ARMCI_PutS routine
4363          uses this convention to figure out memory sizes. */
4364       count[0] *= size;
4365 
4366       /* Calculate strides in memory for remote processor indexed by proc and
4367          local buffer */
4368       gai_SetStrideWithSkip(ndim, size, ld, ldrem, stride_rem, stride_loc,
4369           skip);
4370 
4371       /* BJP */
4372       if (p_handle != -1) {
4373         proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
4374       }
4375       ARMCI_GetS(prem, stride_rem, pbuf, stride_loc, count, nstride-1, proc);
4376   }
4377   gai_iterator_destroy(&it_hdl);
4378 }
4379 
4380 /**
4381  *  Accumulate operation for an N-dimensional patch of strided data of
4382  *  a Global Array
4383  *       g_a += alpha * patch
4384 \*/
4385 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4386 #   pragma weak wnga_strided_acc = pnga_strided_acc
4387 #endif
4388 
pnga_strided_acc(Integer g_a,Integer * lo,Integer * hi,Integer * skip,void * buf,Integer * ld,void * alpha)4389 void pnga_strided_acc(Integer g_a, Integer *lo, Integer *hi, Integer *skip,
4390                       void *buf, Integer *ld, void *alpha)
4391 {
4392   /* g_a:    Global Array handle
4393      lo[]:   Array of lower indices of patch of global array
4394      hi[]:   Array of upper indices of patch of global array
4395      skip[]: Array of skips for each dimension
4396      buf[]:  Local buffer that patch will be copied from
4397      ld[]:   ndim-1 physical dimensions of local buffer
4398      alpha:  muliplicative scale factor */
4399   Integer p, np, handle = GA_OFFSET + g_a;
4400   Integer idx, size, nstride, type, p_handle, nproc;
4401   int i, optype=-1, proc, ndim;
4402   Integer ldrem[MAXDIM];
4403   Integer idx_buf, *blo, *bhi;
4404   Integer plo[MAXDIM],phi[MAXDIM];
4405   char *pbuf, *prem;
4406   int count[2*MAXDIM], stride_rem[2*MAXDIM], stride_loc[2*MAXDIM];
4407   _iterator_hdl it_hdl;
4408 
4409   size = GA[handle].elemsize;
4410   ndim = GA[handle].ndim;
4411   type = GA[handle].type;
4412   nproc = pnga_nnodes();
4413   p_handle = GA[handle].p_handle;
4414 
4415   if (type == C_DBL) optype = ARMCI_ACC_DBL;
4416   else if (type == C_FLOAT) optype = ARMCI_ACC_FLT;
4417   else if (type == C_DCPL) optype = ARMCI_ACC_DCP;
4418   else if (type == C_SCPL) optype = ARMCI_ACC_CPL;
4419   else if (type == C_INT) optype = ARMCI_ACC_INT;
4420   else if (type == C_LONG) optype = ARMCI_ACC_LNG;
4421   else pnga_error("nga_strided_acc: type not supported",type);
4422 
4423   /* check values of skips to make sure they are legitimate */
4424   for (i = 0; i<ndim; i++) {
4425     if (skip[i]<1) {
4426       pnga_error("nga_strided_acc: Invalid value of skip along coordinate ",i);
4427     }
4428   }
4429 
4430   gai_iterator_init(g_a, lo, hi, &it_hdl);
4431   while (gai_iterator_next(&it_hdl, &proc, &blo, &bhi, &prem, ldrem)) {
4432       /* Correct ranges to account for skips in original patch. If no
4433          data is left in patch jump to next processor in loop. */
4434       for (i=0; i<ndim; i++) {
4435         plo[i] = blo[i];
4436         phi[i] = bhi[i];
4437       }
4438       if (!gai_correct_strided_patch((Integer)ndim, lo, skip, plo, phi))
4439         continue;
4440       /* May need to correct location of remote buffer */
4441       gai_FindOffset(ndim,blo,plo,ldrem,&idx_buf);
4442       prem += size*idx_buf;
4443 
4444       /* get pointer in local buffer to point indexed by plo given that
4445          the corner of the buffer corresponds to the point indexed by lo */
4446       gai_ComputePatchIndexWithSkip(ndim, lo, plo, skip, ld, &idx_buf);
4447       pbuf = size*idx_buf + (char*)buf;
4448 
4449       /* Compute number of elements in each stride region and compute the
4450          number of stride regions. Store the results in count and nstride */
4451       if (!gai_ComputeCountWithSkip(ndim, plo, phi, skip, count, &nstride))
4452         continue;
4453 
4454       /* Scale first element in count by element size. The ARMCI_PutS routine
4455          uses this convention to figure out memory sizes. */
4456       count[0] *= size;
4457 
4458       /* Calculate strides in memory for remote processor indexed by proc and
4459          local buffer */
4460       gai_SetStrideWithSkip(ndim, size, ld, ldrem, stride_rem, stride_loc,
4461           skip);
4462 
4463       /* BJP */
4464       if (p_handle != -1) {
4465         proc = PGRP_LIST[p_handle].inv_map_proc_list[proc];
4466       }
4467       ARMCI_AccS(optype, alpha, pbuf, stride_loc, prem, stride_rem, count,
4468           nstride-1, proc);
4469   }
4470   gai_iterator_destroy(&it_hdl);
4471 }
4472