1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /* $Id: base.c,v 1.149.2.19 2007/12/18 18:42:20 d3g293 Exp $ */
6 /*
7  * module: base.c
8  * author: Jarek Nieplocha
9  * description: implements GA primitive operations --
10  *              create (regular& irregular) and duplicate, destroy
11  *
12  * DISCLAIMER
13  *
14  * This material was prepared as an account of work sponsored by an
15  * agency of the United States Government.  Neither the United States
16  * Government nor the United States Department of Energy, nor Battelle,
17  * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
18  * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
19  * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
20  * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
21  * INFRINGE PRIVATELY OWNED RIGHTS.
22  *
23  *
24  * ACKNOWLEDGMENT
25  *
26  * This software and its documentation were produced with United States
27  * Government support under Contract Number DE-AC06-76RLO-1830 awarded by
28  * the United States Department of Energy.  The United States Government
29  * retains a paid-up non-exclusive, irrevocable worldwide license to
30  * reproduce, prepare derivative works, perform publicly and display
31  * publicly by or for the US Government, including the right to
32  * distribute to other US Government contractors.
33  */
34 
35 #if HAVE_STDIO_H
36 #   include <stdio.h>
37 #endif
38 #if HAVE_STRING_H
39 #   include <string.h>
40 #endif
41 #if HAVE_STDLIB_H
42 #   include <stdlib.h>
43 #endif
44 #if HAVE_MATH_H
45 #   include <math.h>
46 #endif
47 #if HAVE_ASSERT_H
48 #   include <assert.h>
49 #endif
50 
51 #include <ctype.h>
52 #include "farg.h"
53 #include "globalp.h"
54 #include "message.h"
55 #include "base.h"
56 #include "macdecls.h"
57 #include "armci.h"
58 #include "ga-papi.h"
59 #include "ga-wapi.h"
60 #include "thread-safe.h"
61 
62 static int calc_maplen(int handle);
63 
64 #ifdef PROFILE_OLD
65 #include "ga_profile.h"
66 #endif
67 /*#define AVOID_MA_STORAGE 1*/
68 #define DEBUG 0
69 #define USE_MALLOC 1
70 #define INVALID_MA_HANDLE -1
71 #define NEAR_INT(x) (x)< 0.0 ? ceil( (x) - 0.5) : floor((x) + 0.5)
72 
73 #define FLEN        80              /* length of Fortran strings */
74 
75 /*uncomment line below to verify consistency of MA in every sync */
76 /*#define CHECK_MA yes */
77 
78 /*uncomment line below to verify if MA base address is alligned wrt datatype*/
79 #if !(defined(LINUX) || defined(CRAY) || defined(CYGWIN))
80 #define CHECK_MA_ALGN 1
81 #endif
82 
83 /*uncomment line below to initialize arrays in ga_create/duplicate */
84 /*#define GA_CREATE_INDEF yes */
85 
86 /*uncomment line below to introduce padding between shared memory regions
87   of a GA when the region spans in more than 1 process within SMP */
88 #define GA_ELEM_PADDING yes
89 
90 #define OLD_DISTRIBUTION 1
91 #if OLD_DISTRIBUTION
92     extern void ddb_h2(Integer ndims, Integer dims[], Integer npes,
93                     double threshold, Integer bias, Integer blk[],
94                     Integer pedims[]);
95 #else
96     extern void ddb(Integer ndims, Integer dims[], Integer npes,
97                     Integer blk[], Integer pedims[]);
98 #endif
99 
100 global_array_t *_ga_main_data_structure;
101 global_array_t *GA;
102 proc_list_t *_proc_list_main_data_structure;
103 proc_list_t *PGRP_LIST;
104 static int GAinitialized = 0;
105 static int ARMCIinitialized = 0;
106 int _ga_sync_begin = 1;
107 int _ga_sync_end = 1;
108 int _max_global_array = MAX_ARRAYS;
109 int GA_World_Proc_Group = -1;
110 int GA_Default_Proc_Group = -1;
111 int ga_armci_world_group=0;
112 int GA_Init_Proc_Group = -2;
113 Integer GA_Debug_flag = 0;
114 
115 /* MA addressing */
116 DoubleComplex   *DCPL_MB;           /* double precision complex base address */
117 SingleComplex   *SCPL_MB;           /* single precision complex base address */
118 DoublePrecision *DBL_MB;            /* double precision base address */
119 Integer         *INT_MB;            /* integer base address */
120 float           *FLT_MB;            /* float base address */
121 int** GA_Update_Flags;
122 int* GA_Update_Signal;
123 
124 typedef struct {
125 long id;
126 long type;
127 long size;
128 long dummy;
129 } getmem_t;
130 
131 /*\
132  * Copy of GA's internal communicator
133 \*/
134 #ifdef MSG_COMMS_MPI
135 MPI_Comm GA_MPI_World_comm_dup;
136 #endif
137 
138 /* set total limit (bytes) for memory usage per processor to "unlimited" */
139 static Integer GA_total_memory = -1;
140 static Integer GA_memory_limited = 0;
141 struct ga_stat_t GAstat;
142 struct ga_bytes_t GAbytes ={0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.};
143 long   *GAstat_arr;
144 static Integer GA_memory_limit=0;
145 Integer GAme, GAnproc;
146 static Integer MPme;
147 Integer *mapALL;
148 
149 static Integer _mirror_gop_grp;
150 
151 /* Function prototypes */
152 int gai_getmem(char* name, char **ptr_arr, C_Long bytes, int type, long *id,
153                int grp_id);
154 int gai_get_devmem(char *name, char **ptr_arr, C_Long bytes, int type, long *adj,
155 		  int grp_id, int dev_flag, const char *device);
156 #ifdef ENABLE_CHECKPOINT
157 static int ga_group_is_for_ft=0;
158 int ga_spare_procs;
159 #endif
160 
161 
162 /*************************************************************************/
163 
164 /*\ This macro computes index (place in ordered set) for the element
165  *  identified by _subscript in ndim- dimensional array of dimensions _dim[]
166  *  assume that first subscript component changes first
167 \*/
168 #define ga_ComputeIndexM(_index, _ndim, _subscript, _dims)                     \
169 {                                                                              \
170   Integer  _i, _factor=1;                                                      \
171   __CRAYX1_PRAGMA("_CRI novector");                                            \
172   for(_i=0,*(_index)=0; _i<_ndim; _i++){                                       \
173       *(_index) += _subscript[_i]*_factor;                                     \
174       if(_i<_ndim-1)_factor *= _dims[_i];                                      \
175   }                                                                            \
176 }
177 
178 
179 /*\ updates subscript corresponding to next element in a patch <lo[]:hi[]>
180 \*/
181 #define ga_UpdateSubscriptM(_ndim, _subscript, _lo, _hi, _dims)\
182 {                                                                              \
183   Integer  _i;                                                                 \
184   __CRAYX1_PRAGMA("_CRI novector");                                            \
185   for(_i=0; _i<_ndim; _i++){                                                   \
186        if(_subscript[_i] < _hi[_i]) { _subscript[_i]++; break;}                \
187        _subscript[_i] = _lo[_i];                                               \
188   }                                                                            \
189 }
190 
191 
192 /*\ Initialize n-dimensional loop by counting elements and setting subscript=lo
193 \*/
194 #define ga_InitLoopM(_elems, _ndim, _subscript, _lo, _hi, _dims)\
195 {                                                                              \
196   Integer  _i;                                                                 \
197   *_elems = 1;                                                                 \
198   __CRAYX1_PRAGMA("_CRI novector");                                            \
199   for(_i=0; _i<_ndim; _i++){                                                   \
200        *_elems *= _hi[_i]-_lo[_i] +1;                                          \
201        _subscript[_i] = _lo[_i];                                               \
202   }                                                                            \
203 }
204 
205 
GAsizeof(Integer type)206 Integer GAsizeof(Integer type)
207 {
208   switch (type) {
209      case C_DBL  : return (sizeof(double));
210      case C_INT  : return (sizeof(int));
211      case C_SCPL : return (sizeof(SingleComplex));
212      case C_DCPL : return (sizeof(DoubleComplex));
213      case C_FLOAT : return (sizeof(float));
214      case C_LONG : return (sizeof(long));
215      case C_LONGLONG : return (sizeof(long long));
216           default   : return 0;
217   }
218 }
219 
220 
221 /*\ Register process list
222  *  process list can be used to:
223  *   1. permute process ids w.r.t. message-passing ids (set PERMUTE_PIDS), or
224  *   2. change logical mapping of array blocks to processes
225 \*/
ga_register_proclist_(Integer * list,Integer * np)226 void ga_register_proclist_(Integer *list, Integer* np)
227 {
228     /* no longer used */
229 }
230 
231 
GA_Register_proclist(int * list,int np)232 void GA_Register_proclist(int *list, int np)
233 {
234     /* no long used */
235 }
236 
237 
238 /*\ FINAL CLEANUP of shmem when terminating
239 \*/
ga_clean_resources()240 void ga_clean_resources()
241 {
242     ARMCI_Cleanup();
243 }
244 
245 
246 /*\ CHECK GA HANDLE and if it's wrong TERMINATE
247  *  C version
248 \*/
249 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
250 #   pragma weak wnga_check_handle = pnga_check_handle
251 #endif
pnga_check_handle(Integer g_a,char * string)252 void pnga_check_handle(Integer g_a, char * string)
253 {
254   ga_check_handleM(g_a, string);
255 }
256 
257 
258 /*\ Initialize MA-like addressing:
259  *  get addressees for the base arrays for double, complex and int types
260 \*/
261 static int ma_address_init=0;
gai_ma_address_init()262 void gai_ma_address_init()
263 {
264 #ifdef CHECK_MA_ALGN
265 Integer  off_dbl, off_int, off_dcpl, off_flt,off_scpl;
266 #endif
267      ma_address_init=1;
268      INT_MB = (Integer*)MA_get_mbase(MT_F_INT);
269      DBL_MB = (DoublePrecision*)MA_get_mbase(MT_F_DBL);
270      DCPL_MB= (DoubleComplex*)MA_get_mbase(MT_F_DCPL);
271      SCPL_MB= (SingleComplex*)MA_get_mbase(MT_F_SCPL);
272      FLT_MB = (float*)MA_get_mbase(MT_F_REAL);
273 #   ifdef CHECK_MA_ALGN
274         off_dbl = 0 != ((long)DBL_MB)%sizeof(DoublePrecision);
275         off_int = 0 != ((long)INT_MB)%sizeof(Integer);
276         off_dcpl= 0 != ((long)DCPL_MB)%sizeof(DoublePrecision);
277         off_scpl= 0 != ((long)SCPL_MB)%sizeof(float);
278         off_flt = 0 != ((long)FLT_MB)%sizeof(float);
279         if(off_dbl)
280            pnga_error("GA initialize: MA DBL_MB not alligned", (Integer)DBL_MB);
281 
282         if(off_int)
283            pnga_error("GA initialize: INT_MB not alligned", (Integer)INT_MB);
284 
285         if(off_dcpl)
286           pnga_error("GA initialize: DCPL_MB not alligned", (Integer)DCPL_MB);
287 
288         if(off_scpl)
289           pnga_error("GA initialize: SCPL_MB not alligned", (Integer)SCPL_MB);
290 
291         if(off_flt)
292            pnga_error("GA initialize: FLT_MB not alligned", (Integer)FLT_MB);
293 
294 #   endif
295 
296     if(DEBUG) printf("%d INT_MB=%p DBL_MB=%p DCPL_MB=%p FLT_MB=%p SCPL_MB=%p\n",
297                      (int)GAme, (void*)INT_MB, (void*)DBL_MB, (void*)DCPL_MB, (void*)FLT_MB, (void*)SCPL_MB);
298 }
299 
300 
301 
302 extern int *_ga_argc;
303 extern char ***_ga_argv;
304 extern int _ga_initialize_args;
305 extern int _ga_initialize_c;
306 extern int _ga_initialize_f;
307 
308 /**
309  *  Initialize library structures in Global Arrays.
310  *  either ga_initialize_ltd or ga_initialize must be the first
311  *         GA routine called (except ga_uses_ma)
312  */
313 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
314 #   pragma weak wnga_initialize = pnga_initialize
315 #endif
316 
pnga_initialize()317 void pnga_initialize()
318 {
319     Integer  i, j,nproc, nnode, zero;
320     int bytes;
321     GA_Internal_Threadsafe_Lock();
322 #ifdef MSG_COMMS_MPI
323         MPI_Comm comm;
324 #endif
325 
326     if(GAinitialized)
327     {
328         GA_Internal_Threadsafe_Unlock();
329         return;
330     }
331 
332 #if HAVE_ARMCI_INITIALIZED_FUNCTION
333     if (!ARMCI_Initialized())
334 #else
335     if (!ARMCIinitialized)
336 #endif
337     {
338         /* assure that GA will not alocate more shared memory than specified */
339         if(GA_memory_limited) ARMCI_Set_shm_limit(GA_total_memory);
340         if (_ga_initialize_c) {
341             if (_ga_initialize_args) {
342                 ARMCI_Init_args(_ga_argc, _ga_argv);
343             }
344             else {
345                 ARMCI_Init();
346             }
347         }
348         else if (_ga_initialize_f) {
349             _ga_argc = malloc(sizeof(int));
350             _ga_argv = malloc(sizeof(char**));
351             if (!_ga_argc) pnga_error("malloc argc failed",1);
352             ga_f2c_get_cmd_args(_ga_argc, _ga_argv);
353             ARMCI_Init_args(_ga_argc, _ga_argv);
354         }
355         else {
356             pnga_error("pnga_initialize called outside of C or F APIs",1);
357         }
358         ARMCIinitialized = 1;
359     }
360 
361     GA_Default_Proc_Group = -1;
362     /* zero in pointers in GA array */
363     _ga_main_data_structure
364        = (global_array_t *)malloc(sizeof(global_array_t)*MAX_ARRAYS);
365     _proc_list_main_data_structure
366        = (proc_list_t *)malloc(sizeof(proc_list_t)*MAX_ARRAYS);
367     if(!_ga_main_data_structure)
368        pnga_error("ga_init:malloc ga failed",0);
369     if(!_proc_list_main_data_structure)
370        pnga_error("ga_init:malloc proc_list failed",0);
371     GA = _ga_main_data_structure;
372     PGRP_LIST = _proc_list_main_data_structure;
373     for(i=0;i<MAX_ARRAYS; i++) {
374        GA[i].ptr  = (char**)0;
375        GA[i].mapc = (C_Integer*)0;
376        GA[i].rstrctd_list = (C_Integer*)0;
377        GA[i].rank_rstrctd = (C_Integer*)0;
378        GA[i].property = NO_PROPERTY;
379        GA[i].mem_dev_set = 0;
380 #ifdef ENABLE_CHECKPOINT
381        GA[i].record_id = 0;
382 #endif
383        GA[i].actv = 0;
384        GA[i].p_handle = GA_Init_Proc_Group;
385        GA[i].overlay = 0;
386        PGRP_LIST[i].map_proc_list = (int*)0;
387        PGRP_LIST[i].inv_map_proc_list = (int*)0;
388        PGRP_LIST[i].actv = 0;
389     }
390 
391     bzero(&GAstat,sizeof(GAstat));
392 
393     /* initialize some data structures used in non-blocking communication */
394     gai_nb_init();
395 
396     GAnproc = (Integer)armci_msg_nproc();
397 
398     /* Allocate arrays used by library */
399     mapALL = (Integer*)malloc((GAnproc+MAXDIM-1)*sizeof(Integer*));
400 
401     GAme = (Integer)armci_msg_me();
402     if(GAme<0 || GAme>GAnproc)
403        pnga_error("ga_init:message-passing initialization problem: my ID=",GAme);
404 
405     MPme= (Integer)armci_msg_me();
406 
407     gai_init_onesided();
408 
409     /* set activity status for all arrays to inactive */
410     for(i=0;i<_max_global_array;i++)GA[i].actv=0;
411     for(i=0;i<_max_global_array;i++)GA[i].actv_handle=0;
412 
413     /* Create proc list for mirrored arrays */
414     PGRP_LIST[0].map_proc_list = (int*)malloc(GAnproc*sizeof(int)*2);
415     PGRP_LIST[0].inv_map_proc_list = PGRP_LIST[0].map_proc_list + GAnproc;
416     for (i=0; i<GAnproc; i++) PGRP_LIST[0].map_proc_list[i] = -1;
417     for (i=0; i<GAnproc; i++) PGRP_LIST[0].inv_map_proc_list[i] = -1;
418     nnode = pnga_cluster_nodeid();
419     nproc = pnga_cluster_nprocs(nnode);
420     zero = 0;
421     j = pnga_cluster_procid(nnode, zero);
422     PGRP_LIST[0].parent = -1;
423     PGRP_LIST[0].actv = 1;
424     PGRP_LIST[0].map_nproc = nproc;
425     PGRP_LIST[0].mirrored = 1;
426     for (i=0; i<nproc; i++) {
427        PGRP_LIST[0].map_proc_list[i+j] = i;
428        PGRP_LIST[0].inv_map_proc_list[i] = i+j;
429     }
430 
431     /* Set up group for doing cluster merge. Start by checking if
432      * number of procs per node is the same for all nodes */
433     i = nproc;
434     pnga_pgroup_gop(GA_World_Proc_Group,pnga_type_f2c(MT_F_INT),&i,1,"max");
435     j = nproc;
436     pnga_pgroup_gop(GA_World_Proc_Group,pnga_type_f2c(MT_F_INT),&j,1,"min");
437     if (i == j) {
438       /* construct a group that containing all processors with the same local
439        * proc ID across all nodes in the system */
440       Integer numnodes = pnga_cluster_nnodes();
441       Integer *nodelist = (Integer*)malloc(numnodes*sizeof(Integer));
442       Integer myproc = GAme-pnga_cluster_procid(nnode,zero);
443       for (i=0; i<numnodes; i++) {
444         nodelist[i] = i*nproc+myproc;
445       }
446       _mirror_gop_grp = pnga_pgroup_create(nodelist,numnodes);
447       free(nodelist);
448     } else {
449       _mirror_gop_grp = GA_World_Proc_Group;
450     }
451 
452 
453 
454     /* Allocate memory for update flags and signal*/
455     bytes = 2*MAXDIM*sizeof(int);
456     GA_Update_Flags = (int**)malloc(GAnproc*sizeof(void*));
457     if (!GA_Update_Flags)
458       pnga_error("ga_init: Failed to initialize GA_Update_Flags",(int)GAme);
459     if (ARMCI_Malloc((void**)GA_Update_Flags, (armci_size_t) bytes))
460       pnga_error("ga_init:Failed to initialize memory for update flags",GAme);
461     if(GA_Update_Flags[GAme]==NULL)pnga_error("ga_init:ARMCIMalloc failed",GAme);
462 
463     bytes = sizeof(int);
464     GA_Update_Signal = ARMCI_Malloc_local((armci_size_t) bytes);
465 
466     /* Zero update flags */
467     for (i=0; i<2*MAXDIM; i++) {
468 		GA_Update_Flags[GAme][i] = 0;
469 	}
470 
471     /* set MA error function */
472     MA_set_error_callback(ARMCI_Error);
473 
474     GAinitialized = 1;
475 
476 #ifdef PROFILE_OLD
477     ga_profile_init();
478 #endif
479 #ifdef ENABLE_CHECKPOINT
480     {
481     Integer tmplist[1000];
482     Integer tmpcount;
483     tmpcount = GAnproc-ga_spare_procs;
484     for(i=0;i<tmpcount;i++)
485             tmplist[i]=i;
486     ga_group_is_for_ft=1;
487     GA_Default_Proc_Group = pnga_pgroup_create(tmplist,tmpcount);
488     ga_group_is_for_ft=0;
489     if(GAme>=tmpcount)
490       ga_irecover(0);
491     printf("\n%d:here done with initialize\n",GAme);
492 
493     }
494 #endif
495     /* create duplicate of world communicator */
496 #ifdef MSG_COMMS_MPI
497     comm =  GA_MPI_Comm_pgroup(-1);
498     MPI_Comm_dup(comm, &GA_MPI_World_comm_dup);
499 #endif
500     GA_Internal_Threadsafe_Unlock();
501 }
502 
503 
504 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
505 #   pragma weak wnga_initialized = pnga_initialized
506 #endif
pnga_initialized()507 int pnga_initialized()
508 {
509     return GAinitialized;
510 }
511 
512 #if ENABLE_CHECKPOINT
set_ga_group_is_for_ft(int val)513 void set_ga_group_is_for_ft(int val)
514 {
515     ga_group_is_for_ft = val;
516 }
517 #endif
518 
519 /**
520  *  Is MA used for allocation of GA memory?
521  */
522 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
523 #   pragma weak wnga_uses_ma =  pnga_uses_ma
524 #endif
525 
pnga_uses_ma()526 logical pnga_uses_ma()
527 {
528 #ifdef AVOID_MA_STORAGE
529    return FALSE;
530 #else
531    if(!GAinitialized) return FALSE;
532 
533    if(ARMCI_Uses_shm()) return FALSE;
534    else return TRUE;
535 #endif
536 }
537 
538 /**
539  *  Is memory limit set
540  */
541 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
542 #   pragma weak wnga_memory_limited =  pnga_memory_limited
543 #endif
544 
pnga_memory_limited()545 logical pnga_memory_limited()
546 {
547    if(GA_memory_limited) return TRUE;
548    else                  return FALSE;
549 }
550 
551 /**
552  *  Returns the amount of memory on each processor used in active Global Arrays
553  */
554 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
555 #   pragma weak wnga_inquire_memory =  pnga_inquire_memory
556 #endif
557 
pnga_inquire_memory()558 Integer pnga_inquire_memory()
559 {
560 Integer i, sum=0;
561     for(i=0; i<_max_global_array; i++)
562         if(GA[i].actv) sum += (Integer)GA[i].size;
563     return(sum);
564 }
565 
566 /**
567  *  Returns the amount of memory available on the calling processor
568  */
569 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
570 #   pragma weak wnga_memory_avail =  pnga_memory_avail
571 #endif
572 
pnga_memory_avail()573 Integer pnga_memory_avail()
574 {
575    if(!pnga_uses_ma()) return(GA_total_memory);
576    else{
577       Integer ma_limit = MA_inquire_avail(MT_F_BYTE);
578 
579       if ( GA_memory_limited ) return( GA_MIN(GA_total_memory, ma_limit) );
580       else return( ma_limit );
581    }
582 }
583 
584 
585 
586 /**
587  *  (re)set limit on GA memory usage
588  */
589 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
590 #   pragma weak wnga_set_memory_limit =  pnga_set_memory_limit
591 #endif
592 
pnga_set_memory_limit(Integer mem_limit)593 void pnga_set_memory_limit(Integer mem_limit)
594 {
595   if(GA_memory_limited){
596 
597     /* if we had the limit set we need to adjust the amount available */
598     if (mem_limit>=0)
599       /* adjust the current value by diff between old and new limit */
600       GA_total_memory += (mem_limit - GA_memory_limit);
601     else{
602 
603       /* negative values reset limit to "unlimited" */
604       GA_memory_limited =  0;
605       GA_total_memory= -1;
606     }
607 
608   }else{
609 
610     GA_total_memory = GA_memory_limit  = mem_limit;
611     if(mem_limit >= 0) GA_memory_limited = 1;
612   }
613 }
614 
615 /**
616  *  Initialize Global Array library structures and set a limit on memory
617  *  usage by GA.
618  *    the byte limit is per processor (even for shared memory)
619  *    either ga_initialize_ltd or ga_initialize must be the first
620  *         GA routine called (except ga_uses_ma)
621  *    ga_initialize is another version of ga_initialize_ltd, except
622  *         without memory control
623  *    mem_limit < 0 means "memory unlimited"
624  */
625 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
626 #   pragma weak wnga_initialize_ltd = pnga_initialize_ltd
627 #endif
628 
pnga_initialize_ltd(Integer mem_limit)629 void pnga_initialize_ltd(Integer mem_limit)
630 {
631   GA_total_memory =GA_memory_limit  = mem_limit;
632   if(mem_limit >= 0) GA_memory_limited = 1;
633   pnga_initialize();
634 }
635 
636 /* #define gam_checktype(_type)\ */
637 /*        if(_type != C_DBL  && _type != C_INT &&  \ */
638 /*           _type != C_DCPL && _type != C_SCPL && _type != C_FLOAT && \ */
639 /*           _type != C_LONG &&_type != C_LONGLONG)\ */
640 /*          pnga_error("ttype not yet supported ",  _type) */
641 
642 #define gam_checktype(_type) if(!GAvalidtypeM(_type))pnga_error("type not yet supported", (_type))
643 
644 #define gam_checkdim(ndim, dims)\
645 {\
646 int _d;\
647     if(ndim<1||ndim>MAXDIM) pnga_error("unsupported number of dimensions",ndim);\
648   __CRAYX1_PRAGMA("_CRI novector");                                         \
649     for(_d=0; _d<ndim; _d++)\
650          if(dims[_d]<1)pnga_error("wrong dimension specified",dims[_d]);\
651 }
652 
653 /**
654  * Utility function to tell whether or not an array is mirrored
655  */
656 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
657 #   pragma weak wnga_is_mirrored = pnga_is_mirrored
658 #endif
659 
pnga_is_mirrored(Integer g_a)660 logical pnga_is_mirrored(Integer g_a)
661 {
662   Integer ret = FALSE;
663   Integer handle = GA_OFFSET + g_a;
664   Integer p_handle = (Integer)GA[handle].p_handle;
665   if (p_handle >= 0) {
666      if (PGRP_LIST[p_handle].mirrored) ret = TRUE;
667   }
668   return ret;
669 }
670 
671 /**
672  * map_ij: pointer to map array containing axis partitions
673  * n: number of blocks along axis
674  * scale: factor for coming up with an initial guess
675  * elem: array element index that we are trying to find
676  * block: index of block containing elem
677  */
678 #define findblock(map_ij,n,scale,elem, block)\
679 {\
680 int candidate, found, b; \
681 C_Integer *map= (map_ij);\
682 \
683     candidate = (int)(scale*(elem));\
684     found = 0;\
685     if(map[candidate] <= (elem)){ /* search downward */\
686          b= candidate;\
687          while(b<(n)-1){ \
688             found = (map[b+1]>(elem));\
689             if(found)break;\
690             b++;\
691          } \
692     }else{ /* search upward */\
693          b= candidate-1;\
694          while(b>=0){\
695             found = (map[b]<=(elem));\
696             if(found)break;\
697             b--;\
698          }\
699     }\
700     if(!found)b=(n)-1;\
701     *(block) = b;\
702 }
703 
704 /*\
705  * Find indices of block containing the array element at the location
706  * in subscript.
707 \*/
708 #define gam_find_block_indices_from_subscript(handle,subscript,index)\
709 {                                                                    \
710   int _type = GA[handle].distr_type;                                 \
711   Integer *_mapc = GA[handle].mapc;                                  \
712   Integer _offset;                                                   \
713   int _i;                                                            \
714   int _ndim = GA[handle].ndim;                                       \
715   if (_type == REGULAR) {                                            \
716     for (_i=0, _offset=0; _i<_ndim; _i++) {                          \
717       findblock(_mapc+_offset, GA[handle].nblock[_i],                \
718           GA[handle].scale[_i],subscript[_i],&index[_i]);            \
719       _offset += GA[handle].nblock[_i];                              \
720     }                                                                \
721   } else if (_type == TILED_IRREG) {                                 \
722     for (_i=0, _offset=0; _i<_ndim; _i++) {                          \
723       findblock(_mapc+_offset, GA[handle].num_blocks[_i],            \
724           GA[handle].scale[_i],subscript[_i],&index[_i]);            \
725       _offset += GA[handle].num_blocks[_i];                          \
726     }                                                                \
727   } else {                                                           \
728     for (_i=0; _i<_ndim; _i++) {                                     \
729       index[_i] = (subscript[_i]-1)/GA[handle].block_dims[_i];       \
730     }                                                                \
731   }                                                                  \
732 }
733 
734 
735 /**
736  *  Locate the owner of an element of a Global Array specified by the array
737  *  subscript
738  */
739 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
740 #   pragma weak wnga_locate =  pnga_locate
741 #endif
742 
pnga_locate(Integer g_a,Integer * subscript,Integer * owner)743 logical pnga_locate(Integer g_a, Integer* subscript, Integer* owner)
744 {
745 Integer d, proc, dpos, ndim, ga_handle = GA_OFFSET + g_a, proc_s[MAXDIM];
746 
747    ga_check_handleM(g_a, "nga_locate");
748    ndim = GA[ga_handle].ndim;
749 
750    if (GA[ga_handle].distr_type == REGULAR) {
751      for(d=0, *owner=-1; d< ndim; d++)
752        if(subscript[d]< 1 || subscript[d]>GA[ga_handle].dims[d]) return FALSE;
753 
754      for(d = 0, dpos = 0; d< ndim; d++){
755        findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
756            GA[ga_handle].scale[d], subscript[d], &proc_s[d]);
757        dpos += GA[ga_handle].nblock[d];
758      }
759 
760      ga_ComputeIndexM(&proc, ndim, proc_s, GA[ga_handle].nblock);
761 
762      *owner = proc;
763      if (GA[ga_handle].num_rstrctd > 0) {
764        *owner = GA[ga_handle].rstrctd_list[*owner];
765      }
766    } else {
767      Integer i;
768      Integer index[MAXDIM];
769      gam_find_block_indices_from_subscript(ga_handle,subscript,index);
770      gam_find_block_from_indices(ga_handle,i,index);
771      *owner = i;
772    }
773 
774    return TRUE;
775 }
776 
777 
778 
779 /*\ UTILITY FUNCTION TO LOCATE THE BOUNDING INDICES OF A CONTIGUOUS CHUNK OF
780  *  SHARED MEMORY FOR A MIRRORED ARRAY
781 \*/
ngai_get_first_last_indices(Integer g_a)782 void ngai_get_first_last_indices( Integer g_a)  /* array handle (input) */
783 {
784 
785   Integer  lo[MAXDIM], hi[MAXDIM];
786   Integer  nelems, nnodes, inode, nproc;
787   Integer  ifirst, ilast, nfirst, nlast, icnt, np;
788   Integer  i, j, itmp, ndim, map_offset[MAXDIM];
789   /* Integer  icheck; */
790   Integer  index[MAXDIM], subscript[MAXDIM];
791   Integer  handle = GA_OFFSET + g_a;
792   Integer  type, size=0, id;
793   /* Integer  grp_id; */
794   int Save_default_group;
795   char     *fptr, *lptr;
796 
797   /* find total number of elements */
798   ndim = GA[handle].ndim;
799   nelems = 1;
800   for (i=0; i<ndim; i++) nelems *= GA[handle].dims[i];
801 
802   /* If array is mirrored, evaluate first and last indices */
803   if (pnga_is_mirrored(g_a)) {
804     /* If default group is not world group, change default group to world group
805        temporarily */
806     Save_default_group = GA_Default_Proc_Group;
807     GA_Default_Proc_Group = -1;
808     nnodes = pnga_cluster_nnodes();
809     inode = pnga_cluster_nodeid();
810     nproc = pnga_cluster_nprocs(inode);
811     /* grp_id = GA[handle].p_handle; */
812     ifirst = (Integer)((double)(inode*nelems)/((double)nnodes));
813     if (inode != nnodes-1) {
814       ilast = (Integer)((double)((inode+1)*nelems)/((double)nnodes))-1;
815     } else {
816       ilast = nelems-1;
817     }
818     /* ifirst and ilast correspond to offsets in shared memory. Find the
819        actual indices of the data elements corresponding to these offsets.
820        The map_offset array provides a convenient mechanism for extracting
821        the first indices on each processor along each coordinate dimension
822        from the mapc array. */
823     for (i = 0; i<ndim; i++) {
824       map_offset[i] = 0;
825       for (j = 0; j<i; j++) {
826         map_offset[i] += GA[handle].nblock[j];
827       }
828     }
829     icnt = 0;
830     nfirst = -1;
831     nlast = -1;
832     for (i = 0; i<nproc; i++) {
833       /* find block indices corresponding to proc i */
834       pnga_proc_topology(g_a, i, index);
835       nelems = 1;
836       for (j = 0; j<ndim; j++) {
837         if (index[j] < GA[handle].nblock[j]-1) {
838 
839           itmp = ((Integer)GA[handle].mapc[map_offset[j]+index[j]+1]
840                - (Integer)GA[handle].mapc[map_offset[j]+index[j]]);
841           nelems *= itmp;
842         } else {
843           itmp = ((Integer)GA[handle].dims[j]
844                - (Integer)GA[handle].mapc[map_offset[j]+index[j]] + 1);
845           nelems *= itmp;
846         }
847       }
848       icnt += nelems;
849       if (icnt-1 >= ifirst && nfirst < 0) {
850         nfirst = i;
851       }
852       if (ilast <= icnt-1 && nfirst >= 0 && nlast < 0) {
853         nlast = i;
854       }
855     }
856     /* Adjust indices corresponding to start and end of block of
857        shared memory so that it can be decomposed into large
858        rectangular blocks of the global array. Start by
859        adusting the lower index */
860     icnt = 0;
861     for (i = 0; i<nfirst; i++) {
862       pnga_distribution(g_a, i, lo, hi);
863       nelems = 1;
864       for (j = 0; j<ndim; j++) {
865         if (hi[j] >= lo[j]) {
866           nelems *= (hi[j] - lo[j] + 1);
867         } else {
868           nelems = 0;
869         }
870       }
871       icnt += nelems;
872     }
873     /* calculate offset in local block of memory */
874     ifirst = ifirst - icnt;
875     /* find dimensions of data on block nfirst */
876     np = nfirst;
877     pnga_distribution(g_a, np, lo, hi);
878     nelems = 1;
879     for (i=0; i<ndim-1; i++) {
880       nelems *= (hi[i] - lo[i] + 1);
881     }
882     if (ifirst%nelems == 0) {
883       ifirst = ifirst/nelems;
884     } else {
885       ifirst = (ifirst-ifirst%nelems)/nelems;
886       ifirst++;
887     }
888     if (ifirst > GA[handle].dims[ndim-1]-1) ifirst=GA[handle].dims[ndim-1]-1;
889     /* adjust value of ifirst */
890     pnga_proc_topology(g_a, nfirst, index);
891     subscript[ndim-1] = ifirst;
892     for (i=0; i<ndim-1; i++) {
893       subscript[i] = 0;
894     }
895     /* Finally, evaluate absolute indices of first data point */
896     for (i=0; i<ndim; i++) {
897       GA[handle].first[i] = GA[handle].mapc[map_offset[i]+index[i]]
898                           + (C_Integer)subscript[i];
899     }
900     /* adjust upper bound. If nlast = nfirst, just use old value of icnt */
901     if (nlast > nfirst) {
902       icnt = 0;
903       for (i = 0; i<nlast; i++) {
904         pnga_distribution(g_a, i, lo, hi);
905         nelems = 1;
906         for (j = 0; j<ndim; j++) {
907           if (hi[j] >= lo[j]) {
908             nelems *= (hi[j] - lo[j] + 1);
909           } else {
910             nelems = 0;
911           }
912         }
913         icnt += nelems;
914       }
915     }
916     ilast = ilast - icnt;
917     /* find dimensions of data on block nlast */
918     np = nlast;
919     pnga_distribution(g_a, np, lo, hi);
920     nelems = 1;
921     for (i=0; i<ndim-1; i++) {
922       nelems *= (hi[i] - lo[i] + 1);
923     }
924     ilast = (ilast-ilast%nelems)/nelems;
925     /* adjust value of ilast */
926     subscript[ndim-1] = ilast;
927     for (i=0; i<ndim-1; i++) {
928       subscript[i] = (hi[i] - lo[i]);
929     }
930     pnga_proc_topology(g_a, nlast, index);
931     /*
932     icheck = 1;
933     for (i=1; i<ndim; i++) {
934       if (index[i] < GA[handle].nblock[i]-1) {
935         itmp = (Integer)GA[handle].mapc[map_offset[i]+index[i]+1]
936              - (Integer)GA[handle].mapc[map_offset[i]+index[i]];
937       } else {
938         itmp = (Integer)GA[handle].dims[i]
939              - (Integer)GA[handle].mapc[map_offset[i]+index[i]] + 1;
940       }
941       if (subscript[i] < itmp-1) icheck = 0;
942       subscript[i] = itmp-1;
943     }
944     if (!icheck) {
945       subscript[0]--;
946     } */
947     /* Finally, evaluate absolute indices of last data point */
948     for (i=0; i<ndim; i++) {
949       GA[handle].last[i] = GA[handle].mapc[map_offset[i]+index[i]]
950                           + (C_Integer)subscript[i];
951       if (GA[handle].last[i] > GA[handle].dims[i]) {
952         GA[handle].last[i] = GA[handle].dims[i];
953       }
954     }
955     /* find length of shared memory segment owned by this node. Adjust
956      * length, if necessary, to account for gaps in memory between
957      * processors */
958     type = GA[handle].type;
959     switch(type) {
960       case C_FLOAT: size = sizeof(float); break;
961       case C_DBL: size = sizeof(double); break;
962       case C_LONG: size = sizeof(long); break;
963       case C_LONGLONG: size = sizeof(long long); break;
964       case C_INT: size = sizeof(int); break;
965       case C_SCPL: size = 2*sizeof(float); break;
966       case C_DCPL: size = 2*sizeof(double); break;
967       default: pnga_error("type not supported",type);
968     }
969     for (i=0; i<ndim; i++) index[i] = (Integer)GA[handle].first[i];
970     i = (int)pnga_locate(g_a, index, &id);
971     gam_Loc_ptr(id, handle, (Integer)GA[handle].first, &fptr);
972 
973     for (i=0; i<ndim; i++) index[i] = (Integer)GA[handle].last[i];
974     i = (int)pnga_locate(g_a, index, &id);
975     gam_Loc_ptr(id, handle, (Integer)GA[handle].last, &lptr);
976 
977     GA[handle].shm_length = (C_Long)(lptr - fptr + size);
978     GA_Default_Proc_Group = Save_default_group;
979   } else {
980     for (i=0; i<ndim; i++) {
981       GA[handle].first[i] = 0;
982       GA[handle].last[i] = -1;
983       GA[handle].shm_length = -1;
984     }
985   }
986 }
987 
988 /*\ print subscript of ndim dimensional array with two strings before and after
989 \*/
gai_print_subscript(char * pre,int ndim,Integer subscript[],char * post)990 void gai_print_subscript(char *pre,int ndim, Integer subscript[], char* post)
991 {
992         int i;
993 
994         printf("%s [",pre);
995         for(i=0;i<ndim;i++){
996                 printf("%ld",(long)subscript[i]);
997                 if(i==ndim-1)printf("] %s",post);
998                 else printf(",");
999         }
1000 }
1001 
gai_init_struct(int handle)1002 void gai_init_struct(int handle)
1003 {
1004      if(!GA[handle].ptr){
1005         int len = (int)GAnproc;
1006         GA[handle].ptr = (char**)malloc(len*sizeof(char**));
1007      }
1008      if(!GA[handle].ptr)pnga_error("malloc failed: ptr:",0);
1009      GA[handle].ndim = -1;
1010 }
1011 
1012 /**
1013  *  Function to set default processor group
1014  */
1015 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1016 #   pragma weak wnga_pgroup_set_default = pnga_pgroup_set_default
1017 #endif
1018 
pnga_pgroup_set_default(Integer grp)1019 void pnga_pgroup_set_default(Integer grp)
1020 {
1021 #if 0
1022     int local_sync_begin,local_sync_end;
1023 
1024     local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1025 #endif
1026     _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
1027 
1028     /* force a hang if default group is not being set correctly */
1029 #if 0
1030     if (local_sync_begin || local_sync_end) pnga_pgroup_sync(grp);
1031 #endif
1032     GA_Default_Proc_Group = (int)(grp);
1033 
1034 #ifdef MSG_COMMS_MPI
1035     {
1036        ARMCI_Group parent_grp;
1037        if(GA_Default_Proc_Group > 0)
1038           parent_grp = PGRP_LIST[GA_Default_Proc_Group].group;
1039        else
1040           ARMCI_Group_get_world(&parent_grp);
1041        ARMCI_Group_set_default(&parent_grp);
1042     }
1043 #endif
1044 }
1045 
1046 /**
1047  *  Create a new processor group containing count processors with
1048  *  process IDs (in the default group) in list. Return process group
1049  *  handle.
1050  */
1051 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1052 #   pragma weak wnga_pgroup_create = pnga_pgroup_create
1053 #endif
1054 
pnga_pgroup_create(Integer * list,Integer count)1055 Integer pnga_pgroup_create(Integer *list, Integer count)
1056 {
1057     Integer pgrp_handle, i, j, nprocs, itmp;
1058     Integer parent;
1059     int tmp_count;
1060     Integer *tmp_list;
1061     int *tmp2_list;
1062 #ifdef MSG_COMMS_MPI
1063     ARMCI_Group *tmpgrp;
1064 #endif
1065 
1066 
1067     /* Allocate temporary arrays */
1068     tmp_list = (Integer*)malloc(GAnproc*sizeof(Integer));
1069     tmp2_list = (int*)malloc(GAnproc*sizeof(int));
1070 
1071     /*** Get next free process group handle ***/
1072     pgrp_handle =-1; i=0;
1073     do{
1074        if(!PGRP_LIST[i].actv) pgrp_handle=i;
1075        i++;
1076     }while(i<_max_global_array && pgrp_handle==-1);
1077     if( pgrp_handle == -1)
1078        pnga_error(" Too many process groups ", (Integer)_max_global_array);
1079 
1080     /* Check list for validity (no duplicates and no out of range entries) */
1081     nprocs = GAnproc;
1082     for (i=0; i<count; i++) {
1083        if (list[i] <0 || list[i] >= nprocs)
1084 	  pnga_error(" invalid element in list ", list[i]);
1085        for (j=i+1; j<count; j++) {
1086 	  if (list[i] == list[j])
1087 	     pnga_error(" Duplicate elements in list ", list[i]);
1088        }
1089     }
1090 
1091     /* Allocate memory for arrays containg processor maps and initialize
1092        values */
1093     PGRP_LIST[pgrp_handle].map_proc_list
1094        = (int*)malloc(GAnproc*sizeof(int)*2);
1095     PGRP_LIST[pgrp_handle].inv_map_proc_list
1096        = PGRP_LIST[pgrp_handle].map_proc_list + GAnproc;
1097     for (i=0; i<GAnproc; i++)
1098        PGRP_LIST[pgrp_handle].map_proc_list[i] = -1;
1099     for (i=0; i<GAnproc; i++)
1100        PGRP_LIST[pgrp_handle].inv_map_proc_list[i] = -1;
1101 
1102     for (i=0; i<count; i++) {
1103        tmp2_list[i] = (int)list[i];
1104     }
1105 
1106     /* use a simple sort routine to reorder list into assending order */
1107     for (j=1; j<count; j++) {
1108        itmp = tmp2_list[j];
1109        i = j-1;
1110        while(i>=0  && tmp2_list[i] > itmp) {
1111           tmp2_list[i+1] = tmp2_list[i];
1112           i--;
1113        }
1114        tmp2_list[i+1] = itmp;
1115     }
1116 
1117     /* Remap elements in list to absolute processor indices (if necessary)*/
1118     if (GA_Default_Proc_Group != -1) {
1119        parent = GA_Default_Proc_Group;
1120        for (i=0; i<count; i++) {
1121           tmp_list[i] = (int)PGRP_LIST[parent].inv_map_proc_list[tmp2_list[i]];
1122        }
1123     } else {
1124        for (i=0; i<count; i++) {
1125           tmp_list[i] = (int)tmp2_list[i];
1126        }
1127     }
1128 
1129     tmp_count = (int)(count);
1130     /* Create proc list maps */
1131     for (i=0; i<count; i++) {
1132        j = tmp_list[i];
1133        PGRP_LIST[pgrp_handle].map_proc_list[j] = i;
1134        PGRP_LIST[pgrp_handle].inv_map_proc_list[i] = j;
1135     }
1136     PGRP_LIST[pgrp_handle].actv = 1;
1137     PGRP_LIST[pgrp_handle].parent = GA_Default_Proc_Group;
1138     PGRP_LIST[pgrp_handle].mirrored = 0;
1139     PGRP_LIST[pgrp_handle].map_nproc = tmp_count;
1140 #ifdef MSG_COMMS_MPI
1141     tmpgrp = &PGRP_LIST[pgrp_handle].group;
1142 #if ENABLE_CHECKPOINT
1143     if(ga_group_is_for_ft)
1144        tmpgrp = ARMCI_Get_ft_group();
1145     else
1146 #endif
1147        ARMCI_Group_create(tmp_count, tmp2_list, &PGRP_LIST[pgrp_handle].group);
1148 #endif
1149 
1150     /* Clean up temporary arrays */
1151     free(tmp_list);
1152     free(tmp2_list);
1153 
1154 #ifdef MSG_COMMS_MPI
1155     return pgrp_handle;
1156 #else
1157     return pnga_pgroup_get_default();
1158 #endif
1159 }
1160 
1161 /**
1162  *  Duplicate and existing processor group
1163  */
1164 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1165 #   pragma weak wnga_pgroup_duplicate = pnga_pgroup_duplicate
1166 #endif
1167 
pnga_pgroup_duplicate(Integer grp)1168 Integer pnga_pgroup_duplicate(Integer grp)
1169 {
1170     Integer pgrp_handle, i, j, nprocs, itmp;
1171     int tmp_count;
1172     int *tmp_list, *tmp2_list;
1173 #ifdef MSG_COMMS_MPI
1174     ARMCI_Group *tmpgrp;
1175 #endif
1176     Integer save_grp;
1177     if (grp != -1 && !PGRP_LIST[grp].actv) {
1178        pnga_error(" Group is not active ", grp);
1179     }
1180 
1181     /*** Get next free process group handle ***/
1182     pgrp_handle =-1; i=0;
1183     do{
1184        if(!PGRP_LIST[i].actv) pgrp_handle=i;
1185        i++;
1186     }while(i<_max_global_array && pgrp_handle==-1);
1187     if( pgrp_handle == -1)
1188        pnga_error(" Too many process groups ", (Integer)_max_global_array);
1189 
1190     /* Allocate memory for arrays containg processor maps and initialize
1191        values */
1192     PGRP_LIST[pgrp_handle].map_proc_list
1193        = (int*)malloc(GAnproc*sizeof(int)*2);
1194     PGRP_LIST[pgrp_handle].inv_map_proc_list
1195        = PGRP_LIST[pgrp_handle].map_proc_list + GAnproc;
1196     for (i=0; i<GAnproc; i++)
1197        PGRP_LIST[pgrp_handle].map_proc_list[i] = -1;
1198     for (i=0; i<GAnproc; i++)
1199        PGRP_LIST[pgrp_handle].inv_map_proc_list[i] = -1;
1200     if (grp != -1) {
1201       for (i=0; i<GAnproc; i++) {
1202         PGRP_LIST[pgrp_handle].map_proc_list[i]
1203           = PGRP_LIST[grp].map_proc_list[i];
1204         PGRP_LIST[pgrp_handle].inv_map_proc_list[i]
1205           = PGRP_LIST[grp].inv_map_proc_list[i];
1206       }
1207     } else {
1208       for (i=0; i<GAnproc; i++) {
1209         PGRP_LIST[pgrp_handle].map_proc_list[i]
1210           = i;
1211         PGRP_LIST[pgrp_handle].inv_map_proc_list[i]
1212           = i;
1213       }
1214     }
1215     tmp_count = PGRP_LIST[grp].map_nproc;
1216 
1217     tmp_list = (int*)malloc(GAnproc*sizeof(int));
1218     tmp2_list = PGRP_LIST[grp].inv_map_proc_list;
1219     save_grp = GA_Default_Proc_Group;
1220     GA_Default_Proc_Group = PGRP_LIST[grp].parent;
1221     if (grp != -1 && GA_Default_Proc_Group != -1) {
1222        int parent = GA_Default_Proc_Group;
1223        for (i=0; i<tmp_count; i++) {
1224           tmp_list[i] = (int)PGRP_LIST[parent].map_proc_list[tmp2_list[i]];
1225        }
1226     } else if (grp != -1 && GA_Default_Proc_Group == -1) {
1227        for (i=0; i<tmp_count; i++) {
1228           tmp_list[i] = (int)PGRP_LIST[grp].map_proc_list[tmp2_list[i]];
1229        }
1230     } else {
1231        for (i=0; i<GAnproc; i++) {
1232           tmp_list[i] = i;
1233        }
1234     }
1235 
1236     PGRP_LIST[pgrp_handle].map_nproc = tmp_count;
1237     PGRP_LIST[pgrp_handle].actv = 1;
1238     PGRP_LIST[pgrp_handle].parent = PGRP_LIST[grp].parent;
1239     PGRP_LIST[pgrp_handle].mirrored = 0;
1240     PGRP_LIST[pgrp_handle].map_nproc = PGRP_LIST[grp].map_nproc;
1241 #ifdef MSG_COMMS_MPI
1242     tmpgrp = &PGRP_LIST[pgrp_handle].group;
1243 #if ENABLE_CHECKPOINT
1244     if(ga_group_is_for_ft)
1245        tmpgrp = ARMCI_Get_ft_group();
1246     else
1247 #endif
1248        ARMCI_Group_create(tmp_count, tmp_list, &PGRP_LIST[pgrp_handle].group);
1249 #endif
1250 
1251     GA_Default_Proc_Group = save_grp;
1252     /* Clean up temporary arrays */
1253     free(tmp_list);
1254 
1255 #ifdef MSG_COMMS_MPI
1256     return pgrp_handle;
1257 #else
1258     return pnga_pgroup_get_default();
1259 #endif
1260 }
1261 
1262 /**
1263  * Create a duplicate of the group with only the calling processor in it
1264  */
1265 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1266 #   pragma weak wnga_pgroup_self = pnga_pgroup_self
1267 #endif
1268 
pnga_pgroup_self()1269 Integer pnga_pgroup_self()
1270 {
1271   Integer one = 1;
1272   Integer me = pnga_nodeid();
1273   return pnga_pgroup_create(&me,one);
1274 }
1275 
1276 /**
1277  *  Free up processor group handle for reuse
1278  */
1279 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1280 #   pragma weak wnga_pgroup_destroy = pnga_pgroup_destroy
1281 #endif
1282 
pnga_pgroup_destroy(Integer grp_id)1283 logical pnga_pgroup_destroy(Integer grp_id)
1284 {
1285   logical ret = TRUE;
1286   int i, ok;
1287 
1288   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
1289 
1290 #ifdef MSG_COMMS_MPI
1291        ARMCI_Group_free(&PGRP_LIST[grp_id].group);
1292 #endif
1293   /* check to make sure there are no GAs that depend on this process group */
1294   i=0;
1295   ok = 1;
1296   do{
1297       if (GA[i].actv) {
1298         if(GA[i].p_handle == (int)grp_id && GA[i].actv) ok = 0;
1299       }
1300       i++;
1301   }while(i<_max_global_array && ok);
1302   if (!ok) pnga_error("Attempt to destroy process group with attached GAs",grp_id);
1303 
1304   if (PGRP_LIST[grp_id].actv == 0) {
1305     ret = FALSE;
1306   }
1307   PGRP_LIST[grp_id].actv = 0;
1308 
1309   /* Deallocate memory for lists */
1310   free(PGRP_LIST[grp_id].map_proc_list);
1311   return ret;
1312 }
1313 
1314 /**
1315  *  Simple function to recover handle of current default processor group
1316  */
1317 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1318 #   pragma weak wnga_pgroup_get_default = pnga_pgroup_get_default
1319 #endif
1320 
pnga_pgroup_get_default()1321 Integer pnga_pgroup_get_default()
1322 {
1323   return GA_Default_Proc_Group;
1324 }
1325 
1326 /**
1327  *  Simple function to recover handle of mirror group
1328  */
1329 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1330 #   pragma weak wnga_pgroup_get_mirror = pnga_pgroup_get_mirror
1331 #endif
1332 
pnga_pgroup_get_mirror()1333 Integer pnga_pgroup_get_mirror()
1334 {
1335   return 0;
1336 }
1337 
1338 /**
1339  *  Simple function to recover handle of world group
1340  */
1341 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1342 #   pragma weak wnga_pgroup_get_world = pnga_pgroup_get_world
1343 #endif
1344 
pnga_pgroup_get_world()1345 Integer pnga_pgroup_get_world()
1346 {
1347   return -1;
1348 }
1349 
1350 /**
1351  *  Create new process groups by splitting the group grp into grp_num new
1352  *  groups. If mod(size(grp),grp_num) != 0, then one group consists of a smaller
1353  *  number of processes than the others. The new group handle is returned by
1354  *  the call.
1355  */
1356 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1357 #   pragma weak wnga_pgroup_split = pnga_pgroup_split
1358 #endif
1359 
pnga_pgroup_split(Integer grp,Integer grp_num)1360 Integer pnga_pgroup_split(Integer grp, Integer grp_num)
1361 {
1362   Integer nprocs, me, default_grp;
1363   Integer ratio, start, end, grp_size;
1364   Integer i, icnt;
1365   Integer *nodes;
1366   Integer grp_id, ret=-1;
1367 
1368   /* Allocate temporary array */
1369   nodes = (Integer*)malloc(GAnproc*sizeof(Integer));
1370 
1371   if(grp_num<0) pnga_error("Invalid argument (number of groups < 0)",grp_num);
1372   if(grp_num==0) return grp;
1373 
1374   default_grp = pnga_pgroup_get_default();
1375   pnga_pgroup_set_default(grp);
1376 
1377 #if 0 /* This is wrong. Should split only default group and not world group */
1378   world_grp = pnga_pgroup_get_world();
1379   pnga_pgroup_set_default(world_grp);
1380 #endif
1381   nprocs = pnga_nnodes();
1382   me = pnga_nodeid();
1383   /* Figure out how big groups are */
1384   grp_size = nprocs/grp_num;
1385   if (nprocs > grp_size*grp_num) grp_size++;
1386   /* Figure out what procs are in my group */
1387   ratio = me/grp_size;
1388   start = ratio*grp_size;
1389   end = (ratio+1)*grp_size-1;
1390   end = GA_MIN(end,nprocs-1);
1391   if (end<start)
1392     pnga_error("Invalid proc range encountered",0);
1393   icnt = 0;
1394   for (i= 0; i<nprocs; i++) {
1395     if (icnt%grp_size == 0 && i>0) {
1396       grp_id = pnga_pgroup_create(nodes, grp_size);
1397       if (i == end + 1) {
1398         ret = grp_id;
1399       }
1400       icnt = 0;
1401     }
1402     nodes[icnt] = i;
1403     icnt++;
1404   }
1405   grp_id = pnga_pgroup_create(nodes, icnt);
1406   if (end == nprocs-1) {
1407     ret = grp_id;
1408   }
1409   pnga_pgroup_set_default(default_grp);
1410   if(ret==-1) pnga_error("ga_pgroup_split failed",ret);
1411   /* Free temporary array */
1412   free(nodes);
1413   return ret;
1414 }
1415 
1416 /**
1417  *  Split grp into multiple groups based on the color in mycolor. All processes
1418  *  in grp with the same color are assigned to the same group.
1419  */
1420 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1421 #   pragma weak wnga_pgroup_split_irreg = pnga_pgroup_split_irreg
1422 #endif
1423 
pnga_pgroup_split_irreg(Integer grp,Integer mycolor)1424 Integer pnga_pgroup_split_irreg(Integer grp, Integer mycolor)
1425 {
1426   Integer nprocs, me, default_grp, grp_id;
1427   Integer i, icnt=0;
1428   Integer *nodes, *color_arr;
1429 
1430 
1431   /* Allocate temporary arrays */
1432   nodes = (Integer*)malloc(GAnproc*sizeof(Integer));
1433   color_arr = (Integer*)malloc(GAnproc*sizeof(Integer));
1434 
1435   if(mycolor<0) pnga_error("Invalid argument (color < 0)",mycolor);
1436 
1437   default_grp = pnga_pgroup_get_default();
1438   pnga_pgroup_set_default(grp);
1439   nprocs = pnga_nnodes();
1440   me = pnga_nodeid();
1441 
1442   /* Figure out what procs are in my group */
1443   for(i=0; i<nprocs; i++) color_arr[i] = 0;
1444   color_arr[me] = mycolor;
1445   pnga_gop(pnga_type_f2c(MT_F_INT), color_arr, nprocs, "+");
1446 
1447   for (icnt=0, i=0; i<nprocs; i++) {
1448      if(color_arr[i] == mycolor) {
1449         nodes[icnt] = i;
1450         icnt++;
1451      }
1452   }
1453 
1454   grp_id = pnga_pgroup_create(nodes, icnt);
1455 
1456   pnga_pgroup_set_default(default_grp);
1457 
1458   /* Free temporary arrays */
1459   free(nodes);
1460   free(color_arr);
1461 
1462 
1463   return grp_id;
1464 }
1465 
1466 #ifdef MSG_COMMS_MPI
ga_get_armci_group_(int grp_id)1467 ARMCI_Group* ga_get_armci_group_(int grp_id)
1468 {
1469   return &PGRP_LIST[grp_id].group;
1470 }
1471 #endif
1472 
1473 /**
1474  * Return a new global array handle
1475  */
1476 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1477 #   pragma weak wnga_create_handle = pnga_create_handle
1478 #endif
1479 
pnga_create_handle()1480 Integer pnga_create_handle()
1481 {
1482   Integer ga_handle, i, g_a;
1483   /*** Get next free global array handle ***/
1484   ga_handle =-1; i=0;
1485   do{
1486       if(!GA[i].actv_handle) ga_handle=i;
1487       i++;
1488   }while(i<_max_global_array && ga_handle==-1);
1489   if( ga_handle == -1)
1490       pnga_error(" too many arrays ", (Integer)_max_global_array);
1491   g_a = (Integer)ga_handle - GA_OFFSET;
1492 
1493   /*** fill in Global Info Record for g_a ***/
1494   gai_init_struct(ga_handle);
1495   GA[ga_handle].p_handle = GA_Init_Proc_Group;
1496   GA[ga_handle].ndim = -1;
1497   GA[ga_handle].name[0] = '\0';
1498   GA[ga_handle].mapc = NULL;
1499   GA[ga_handle].irreg = 0;
1500   GA[ga_handle].ghosts = 0;
1501   GA[ga_handle].corner_flag = -1;
1502   GA[ga_handle].cache = NULL;
1503   GA[ga_handle].distr_type = REGULAR;
1504   GA[ga_handle].block_total = -1;
1505   GA[ga_handle].rstrctd_list = NULL;
1506   GA[ga_handle].rank_rstrctd = NULL;
1507   GA[ga_handle].num_rstrctd = 0;   /* This is also used as a flag for   */
1508                                    /* restricted arrays. If it is zero, */
1509                                    /* then array is not restricted.     */
1510   GA[ga_handle].actv_handle = 1;
1511   GA[ga_handle].has_data = 1;
1512   GA[ga_handle].property = NO_PROPERTY;
1513   return g_a;
1514 }
1515 
1516 /**
1517  *  Set the dimensions and data type on a new global array handle
1518  */
1519 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1520 #   pragma weak wnga_set_data = pnga_set_data
1521 #endif
1522 
pnga_set_data(Integer g_a,Integer ndim,Integer * dims,Integer type)1523 void pnga_set_data(Integer g_a, Integer ndim, Integer *dims, Integer type)
1524 {
1525   Integer i;
1526   Integer ga_handle = g_a + GA_OFFSET;
1527   if (GA[ga_handle].actv == 1)
1528     pnga_error("Cannot set data on array that has been allocated",0);
1529   gam_checkdim(ndim, dims);
1530   gam_checktype(pnga_type_f2c(type));
1531 
1532   GA[ga_handle].type = pnga_type_f2c((int)(type));
1533   GA[ga_handle].elemsize = GAsizeofM(GA[ga_handle].type);
1534 
1535   for (i=0; i<ndim; i++) {
1536     GA[ga_handle].dims[i] = (C_Integer)dims[i];
1537     GA[ga_handle].chunk[i] = 0;
1538     GA[ga_handle].width[i] = 0;
1539   }
1540   GA[ga_handle].ndim = (int)(ndim);
1541 }
1542 
1543 /**
1544  *  Set the chunk array on a new global array
1545  */
1546 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1547 #   pragma weak wnga_set_chunk = pnga_set_chunk
1548 #endif
1549 
pnga_set_chunk(Integer g_a,Integer * chunk)1550 void pnga_set_chunk(Integer g_a, Integer *chunk)
1551 {
1552   Integer i;
1553   Integer ga_handle = g_a + GA_OFFSET;
1554   if (GA[ga_handle].actv == 1)
1555     pnga_error("Cannot set chunk on array that has been allocated",0);
1556   if (GA[ga_handle].ndim < 1)
1557     pnga_error("Dimensions must be set before chunk array is specified",0);
1558   if (chunk) {
1559     for (i=0; i<GA[ga_handle].ndim; i++) {
1560       GA[ga_handle].chunk[i] = (C_Integer)chunk[i];
1561     }
1562   }
1563 }
1564 
1565 /**
1566  * Set the array name on a new global array
1567  */
1568 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1569 #   pragma weak wnga_set_array_name = pnga_set_array_name
1570 #endif
1571 
pnga_set_array_name(Integer g_a,char * array_name)1572 void pnga_set_array_name(Integer g_a, char *array_name)
1573 {
1574   Integer ga_handle = g_a + GA_OFFSET;
1575   if (GA[ga_handle].actv == 1)
1576     pnga_error("Cannot set array name on array that has been allocated",0);
1577   if (strlen(array_name) > FNAM)
1578     pnga_error("Array name exceeds maximum array name length",FNAM);
1579   strcpy(GA[ga_handle].name, array_name);
1580 }
1581 
1582 /**
1583  * Get the array name of a global array
1584  */
1585 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1586 #   pragma weak wnga_get_array_name = pnga_get_array_name
1587 #endif
1588 
pnga_get_array_name(Integer g_a,char * array_name)1589 void pnga_get_array_name(Integer g_a, char *array_name)
1590 {
1591   Integer ga_handle = g_a + GA_OFFSET;
1592   if (GA[ga_handle].actv == 1)
1593     pnga_error("Cannot get array name on array that has not been allocated",0);
1594   if (strlen(array_name) > FNAM)
1595     pnga_error("Array name exceeds maximum array name length",FNAM);
1596   strcpy(array_name, GA[ga_handle].name);
1597 }
1598 
1599 /**
1600  *  Set the processor group on a new global array
1601  */
1602 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1603 #   pragma weak wnga_set_pgroup = pnga_set_pgroup
1604 #endif
1605 
pnga_set_pgroup(Integer g_a,Integer p_handle)1606 void pnga_set_pgroup(Integer g_a, Integer p_handle)
1607 {
1608   Integer ga_handle = g_a + GA_OFFSET;
1609   if (GA[ga_handle].actv == 1)
1610     pnga_error("Cannot set processor configuration on array that has been allocated",0);
1611   if (p_handle == GA_World_Proc_Group || PGRP_LIST[p_handle].actv == 1) {
1612     GA[ga_handle].p_handle = (int) (p_handle);
1613   } else {
1614     pnga_error("Processor group does not exist",0);
1615   }
1616 }
1617 
1618 /**
1619  *  Get the processor group handle associated with g_a
1620  */
1621 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1622 #   pragma weak wnga_get_pgroup = pnga_get_pgroup
1623 #endif
1624 
pnga_get_pgroup(Integer g_a)1625 Integer pnga_get_pgroup(Integer g_a)
1626 {
1627     Integer ga_handle = g_a + GA_OFFSET;
1628     return (Integer)GA[ga_handle].p_handle;
1629 }
1630 
1631 /**
1632  *  Return the number of processors associated with a processor group
1633  */
1634 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1635 #   pragma weak wnga_get_pgroup_size = pnga_get_pgroup_size
1636 #endif
1637 
pnga_get_pgroup_size(Integer grp_id)1638 Integer pnga_get_pgroup_size(Integer grp_id)
1639 {
1640     int p_handle = (int)(grp_id);
1641     if (p_handle > 0) {
1642        return (Integer)PGRP_LIST[p_handle].map_nproc;
1643     } else {
1644        return GAnproc;
1645     }
1646 }
1647 
1648 /**
1649  *  Add ghost cells to a new global array
1650  */
1651 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1652 #   pragma weak wnga_set_ghosts = pnga_set_ghosts
1653 #endif
1654 
pnga_set_ghosts(Integer g_a,Integer * width)1655 void pnga_set_ghosts(Integer g_a, Integer *width)
1656 {
1657   Integer i;
1658   Integer ga_handle = g_a + GA_OFFSET;
1659   if (GA[ga_handle].actv == 1)
1660     pnga_error("Cannot set ghost widths on array that has been allocated",0);
1661   if (GA[ga_handle].ndim < 1)
1662     pnga_error("Dimensions must be set before array widths are specified",0);
1663   for (i=0; i<GA[ga_handle].ndim; i++) {
1664     if ((C_Integer)width[i] > GA[ga_handle].dims[i])
1665       pnga_error("Boundary width must be <= corresponding dimension",i);
1666     if ((C_Integer)width[i] < 0)
1667       pnga_error("Boundary width must be >= 0",i);
1668   }
1669   for (i=0; i<GA[ga_handle].ndim; i++) {
1670     GA[ga_handle].width[i] = (C_Integer)width[i];
1671     if (width[i] > 0) GA[ga_handle].ghosts = 1;
1672   }
1673 }
1674 
1675 /**
1676  *  Set irregular distribution in a new global array
1677  */
1678 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1679 #   pragma weak wnga_set_irreg_distr = pnga_set_irreg_distr
1680 #endif
1681 
pnga_set_irreg_distr(Integer g_a,Integer * mapc,Integer * nblock)1682 void pnga_set_irreg_distr(Integer g_a, Integer *mapc, Integer *nblock)
1683 {
1684   Integer i, j, ichk, maplen;
1685   Integer ga_handle = g_a + GA_OFFSET;
1686   if (GA[ga_handle].actv == 1)
1687     pnga_error("Cannot set irregular data distribution on array that has been allocated",0);
1688   if (GA[ga_handle].ndim < 1)
1689     pnga_error("Dimensions must be set before irregular distribution is specified",0);
1690   for (i=0; i<GA[ga_handle].ndim; i++)
1691     if ((C_Integer)nblock[i] > GA[ga_handle].dims[i])
1692       pnga_error("number of blocks must be <= corresponding dimension",i);
1693   /* Check to see that mapc array is sensible */
1694   maplen = 0;
1695   for (i=0; i<GA[ga_handle].ndim; i++) {
1696     ichk = mapc[maplen];
1697     if (ichk < 1 || ichk > GA[ga_handle].dims[i])
1698       pnga_error("Mapc entry outside array dimension limits",ichk);
1699     maplen++;
1700     for (j=1; j<nblock[i]; j++) {
1701       if (mapc[maplen] < ichk) {
1702         pnga_error("Mapc entries are not properly monotonic",ichk);
1703       }
1704       ichk = mapc[maplen];
1705       if (ichk < 1 || ichk > GA[ga_handle].dims[i])
1706         pnga_error("Mapc entry outside array dimension limits",ichk);
1707       maplen++;
1708     }
1709   }
1710 
1711   maplen = 0;
1712   for (i=0; i<GA[ga_handle].ndim; i++) {
1713     maplen += nblock[i];
1714     GA[ga_handle].nblock[i] = (C_Integer)nblock[i];
1715   }
1716   GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
1717   for (i=0; i<maplen; i++) {
1718     GA[ga_handle].mapc[i] = (C_Integer)mapc[i];
1719   }
1720   GA[ga_handle].mapc[maplen] = -1;
1721   GA[ga_handle].irreg = 1;
1722 }
1723 
1724 /**
1725  *  Overide the irregular data distribution flag on a new global array
1726  */
1727 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1728 #   pragma weak wnga_set_irreg_flag = pnga_set_irreg_flag
1729 #endif
1730 
pnga_set_irreg_flag(Integer g_a,logical flag)1731 void pnga_set_irreg_flag(Integer g_a, logical flag)
1732 {
1733   Integer ga_handle = g_a + GA_OFFSET;
1734   GA[ga_handle].irreg = (int)(flag);
1735 }
1736 
1737 /**
1738  *  Get dimension on a new global array
1739  */
1740 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1741 #   pragma weak wnga_get_dimension = pnga_get_dimension
1742 #endif
1743 
pnga_get_dimension(Integer g_a)1744 Integer pnga_get_dimension(Integer g_a)
1745 {
1746   Integer ga_handle = g_a + GA_OFFSET;
1747   return (Integer)GA[ga_handle].ndim;
1748 }
1749 
1750 /**
1751  *  Use a simple block-cyclic data distribution for array
1752  */
1753 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1754 #   pragma weak wnga_set_block_cyclic = pnga_set_block_cyclic
1755 #endif
1756 
pnga_set_block_cyclic(Integer g_a,Integer * dims)1757 void pnga_set_block_cyclic(Integer g_a, Integer *dims)
1758 {
1759   Integer i, jsize;
1760   Integer ga_handle = g_a + GA_OFFSET;
1761   if (GA[ga_handle].actv == 1)
1762     pnga_error("Cannot set block-cyclic data distribution on array that has been allocated",0);
1763   if (!(GA[ga_handle].ndim > 0))
1764     pnga_error("Cannot set block-cyclic data distribution if array size not set",0);
1765   if (GA[ga_handle].distr_type != REGULAR)
1766     pnga_error("Cannot reset block-cyclic data distribution on array that has been set",0);
1767   GA[ga_handle].distr_type = BLOCK_CYCLIC;
1768   /* evaluate number of blocks in each dimension */
1769   for (i=0; i<GA[ga_handle].ndim; i++) {
1770     if (dims[i] < 1)
1771       pnga_error("Block dimensions must all be greater than zero",0);
1772     GA[ga_handle].block_dims[i] = dims[i];
1773     jsize = GA[ga_handle].dims[i]/dims[i];
1774     if (GA[ga_handle].dims[i]%dims[i] != 0) jsize++;
1775     GA[ga_handle].num_blocks[i] = jsize;
1776   }
1777   jsize = 1;
1778   for (i=0; i<GA[ga_handle].ndim; i++) {
1779     jsize *= GA[ga_handle].num_blocks[i];
1780   }
1781   GA[ga_handle].block_total = jsize;
1782 }
1783 
1784 /**
1785  *  Use block-cyclic data distribution with ScaLAPACK-type proc grid for array
1786  */
1787 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1788 #   pragma weak wnga_set_block_cyclic_proc_grid = pnga_set_block_cyclic_proc_grid
1789 #endif
1790 
pnga_set_block_cyclic_proc_grid(Integer g_a,Integer * dims,Integer * proc_grid)1791 void pnga_set_block_cyclic_proc_grid(Integer g_a, Integer *dims, Integer *proc_grid)
1792 {
1793   Integer i, jsize, tot;
1794   Integer ga_handle = g_a + GA_OFFSET;
1795   if (GA[ga_handle].actv == 1)
1796     pnga_error("Cannot set block-cyclic data distribution on array that has been allocated",0);
1797   if (!(GA[ga_handle].ndim > 0))
1798     pnga_error("Cannot set block-cyclic data distribution if array size not set",0);
1799   if (GA[ga_handle].distr_type != REGULAR)
1800     pnga_error("Cannot reset block-cyclic data distribution on array that has been set",0);
1801   GA[ga_handle].distr_type = SCALAPACK;
1802   /* Check to make sure processor grid is compatible with total number of processors */
1803   tot = 1;
1804   for (i=0; i<GA[ga_handle].ndim; i++) {
1805     if (proc_grid[i] < 1)
1806       pnga_error("Processor grid dimensions must all be greater than zero",0);
1807     GA[ga_handle].nblock[i] = proc_grid[i];
1808     tot *= proc_grid[i];
1809   }
1810   if (tot != GAnproc)
1811     pnga_error("Number of processors in processor grid must equal available processors",0);
1812   /* evaluate number of blocks in each dimension */
1813   for (i=0; i<GA[ga_handle].ndim; i++) {
1814     if (dims[i] < 1)
1815       pnga_error("Block dimensions must all be greater than zero",0);
1816     GA[ga_handle].block_dims[i] = dims[i];
1817     jsize = GA[ga_handle].dims[i]/dims[i];
1818     if (GA[ga_handle].dims[i]%dims[i] != 0) jsize++;
1819     GA[ga_handle].num_blocks[i] = jsize;
1820   }
1821   jsize = 1;
1822   for (i=0; i<GA[ga_handle].ndim; i++) {
1823     jsize *= GA[ga_handle].num_blocks[i];
1824   }
1825   GA[ga_handle].block_total = jsize;
1826 }
1827 
1828 /**
1829  *  Use block-cyclic data distribution with ScaLAPACK-type proc grid for array
1830  */
1831 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1832 #   pragma weak wnga_set_tiled_proc_grid = pnga_set_tiled_proc_grid
1833 #endif
1834 
pnga_set_tiled_proc_grid(Integer g_a,Integer * dims,Integer * proc_grid)1835 void pnga_set_tiled_proc_grid(Integer g_a, Integer *dims, Integer *proc_grid)
1836 {
1837   Integer i, jsize, tot;
1838   Integer ga_handle = g_a + GA_OFFSET;
1839   if (GA[ga_handle].actv == 1)
1840     pnga_error("Cannot set tiled data distribution on array that has been allocated",0);
1841   if (!(GA[ga_handle].ndim > 0))
1842     pnga_error("Cannot set tiled data distribution if array size not set",0);
1843   if (GA[ga_handle].distr_type != REGULAR)
1844     pnga_error("Cannot reset tiled data distribution on array that has been set",0);
1845   GA[ga_handle].distr_type = TILED;
1846   /* Check to make sure processor grid is compatible with total number of processors */
1847   tot = 1;
1848   for (i=0; i<GA[ga_handle].ndim; i++) {
1849     if (proc_grid[i] < 1)
1850       pnga_error("Processor grid dimensions must all be greater than zero",0);
1851     GA[ga_handle].nblock[i] = proc_grid[i];
1852     tot *= proc_grid[i];
1853   }
1854   if (tot != GAnproc)
1855     pnga_error("Number of processors in processor grid must equal available processors",0);
1856   /* evaluate number of blocks in each dimension */
1857   for (i=0; i<GA[ga_handle].ndim; i++) {
1858     if (dims[i] < 1)
1859       pnga_error("Block dimensions must all be greater than zero",0);
1860     GA[ga_handle].block_dims[i] = dims[i];
1861     jsize = GA[ga_handle].dims[i]/dims[i];
1862     if (GA[ga_handle].dims[i]%dims[i] != 0) jsize++;
1863     GA[ga_handle].num_blocks[i] = jsize;
1864   }
1865   jsize = 1;
1866   for (i=0; i<GA[ga_handle].ndim; i++) {
1867     jsize *= GA[ga_handle].num_blocks[i];
1868   }
1869   GA[ga_handle].block_total = jsize;
1870 }
1871 
1872 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1873 #   pragma weak wnga_set_tiled_irreg_proc_grid = pnga_set_tiled_irreg_proc_grid
1874 #endif
1875 
pnga_set_tiled_irreg_proc_grid(Integer g_a,Integer * mapc,Integer * nblocks,Integer * proc_grid)1876 void pnga_set_tiled_irreg_proc_grid(Integer g_a, Integer *mapc, Integer *nblocks,
1877     Integer *proc_grid)
1878 {
1879   Integer i, j, ichk, maplen, tot, jsize;
1880   Integer ga_handle = g_a + GA_OFFSET;
1881   if (GA[ga_handle].actv == 1)
1882     pnga_error("Cannot set irregular tiled data distribution on array"
1883         " that has been allocated",0);
1884   if (!(GA[ga_handle].ndim > 0))
1885     pnga_error("Cannot set irregular tiled data distribution if array size not set",0);
1886   if (GA[ga_handle].ndim < 1)
1887     pnga_error("Dimensions must be set before irregular distribution is specified",0);
1888   for (i=0; i<GA[ga_handle].ndim; i++)
1889     if ((C_Integer)nblocks[i] > GA[ga_handle].dims[i])
1890       pnga_error("number of blocks must be <= corresponding dimension",i);
1891   if (GA[ga_handle].distr_type != REGULAR)
1892     pnga_error("Cannot reset irregular tiled data distribution on array that has been set",0);
1893   GA[ga_handle].distr_type = TILED_IRREG;
1894   /* Check to see that mapc array is sensible */
1895   maplen = 0;
1896   for (i=0; i<GA[ga_handle].ndim; i++) {
1897     ichk = mapc[maplen];
1898     if (ichk < 1 || ichk > GA[ga_handle].dims[i])
1899       pnga_error("Mapc entry outside array dimension limits",ichk);
1900     maplen++;
1901     for (j=1; j<nblocks[i]; j++) {
1902       if (mapc[maplen] < ichk) {
1903         pnga_error("Mapc entries are not properly monotonic",ichk);
1904       }
1905       ichk = mapc[maplen];
1906       if (ichk < 1 || ichk > GA[ga_handle].dims[i])
1907         pnga_error("Mapc entry outside array dimension limits",ichk);
1908       maplen++;
1909     }
1910   }
1911 
1912   maplen = 0;
1913   for (i=0; i<GA[ga_handle].ndim; i++) {
1914     maplen += nblocks[i];
1915     GA[ga_handle].num_blocks[i] = (C_Integer)nblocks[i];
1916   }
1917   GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
1918   for (i=0; i<maplen; i++) {
1919     GA[ga_handle].mapc[i] = (C_Integer)mapc[i];
1920   }
1921   GA[ga_handle].mapc[maplen] = -1;
1922   GA[ga_handle].irreg = 1;
1923 
1924   /* Check to make sure processor grid is compatible with total number of processors */
1925   tot = 1;
1926   for (i=0; i<GA[ga_handle].ndim; i++) {
1927     if (proc_grid[i] < 1)
1928       pnga_error("Processor grid dimensions must all be greater than zero",0);
1929     GA[ga_handle].nblock[i] = proc_grid[i];
1930     tot *= proc_grid[i];
1931   }
1932   if (tot != GAnproc)
1933     pnga_error("Number of processors in processor grid must equal available processors",0);
1934   /* Find total number of blocks */
1935   jsize = 1;
1936   for (i=0; i<GA[ga_handle].ndim; i++) {
1937     jsize *= GA[ga_handle].num_blocks[i];
1938   }
1939   GA[ga_handle].block_total = jsize;
1940 }
1941 
1942 /**
1943  *  Restrict processors that actually contain data in the global array. Can also
1944  *  be used to rearrange the distribution of data amongst processors
1945  */
1946 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1947 #   pragma weak wnga_set_restricted = pnga_set_restricted
1948 #endif
1949 
pnga_set_restricted(Integer g_a,Integer * list,Integer size)1950 void pnga_set_restricted(Integer g_a, Integer *list, Integer size)
1951 {
1952   Integer i, ig, id=0, me, p_handle, has_data, nproc;
1953   Integer ga_handle = g_a + GA_OFFSET;
1954   GA[ga_handle].num_rstrctd = size;
1955   GA[ga_handle].rstrctd_list = (Integer*)malloc(size*sizeof(Integer));
1956   GA[ga_handle].rank_rstrctd = (Integer*)malloc((GAnproc)*sizeof(Integer));
1957   p_handle = GA[ga_handle].p_handle;
1958   if (p_handle == -2) p_handle = pnga_pgroup_get_default();
1959   if (p_handle > 0) {
1960     me = PGRP_LIST[p_handle].map_proc_list[GAme];
1961     nproc = PGRP_LIST[p_handle].map_nproc;
1962   } else {
1963     me = GAme;
1964     nproc = GAnproc;
1965   }
1966   has_data = 0;
1967 
1968   for (i=0; i<GAnproc; i++) {
1969     GA[ga_handle].rank_rstrctd[i] = -1;
1970   }
1971 
1972   for (i=0; i<size; i++) {
1973     GA[ga_handle].rstrctd_list[i] = list[i];
1974     /* check if processor is in list */
1975     if (me == list[i]) {
1976       has_data = 1;
1977       id = i;
1978     }
1979     /* check if processor is in group */
1980     if (list[i] < 0 || list[i] >= nproc)
1981       pnga_error("Invalid processor in list",list[i]);
1982     ig = list[i];
1983     GA[ga_handle].rank_rstrctd[ig] = i;
1984   }
1985   GA[ga_handle].has_data = has_data;
1986   GA[ga_handle].rstrctd_id = id;
1987 }
1988 
1989 /**
1990  *  Restrict processors that actually contain data in the global array
1991  *  by specifying a range of processors
1992  */
1993 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1994 #   pragma weak wnga_set_restricted_range = pnga_set_restricted_range
1995 #endif
1996 
pnga_set_restricted_range(Integer g_a,Integer lo_proc,Integer hi_proc)1997 void pnga_set_restricted_range(Integer g_a, Integer lo_proc, Integer hi_proc)
1998 {
1999   Integer i, ig, id=0, me, p_handle, has_data, icnt, nproc, size;
2000   Integer ga_handle = g_a + GA_OFFSET;
2001   size = hi_proc - lo_proc + 1;
2002   GA[ga_handle].num_rstrctd = size;
2003   GA[ga_handle].rstrctd_list = (Integer*)malloc((size)*sizeof(Integer));
2004   GA[ga_handle].rank_rstrctd = (Integer*)malloc((GAnproc)*sizeof(Integer));
2005   p_handle = GA[ga_handle].p_handle;
2006   if (p_handle == -2) p_handle = pnga_pgroup_get_default();
2007   if (p_handle > 0) {
2008     me = PGRP_LIST[p_handle].map_proc_list[GAme];
2009     nproc = PGRP_LIST[p_handle].map_nproc;
2010   } else {
2011     me = GAme;
2012     nproc = GAnproc;
2013   }
2014   has_data = 0;
2015 
2016   for (i=0; i<GAnproc; i++) {
2017     GA[ga_handle].rank_rstrctd[i] = -1;
2018   }
2019 
2020   icnt = 0;
2021   for (i=lo_proc; i<=hi_proc; i++) {
2022     GA[ga_handle].rstrctd_list[icnt] = i;
2023     /* check if processor is in list */
2024     if (me == i) {
2025       has_data = 1;
2026       id = icnt;
2027     }
2028     /* check if processor is in group */
2029     if (i < 0 || i >= nproc)
2030       pnga_error("Invalid processor in list",i);
2031     ig = i;
2032     GA[ga_handle].rank_rstrctd[ig] = icnt;
2033     icnt++;
2034   }
2035   GA[ga_handle].has_data = has_data;
2036   GA[ga_handle].rstrctd_id = id;
2037 }
2038 
2039 /**
2040  *  Set the property on a global array.
2041  */
2042 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2043 #   pragma weak wnga_set_property = pnga_set_property
2044 #endif
pnga_set_property(Integer g_a,char * property)2045 void pnga_set_property(Integer g_a, char* property) {
2046   Integer ga_handle = g_a + GA_OFFSET;
2047   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
2048   pnga_pgroup_sync(GA[ga_handle].p_handle);
2049   /* Check to see if property conflicts with properties already set on the
2050    * global array. This check may need more refinement as additional properties
2051    * are added. */
2052   if (GA[ga_handle].property != NO_PROPERTY) {
2053     pnga_error("Cannot set property on an array that already has property set",0);
2054   }
2055   if (strcmp(property,"read_only")==0) {
2056     /* TODO: copy global array to new configuration */
2057     int i, d, ndim, btot, chk;
2058     Integer nprocs, nodeid, origin_id, dflt_grp, handle, maplen;
2059     Integer nelem, mem_size, status, grp_me, g_tmp, me_local;
2060     Integer *list;
2061     Integer blk[MAXDIM], dims[MAXDIM], pe[MAXDIM], chunk[MAXDIM];
2062     Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
2063     Integer *pmap[MAXDIM], *map;
2064     char *buf;
2065     if (GA[ga_handle].distr_type != REGULAR) {
2066       pnga_error("Block-cyclic arrays not supported for READ_ONLY",0);
2067     }
2068     if (GA[ga_handle].p_handle != pnga_pgroup_get_world()) {
2069       pnga_error("Arrays on subgroups not supported for READ_ONLY",0);
2070     }
2071     ndim = (int)GA[ga_handle].ndim;
2072     btot = 0;
2073     for (i=0; i<ndim; i++) {
2074       GA[ga_handle].old_nblock[i] = GA[ga_handle].nblock[i];
2075       GA[ga_handle].old_lo[i] = GA[ga_handle].lo[i];
2076       GA[ga_handle].old_chunk[i] = GA[ga_handle].chunk[i];
2077       btot += GA[ga_handle].nblock[i];
2078     }
2079     GA[ga_handle].old_mapc = (Integer*)malloc((btot+1)*sizeof(Integer));
2080     for (i=0; i<btot+1; i++) {
2081       GA[ga_handle].old_mapc[i] = GA[ga_handle].mapc[i];
2082     }
2083     /* Make a temporary copy of GA */
2084     g_tmp = pnga_create_handle();
2085     pnga_set_data(g_tmp,ndim,GA[ga_handle].dims,GA[ga_handle].type);
2086     pnga_set_pgroup(g_tmp,GA[ga_handle].p_handle);
2087     if (!pnga_allocate(g_tmp)) {
2088       pnga_error("Failed to allocate temporary array",0);
2089     }
2090     pnga_copy(g_a, g_tmp);
2091     /* Create a group containing all the processors on this node */
2092     GA[ga_handle].old_handle = GA[ga_handle].p_handle;
2093     nodeid = pnga_cluster_nodeid();
2094     nprocs = pnga_cluster_nprocs(nodeid);
2095     origin_id = pnga_cluster_procid(nodeid,0);
2096     dflt_grp = pnga_pgroup_get_default();
2097     pnga_pgroup_set_default(pnga_pgroup_get_world());
2098     list = (Integer*)malloc(nprocs*sizeof(Integer));
2099     for (i=0; i<nprocs; i++) {
2100       list[i] = origin_id+i;
2101     }
2102     handle = pnga_pgroup_create(list, nprocs);
2103     free(list);
2104     GA[ga_handle].p_handle = handle;
2105     GA[ga_handle].property = READ_ONLY;
2106 
2107     /* Ignore hints on data distribution (chunk) and just go with default
2108      * distribution on the node, except if chunk dimension is same as array
2109      * dimension (no partitioning on that axis) */
2110     for (i=0; i<ndim; i++) {
2111       /* eliminate dimension=1 from analysis, otherwise set blk to -1*/
2112       if (GA[ga_handle].chunk[i] == GA[ga_handle].dims[i]) {
2113         chunk[i] = GA[ga_handle].chunk[i];
2114       } else {
2115         chunk[i] = -1;
2116       }
2117       dims[i] = GA[ga_handle].dims[i];
2118     }
2119     if (chunk[0] != 0)
2120       for (d=0; d<ndim; d++) blk[d]=(Integer)GA_MIN(chunk[d],dims[d]);
2121     else
2122       for (d=0; d<ndim; d++) blk[d]=-1;
2123     for (d=0; d<ndim; d++) if (dims[d]==1) blk[d]=1;
2124     ddb_h2(ndim, dims, PGRP_LIST[handle].map_nproc,0.0,(Integer)0,blk,pe);
2125     for(d=0, map=mapALL; d< ndim; d++){
2126       Integer nblock;
2127       Integer pcut; /* # procs that get full blk[] elements; the rest gets less*/
2128       int p;
2129 
2130       pmap[d] = map;
2131 
2132       /* RJH ... don't leave some nodes without data if possible
2133        * but respect the users block size */
2134 
2135       if (chunk[d] > 1) {
2136         Integer ddim = ((dims[d]-1)/GA_MIN(chunk[d],dims[d]) + 1);
2137         pcut = (ddim -(blk[d]-1)*pe[d]) ;
2138       } else {
2139         pcut = (dims[d]-(blk[d]-1)*pe[d]) ;
2140       }
2141 
2142       for (nblock=i=p=0; (p<pe[d]) && (i<dims[d]); p++, nblock++) {
2143         Integer b = blk[d];
2144         if (p >= pcut)
2145           b = b-1;
2146         map[nblock] = i+1;
2147         if (chunk[d]>1) b *= GA_MIN(chunk[d],dims[d]);
2148         i += b;
2149       }
2150 
2151       pe[d] = GA_MIN(pe[d],nblock);
2152       map +=  pe[d];
2153     }
2154     maplen = 0;
2155     for( i = 0; i< ndim; i++){
2156       GA[ga_handle].nblock[i] = pe[i];
2157       maplen += pe[i];
2158     }
2159     free(GA[ga_handle].mapc);
2160     GA[ga_handle].mapc = (Integer*)malloc((maplen+1)*sizeof(Integer));
2161     for(i = 0; i< maplen; i++) {
2162       GA[ga_handle].mapc[i] = (C_Integer)mapALL[i];
2163     }
2164     GA[ga_handle].mapc[maplen] = -1;
2165     /* Set remaining paramters and determine memory size if regular data
2166      * distribution is being used */
2167 
2168     for( i = 0; i< ndim; i++){
2169       GA[ga_handle].scale[i] = (double)GA[ga_handle].nblock[i]
2170         / (double)GA[ga_handle].dims[i];
2171     }
2172 
2173     /*** determine which portion of the array I am supposed
2174      * to hold ***/
2175     pnga_distribution(g_a, GAme, GA[ga_handle].lo, hi);
2176     chk = 1;
2177     for( i = 0, nelem=1; i< ndim; i++){
2178       if (hi[i]-(Integer)GA[ga_handle].lo[i]+1 <= 0) chk = 0;
2179       nelem *= (hi[i]-(Integer)GA[ga_handle].lo[i]+1);
2180     }
2181     mem_size = nelem * GA[ga_handle].elemsize;
2182     if (!chk) mem_size = 0;
2183     grp_me = pnga_pgroup_nodeid(handle);
2184     /* Clean up old memory first */
2185 #ifndef AVOID_MA_STORAGE
2186     if(gai_uses_shm((int)handle)){
2187 #endif
2188       /* make sure that we free original (before address allignment)
2189        * pointer */
2190 #ifdef MSG_COMMS_MPI
2191       if (GA[ga_handle].old_handle > 0){
2192         ARMCI_Free_group(
2193             GA[ga_handle].ptr[pnga_pgroup_nodeid(GA[ga_handle].old_handle)]
2194             - GA[ga_handle].id,
2195             &PGRP_LIST[GA[ga_handle].old_handle].group);
2196       }
2197       else
2198 #endif
2199       {
2200         ARMCI_Free(
2201             GA[ga_handle].ptr[pnga_pgroup_nodeid(GA[ga_handle].old_handle)]
2202             - GA[ga_handle].id);
2203       }
2204 #ifndef AVOID_MA_STORAGE
2205     }else{
2206       if(GA[ga_handle].id != INVALID_MA_HANDLE) MA_free_heap(GA[ga_handle].id);
2207     }
2208 #endif
2209 
2210     if(GA_memory_limited) GA_total_memory += GA[ga_handle].size;
2211     GAstat.curmem -= GA[ga_handle].size;
2212     /* if requested, enforce limits on memory consumption */
2213     if(GA_memory_limited) GA_total_memory -= mem_size;
2214     GAstat.curmem += mem_size;
2215     /* check if everybody has enough memory left */
2216     if(GA_memory_limited){
2217       status = (GA_total_memory >= 0) ? 1 : 0;
2218       pnga_pgroup_gop(handle,pnga_type_f2c(MT_F_INT), &status, 1, "&&");
2219     } else status = 1;
2220     /* allocate memory */
2221     if (status) {
2222       /* Allocate new memory */
2223       if (GA[ga_handle].mem_dev_set) {
2224         status = !gai_get_devmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2225             GA[ga_handle].type, &GA[ga_handle].id, handle,
2226             GA[ga_handle].mem_dev_set,GA[ga_handle].mem_dev);
2227       } else {
2228         status = !gai_getmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2229             GA[ga_handle].type, &GA[ga_handle].id, handle);
2230       }
2231     } else {
2232       GA[ga_handle].ptr[grp_me]=NULL;
2233     }
2234     GA[ga_handle].size = (C_Long)mem_size;
2235     if (!status) {
2236       pnga_error("Memory failure when unsetting READ_ONLY",0);
2237     }
2238     /* Copy data from copy of old GA to new GA and then get rid of copy*/
2239     pnga_distribution(g_a,GAme,lo,hi);
2240     chk = 1;
2241     nelem = 1;
2242     for (i=0; i<ndim; i++) {
2243       /*
2244       GA[ga_handle].chunk[i] = ((C_Integer)hi[i]-GA[ga_handle].lo[i]+1);
2245       */
2246       ld[i] = hi[i] - lo[i] + 1;
2247       nelem *= ld[i];
2248       if (hi[i] < lo[i]) chk = 0;
2249     }
2250     if (chk) {
2251 #if 1
2252       pnga_get(g_tmp,lo,hi,GA[ga_handle].ptr[grp_me],ld);
2253 #else
2254       /* MPI RMA does not allow you to use memory assigned to one window as
2255        * local buffer for another buffer. Create a local buffer to get around
2256        * this problem */
2257       buf = (char*)malloc(nelem*GA[ga_handle].elemsize);
2258       pnga_get(g_tmp,lo,hi,buf,ld);
2259       memcpy(GA[ga_handle].ptr[grp_me],buf,nelem*GA[ga_handle].elemsize);
2260       free(buf);
2261 #endif
2262     }
2263     pnga_destroy(g_tmp);
2264   } else if (strcmp(property, "read_cache") == 0) {
2265     GA[ga_handle].property = READ_CACHE;
2266     GA[ga_handle].cache_head = NULL; /* (cache_struct_t *)malloc(sizeof(cache_struct_t)) */
2267   } else {
2268     pnga_error("Trying to set unknown property",0);
2269   }
2270 }
2271 
2272 /**
2273  *  Set the memory device on a global array.
2274  */
2275 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2276 #   pragma weak wnga_set_memory_dev = pnga_set_memory_dev
2277 #endif
pnga_set_memory_dev(Integer g_a,char * device)2278 void pnga_set_memory_dev(Integer g_a, char *device) {
2279   Integer ga_handle = g_a + GA_OFFSET;
2280   int len = strlen(device);
2281   int i, ilen;
2282   if (len>FNAM) {
2283     pnga_error("Illegal memory device name specified. Device name exceeds length: ",
2284         FNAM);
2285   }
2286   /* convert device name to lower case */
2287   for (i=0; i<len; i++) {
2288     device[i] = tolower(device[i]);
2289   }
2290   /* remove blanks */
2291   ilen = 0;
2292   for (i=0; i<len; i++) {
2293     if (device[i] != ' ') {
2294       device [ilen] = device[i];
2295       ilen++;
2296     }
2297   }
2298   if (ilen > len && ilen < FNAM) device[ilen] = '\0';
2299   GA[ga_handle].mem_dev_set = 1;
2300   strcpy(GA[ga_handle].mem_dev,device);
2301 }
2302 
2303 /**
2304  *  Clear property from global array.
2305  */
2306 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2307 #   pragma weak wnga_unset_property = pnga_unset_property
2308 #endif
pnga_unset_property(Integer g_a)2309 void pnga_unset_property(Integer g_a) {
2310   Integer ga_handle = g_a + GA_OFFSET;
2311   if (GA[ga_handle].property == READ_ONLY) {
2312     /* TODO: Copy global array to original configuration */
2313     int i, d, ndim, btot, chk;
2314     Integer g_tmp, grp_me;
2315     Integer nprocs, nodeid, origin_id, dflt_grp, handle, maplen;
2316     Integer nelem, mem_size, status;
2317     Integer *list;
2318     Integer dims[MAXDIM], chunk[MAXDIM];
2319     Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
2320     void *ptr, *buf;
2321 
2322     ndim = (int)GA[ga_handle].ndim;
2323     /* Start by making a copy of the GA */
2324     for (i=0; i<ndim; i++) {
2325       chunk[i] = GA[ga_handle].chunk[i];
2326       dims[i] = GA[ga_handle].dims[i];
2327     }
2328     /* Make a temporary copy of GA */
2329     g_tmp = pnga_create_handle();
2330     pnga_set_data(g_tmp,ndim,GA[ga_handle].dims,GA[ga_handle].type);
2331     pnga_set_pgroup(g_tmp,GA[ga_handle].old_handle);
2332     if (!pnga_allocate(g_tmp)) {
2333       pnga_error("Failed to allocate temporary array",0);
2334     }
2335     /* Copy portion of global array to locally held portion of tmp array */
2336     grp_me = pnga_pgroup_nodeid(GA[ga_handle].old_handle);
2337     pnga_distribution(g_tmp,grp_me,lo,hi);
2338     chk = 1;
2339     for (i=0; i<ndim; i++) {
2340       ld[i] = hi[i]-lo[i]+1;
2341       if (hi[i] < lo[i]) chk = 0;
2342     }
2343     if (chk) {
2344       pnga_access_ptr(g_tmp,lo,hi,&ptr,ld);
2345       pnga_get(g_a,lo,hi,ptr,ld);
2346     }
2347 
2348     /* Get rid of current memory allocation */
2349 #ifndef AVOID_MA_STORAGE
2350     if(gai_uses_shm((int)GA[ga_handle].p_handle)){
2351 #endif
2352       /* make sure that we free original (before address allignment)
2353        * pointer */
2354 #ifdef MSG_COMMS_MPI
2355       if (GA[ga_handle].p_handle > 0){
2356         ARMCI_Free_group(
2357             GA[ga_handle].ptr[pnga_pgroup_nodeid(GA[ga_handle].p_handle)]
2358             - GA[ga_handle].id,
2359             &PGRP_LIST[GA[ga_handle].p_handle].group);
2360       }
2361       else
2362 #endif
2363       {
2364         ARMCI_Free(
2365             GA[ga_handle].ptr[pnga_pgroup_nodeid(GA[ga_handle].p_handle)]
2366             - GA[ga_handle].id);
2367       }
2368 #ifndef AVOID_MA_STORAGE
2369     }else{
2370       if(GA[ga_handle].id != INVALID_MA_HANDLE) MA_free_heap(GA[ga_handle].id);
2371     }
2372 #endif
2373 
2374     /* Reset distribution parameters back to original values */
2375     btot = 0;
2376     for (i=0; i<ndim; i++) {
2377       GA[ga_handle].nblock[i] = GA[ga_handle].old_nblock[i];
2378       GA[ga_handle].lo[i] = GA[ga_handle].old_lo[i];
2379       GA[ga_handle].chunk[i] = GA[ga_handle].old_chunk[i];
2380       btot += GA[ga_handle].nblock[i];
2381     }
2382     free(GA[ga_handle].mapc);
2383     GA[ga_handle].mapc = (Integer*)malloc((btot+1)*sizeof(Integer));
2384     for (i=0; i<btot+1; i++) {
2385       GA[ga_handle].mapc[i] = GA[ga_handle].old_mapc[i];
2386     }
2387     free(GA[ga_handle].old_mapc);
2388 
2389     pnga_distribution(g_a, grp_me, GA[ga_handle].lo, hi);
2390     chk = 1;
2391     for( i = 0, nelem=1; i< ndim; i++){
2392       if (hi[i]-(Integer)GA[ga_handle].lo[i]+1 <= 0) chk = 0;
2393       nelem *= (hi[i]-(Integer)GA[ga_handle].lo[i]+1);
2394     }
2395     mem_size = nelem * GA[ga_handle].elemsize;
2396     if (!chk) mem_size = 0;
2397 
2398     if(GA_memory_limited) GA_total_memory += GA[ga_handle].size;
2399     GAstat.curmem -= GA[ga_handle].size;
2400     /* if requested, enforce limits on memory consumption */
2401     if(GA_memory_limited) GA_total_memory -= mem_size;
2402     /* check if everybody has enough memory left */
2403     if(GA_memory_limited){
2404       status = (GA_total_memory >= 0) ? 1 : 0;
2405       pnga_pgroup_gop(GA[ga_handle].old_handle,pnga_type_f2c(MT_F_INT),
2406           &status, 1, "&&");
2407     } else status = 1;
2408     handle = (Integer)GA[ga_handle].p_handle;
2409     GA[ga_handle].p_handle = GA[ga_handle].old_handle;
2410     if (status) {
2411       /* Allocate new memory */
2412       if (GA[ga_handle].mem_dev_set) {
2413         status = !gai_get_devmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2414             GA[ga_handle].type, &GA[ga_handle].id, GA[ga_handle].p_handle,
2415             GA[ga_handle].mem_dev_set,GA[ga_handle].mem_dev);
2416       } else {
2417         status = !gai_getmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2418             GA[ga_handle].type, &GA[ga_handle].id, GA[ga_handle].p_handle);
2419       }
2420     } else {
2421       GA[ga_handle].ptr[grp_me]=NULL;
2422     }
2423     GA[ga_handle].size = (C_Long)mem_size;
2424     if (!status) {
2425       pnga_error("Memory failure when setting READ_ONLY",0);
2426     }
2427     /* Get rid of read-only group */
2428     pnga_pgroup_destroy(handle);
2429     /* Generate parameters for new memory allocation. Distribution function
2430      * should work, so we can use that to find out how much data is on this
2431      * processor */
2432     GA[ga_handle].property = NO_PROPERTY;
2433     pnga_distribution(g_a,GAme,lo,hi);
2434     chk = 1;
2435     nelem = 1;
2436     for (i=0; i<ndim; i++) {
2437       ld[i] = hi[i]-lo[i]+1;
2438       nelem *= ld[i];
2439       if (hi[i] < lo[i]) chk = 0;
2440     }
2441     if (chk) {
2442 #if 1
2443       pnga_access_ptr(g_a,lo,hi,&ptr,ld);
2444       pnga_get(g_tmp,lo,hi,ptr,ld);
2445 #else
2446       /* MPI RMA does not allow you to use memory assigned to one window as
2447        * local buffer for another buffer. Create a local buffer to get around
2448        * this problem */
2449       buf = (void*)malloc(nelem*GA[ga_handle].elemsize);
2450       pnga_get(g_tmp,lo,hi,buf,ld);
2451       pnga_access_ptr(g_a,lo,hi,&ptr,ld);
2452       memcpy(ptr,buf,nelem*GA[ga_handle].elemsize);
2453       free(buf);
2454 #endif
2455     }
2456     pnga_destroy(g_tmp);
2457   } else if (GA[ga_handle].property == READ_CACHE) {
2458     if (GA[ga_handle].cache_head != NULL) {
2459       cache_struct_t *next;
2460       next = GA[ga_handle].cache_head->next;
2461       if (GA[ga_handle].cache_head->cache_buf)
2462         free(GA[ga_handle].cache_head->cache_buf);
2463       free(GA[ga_handle].cache_head);
2464       while (next) {
2465         GA[ga_handle].cache_head = next;
2466         next = next->next;
2467         if (GA[ga_handle].cache_head->cache_buf)
2468           free(GA[ga_handle].cache_head->cache_buf);
2469         free(GA[ga_handle].cache_head);
2470       }
2471     }
2472     GA[ga_handle].cache_head = NULL;
2473   } else {
2474     GA[ga_handle].property = NO_PROPERTY;
2475   }
2476 }
2477 
2478 /**
2479  *  Allocate memory and complete setup of global array
2480  */
2481 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2482 #   pragma weak wnga_allocate = pnga_allocate
2483 #endif
2484 
pnga_allocate(Integer g_a)2485 logical pnga_allocate(Integer g_a)
2486 {
2487 
2488   Integer hi[MAXDIM];
2489   Integer ga_handle = g_a + GA_OFFSET;
2490   Integer d, width[MAXDIM], ndim;
2491   Integer mem_size, nelem;
2492   Integer i, status, maplen=0, p_handle;
2493   Integer dims[MAXDIM], chunk[MAXDIM];
2494   Integer pe[MAXDIM], *pmap[MAXDIM], *map;
2495   Integer blk[MAXDIM];
2496   Integer grp_me=GAme, grp_nproc=GAnproc;
2497   Integer block_size = 0;
2498 
2499   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
2500   if (GA[ga_handle].ndim == -1)
2501     pnga_error("Insufficient data to create global array",0);
2502 
2503   p_handle = (Integer)GA[ga_handle].p_handle;
2504   if (p_handle == (Integer)GA_Init_Proc_Group) {
2505     GA[ga_handle].p_handle = GA_Default_Proc_Group;
2506     p_handle = GA_Default_Proc_Group;
2507   }
2508   pnga_pgroup_sync(p_handle);
2509 
2510   if (p_handle > 0) {
2511      grp_nproc  = PGRP_LIST[p_handle].map_nproc;
2512      grp_me = PGRP_LIST[p_handle].map_proc_list[GAme];
2513   }
2514 
2515   if(!GAinitialized) pnga_error("GA not initialized ", 0);
2516   if(!ma_address_init) gai_ma_address_init();
2517 
2518   ndim = GA[ga_handle].ndim;
2519   for (i=0; i<ndim; i++) width[i] = (C_Integer)GA[ga_handle].width[i];
2520 
2521   /* The data distribution has not been specified by the user. Create
2522      default distribution */
2523   if (GA[ga_handle].mapc == NULL && GA[ga_handle].distr_type == REGULAR) {
2524     for (d=0; d<ndim; d++) {
2525       dims[d] = (Integer)GA[ga_handle].dims[d];
2526       chunk[d] = (Integer)GA[ga_handle].chunk[d];
2527     }
2528     if(chunk[0]!=0) /* for chunk[0]=0 compute all */
2529       for(d=0; d< ndim; d++) blk[d]=(Integer)GA_MIN(chunk[d],dims[d]);
2530     else
2531       for(d=0; d< ndim; d++) blk[d]=-1;
2532 
2533     /* eliminate dimensions =1 from ddb analysis */
2534     for(d=0; d<ndim; d++)if(dims[d]==1)blk[d]=1;
2535 
2536     if (GAme==0 && DEBUG )
2537       for (d=0;d<ndim;d++) fprintf(stderr,"b[%ld]=%ld\n",(long)d,(long)blk[d]);
2538     pnga_pgroup_sync(p_handle);
2539 
2540     /* ddb(ndim, dims, GAnproc, blk, pe);*/
2541     if(p_handle == 0) /* for mirrored arrays */
2542 #if OLD_DISTRIBUTION
2543        ddb_h2(ndim, dims, PGRP_LIST[p_handle].map_nproc,0.0,(Integer)0, blk, pe);
2544 #else
2545        ddb(ndim, dims, PGRP_LIST[p_handle].map_nproc, blk, pe);
2546 #endif
2547     else
2548        if (GA[ga_handle].num_rstrctd == 0) {
2549          /* Data is normally distributed on processors */
2550 #if OLD_DISTRIBUTION
2551          ddb_h2(ndim, dims, grp_nproc,0.0,(Integer)0, blk, pe);
2552 #else
2553          ddb(ndim, dims, grp_nproc, blk, pe);
2554 #endif
2555        } else {
2556          /* Data is only distributed on subset of processors */
2557 #if OLD_DISTRIBUTION
2558          ddb_h2(ndim, dims, GA[ga_handle].num_rstrctd, 0.0, (Integer)0, blk, pe);
2559 #else
2560          ddb(ndim, dims, GA[ga_handle].num_rstrctd, blk, pe);
2561 #endif
2562        }
2563 
2564     for(d=0, map=mapALL; d< ndim; d++){
2565       Integer nblock;
2566       Integer pcut; /* # procs that get full blk[] elements; the rest gets less*/
2567       int p;
2568 
2569       pmap[d] = map;
2570 
2571       /* RJH ... don't leave some nodes without data if possible
2572        but respect the users block size */
2573 
2574       if (chunk[d] > 1) {
2575         Integer ddim = ((dims[d]-1)/GA_MIN(chunk[d],dims[d]) + 1);
2576         pcut = (ddim -(blk[d]-1)*pe[d]) ;
2577       }
2578       else {
2579         pcut = (dims[d]-(blk[d]-1)*pe[d]) ;
2580       }
2581 
2582       for (nblock=i=p=0; (p<pe[d]) && (i<dims[d]); p++, nblock++) {
2583         Integer b = blk[d];
2584         if (p >= pcut)
2585           b = b-1;
2586         map[nblock] = i+1;
2587         if (chunk[d]>1) b *= GA_MIN(chunk[d],dims[d]);
2588         i += b;
2589       }
2590 
2591       pe[d] = GA_MIN(pe[d],nblock);
2592       map +=  pe[d];
2593     }
2594     if(GAme==0&& DEBUG){
2595       gai_print_subscript("pe ",(int)ndim, pe,"\n");
2596       gai_print_subscript("blocks ",(int)ndim, blk,"\n");
2597       printf("decomposition map\n");
2598       for(d=0; d< ndim; d++){
2599         printf("dim=%ld: ",(long)d);
2600         for (i=0;i<pe[d];i++)printf("%ld ",(long)pmap[d][i]);
2601         printf("\n");
2602       }
2603       fflush(stdout);
2604     }
2605     maplen = 0;
2606     for( i = 0; i< ndim; i++){
2607       GA[ga_handle].nblock[i] = pe[i];
2608       maplen += pe[i];
2609     }
2610     GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
2611     for(i = 0; i< maplen; i++) {
2612       GA[ga_handle].mapc[i] = (C_Integer)mapALL[i];
2613     }
2614     GA[ga_handle].mapc[maplen] = -1;
2615   } else if (GA[ga_handle].distr_type == BLOCK_CYCLIC) {
2616     /* Regular block-cyclic data distribution has been specified. Figure
2617        out how much memory is needed by each processor to store blocks */
2618     Integer nblocks = GA[ga_handle].block_total;
2619     Integer tsize, j;
2620     Integer lo[MAXDIM];
2621     block_size = 0;
2622     for (i=GAme; i<nblocks; i +=GAnproc) {
2623       ga_ownsM(ga_handle,i,lo,hi);
2624       tsize = 1;
2625       for (j=0; j<ndim; j++) {
2626         tsize *= (hi[j] - lo[j] + 1);
2627       }
2628       block_size += tsize;
2629     }
2630   } else if (GA[ga_handle].distr_type == SCALAPACK) {
2631     /* ScaLAPACK block-cyclic data distribution has been specified. Figure
2632        out how much memory is needed by each processor to store blocks */
2633     Integer j, jtot, skip, imin, imax;
2634     Integer index[MAXDIM];
2635     gam_find_proc_indices(ga_handle,GAme,index);
2636     block_size = 1;
2637     for (i=0; i<ndim; i++) {
2638       skip = GA[ga_handle].nblock[i];
2639       jtot = 0;
2640       for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2641         imin = j*GA[ga_handle].block_dims[i] + 1;
2642         imax = (j+1)*GA[ga_handle].block_dims[i];
2643         if (imax > GA[ga_handle].dims[i]) imax = GA[ga_handle].dims[i];
2644         jtot += (imax-imin+1);
2645       }
2646       block_size *= jtot;
2647     }
2648   } else if (GA[ga_handle].distr_type == TILED) {
2649     /* Tiled data distribution has been specified. Figure
2650        out how much memory is needed by each processor to store blocks */
2651     Integer j, jtot, skip, imin, imax;
2652     Integer index[MAXDIM];
2653     gam_find_tile_proc_indices(ga_handle,GAme,index);
2654     block_size = 1;
2655     for (i=0; i<ndim; i++) {
2656       skip = GA[ga_handle].nblock[i];
2657       jtot = 0;
2658       for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2659         imin = j*GA[ga_handle].block_dims[i] + 1;
2660         imax = (j+1)*GA[ga_handle].block_dims[i];
2661         if (imax > GA[ga_handle].dims[i]) imax = GA[ga_handle].dims[i];
2662         jtot += (imax-imin+1);
2663       }
2664       block_size *= jtot;
2665     }
2666   } else if (GA[ga_handle].distr_type == TILED_IRREG) {
2667     /* Tiled data distribution has been specified. Figure
2668        out how much memory is needed by each processor to store blocks */
2669     Integer j, jtot, skip, imin, imax;
2670     Integer index[MAXDIM];
2671     Integer offset = 0;
2672     gam_find_tile_proc_indices(ga_handle,GAme,index);
2673     block_size = 1;
2674     for (i=0; i<ndim; i++) {
2675       skip = GA[ga_handle].nblock[i];
2676       jtot = 0;
2677       for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2678         imin = GA[ga_handle].mapc[offset+j];
2679         if (j<GA[ga_handle].num_blocks[i]-1) {
2680           imax = GA[ga_handle].mapc[offset+j+1]-1;
2681         } else {
2682           imax = GA[ga_handle].dims[i];
2683         }
2684         jtot += (imax-imin+1);
2685       }
2686       block_size *= jtot;
2687       offset += GA[ga_handle].num_blocks[i];
2688     }
2689   }
2690 
2691   GAstat.numcre ++;
2692 
2693   GA[ga_handle].actv = 1;
2694   /* If only one node is being used and array is mirrored,
2695    * set proc list to world group */
2696   if (pnga_cluster_nnodes() == 1 && GA[ga_handle].p_handle == 0) {
2697     GA[ga_handle].p_handle = pnga_pgroup_get_world();
2698   }
2699 
2700   /* Set remaining parameters and determine memory size if regular data
2701    * distribution is being used */
2702   if (GA[ga_handle].distr_type == REGULAR) {
2703     /* set corner flag, if it has not already been set and set up message
2704        passing data */
2705     if (GA[ga_handle].corner_flag == -1) {
2706        i = 1;
2707     } else {
2708        i = GA[ga_handle].corner_flag;
2709     }
2710     for( i = 0; i< ndim; i++){
2711        GA[ga_handle].scale[i] = (double)GA[ga_handle].nblock[i]
2712          / (double)GA[ga_handle].dims[i];
2713     }
2714     pnga_set_ghost_corner_flag(g_a, i);
2715 
2716     /*** determine which portion of the array I am supposed to hold ***/
2717     if (p_handle == 0) { /* for mirrored arrays */
2718        Integer me_local = (Integer)PGRP_LIST[p_handle].map_proc_list[GAme];
2719        pnga_distribution(g_a, me_local, GA[ga_handle].lo, hi);
2720     } else {
2721        pnga_distribution(g_a, grp_me, GA[ga_handle].lo, hi);
2722     }
2723     if (GA[ga_handle].num_rstrctd == 0 || GA[ga_handle].has_data == 1) {
2724       for( i = 0, nelem=1; i< ndim; i++){
2725         /*
2726         GA[ga_handle].chunk[i] = ((C_Integer)hi[i]-GA[ga_handle].lo[i]+1);
2727         */
2728         nelem *= (hi[i]-(Integer)GA[ga_handle].lo[i]+1+2*width[i]);
2729       }
2730     } else {
2731       nelem = 0;
2732     }
2733     mem_size = nelem * GA[ga_handle].elemsize;
2734   } else {
2735     mem_size = block_size * GA[ga_handle].elemsize;
2736   }
2737   GA[ga_handle].id = INVALID_MA_HANDLE;
2738   GA[ga_handle].size = (C_Long)mem_size;
2739   /* if requested, enforce limits on memory consumption */
2740   if(GA_memory_limited) GA_total_memory -= mem_size;
2741   /* check if everybody has enough memory left */
2742   if(GA_memory_limited){
2743      status = (GA_total_memory >= 0) ? 1 : 0;
2744      if (p_handle > 0) {
2745         /* pnga_pgroup_gop(p_handle,pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
2746         pnga_pgroup_gop(p_handle,pnga_type_f2c(MT_F_INT), &status, 1, "&&");
2747      } else {
2748         /* pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
2749         pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "&&");
2750      }
2751   }else status = 1;
2752 
2753   if (status) {
2754     if (GA[ga_handle].mem_dev_set) {
2755       status = !gai_get_devmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2756           GA[ga_handle].type, &GA[ga_handle].id, p_handle,
2757           GA[ga_handle].mem_dev_set, GA[ga_handle].mem_dev);
2758     } else {
2759       status = !gai_getmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2760           GA[ga_handle].type, &GA[ga_handle].id, p_handle);
2761     }
2762   } else {
2763      GA[ga_handle].ptr[grp_me]=NULL;
2764   }
2765 
2766   if (GA[ga_handle].distr_type == REGULAR) {
2767     /* Finish setting up information for ghost cell updates */
2768     if (GA[ga_handle].ghosts == 1) {
2769       if (!pnga_set_ghost_info(g_a))
2770         pnga_error("Could not allocate update information for ghost cells",0);
2771     }
2772     /* If array is mirrored, evaluate first and last indices */
2773     /* ngai_get_first_last_indices(&g_a); */
2774   }
2775 
2776   pnga_pgroup_sync(p_handle);
2777   if (status) {
2778     GAstat.curmem += (long)GA[ga_handle].size;
2779     GAstat.maxmem  = (long)GA_MAX(GAstat.maxmem, GAstat.curmem);
2780     status = TRUE;
2781   } else {
2782     if(GA_memory_limited) GA_total_memory += mem_size;
2783     pnga_destroy(g_a);
2784     status = FALSE;
2785   }
2786   return status;
2787 }
2788 
2789 /**
2790  *  Use memory from another GA and complete setup of global array
2791  */
2792 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2793 #   pragma weak wnga_overlay = pnga_overlay
2794 #endif
2795 
pnga_overlay(Integer g_a,Integer g_parent)2796 logical pnga_overlay(Integer g_a, Integer g_parent)
2797 {
2798 
2799   Integer hi[MAXDIM];
2800   Integer ga_handle = g_a + GA_OFFSET;
2801   Integer g_p = g_parent + GA_OFFSET;
2802   Integer d, width[MAXDIM], ndim;
2803   Integer mem_size, nelem;
2804   Integer i, status, maplen=0, p_handle;
2805   Integer dims[MAXDIM], chunk[MAXDIM];
2806   Integer pe[MAXDIM], *pmap[MAXDIM], *map;
2807   Integer blk[MAXDIM];
2808   Integer grp_me=GAme, grp_nproc=GAnproc;
2809   Integer block_size = 0;
2810 
2811   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
2812   if (GA[ga_handle].ndim == -1)
2813     pnga_error("Insufficient data to create global array",0);
2814 
2815   p_handle = (Integer)GA[ga_handle].p_handle;
2816   if (p_handle == (Integer)GA_Init_Proc_Group) {
2817     GA[ga_handle].p_handle = GA_Default_Proc_Group;
2818     p_handle = GA_Default_Proc_Group;
2819   }
2820   pnga_pgroup_sync(p_handle);
2821   if (p_handle != (Integer)GA[g_p].p_handle) {
2822     pnga_error("Parent and overlay global array must be on same procesor group",0);
2823   }
2824 
2825   if (p_handle > 0) {
2826      grp_nproc  = PGRP_LIST[p_handle].map_nproc;
2827      grp_me = PGRP_LIST[p_handle].map_proc_list[GAme];
2828   }
2829 
2830   if(!GAinitialized) pnga_error("GA not initialized ", 0);
2831   if(!ma_address_init) gai_ma_address_init();
2832 
2833   ndim = GA[ga_handle].ndim;
2834   for (i=0; i<ndim; i++) width[i] = (C_Integer)GA[ga_handle].width[i];
2835 
2836   /* The data distribution has not been specified by the user. Create
2837      default distribution */
2838   if (GA[ga_handle].mapc == NULL && GA[ga_handle].distr_type == REGULAR) {
2839     for (d=0; d<ndim; d++) {
2840       dims[d] = (Integer)GA[ga_handle].dims[d];
2841       chunk[d] = (Integer)GA[ga_handle].chunk[d];
2842     }
2843     if(chunk[0]!=0) /* for chunk[0]=0 compute all */
2844       for(d=0; d< ndim; d++) blk[d]=(Integer)GA_MIN(chunk[d],dims[d]);
2845     else
2846       for(d=0; d< ndim; d++) blk[d]=-1;
2847 
2848     /* eliminate dimensions =1 from ddb analysis */
2849     for(d=0; d<ndim; d++)if(dims[d]==1)blk[d]=1;
2850 
2851     if (GAme==0 && DEBUG )
2852       for (d=0;d<ndim;d++) fprintf(stderr,"b[%ld]=%ld\n",(long)d,(long)blk[d]);
2853     pnga_pgroup_sync(p_handle);
2854 
2855     /* ddb(ndim, dims, GAnproc, blk, pe);*/
2856     if(p_handle == 0) /* for mirrored arrays */
2857 #if OLD_DISTRIBUTION
2858        ddb_h2(ndim, dims, PGRP_LIST[p_handle].map_nproc,0.0,(Integer)0, blk, pe);
2859 #else
2860        ddb(ndim, dims, PGRP_LIST[p_handle].map_nproc, blk, pe);
2861 #endif
2862     else
2863        if (GA[ga_handle].num_rstrctd == 0) {
2864          /* Data is normally distributed on processors */
2865 #if OLD_DISTRIBUTION
2866          ddb_h2(ndim, dims, grp_nproc,0.0,(Integer)0, blk, pe);
2867 #else
2868          ddb(ndim, dims, grp_nproc, blk, pe);
2869 #endif
2870        } else {
2871          /* Data is only distributed on subset of processors */
2872 #if OLD_DISTRIBUTION
2873          ddb_h2(ndim, dims, GA[ga_handle].num_rstrctd, 0.0, (Integer)0, blk, pe);
2874 #else
2875          ddb(ndim, dims, GA[ga_handle].num_rstrctd, blk, pe);
2876 #endif
2877        }
2878 
2879     for(d=0, map=mapALL; d< ndim; d++){
2880       Integer nblock;
2881       Integer pcut; /* # procs that get full blk[] elements; the rest gets less*/
2882       int p;
2883 
2884       pmap[d] = map;
2885 
2886       /* RJH ... don't leave some nodes without data if possible
2887        but respect the users block size */
2888 
2889       if (chunk[d] > 1) {
2890         Integer ddim = ((dims[d]-1)/GA_MIN(chunk[d],dims[d]) + 1);
2891         pcut = (ddim -(blk[d]-1)*pe[d]) ;
2892       }
2893       else {
2894         pcut = (dims[d]-(blk[d]-1)*pe[d]) ;
2895       }
2896 
2897       for (nblock=i=p=0; (p<pe[d]) && (i<dims[d]); p++, nblock++) {
2898         Integer b = blk[d];
2899         if (p >= pcut)
2900           b = b-1;
2901         map[nblock] = i+1;
2902         if (chunk[d]>1) b *= GA_MIN(chunk[d],dims[d]);
2903         i += b;
2904       }
2905 
2906       pe[d] = GA_MIN(pe[d],nblock);
2907       map +=  pe[d];
2908     }
2909     if(GAme==0&& DEBUG){
2910       gai_print_subscript("pe ",(int)ndim, pe,"\n");
2911       gai_print_subscript("blocks ",(int)ndim, blk,"\n");
2912       printf("decomposition map\n");
2913       for(d=0; d< ndim; d++){
2914         printf("dim=%ld: ",(long)d);
2915         for (i=0;i<pe[d];i++)printf("%ld ",(long)pmap[d][i]);
2916         printf("\n");
2917       }
2918       fflush(stdout);
2919     }
2920     maplen = 0;
2921     for( i = 0; i< ndim; i++){
2922       GA[ga_handle].nblock[i] = pe[i];
2923       maplen += pe[i];
2924     }
2925     GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
2926     for(i = 0; i< maplen; i++) {
2927       GA[ga_handle].mapc[i] = (C_Integer)mapALL[i];
2928     }
2929     GA[ga_handle].mapc[maplen] = -1;
2930   } else if (GA[ga_handle].distr_type == BLOCK_CYCLIC) {
2931     /* Regular block-cyclic data distribution has been specified. Figure
2932        out how much memory is needed by each processor to store blocks */
2933     Integer nblocks = GA[ga_handle].block_total;
2934     Integer tsize, j;
2935     Integer lo[MAXDIM];
2936     block_size = 0;
2937     for (i=GAme; i<nblocks; i +=GAnproc) {
2938       ga_ownsM(ga_handle,i,lo,hi);
2939       tsize = 1;
2940       for (j=0; j<ndim; j++) {
2941         tsize *= (hi[j] - lo[j] + 1);
2942       }
2943       block_size += tsize;
2944     }
2945   } else if (GA[ga_handle].distr_type == SCALAPACK) {
2946     /* ScaLAPACK block-cyclic data distribution has been specified. Figure
2947        out how much memory is needed by each processor to store blocks */
2948     Integer j, jtot, skip, imin, imax;
2949     Integer index[MAXDIM];
2950     gam_find_proc_indices(ga_handle,GAme,index);
2951     block_size = 1;
2952     for (i=0; i<ndim; i++) {
2953       skip = GA[ga_handle].nblock[i];
2954       jtot = 0;
2955       for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2956         imin = j*GA[ga_handle].block_dims[i] + 1;
2957         imax = (j+1)*GA[ga_handle].block_dims[i];
2958         if (imax > GA[ga_handle].dims[i]) imax = GA[ga_handle].dims[i];
2959         jtot += (imax-imin+1);
2960       }
2961       block_size *= jtot;
2962     }
2963   } else if (GA[ga_handle].distr_type == TILED) {
2964     /* Tiled data distribution has been specified. Figure
2965        out how much memory is needed by each processor to store blocks */
2966     Integer j, jtot, skip, imin, imax;
2967     Integer index[MAXDIM];
2968     gam_find_tile_proc_indices(ga_handle,GAme,index);
2969     block_size = 1;
2970     for (i=0; i<ndim; i++) {
2971       skip = GA[ga_handle].nblock[i];
2972       jtot = 0;
2973       for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2974         imin = j*GA[ga_handle].block_dims[i] + 1;
2975         imax = (j+1)*GA[ga_handle].block_dims[i];
2976         if (imax > GA[ga_handle].dims[i]) imax = GA[ga_handle].dims[i];
2977         jtot += (imax-imin+1);
2978       }
2979       block_size *= jtot;
2980     }
2981   } else if (GA[ga_handle].distr_type == TILED_IRREG) {
2982     /* Tiled data distribution has been specified. Figure
2983        out how much memory is needed by each processor to store blocks */
2984     Integer j, jtot, skip, imin, imax;
2985     Integer index[MAXDIM];
2986     Integer offset = 0;
2987     gam_find_tile_proc_indices(ga_handle,GAme,index);
2988     block_size = 1;
2989     for (i=0; i<ndim; i++) {
2990       skip = GA[ga_handle].nblock[i];
2991       jtot = 0;
2992       for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2993         imin = GA[ga_handle].mapc[offset+j];
2994         if (j<GA[ga_handle].num_blocks[i]-1) {
2995           imax = GA[ga_handle].mapc[offset+j+1]-1;
2996         } else {
2997           imax = GA[ga_handle].dims[i];
2998         }
2999         jtot += (imax-imin+1);
3000       }
3001       block_size *= jtot;
3002       offset += GA[ga_handle].num_blocks[i];
3003     }
3004   }
3005 
3006   GAstat.numcre ++;
3007 
3008   GA[ga_handle].actv = 1;
3009   /* If only one node is being used and array is mirrored,
3010    * set proc list to world group */
3011   if (pnga_cluster_nnodes() == 1 && GA[ga_handle].p_handle == 0) {
3012     GA[ga_handle].p_handle = pnga_pgroup_get_world();
3013   }
3014 
3015   /* Set remaining parameters and determine memory size if regular data
3016    * distribution is being used */
3017   if (GA[ga_handle].distr_type == REGULAR) {
3018     /* set corner flag, if it has not already been set and set up message
3019        passing data */
3020     if (GA[ga_handle].corner_flag == -1) {
3021        i = 1;
3022     } else {
3023        i = GA[ga_handle].corner_flag;
3024     }
3025     for( i = 0; i< ndim; i++){
3026        GA[ga_handle].scale[i] = (double)GA[ga_handle].nblock[i]
3027          / (double)GA[ga_handle].dims[i];
3028     }
3029     pnga_set_ghost_corner_flag(g_a, i);
3030 
3031     /*** determine which portion of the array I am supposed to hold ***/
3032     if (p_handle == 0) { /* for mirrored arrays */
3033        Integer me_local = (Integer)PGRP_LIST[p_handle].map_proc_list[GAme];
3034        pnga_distribution(g_a, me_local, GA[ga_handle].lo, hi);
3035     } else {
3036        pnga_distribution(g_a, grp_me, GA[ga_handle].lo, hi);
3037     }
3038     if (GA[ga_handle].num_rstrctd == 0 || GA[ga_handle].has_data == 1) {
3039       for( i = 0, nelem=1; i< ndim; i++){
3040         /*
3041         GA[ga_handle].chunk[i] = ((C_Integer)hi[i]-GA[ga_handle].lo[i]+1);
3042         */
3043         nelem *= (hi[i]-(Integer)GA[ga_handle].lo[i]+1+2*width[i]);
3044       }
3045     } else {
3046       nelem = 0;
3047     }
3048     mem_size = nelem * GA[ga_handle].elemsize;
3049   } else {
3050     mem_size = block_size * GA[ga_handle].elemsize;
3051   }
3052   GA[ga_handle].id = INVALID_MA_HANDLE;
3053   GA[ga_handle].size = (C_Long)mem_size;
3054   /* check if everybody has enough memory to fit overlay GA */
3055   status = (GA[ga_handle].size <= GA[g_p].size) ? 1 : 0;
3056   if (p_handle > 0) {
3057     pnga_pgroup_gop(p_handle,pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3058   } else {
3059     pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3060   }
3061 
3062   if (status) {
3063     GA[ga_handle].overlay = 1;
3064     GA[ga_handle].id = GA[g_p].id;
3065     for (i=0; i<grp_nproc; i++) {
3066       GA[ga_handle].ptr[i] = GA[g_p].ptr[i];
3067     }
3068   }
3069 
3070   if (GA[ga_handle].distr_type == REGULAR) {
3071     /* Finish setting up information for ghost cell updates */
3072     if (GA[ga_handle].ghosts == 1) {
3073       if (!pnga_set_ghost_info(g_a))
3074         pnga_error("Could not allocate update information for ghost cells",0);
3075     }
3076     /* If array is mirrored, evaluate first and last indices */
3077     /* ngai_get_first_last_indices(&g_a); */
3078   }
3079 
3080   pnga_pgroup_sync(p_handle);
3081   if (status) {
3082     status = TRUE;
3083   } else {
3084     pnga_destroy(g_a);
3085     status = FALSE;
3086   }
3087   return status;
3088 }
3089 
3090 /**
3091  *  Create an N-dimensional Global Array with ghost cells using an
3092  *  irregular distribution on a user-specified process group.
3093  *  This is the master routine. All other creation routines are derived
3094  *  from this one.
3095  */
3096 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3097 #   pragma weak wnga_create_ghosts_irreg_config =  pnga_create_ghosts_irreg_config
3098 #endif
3099 
pnga_create_ghosts_irreg_config(Integer type,Integer ndim,Integer * dims,Integer * width,char * array_name,Integer * map,Integer * nblock,Integer p_handle,Integer * g_a)3100 logical pnga_create_ghosts_irreg_config(
3101         Integer type,     /* MA type */
3102         Integer ndim,     /* number of dimensions */
3103         Integer *dims,    /* array of dimensions */
3104         Integer *width,   /* width of boundary cells for each dimension */
3105         char *array_name, /* array name */
3106         Integer *map,     /* decomposition map array */
3107         Integer *nblock,  /* number of blocks for each dimension in map */
3108         Integer p_handle, /* processor list handle */
3109         Integer *g_a)     /* array handle (output) */
3110 {
3111   logical status;
3112   Integer g_A;
3113 
3114   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
3115   pnga_pgroup_sync(p_handle);
3116 
3117   g_A = pnga_create_handle();
3118   *g_a = g_A;
3119   pnga_set_data(g_A,ndim,dims,type);
3120   pnga_set_ghosts(g_A,width);
3121   pnga_set_array_name(g_A,array_name);
3122   pnga_set_irreg_distr(g_A,map,nblock);
3123   pnga_set_pgroup(g_A,p_handle);
3124   status = pnga_allocate(g_A);
3125 
3126   return status;
3127 }
3128 
3129 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3130 #   pragma weak wnga_create_ghosts_irreg =  pnga_create_ghosts_irreg
3131 #endif
3132 
pnga_create_ghosts_irreg(Integer type,Integer ndim,Integer dims[],Integer width[],char * array_name,Integer map[],Integer nblock[],Integer * g_a)3133 logical pnga_create_ghosts_irreg(
3134         Integer type,     /* MA type */
3135         Integer ndim,     /* number of dimensions */
3136         Integer dims[],   /* array of dimensions */
3137         Integer width[],  /* width of boundary cells for each dimension */
3138         char *array_name, /* array name */
3139         Integer map[],    /* decomposition map array */
3140         Integer nblock[], /* number of blocks for each dimension in map */
3141         Integer *g_a)     /* array handle (output) */
3142 {
3143    Integer p_handle = pnga_pgroup_get_default();
3144    return pnga_create_ghosts_irreg_config(type, ndim, dims, width,
3145                 array_name, map, nblock, p_handle, g_a);
3146 }
3147 
3148 
3149 /** Create an N-dimensional Global Array on user-specified process group.
3150  *  Allow machine to choose location of array boundaries on individual
3151  *  processors
3152  */
3153 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3154 #   pragma weak wnga_create_config = pnga_create_config
3155 #endif
3156 
pnga_create_config(Integer type,Integer ndim,Integer dims[],char * array_name,Integer * chunk,Integer p_handle,Integer * g_a)3157 logical pnga_create_config(Integer type,
3158                          Integer ndim,
3159                          Integer dims[],
3160                          char* array_name,
3161                          Integer *chunk,
3162                          Integer p_handle,
3163                          Integer *g_a)
3164 {
3165   logical status;
3166   Integer g_A;
3167   g_A = pnga_create_handle();
3168   *g_a = g_A;
3169   pnga_set_data(g_A,ndim,dims,type);
3170   pnga_set_array_name(g_A,array_name);
3171   pnga_set_chunk(g_A,chunk);
3172   pnga_set_pgroup(g_A,p_handle);
3173   status = pnga_allocate(g_A);
3174   return status;
3175 }
3176 
3177 /** Create an N-dimensional Global Array on default processor group.
3178  *  Allow machine to choose location of array boundaries on individual
3179  *  processors
3180  */
3181 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3182 #   pragma weak wnga_create = pnga_create
3183 #endif
3184 
pnga_create(Integer type,Integer ndim,Integer dims[],char * array_name,Integer * chunk,Integer * g_a)3185 logical pnga_create(Integer type,
3186                    Integer ndim,
3187                    Integer dims[],
3188                    char* array_name,
3189                    Integer *chunk,
3190                    Integer *g_a)
3191 {
3192   GA_Internal_Threadsafe_Lock();
3193   Integer p_handle = pnga_pgroup_get_default();
3194   logical result = pnga_create_config(type, ndim, dims, array_name, chunk, p_handle, g_a);
3195   GA_Internal_Threadsafe_Unlock();
3196   return result;
3197 }
3198 
3199 
3200 /*\ CREATE AN N-DIMENSIONAL GLOBAL ARRAY WITH GHOST CELLS
3201  *  Allow machine to choose location of array boundaries on individual
3202  *  processors.
3203 \*/
3204 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3205 #   pragma weak wnga_create_ghosts_config = pnga_create_ghosts_config
3206 #endif
3207 
pnga_create_ghosts_config(Integer type,Integer ndim,Integer dims[],Integer width[],char * array_name,Integer chunk[],Integer p_handle,Integer * g_a)3208 logical pnga_create_ghosts_config(Integer type,
3209                    Integer ndim,
3210                    Integer dims[],
3211                    Integer width[],
3212                    char* array_name,
3213                    Integer chunk[],
3214                    Integer p_handle,
3215                    Integer *g_a)
3216 {
3217   logical status;
3218   Integer g_A;
3219   g_A = pnga_create_handle();
3220   *g_a = g_A;
3221   pnga_set_data(g_A,ndim,dims,type);
3222   pnga_set_ghosts(g_A,width);
3223   pnga_set_array_name(g_A,array_name);
3224   pnga_set_chunk(g_A,chunk);
3225   pnga_set_pgroup(g_A,p_handle);
3226   status = pnga_allocate(g_A);
3227   return status;
3228 }
3229 
3230 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3231 #   pragma weak wnga_create_ghosts = pnga_create_ghosts
3232 #endif
3233 
pnga_create_ghosts(Integer type,Integer ndim,Integer dims[],Integer width[],char * array_name,Integer chunk[],Integer * g_a)3234 logical pnga_create_ghosts(Integer type,
3235                    Integer ndim,
3236                    Integer dims[],
3237                    Integer width[],
3238                    char* array_name,
3239                    Integer chunk[],
3240                    Integer *g_a)
3241 {
3242   Integer p_handle = pnga_pgroup_get_default();
3243   return pnga_create_ghosts_config(type, ndim, dims, width, array_name,
3244                   chunk, p_handle, g_a);
3245 }
3246 
3247 /**
3248  *  Create a Global Array with an irregular distribution and a user-specified
3249  *  process group. The user can specify location of array boundaries on
3250  *  individual processors.
3251  */
3252 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3253 #   pragma weak wnga_create_irreg_config = pnga_create_irreg_config
3254 #endif
3255 
pnga_create_irreg_config(Integer type,Integer ndim,Integer dims[],char * array_name,Integer map[],Integer nblock[],Integer p_handle,Integer * g_a)3256 logical pnga_create_irreg_config(
3257         Integer type,     /* MA type */
3258         Integer ndim,     /* number of dimensions */
3259         Integer dims[],   /* array of dimensions */
3260         char *array_name, /* array name */
3261         Integer map[],    /* decomposition map array */
3262         Integer nblock[], /* number of blocks for each dimension in map */
3263         Integer p_handle, /* processor list hande */
3264         Integer *g_a)     /* array handle (output) */
3265 {
3266 Integer  d,width[MAXDIM];
3267 logical status;
3268 
3269       for (d=0; d<ndim; d++) width[d] = 0;
3270       status = pnga_create_ghosts_irreg_config(type, ndim, dims, width,
3271           array_name, map, nblock, p_handle, g_a);
3272 
3273       return status;
3274 }
3275 
3276 
3277 /**
3278  *  Create a Global Array with an irregular distribution. The user can specify
3279  *  location of array boundaries on individual
3280  *  processors.
3281  */
3282 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3283 #   pragma weak wnga_create_irreg = pnga_create_irreg
3284 #endif
3285 
pnga_create_irreg(Integer type,Integer ndim,Integer dims[],char * array_name,Integer map[],Integer nblock[],Integer * g_a)3286 logical pnga_create_irreg(
3287         Integer type,     /* MA type */
3288         Integer ndim,     /* number of dimensions */
3289         Integer dims[],   /* array of dimensions */
3290         char *array_name, /* array name */
3291         Integer map[],    /* decomposition map array */
3292         Integer nblock[], /* number of blocks for each dimension in map */
3293         Integer *g_a)     /* array handle (output) */
3294 {
3295 
3296 Integer  d,width[MAXDIM];
3297 logical status;
3298 
3299       for (d=0; d<ndim; d++) width[d] = 0;
3300       status = pnga_create_ghosts_irreg(type, ndim, dims, width,
3301           array_name, map, nblock, g_a);
3302 
3303       return status;
3304 }
3305 
3306 /*\ get memory alligned w.r.t. MA base
3307  *  required on Linux as g77 ignores natural data alignment in common blocks
3308 \*/
gai_get_shmem(char ** ptr_arr,C_Long bytes,int type,long * adj,int grp_id)3309 int gai_get_shmem(char **ptr_arr, C_Long bytes, int type, long *adj,
3310 		  int grp_id)
3311 {
3312 int status=0;
3313 #ifndef _CHECK_MA_ALGN
3314 char *base;
3315 long diff, item_size;
3316 Integer *adjust;
3317 int i, nproc,grp_me=GAme;
3318 
3319     if (grp_id > 0) {
3320        nproc  = PGRP_LIST[grp_id].map_nproc;
3321        grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3322     }
3323     else
3324        nproc = GAnproc;
3325 
3326     /* need to enforce proper, natural allignment (on size boundary)  */
3327     switch (pnga_type_c2f(type)){
3328       case MT_F_DBL:   base =  (char *) DBL_MB; break;
3329       case MT_F_INT:   base =  (char *) INT_MB; break;
3330       case MT_F_DCPL:  base =  (char *) DCPL_MB; break;
3331       case MT_F_SCPL:  base =  (char *) SCPL_MB; break;
3332       case MT_F_REAL:  base =  (char *) FLT_MB; break;
3333       default:        base = (char*)0;
3334     }
3335 
3336     item_size = GAsizeofM(type);
3337 #   ifdef GA_ELEM_PADDING
3338        bytes += (C_Long)item_size;
3339 #   endif
3340 
3341 #endif
3342 
3343     *adj = 0;
3344 
3345     /* use ARMCI_Malloc_group for groups if proc group is not world group
3346        or mirror group */
3347 #ifdef MSG_COMMS_MPI
3348     if (grp_id > 0) {
3349        status = ARMCI_Malloc_group((void**)ptr_arr, (armci_size_t)bytes,
3350 				   &PGRP_LIST[grp_id].group);
3351     } else {
3352 #endif
3353       status = ARMCI_Malloc((void**)ptr_arr, (armci_size_t)bytes);
3354 #ifdef MSG_COMMS_MPI
3355     }
3356 #endif
3357 
3358     if(bytes!=0 && ptr_arr[grp_me]==NULL)
3359        pnga_error("gai_get_shmem: ARMCI Malloc failed", GAme);
3360     if(status) return status;
3361 
3362 #ifndef _CHECK_MA_ALGN
3363 
3364     /* adjust all addresses if they are not alligned on corresponding nodes*/
3365 
3366     /* we need storage for GAnproc*sizeof(Integer) */
3367     /* JAD -- fixed bug where _ga_map was reused before gai_getmem was done
3368      * with it. Now malloc/free needed memory. */
3369     adjust = (Integer*)malloc(GAnproc*sizeof(Integer));
3370 
3371     diff = (GA_ABS( base - (char *) ptr_arr[grp_me])) % item_size;
3372     for(i=0;i<nproc;i++)adjust[i]=0;
3373     adjust[grp_me] = (diff > 0) ? item_size - diff : 0;
3374     *adj = adjust[grp_me];
3375 
3376     if (grp_id > 0)
3377        pnga_pgroup_gop(grp_id, pnga_type_f2c(MT_F_INT), adjust, nproc, "+");
3378     else
3379        pnga_gop(pnga_type_f2c(MT_F_INT), adjust, nproc, "+");
3380 
3381     for(i=0;i<nproc;i++){
3382        ptr_arr[i] = adjust[i] + (char*)ptr_arr[i];
3383     }
3384     free(adjust);
3385 
3386 #endif
3387     return status;
3388 }
3389 
gai_uses_shm(int grp_id)3390 int gai_uses_shm(int grp_id)
3391 {
3392 #ifdef MSG_COMMS_MPI
3393     if(grp_id > 0) return ARMCI_Uses_shm_grp(&PGRP_LIST[grp_id].group);
3394     else
3395 #endif
3396       return ARMCI_Uses_shm();
3397 }
3398 
gai_getmem(char * name,char ** ptr_arr,C_Long bytes,int type,long * id,int grp_id)3399 int gai_getmem(char* name, char **ptr_arr, C_Long bytes, int type, long *id,
3400 	       int grp_id)
3401 {
3402 #ifdef AVOID_MA_STORAGE
3403    return gai_get_shmem(ptr_arr, bytes, type, id, grp_id);
3404 #else
3405 Integer handle = INVALID_MA_HANDLE, index;
3406 Integer nproc=GAnproc, grp_me=GAme, item_size = GAsizeofM(type);
3407 C_Long nelem;
3408 char *ptr = (char*)0;
3409 
3410    if (grp_id > 0) {
3411      nproc  = PGRP_LIST[grp_id].map_nproc;
3412      grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3413    }
3414 
3415    if(gai_uses_shm(grp_id)) return gai_get_shmem(ptr_arr, bytes, type, id, grp_id);
3416    else{
3417      nelem = bytes/((C_Long)item_size) + 1;
3418      if(bytes)
3419         if(MA_alloc_get(type, nelem, name, &handle, &index)){
3420                 MA_get_pointer(handle, &ptr);}
3421      *id   = (long)handle;
3422 
3423      /*
3424             printf("bytes=%d ptr=%ld index=%d\n",bytes, ptr,index);
3425             fflush(stdout);
3426      */
3427 
3428      bzero((char*)ptr_arr,(int)nproc*sizeof(char*));
3429      ptr_arr[grp_me] = ptr;
3430 
3431 #   ifndef _CHECK_MA_ALGN /* align */
3432      {
3433         long diff, adjust;
3434         diff = ((unsigned long)ptr_arr[grp_me]) % item_size;
3435         adjust = (diff > 0) ? item_size - diff : 0;
3436         ptr_arr[grp_me] = adjust + (char*)ptr_arr[grp_me];
3437      }
3438 #   endif
3439 
3440 #   ifdef MSG_COMMS_MPI
3441      if (grp_id > 0) {
3442         armci_exchange_address_grp((void**)ptr_arr,(int)nproc,
3443                                    &PGRP_LIST[grp_id].group);
3444      } else
3445 #   endif
3446         armci_exchange_address((void**)ptr_arr,(int)nproc);
3447      if(bytes && !ptr) return 1;
3448      else return 0;
3449    }
3450 #endif /* AVOID_MA_STORAGE */
3451 }
3452 
3453 /*\ get device memory alligned w.r.t. MA base
3454  *  required on Linux as g77 ignores natural data alignment in common blocks
3455 \*/
gai_get_devmem(char * name,char ** ptr_arr,C_Long bytes,int type,long * adj,int grp_id,int dev_flag,const char * device)3456 int gai_get_devmem(char *name, char **ptr_arr, C_Long bytes, int type, long *adj,
3457 		  int grp_id, int dev_flag, const char *device)
3458 {
3459   int status=0;
3460 #ifndef _CHECK_MA_ALGN
3461   char *base;
3462   long diff, item_size;
3463   Integer *adjust;
3464   int i, nproc,grp_me=GAme;
3465 
3466   if (grp_id > 0) {
3467     nproc  = PGRP_LIST[grp_id].map_nproc;
3468     grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3469   }
3470   else
3471     nproc = GAnproc;
3472 
3473   /* need to enforce proper, natural allignment (on size boundary)  */
3474   switch (pnga_type_c2f(type)){
3475     case MT_F_DBL:   base =  (char *) DBL_MB; break;
3476     case MT_F_INT:   base =  (char *) INT_MB; break;
3477     case MT_F_DCPL:  base =  (char *) DCPL_MB; break;
3478     case MT_F_SCPL:  base =  (char *) SCPL_MB; break;
3479     case MT_F_REAL:  base =  (char *) FLT_MB; break;
3480     default:        base = (char*)0;
3481   }
3482 
3483   item_size = GAsizeofM(type);
3484 #   ifdef GA_ELEM_PADDING
3485   bytes += (C_Long)item_size;
3486 #   endif
3487 
3488 #endif
3489 
3490   *adj = 0;
3491 
3492   /* use ARMCI_Malloc_group for groups if proc group is not world group
3493      or mirror group */
3494 #ifdef MSG_COMMS_MPI
3495   if (grp_id > 0) {
3496     if (dev_flag) {
3497       status = ARMCI_Malloc_group_memdev((void**)ptr_arr, (armci_size_t)bytes,
3498           &PGRP_LIST[grp_id].group, device);
3499     } else {
3500       status = ARMCI_Malloc_group((void**)ptr_arr, (armci_size_t)bytes,
3501           &PGRP_LIST[grp_id].group);
3502     }
3503   } else {
3504 #endif
3505     if (dev_flag) {
3506       status = ARMCI_Malloc_memdev((void**)ptr_arr, (armci_size_t)bytes, device);
3507     } else {
3508       status = ARMCI_Malloc((void**)ptr_arr, (armci_size_t)bytes);
3509     }
3510 #ifdef MSG_COMMS_MPI
3511   }
3512 #endif
3513 
3514   if(bytes!=0 && ptr_arr[grp_me]==NULL)
3515     pnga_error("gai_get_shmem: ARMCI Malloc failed", GAme);
3516   if(status) return status;
3517 
3518 #ifndef _CHECK_MA_ALGN
3519 
3520   /* adjust all addresses if they are not alligned on corresponding nodes*/
3521 
3522   /* we need storage for GAnproc*sizeof(Integer) */
3523   /* JAD -- fixed bug where _ga_map was reused before gai_getmem was done
3524    * with it. Now malloc/free needed memory. */
3525   adjust = (Integer*)malloc(GAnproc*sizeof(Integer));
3526 
3527   diff = (GA_ABS( base - (char *) ptr_arr[grp_me])) % item_size;
3528   for(i=0;i<nproc;i++)adjust[i]=0;
3529   adjust[grp_me] = (diff > 0) ? item_size - diff : 0;
3530   *adj = adjust[grp_me];
3531 
3532   if (grp_id > 0)
3533     pnga_pgroup_gop(grp_id, pnga_type_f2c(MT_F_INT), adjust, nproc, "+");
3534   else
3535     pnga_gop(pnga_type_f2c(MT_F_INT), adjust, nproc, "+");
3536 
3537   for(i=0;i<nproc;i++){
3538     ptr_arr[i] = adjust[i] + (char*)ptr_arr[i];
3539   }
3540   free(adjust);
3541 
3542 #endif
3543   return status;
3544 }
3545 
3546 /*\ externalized version of gai_getmem to facilitate two-step array creation
3547 \*/
GA_Getmem(int type,int nelem,int grp_id)3548 void *GA_Getmem(int type, int nelem, int grp_id)
3549 {
3550 char **ptr_arr=(char**)0;
3551 int  rc,i;
3552 long id;
3553 int bytes;
3554 int extra=sizeof(getmem_t)+GAnproc*sizeof(char*);
3555 char *myptr;
3556 Integer status;
3557      type = pnga_type_f2c(type);
3558      bytes = nelem *  GAsizeofM(type);
3559      if(GA_memory_limited){
3560          GA_total_memory -= bytes+extra;
3561          status = (GA_total_memory >= 0) ? 1 : 0;
3562          /* pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
3563          pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3564          if(!status)GA_total_memory +=bytes+extra;
3565      }else status = 1;
3566 
3567      ptr_arr=malloc(GAnproc*sizeof(char**));
3568      rc= gai_getmem("ga_getmem", ptr_arr,(Integer)bytes+extra, type, &id, grp_id);
3569      if(rc)pnga_error("ga_getmem: failed to allocate memory",bytes+extra);
3570 
3571      myptr = ptr_arr[GAme];
3572 
3573      /* make sure that remote memory addresses point to user memory */
3574      for(i=0; i<GAnproc; i++)ptr_arr[i] += extra;
3575 
3576 #ifndef AVOID_MA_STORAGE
3577      if(ARMCI_Uses_shm())
3578 #endif
3579         id += extra; /* id is used to store offset */
3580 
3581      /* stuff the type and id info at the beginning */
3582      ((getmem_t*)myptr)->id = id;
3583      ((getmem_t*)myptr)->type = type;
3584      ((getmem_t*)myptr)->size = bytes+extra;
3585 
3586      /* add ptr info */
3587      memcpy(myptr+sizeof(getmem_t),ptr_arr,(size_t)GAnproc*sizeof(char**));
3588      free(ptr_arr);
3589 
3590      return (void*)(myptr+extra);
3591 }
3592 
3593 
GA_Freemem(void * ptr)3594 void GA_Freemem(void *ptr)
3595 {
3596 int extra = sizeof(getmem_t)+GAnproc*sizeof(char*);
3597 getmem_t *info = (getmem_t *)((char*)ptr - extra);
3598 char **ptr_arr = (char**)(info+1);
3599 
3600 #ifndef AVOID_MA_STORAGE
3601     if(ARMCI_Uses_shm()){
3602 #endif
3603       /* make sure that we free original (before address alignment) pointer */
3604       ARMCI_Free(ptr_arr[GAme] - info->id);
3605 #ifndef AVOID_MA_STORAGE
3606     }else{
3607       if(info->id != INVALID_MA_HANDLE) MA_free_heap(info->id);
3608     }
3609 #endif
3610 
3611     if(GA_memory_limited) GA_total_memory += info->size;
3612 }
3613 
3614 /**
3615  * Return coordinates of a GA patch associated with processor proc
3616  */
3617 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3618 #   pragma weak wnga_distribution = pnga_distribution
3619 #endif
3620 
pnga_distribution(Integer g_a,Integer proc,Integer * lo,Integer * hi)3621 void pnga_distribution(Integer g_a, Integer proc, Integer *lo, Integer * hi)
3622 {
3623   Integer ga_handle, lproc, old_grp;
3624 
3625   ga_check_handleM(g_a, "nga_distribution");
3626   ga_handle = (GA_OFFSET + g_a);
3627 
3628   lproc = proc;
3629   if (GA[ga_handle].num_rstrctd > 0) {
3630     lproc = GA[ga_handle].rank_rstrctd[lproc];
3631   }
3632   /* This currently assumes that read-only property can only be applied to
3633    * processors on the world group */
3634   if (GA[ga_handle].property == READ_ONLY) {
3635     Integer node = pnga_cluster_proc_nodeid(proc);
3636     Integer nodesize = pnga_cluster_nprocs(node);
3637     lproc = proc%nodesize;
3638   }
3639   ga_ownsM(ga_handle, lproc, lo, hi);
3640 }
3641 
3642 /**
3643  * Check to see if array has ghost cells.
3644  */
3645 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3646 #   pragma weak wnga_has_ghosts =  pnga_has_ghosts
3647 #endif
3648 
pnga_has_ghosts(Integer g_a)3649 logical pnga_has_ghosts(Integer g_a)
3650 {
3651       int h_a = (int)g_a + GA_OFFSET;
3652       return GA[h_a].ghosts;
3653 }
3654 /**
3655  *  Return the dimension of a Global Array
3656  */
3657 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3658 #   pragma weak wnga_ndim =  pnga_ndim
3659 #endif
3660 
pnga_ndim(Integer g_a)3661 Integer pnga_ndim(Integer g_a)
3662 {
3663       ga_check_handleM(g_a,"ga_ndim");
3664       return GA[g_a +GA_OFFSET].ndim;
3665 }
3666 
3667 
3668 
3669 /**
3670  * Duplicate an existing global array
3671  *  -- new array g_b will have properties of g_a
3672  * array_name    - a character string [input]
3673  * g_a           - Integer handle for reference array [input]
3674  * g_b           - Integer handle for new array [output]
3675 \*/
3676 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3677 #   pragma weak wnga_duplicate =  pnga_duplicate
3678 #endif
3679 
pnga_duplicate(Integer g_a,Integer * g_b,char * array_name)3680 logical pnga_duplicate(Integer g_a, Integer *g_b, char* array_name)
3681 {
3682   char     **save_ptr;
3683   C_Long  mem_size, mem_size_proc;
3684   Integer  i, ga_handle, status;
3685   int local_sync_begin,local_sync_end;
3686   Integer grp_id, grp_me=GAme;
3687   /* Integer grp_nproc=GAnproc; */
3688   int maplen;
3689 
3690   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3691   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3692   grp_id = pnga_get_pgroup(g_a);
3693   if(local_sync_begin)pnga_pgroup_sync(grp_id);
3694 
3695   if (grp_id > 0) {
3696     /* grp_nproc  = PGRP_LIST[grp_id].map_nproc; */
3697     grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3698   }
3699 
3700   GAstat.numcre ++;
3701 
3702   ga_check_handleM(g_a,"ga_duplicate");
3703 
3704   /* find a free global_array handle for g_b */
3705   ga_handle =-1; i=0;
3706   do{
3707     if(!GA[i].actv_handle) ga_handle=i;
3708     i++;
3709   }while(i<_max_global_array && ga_handle==-1);
3710   if( ga_handle == -1)
3711     pnga_error("ga_duplicate: too many arrays", (Integer)_max_global_array);
3712   *g_b = (Integer)ga_handle - GA_OFFSET;
3713    GA[ga_handle].actv_handle = 1;
3714 
3715   gai_init_struct(ga_handle);
3716 
3717   /*** copy content of the data structure ***/
3718   save_ptr = GA[ga_handle].ptr;
3719   GA[ga_handle] = GA[GA_OFFSET + g_a]; /* <--- shallow copy */
3720   strcpy(GA[ga_handle].name, array_name);
3721   GA[ga_handle].ptr = save_ptr;
3722   GA[ga_handle].distr_type = GA[GA_OFFSET + g_a].distr_type;
3723   maplen = calc_maplen(GA_OFFSET + g_a);
3724   if (maplen > 0) {
3725     GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
3726     for(i=0;i<maplen; i++)GA[ga_handle].mapc[i] = GA[GA_OFFSET+ g_a].mapc[i];
3727     GA[ga_handle].mapc[maplen] = -1;
3728   }
3729 
3730   /*** if ghost cells are used, initialize ghost cache data ***/
3731   GA[ga_handle].cache = NULL;
3732   pnga_set_ghost_info(*g_b);
3733 
3734   /*** initialize and copy info for restricted arrays, if relevant ***/
3735   GA[ga_handle].rstrctd_list = NULL;
3736   GA[ga_handle].rank_rstrctd = NULL;
3737   GA[ga_handle].num_rstrctd = 0;
3738   if (GA[GA_OFFSET + g_a].num_rstrctd > 0) {
3739     GA[ga_handle].num_rstrctd = GA[GA_OFFSET + g_a].num_rstrctd;
3740     pnga_set_restricted(*g_b, GA[GA_OFFSET + g_a].rstrctd_list,
3741         GA[GA_OFFSET + g_a].num_rstrctd);
3742   }
3743 
3744   /*** Memory Allocation & Initialization of GA Addressing Space ***/
3745   mem_size = mem_size_proc = GA[ga_handle].size;
3746   GA[ga_handle].id = INVALID_MA_HANDLE;
3747   /* if requested, enforce limits on memory consumption */
3748   if(GA_memory_limited) GA_total_memory -= mem_size_proc;
3749 
3750   /* check if everybody has enough memory left */
3751   if(GA_memory_limited){
3752     status = (GA_total_memory >= 0) ? 1 : 0;
3753     if (grp_id > 0) {
3754       /* pnga_pgroup_gop(grp_id, pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
3755       pnga_pgroup_gop(grp_id, pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3756       status = (Integer)status;
3757     } else {
3758       /* pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
3759       pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3760     }
3761   }else status = 1;
3762 
3763   if(status)
3764   {
3765     if (GA[ga_handle].mem_dev_set) {
3766       status = !gai_get_devmem(array_name, GA[ga_handle].ptr,mem_size,
3767           (int)GA[ga_handle].type, &GA[ga_handle].id,
3768           (int)grp_id,GA[ga_handle].mem_dev_set,GA[ga_handle].mem_dev);
3769     } else {
3770       status = !gai_getmem(array_name, GA[ga_handle].ptr,mem_size,
3771           (int)GA[ga_handle].type, &GA[ga_handle].id,
3772           (int)grp_id);
3773     }
3774 }
3775   else{
3776     GA[ga_handle].ptr[grp_me]=NULL;
3777   }
3778 
3779   if(local_sync_end)pnga_pgroup_sync(grp_id);
3780 
3781 #     ifdef GA_CREATE_INDEF
3782   /* This code is incorrect. It needs to fixed if INDEF is ever used */
3783   if(status){
3784     Integer one = 1;
3785     Integer dim1 =(Integer)GA[ga_handle].dims[1], dim2=(Integer)GA[ga_handle].dims[2];
3786     if(GAme==0)fprintf(stderr,"duplicate:initializing GA array%ld\n",g_b);
3787     if(GA[ga_handle].type == C_DBL) {
3788       double bad = (double) DBL_MAX;
3789       ga_fill_patch_(g_b, &one, &dim1, &one, &dim2,  &bad);
3790     } else if (GA[ga_handle].type == C_INT) {
3791       int bad = (int) INT_MAX;
3792       ga_fill_patch_(g_b, &one, &dim1, &one, &dim2,  &bad);
3793     } else if (GA[ga_handle].type == C_LONG) {
3794       long bad = LONG_MAX;
3795       ga_fill_patch_(g_b, &one, &dim1, &one, &dim2,  &bad);
3796     } else if (GA[ga_handle].type == C_LONGLONG) {
3797       long long bad = LONG_MAX;
3798       ga_fill_patch_(g_b, &one, &dim1, &one, &dim2,  &bad);
3799     } else if (GA[ga_handle].type == C_DCPL) {
3800       DoubleComplex bad = {DBL_MAX, DBL_MAX};
3801       ga_fill_patch_(g_b, &one, &dim1, &one, &dim2,  &bad);
3802     } else if (GA[ga_handle].type == C_SCPL) {
3803       SingleComplex bad = {FLT_MAX, FLT_MAX};
3804       ga_fill_patch_(g_b, &one, &dim1, &one, &dim2,  &bad);
3805     } else if (GA[ga_handle].type == C_FLOAT) {
3806       float bad = FLT_MAX;
3807       ga_fill_patch_(g_b, &one, &dim1, &one, &dim2,  &bad);
3808     } else {
3809       pnga_error("ga_duplicate: type not supported ",GA[ga_handle].type);
3810     }
3811   }
3812 #     endif
3813 
3814   if(status){
3815     GAstat.curmem += (long)GA[ga_handle].size;
3816     GAstat.maxmem  = (long)GA_MAX(GAstat.maxmem, GAstat.curmem);
3817     return(TRUE);
3818   }else{
3819     if (GA_memory_limited) GA_total_memory += mem_size_proc;
3820     pnga_destroy(*g_b);
3821     return(FALSE);
3822   }
3823 }
3824 
3825 /*\ DUPLICATE A GLOBAL ARRAY -- memory comes from user
3826  *  -- new array g_b will have properties of g_a
3827 \*/
GA_Assemble_duplicate(int g_a,char * array_name,void * ptr)3828 int GA_Assemble_duplicate(int g_a, char* array_name, void* ptr)
3829 {
3830 char     **save_ptr;
3831 int      i, ga_handle;
3832 int extra = sizeof(getmem_t)+GAnproc*sizeof(char*);
3833 getmem_t *info = (getmem_t *)((char*)ptr - extra);
3834 char **ptr_arr = (char**)(info+1);
3835 int g_b;
3836 int maplen = calc_maplen(GA_OFFSET + g_a);
3837 
3838 
3839       pnga_sync();
3840 
3841       GAstat.numcre ++;
3842 
3843       ga_check_handleM(g_a,"ga_assemble_duplicate");
3844 
3845       /* find a free global_array handle for g_b */
3846       ga_handle =-1; i=0;
3847       do{
3848         if(!GA[i].actv_handle) ga_handle=i;
3849         i++;
3850       }while(i<_max_global_array && ga_handle==-1);
3851       if( ga_handle == -1)
3852           pnga_error("ga_assemble_duplicate: too many arrays ",
3853                                            (Integer)_max_global_array);
3854       g_b = ga_handle - GA_OFFSET;
3855 
3856       gai_init_struct(ga_handle);
3857       GA[ga_handle].actv_handle = 1;
3858 
3859       /*** copy content of the data structure ***/
3860       save_ptr = GA[ga_handle].ptr;
3861       GA[ga_handle] = GA[GA_OFFSET + g_a];
3862       strcpy(GA[ga_handle].name, array_name);
3863       GA[ga_handle].ptr = save_ptr;
3864       if (maplen > 0) {
3865         GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
3866         for(i=0;i<maplen; i++)GA[ga_handle].mapc[i] = GA[GA_OFFSET+ g_a].mapc[i];
3867         GA[ga_handle].mapc[maplen] = -1;
3868       }
3869 
3870       /* get ptrs and datatype from user memory */
3871       gam_checktype(pnga_type_f2c(info->type));
3872       GA[ga_handle].type = pnga_type_f2c(info->type);
3873       GA[ga_handle].size = (C_Long)info->size;
3874       GA[ga_handle].id = info->id;
3875       memcpy(GA[ga_handle].ptr,ptr_arr,(size_t)GAnproc*sizeof(char**));
3876 
3877       GAstat.curmem += (long)GA[ga_handle].size;
3878       GAstat.maxmem  = (long)GA_MAX(GAstat.maxmem, GAstat.curmem);
3879 
3880       pnga_sync();
3881 
3882       return(g_b);
3883 }
3884 
3885 /**
3886  *  Destroy a Global Array and clean up memory
3887  */
3888 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3889 #   pragma weak wnga_destroy =  pnga_destroy
3890 #endif
3891 
pnga_destroy(Integer g_a)3892 logical pnga_destroy(Integer g_a)
3893 {
3894 Integer ga_handle = GA_OFFSET + g_a, grp_id, grp_me=GAme;
3895 int local_sync_begin,local_sync_end;
3896 
3897     local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3898     _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3899     grp_id = (Integer)GA[ga_handle].p_handle;
3900     if(local_sync_begin)pnga_pgroup_sync(grp_id);
3901 
3902     if (grp_id > 0) grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3903     else grp_me=GAme;
3904 
3905     GAstat.numdes ++; /*regardless of array status we count this call */
3906     /* fails if handle is out of range or array not active */
3907     if(ga_handle < 0 || ga_handle >= _max_global_array){
3908        return FALSE;
3909     }
3910     if(GA[ga_handle].actv==0){
3911        return FALSE;
3912     }
3913     if (GA[ga_handle].cache)
3914       free(GA[ga_handle].cache);
3915     GA[ga_handle].cache = NULL;
3916     GA[ga_handle].actv = 0;
3917     GA[ga_handle].actv_handle = 0;
3918     GA[ga_handle].mem_dev_set = 0;
3919 
3920     if (GA[ga_handle].num_rstrctd > 0) {
3921       GA[ga_handle].num_rstrctd = 0;
3922       if (GA[ga_handle].rstrctd_list)
3923         free(GA[ga_handle].rstrctd_list);
3924       GA[ga_handle].rstrctd_list = NULL;
3925 
3926       if (GA[ga_handle].rank_rstrctd)
3927         free(GA[ga_handle].rank_rstrctd);
3928       GA[ga_handle].rank_rstrctd = NULL;
3929     }
3930 
3931     if(GA[ga_handle].mapc != NULL){
3932        free(GA[ga_handle].mapc);
3933        GA[ga_handle].mapc = NULL;
3934     }
3935 
3936     if (GA[ga_handle].property == READ_CACHE) {
3937       if (GA[ga_handle].cache_head != NULL) {
3938         cache_struct_t *next;
3939         next = GA[ga_handle].cache_head->next;
3940         if (GA[ga_handle].cache_head->cache_buf)
3941           free(GA[ga_handle].cache_head->cache_buf);
3942         free(GA[ga_handle].cache_head);
3943         while (next) {
3944           GA[ga_handle].cache_head = next;
3945           next = next->next;
3946           if (GA[ga_handle].cache_head->cache_buf)
3947             free(GA[ga_handle].cache_head->cache_buf);
3948           free(GA[ga_handle].cache_head);
3949         }
3950       }
3951     }
3952     GA[ga_handle].cache_head = NULL;
3953 
3954     if (GA[ga_handle].property == READ_ONLY) {
3955       free(GA[ga_handle].old_mapc);
3956       pnga_pgroup_destroy(GA[ga_handle].p_handle);
3957     }
3958 
3959     if(GA[ga_handle].ptr[grp_me]==NULL){
3960        return TRUE;
3961     }
3962     if (!GA[ga_handle].overlay) {
3963 #ifndef AVOID_MA_STORAGE
3964       if(gai_uses_shm((int)grp_id)){
3965 #endif
3966         /* make sure that we free original (before address allignment) pointer */
3967 #ifdef MSG_COMMS_MPI
3968         if (grp_id > 0){
3969           ARMCI_Free_group(GA[ga_handle].ptr[grp_me] - GA[ga_handle].id,
3970               &PGRP_LIST[grp_id].group);
3971         }
3972         else
3973 #endif
3974           if (GA[ga_handle].mem_dev_set) {
3975             ARMCI_Free_memdev(GA[ga_handle].ptr[GAme]-GA[ga_handle].id);
3976           } else {
3977             ARMCI_Free(GA[ga_handle].ptr[GAme] - GA[ga_handle].id);
3978           }
3979 #ifndef AVOID_MA_STORAGE
3980       }else{
3981         if(GA[ga_handle].id != INVALID_MA_HANDLE) MA_free_heap(GA[ga_handle].id);
3982       }
3983 #endif
3984       if(GA_memory_limited) GA_total_memory += GA[ga_handle].size;
3985       GAstat.curmem -= GA[ga_handle].size;
3986     } else {
3987       GA[ga_handle].overlay = 0;
3988     }
3989 
3990 
3991     if(local_sync_end)pnga_pgroup_sync(grp_id);
3992     return(TRUE);
3993 }
3994 
3995 
3996 
3997 /**
3998  *  Terminate Global Array structures
3999  *
4000  *  All GA arrays are destroyed & shared memory is dealocated
4001  *  GA routines (except for ga_initialize) should not be called thereafter
4002  */
4003 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4004 #   pragma weak wnga_terminate =  pnga_terminate
4005 #endif
4006 
pnga_terminate()4007 void pnga_terminate()
4008 {
4009     //GA_Internal_Threadsafe_Lock();
4010 Integer i, handle;
4011 
4012     _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
4013     if(!GAinitialized)
4014     {
4015         //GA_Internal_Threadsafe_Unlock();
4016         return;
4017     }
4018 
4019 #ifdef PROFILE_OLD
4020     ga_profile_terminate();
4021 #endif
4022     for (i=0;i<_max_global_array;i++){
4023           handle = i - GA_OFFSET ;
4024           if(GA[i].actv) pnga_destroy(handle);
4025           if(GA[i].ptr) free(GA[i].ptr);
4026           if(GA[i].mapc) free(GA[i].mapc);
4027     }
4028     /* don't free groups list until all arrays destroyed */
4029     for (i=0;i<_max_global_array;i++){
4030           if(PGRP_LIST[i].actv) free(PGRP_LIST[i].map_proc_list);
4031     }
4032     pnga_sync();
4033 
4034     GA_total_memory = -1; /* restore "unlimited" memory usage status */
4035     GA_memory_limited = 0;
4036     gai_finalize_onesided();
4037     free(mapALL);
4038     free(_ga_main_data_structure);
4039     free(_proc_list_main_data_structure);
4040     ARMCI_Free(GA_Update_Flags[GAme]);
4041     free(GA_Update_Flags);
4042     ARMCI_Free_local(GA_Update_Signal);
4043 
4044     pnga_sync();
4045     ARMCI_Finalize();
4046     ARMCIinitialized = 0;
4047     GAinitialized = 0;
4048     //GA_Internal_Threadsafe_Unlock();
4049 }
4050 
4051 
4052 /**
4053  *  Is array active or inactive
4054  */
4055 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4056 #   pragma weak wnga_verify_handle =  pnga_verify_handle
4057 #endif
4058 
pnga_verify_handle(Integer g_a)4059 Integer pnga_verify_handle(Integer g_a)
4060 {
4061   return (Integer)
4062     ((g_a + GA_OFFSET>= 0) && (g_a + GA_OFFSET< _max_global_array) &&
4063              GA[GA_OFFSET + (g_a)].actv);
4064 }
4065 
4066 
4067 
4068 /**
4069  *  Fill array with random values in [0,val)
4070  */
4071 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4072 #   pragma weak wnga_randomize =  pnga_randomize
4073 #endif
4074 
pnga_randomize(Integer g_a,void * val)4075 void pnga_randomize(Integer g_a, void* val)
4076 {
4077   int i,handle=GA_OFFSET + (int)g_a;
4078   char *ptr;
4079   int local_sync_begin,local_sync_end;
4080   C_Long elems;
4081   Integer grp_id;
4082   Integer num_blocks;
4083 
4084 
4085   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
4086   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
4087   grp_id = pnga_get_pgroup(g_a);
4088   if(local_sync_begin)pnga_pgroup_sync(grp_id);
4089 
4090 
4091   ga_check_handleM(g_a, "ga_randomize");
4092   gam_checktype(GA[handle].type);
4093   elems = GA[handle].size/((C_Long)GA[handle].elemsize);
4094   num_blocks = GA[handle].block_total;
4095 
4096   if (num_blocks < 0) {
4097     /* Bruce..Please CHECK if this is correct */
4098     if (grp_id >= 0){
4099       Integer grp_me = PGRP_LIST[GA[handle].p_handle].map_proc_list[GAme];
4100       ptr = GA[handle].ptr[grp_me];
4101     }
4102     else  ptr = GA[handle].ptr[GAme];
4103 
4104     switch (GA[handle].type){
4105 /*
4106       case C_DCPL:
4107         for(i=0; i<elems;i++)((DoubleComplex*)ptr)[i]=*(DoubleComplex*) rand();
4108         break;
4109       case C_SCPL:
4110         for(i=0; i<elems;i++)((SingleComplex*)ptr)[i]=*(SingleComplex*)val;
4111         break;
4112 */
4113       case C_DBL:
4114         for(i=0; i<elems;i++)((double*)ptr)[i]=*(double*) val * ((double)rand())/RAND_MAX;
4115         break;
4116       case C_INT:
4117         for(i=0; i<elems;i++)((int*)ptr)[i]=*(int*) val * ((int)rand())/RAND_MAX;
4118         break;
4119       case C_FLOAT:
4120         for(i=0; i<elems;i++)((float*)ptr)[i]=*(float*) val * ((float)rand())/RAND_MAX;
4121         break;
4122       case C_LONG:
4123         for(i=0; i<elems;i++)((long*)ptr)[i]=*(long*) val * ((long)rand())/RAND_MAX;
4124         break;
4125       case C_LONGLONG:
4126         for(i=0; i<elems;i++)((long long*)ptr)[i]=*( long long*) val * ((long long)rand())/RAND_MAX;
4127         break;
4128       default:
4129         pnga_error("type not supported",GA[handle].type);
4130     }
4131   } else {
4132     Integer I_elems = (Integer)elems;
4133     pnga_access_block_segment_ptr(g_a,GAme,&ptr,&I_elems);
4134     elems = (C_Long)I_elems;
4135     switch (GA[handle].type){
4136 /*
4137       case C_DCPL:
4138         for(i=0; i<elems;i++)((DoubleComplex*)ptr)[i]=*(DoubleComplex*)val;
4139         break;
4140       case C_SCPL:
4141         for(i=0; i<elems;i++)((SingleComplex*)ptr)[i]=*(SingleComplex*)val;
4142         break;
4143 */
4144       case C_DBL:
4145         for(i=0; i<elems;i++)((double*)ptr)[i]=*(double*)val * ((double)rand())/RAND_MAX;
4146         break;
4147       case C_INT:
4148         for(i=0; i<elems;i++)((int*)ptr)[i]=*(int*)val * ((int)rand())/RAND_MAX;
4149         break;
4150       case C_FLOAT:
4151         for(i=0; i<elems;i++)((float*)ptr)[i]=*(float*)val * ((float)rand())/RAND_MAX;
4152         break;
4153       case C_LONG:
4154         for(i=0; i<elems;i++)((long*)ptr)[i]=*(long*)val * ((long)rand())/RAND_MAX;
4155         break;
4156       case C_LONGLONG:
4157         for(i=0; i<elems;i++)((long long*)ptr)[i]=*(long long*)val * ((long long)rand())/RAND_MAX;
4158         break;
4159       default:
4160         pnga_error("type not supported",GA[handle].type);
4161     }
4162     pnga_release_block_segment(g_a,GAme);
4163   }
4164 
4165   if(local_sync_end)pnga_pgroup_sync(grp_id);
4166 
4167 }
4168 
4169 /**
4170  * Fill array with value
4171  */
4172 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4173 #   pragma weak wnga_fill =  pnga_fill
4174 #endif
4175 
pnga_fill(Integer g_a,void * val)4176 void pnga_fill(Integer g_a, void* val)
4177 {
4178   int i,handle=GA_OFFSET + (int)g_a;
4179   char *ptr;
4180   int local_sync_begin,local_sync_end;
4181   C_Long elems;
4182   Integer grp_id;
4183   Integer num_blocks;
4184 
4185 
4186   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
4187   _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
4188   grp_id = pnga_get_pgroup(g_a);
4189   if(local_sync_begin)pnga_pgroup_sync(grp_id);
4190 
4191 
4192   ga_check_handleM(g_a, "ga_fill");
4193   gam_checktype(GA[handle].type);
4194   elems = GA[handle].size/((C_Long)GA[handle].elemsize);
4195   num_blocks = GA[handle].block_total;
4196 
4197   if (num_blocks < 0) {
4198     /* Bruce..Please CHECK if this is correct */
4199     if (grp_id >= 0){
4200       Integer grp_me = PGRP_LIST[GA[handle].p_handle].map_proc_list[GAme];
4201       ptr = GA[handle].ptr[grp_me];
4202     }
4203     else  ptr = GA[handle].ptr[GAme];
4204 
4205     switch (GA[handle].type){
4206       case C_DCPL:
4207         for(i=0; i<elems;i++)((DoubleComplex*)ptr)[i]=*(DoubleComplex*)val;
4208         break;
4209       case C_SCPL:
4210         for(i=0; i<elems;i++)((SingleComplex*)ptr)[i]=*(SingleComplex*)val;
4211         break;
4212       case C_DBL:
4213         for(i=0; i<elems;i++)((double*)ptr)[i]=*(double*)val;
4214         break;
4215       case C_INT:
4216         for(i=0; i<elems;i++)((int*)ptr)[i]=*(int*)val;
4217         break;
4218       case C_FLOAT:
4219         for(i=0; i<elems;i++)((float*)ptr)[i]=*(float*)val;
4220         break;
4221       case C_LONG:
4222         for(i=0; i<elems;i++)((long*)ptr)[i]=*(long*)val;
4223         break;
4224       case C_LONGLONG:
4225         for(i=0; i<elems;i++)((long long*)ptr)[i]=*( long long*)val;
4226         break;
4227       default:
4228         pnga_error("type not supported",GA[handle].type);
4229     }
4230   } else {
4231     Integer I_elems = (Integer)elems;
4232     pnga_access_block_segment_ptr(g_a,GAme,&ptr,&I_elems);
4233     elems = (C_Long)I_elems;
4234     switch (GA[handle].type){
4235       case C_DCPL:
4236         for(i=0; i<elems;i++)((DoubleComplex*)ptr)[i]=*(DoubleComplex*)val;
4237         break;
4238       case C_SCPL:
4239         for(i=0; i<elems;i++)((SingleComplex*)ptr)[i]=*(SingleComplex*)val;
4240         break;
4241       case C_DBL:
4242         for(i=0; i<elems;i++)((double*)ptr)[i]=*(double*)val;
4243         break;
4244       case C_INT:
4245         for(i=0; i<elems;i++)((int*)ptr)[i]=*(int*)val;
4246         break;
4247       case C_FLOAT:
4248         for(i=0; i<elems;i++)((float*)ptr)[i]=*(float*)val;
4249         break;
4250       case C_LONG:
4251         for(i=0; i<elems;i++)((long*)ptr)[i]=*(long*)val;
4252         break;
4253       case C_LONGLONG:
4254         for(i=0; i<elems;i++)((long long*)ptr)[i]=*(long long*)val;
4255         break;
4256       default:
4257         pnga_error("type not supported",GA[handle].type);
4258     }
4259     pnga_release_block_segment(g_a,GAme);
4260   }
4261 
4262   if(local_sync_end)pnga_pgroup_sync(grp_id);
4263 
4264 }
4265 
4266 /**
4267  *  Get properties of Global Array. Note that type variable will be
4268  *  using C conventions
4269  */
4270 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4271 #   pragma weak wnga_inquire =  pnga_inquire
4272 #endif
4273 
pnga_inquire(Integer g_a,Integer * type,Integer * ndim,Integer * dims)4274 void pnga_inquire(Integer g_a, Integer *type, Integer *ndim, Integer *dims)
4275 {
4276 Integer handle = GA_OFFSET + g_a,i;
4277    ga_check_handleM(g_a, "nga_inquire");
4278    *type       = GA[handle].type;
4279    *ndim       = GA[handle].ndim;
4280    for(i=0;i<*ndim;i++) dims[i]=(Integer)GA[handle].dims[i];
4281 }
4282 
4283 /**
4284  *  Get type of Global Array. Note that type variable will be
4285  *  using C conventions
4286  */
4287 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4288 #   pragma weak wnga_inquire_type =  pnga_inquire_type
4289 #endif
4290 
pnga_inquire_type(Integer g_a,Integer * type)4291 void pnga_inquire_type(Integer g_a, Integer *type)
4292 {
4293 Integer handle = GA_OFFSET + g_a;
4294    ga_check_handleM(g_a, "nga_inquire");
4295    *type       = GA[handle].type;
4296 }
4297 
4298 /**
4299  *  Inquire name of Global Array
4300  */
4301 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4302 #   pragma weak wnga_inquire_name =  pnga_inquire_name
4303 #endif
4304 
pnga_inquire_name(Integer g_a,char ** array_name)4305 void pnga_inquire_name(Integer g_a, char **array_name)
4306 {
4307    ga_check_handleM(g_a, "ga_inquire_name");
4308    *array_name = GA[GA_OFFSET + g_a].name;
4309 }
4310 
4311 /**
4312  *  Return processor coordinates in processor grid
4313  */
4314 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4315 #   pragma weak wnga_proc_topology =  pnga_proc_topology
4316 #endif
4317 
pnga_proc_topology(Integer g_a,Integer proc,Integer * subscript)4318 void pnga_proc_topology(Integer g_a, Integer proc, Integer* subscript)
4319 {
4320 Integer d, index, ndim, ga_handle = GA_OFFSET + g_a;
4321 
4322    ga_check_handleM(g_a, "nga_proc_topology");
4323    ndim = GA[ga_handle].ndim;
4324 
4325    index = proc;
4326 
4327    for(d=0; d<ndim; d++){
4328        subscript[d] = index% GA[ga_handle].nblock[d];
4329        index  /= GA[ga_handle].nblock[d];
4330    }
4331 }
4332 
4333 /**
4334  * Return dimensions of processor grid associate with a Global Array
4335  */
4336 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4337 #   pragma weak wnga_get_proc_grid =  pnga_get_proc_grid
4338 #endif
4339 
pnga_get_proc_grid(Integer g_a,Integer * dims)4340 void pnga_get_proc_grid(Integer g_a, Integer *dims)
4341 {
4342   Integer i, ndim, ga_handle = GA_OFFSET + g_a;
4343   ga_check_handleM(g_a, "ga_get_proc_grid");
4344   ndim = GA[ga_handle].ndim;
4345   for (i=0; i<ndim; i++) {
4346     dims[i] = GA[ga_handle].nblock[i];
4347   }
4348 }
4349 
4350 #if 0
4351 static void gai_get_proc_from_block_index_(Integer g_a, Integer *index, Integer *proc)
4352 {
4353   Integer ga_handle = GA_OFFSET + g_a;
4354   Integer ndim = GA[ga_handle].ndim;
4355   Integer i, ld;
4356   if (pnga_uses_proc_grid(g_a)) {
4357     int *proc_grid = GA[ga_handle].nblock;
4358     Integer proc_id[MAXDIM];
4359     for (i=0; i<ndim; i++) {
4360       proc_id[i] = index[i]%proc_grid[i];
4361     }
4362     ld = 1;
4363     *proc = index[0];
4364     for (i=1; i<ndim; i++) {
4365       ld *= proc_grid[i];
4366       *proc *= ld;
4367       *proc += index[i];
4368     }
4369   } else {
4370     C_Integer *block_grid = GA[ga_handle].num_blocks;
4371     *proc = index[ndim-1];
4372     ld = 1;
4373     for (i=ndim-2; i >= 0; i--) {
4374        ld *= block_grid[i];
4375        *proc *= ld;
4376        *proc += index[i];
4377     }
4378     *proc = *proc%pnga_nnodes();
4379   }
4380 }
4381 #endif
4382 
4383 /*\
4384  * RETURN HOW MANY PROCESSORS/OWNERS THERE ARE FOR THE SPECIFIED PATCH OF A
4385  * GLOBAL ARRAY
4386 \*/
4387 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4388 #   pragma weak wnga_locate_nnodes = pnga_locate_nnodes
4389 #endif
pnga_locate_nnodes(Integer g_a,Integer * lo,Integer * hi,Integer * np)4390 logical pnga_locate_nnodes( Integer g_a,
4391                                  Integer *lo,
4392                                  Integer *hi,
4393                                  Integer *np)
4394 /*    g_a      [input]  global array handle
4395       lo       [input]  lower indices of patch in global array
4396       hi       [input]  upper indices of patch in global array
4397       np       [output] total number of processors containing a portion
4398                         of the patch
4399 
4400       For a block cyclic data distribution, this function returns a list of
4401       blocks that cover the region along with the lower and upper indices of
4402       each block.
4403 */
4404 {
4405   int  procT[MAXDIM], procB[MAXDIM], proc_subscript[MAXDIM];
4406   Integer  proc, i, ga_handle;
4407   Integer  d, dpos, ndim, elems, use_blocks;
4408   /* Integer  p_handle; */
4409 
4410   ga_check_handleM(g_a, "nga_locate_nnodes");
4411 
4412   ga_handle = GA_OFFSET + g_a;
4413 #ifdef __crayx1
4414 #pragma _CRI novector
4415 #endif
4416   for(d = 0; d< GA[ga_handle].ndim; d++)
4417     if((lo[d]<1 || hi[d]>GA[ga_handle].dims[d]) ||(lo[d]>hi[d]))return FALSE;
4418 
4419   ndim = GA[ga_handle].ndim;
4420 
4421   if (GA[ga_handle].distr_type == REGULAR) {
4422     /* find "processor coordinates" for the top left corner and store them
4423      * in ProcT */
4424 #ifdef __crayx1
4425 #pragma _CRI novector
4426 #endif
4427     for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){
4428       findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
4429           GA[ga_handle].scale[d], lo[d], &procT[d]);
4430       dpos += GA[ga_handle].nblock[d];
4431     }
4432 
4433     /* find "processor coordinates" for the right bottom corner and store
4434      * them in procB */
4435 #ifdef __crayx1
4436 #pragma _CRI novector
4437 #endif
4438     for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){
4439       findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
4440           GA[ga_handle].scale[d], hi[d], &procB[d]);
4441       dpos += GA[ga_handle].nblock[d];
4442     }
4443 
4444     *np = 0;
4445 
4446     /* Find total number of processors containing data and return the
4447      * result in elems. Also find the lowest "processor coordinates" of the
4448      * processor block containing data and return these in proc_subscript.
4449      */
4450     ga_InitLoopM(&elems, ndim, proc_subscript, procT,procB,GA[ga_handle].nblock);
4451 
4452     /* p_handle = (Integer)GA[ga_handle].p_handle; */
4453     for(i= 0; i< elems; i++){
4454       Integer _lo[MAXDIM], _hi[MAXDIM];
4455 
4456       /* convert i to owner processor id using the current values in
4457          proc_subscript */
4458       ga_ComputeIndexM(&proc, ndim, proc_subscript, GA[ga_handle].nblock);
4459       /* get range of global array indices that are owned by owner */
4460       ga_ownsM(ga_handle, proc, _lo, _hi);
4461 
4462       /* Update to proc_subscript so that it corresponds to the next
4463        * processor in the block of processors containing the patch */
4464       ga_UpdateSubscriptM(ndim,proc_subscript,procT,procB,GA[ga_handle].nblock);
4465       (*np)++;
4466     }
4467   } else {
4468     Integer nblocks = GA[ga_handle].block_total;
4469     Integer chk, j, tlo[MAXDIM], thi[MAXDIM], cnt;
4470     cnt = 0;
4471     for (i=0; i<nblocks; i++) {
4472       /* check to see if this block overlaps with requested block
4473        * defined by lo and hi */
4474       chk = 1;
4475       /* get limits on block i */
4476       pnga_distribution(g_a,i,tlo,thi);
4477       for (j=0; j<ndim && chk==1; j++) {
4478         /* check to see if at least one end point of the interval
4479          * represented by blo and bhi falls in the interval
4480          * represented by lo and hi */
4481         if (!((tlo[j] >= lo[j] && tlo[j] <= hi[j]) ||
4482               (thi[j] >= lo[j] && thi[j] <= hi[j]))) {
4483           chk = 0;
4484         }
4485       }
4486       /* store blocks that overlap request region in proclist */
4487       if (chk) {
4488         cnt++;
4489       }
4490     }
4491     *np = cnt;
4492   }
4493   return(TRUE);
4494 }
4495 #ifdef __crayx1
4496 #pragma _CRI inline nga_locate_nnodes_
4497 #endif
4498 
4499 
4500 /**
4501  *  Locate individual patches and their owner of specified patch of a
4502  *  Global Array
4503  */
4504 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4505 #   pragma weak wnga_locate_region =  pnga_locate_region
4506 #endif
4507 
pnga_locate_region(Integer g_a,Integer * lo,Integer * hi,Integer * map,Integer * proclist,Integer * np)4508 logical pnga_locate_region( Integer g_a,
4509                             Integer *lo,
4510                             Integer *hi,
4511                             Integer *map,
4512                             Integer *proclist,
4513                             Integer *np)
4514 /*    g_a      [input]  global array handle
4515       lo       [input]  lower indices of patch in global array
4516       hi       [input]  upper indices of patch in global array
4517       map      [output] list of lower and upper indices for portion of
4518                         patch that exists on each processor containing a
4519                         portion of the patch. The map is constructed so
4520                         that for a D dimensional global array, the first
4521                         D elements are the lower indices on the first
4522                         processor in proclist, the next D elements are
4523                         the upper indices of the first processor in
4524                         proclist, the next D elements are the lower
4525                         indices for the second processor in proclist, and
4526                         so on.
4527       proclist [output] list of processors containing some portion of the
4528                         patch
4529       np       [output] total number of processors containing a portion
4530                         of the patch
4531 
4532       For a block cyclic data distribution, this function returns a list of
4533       blocks that cover the region along with the lower and upper indices of
4534       each block.
4535 */
4536 {
4537   int  procT[MAXDIM], procB[MAXDIM], proc_subscript[MAXDIM];
4538   Integer  proc, owner, i, ga_handle;
4539   Integer  d, dpos, ndim, elems, use_blocks;
4540   /* Integer  p_handle; */
4541 
4542   ga_check_handleM(g_a, "nga_locate_region");
4543 
4544   ga_handle = GA_OFFSET + g_a;
4545 #ifdef __crayx1
4546 #pragma _CRI novector
4547 #endif
4548   for(d = 0; d< GA[ga_handle].ndim; d++)
4549     if((lo[d]<1 || hi[d]>GA[ga_handle].dims[d]) ||(lo[d]>hi[d]))return FALSE;
4550 
4551   ndim = GA[ga_handle].ndim;
4552 
4553   if (GA[ga_handle].distr_type == REGULAR) {
4554     /* find "processor coordinates" for the top left corner and store them
4555      * in ProcT */
4556 #ifdef __crayx1
4557 #pragma _CRI novector
4558 #endif
4559     for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){
4560       findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
4561           GA[ga_handle].scale[d], lo[d], &procT[d]);
4562       dpos += GA[ga_handle].nblock[d];
4563     }
4564 
4565     /* find "processor coordinates" for the right bottom corner and store
4566      * them in procB */
4567 #ifdef __crayx1
4568 #pragma _CRI novector
4569 #endif
4570     for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){
4571       findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
4572           GA[ga_handle].scale[d], hi[d], &procB[d]);
4573       dpos += GA[ga_handle].nblock[d];
4574     }
4575 
4576     *np = 0;
4577 
4578     /* Find total number of processors containing data and return the
4579      * result in elems. Also find the lowest "processor coordinates" of the
4580      * processor block containing data and return these in proc_subscript.
4581      */
4582     ga_InitLoopM(&elems, ndim, proc_subscript, procT,procB,GA[ga_handle].nblock);
4583 
4584     /* p_handle = (Integer)GA[ga_handle].p_handle; */
4585     for(i= 0; i< elems; i++){
4586       Integer _lo[MAXDIM], _hi[MAXDIM];
4587       Integer  offset;
4588 
4589       /* convert i to owner processor id using the current values in
4590          proc_subscript */
4591       ga_ComputeIndexM(&proc, ndim, proc_subscript, GA[ga_handle].nblock);
4592       /* get range of global array indices that are owned by owner */
4593       ga_ownsM(ga_handle, proc, _lo, _hi);
4594 
4595       offset = *np *(ndim*2); /* location in map to put patch range */
4596 
4597 #ifdef __crayx1
4598 #pragma _CRI novector
4599 #endif
4600       for(d = 0; d< ndim; d++)
4601         map[d + offset ] = lo[d] < _lo[d] ? _lo[d] : lo[d];
4602 #ifdef __crayx1
4603 #pragma _CRI novector
4604 #endif
4605       for(d = 0; d< ndim; d++)
4606         map[ndim + d + offset ] = hi[d] > _hi[d] ? _hi[d] : hi[d];
4607 
4608       owner = proc;
4609       if (GA[ga_handle].num_rstrctd == 0) {
4610         proclist[i] = owner;
4611       } else {
4612         proclist[i] = GA[ga_handle].rstrctd_list[owner];
4613       }
4614       /* Update to proc_subscript so that it corresponds to the next
4615        * processor in the block of processors containing the patch */
4616       ga_UpdateSubscriptM(ndim,proc_subscript,procT,procB,GA[ga_handle].nblock);
4617       (*np)++;
4618     }
4619   } else {
4620     Integer nblocks = GA[ga_handle].block_total;
4621     Integer chk, j, tlo[MAXDIM], thi[MAXDIM], cnt;
4622     Integer offset;
4623     cnt = 0;
4624     for (i=0; i<nblocks; i++) {
4625       /* check to see if this block overlaps with requested block
4626        * defined by lo and hi */
4627       chk = 1;
4628       /* get limits on block i */
4629       pnga_distribution(g_a,i,tlo,thi);
4630       for (j=0; j<ndim && chk==1; j++) {
4631         /* check to see if at least one end point of the interval
4632          * represented by blo and bhi falls in the interval
4633          * represented by lo and hi */
4634         if (!((tlo[j] >= lo[j] && tlo[j] <= hi[j]) ||
4635               (thi[j] >= lo[j] && thi[j] <= hi[j]))) {
4636           chk = 0;
4637         }
4638       }
4639       /* store blocks that overlap request region in proclist */
4640       if (chk) {
4641         proclist[cnt] = i;
4642         cnt++;
4643       }
4644     }
4645     *np = cnt;
4646 
4647     /* fill map array with block coordinates */
4648     for (i=0; i<cnt; i++) {
4649       offset = i*2*ndim;
4650       j = proclist[i];
4651       pnga_distribution(g_a,j,tlo,thi);
4652       for (j=0; j<ndim; j++) {
4653         map[offset + j] = lo[j] < tlo[j] ? tlo[j] : lo[j];
4654         map[offset + ndim + j] = hi[j] > thi[j] ? thi[j] : hi[j];
4655       }
4656     }
4657   }
4658   return(TRUE);
4659 }
4660 #ifdef __crayx1
4661 #pragma _CRI inline pnga_locate_region
4662 #endif
4663 
4664 /**
4665  *  Returns the processor grid for the global array
4666  */
4667 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4668 #   pragma weak wnga_nblock =  pnga_nblock
4669 #endif
4670 
pnga_nblock(Integer g_a,Integer * nblock)4671 void pnga_nblock(Integer g_a, Integer *nblock)
4672 {
4673 Integer ga_handle = GA_OFFSET + g_a;
4674 int i, n;
4675 
4676      ga_check_handleM(g_a, "ga_nblock");
4677 
4678      n = GA[ga_handle].ndim;
4679 
4680      if (GA[ga_handle].distr_type == REGULAR) {
4681        for(i=0; i<n; i++) nblock[i] = (Integer)GA[ga_handle].nblock[i];
4682      } else {
4683        for(i=0; i<n; i++) nblock[i] = (Integer)GA[ga_handle].num_blocks[i];
4684      }
4685 }
4686 
4687 /**
4688  * Returns a character string describing internal data distribution
4689  */
4690 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4691 #   pragma weak wnga_get_distribution_type =  pnga_get_distribution_type
4692 #endif
4693 
pnga_get_distribution_type(Integer g_a,char * type)4694 void pnga_get_distribution_type(Integer g_a, char *type)
4695 {
4696   Integer ga_handle = GA_OFFSET + g_a;
4697   Integer itype = GA[ga_handle].distr_type;
4698   if (itype == REGULAR) {
4699     strcpy(type,"regular");
4700   } else if (itype == BLOCK_CYCLIC) {
4701     strcpy(type,"block_cyclic");
4702   } else if (itype == SCALAPACK) {
4703     strcpy(type,"scalapack");
4704   } else if (itype == TILED) {
4705     strcpy(type,"tiled");
4706   } else if (itype == TILED_IRREG) {
4707     strcpy(type,"tiled_irreg");
4708   } else {
4709     strcpy(type,"unknown");
4710   }
4711 }
4712 
4713 /**
4714  * Return ID of calling process in default group
4715  */
4716 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4717 #   pragma weak wnga_nodeid =  pnga_nodeid
4718 #endif
4719 
pnga_nodeid()4720 Integer pnga_nodeid()
4721 {
4722     if (GA_Default_Proc_Group > 0) {
4723        return (Integer)PGRP_LIST[GA_Default_Proc_Group].map_proc_list[GAme];
4724     } else {
4725        return ((Integer)GAme);
4726     }
4727 }
4728 
4729 /**
4730  * Return ID of calling process in group grp
4731  */
4732 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4733 #   pragma weak wnga_pgroup_nodeid =  pnga_pgroup_nodeid
4734 #endif
4735 
pnga_pgroup_nodeid(Integer grp)4736 Integer pnga_pgroup_nodeid(Integer grp)
4737 {
4738     if (grp >= 0) {
4739        return (Integer)PGRP_LIST[(int)grp].map_proc_list[GAme];
4740     } else {
4741        return GAme;
4742     }
4743 }
4744 
4745 /**
4746  * Return number of nodes in default group
4747  */
4748 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4749 #   pragma weak wnga_nnodes =  pnga_nnodes
4750 #endif
4751 
pnga_nnodes()4752 Integer pnga_nnodes()
4753 {
4754     if (GA_Default_Proc_Group > 0) {
4755        return (Integer)PGRP_LIST[GA_Default_Proc_Group].map_nproc;
4756     } else {
4757        return ((Integer)GAnproc);
4758     }
4759 }
4760 
4761 /**
4762  * Return number of nodes in group grp
4763  */
4764 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4765 #   pragma weak wnga_pgroup_nnodes =  pnga_pgroup_nnodes
4766 #endif
4767 
pnga_pgroup_nnodes(Integer grp)4768 Integer pnga_pgroup_nnodes(Integer grp)
4769 {
4770     if(grp >=0 )
4771        return (Integer)PGRP_LIST[(int)grp].map_nproc;
4772     else
4773        return ((Integer)GAnproc);
4774 }
4775 
4776 /**
4777  *  Compare distributions of two global arrays
4778  */
4779 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4780 #   pragma weak wnga_compare_distr = pnga_compare_distr
4781 #endif
4782 
pnga_compare_distr(Integer g_a,Integer g_b)4783 logical pnga_compare_distr(Integer g_a, Integer g_b)
4784 {
4785 int h_a =(int)g_a + GA_OFFSET;
4786 int h_b =(int)g_b + GA_OFFSET;
4787 int h_a_maplen = calc_maplen(h_a);
4788 int h_b_maplen = calc_maplen(h_b);
4789 int i;
4790 
4791    _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
4792    ga_check_handleM(g_a, "distribution a");
4793    ga_check_handleM(g_b, "distribution b");
4794 
4795 
4796    if(GA[h_a].ndim != GA[h_b].ndim) return FALSE;
4797 
4798    for(i=0; i <GA[h_a].ndim; i++)
4799        if(GA[h_a].dims[i] != GA[h_b].dims[i]) return FALSE;
4800 
4801    if (GA[h_a].distr_type != GA[h_b].distr_type) return FALSE;
4802    if (GA[h_a].distr_type == REGULAR) {
4803      if (h_a_maplen != h_b_maplen) return FALSE;
4804      for(i=0; i <h_a_maplen; i++){
4805        if(GA[h_a].mapc[i] != GA[h_b].mapc[i]) return FALSE;
4806        if(GA[h_a].mapc[i] == -1) break;
4807      }
4808    } else if (GA[h_a].distr_type == BLOCK_CYCLIC ||
4809        GA[h_a].distr_type == SCALAPACK || GA[h_a].distr_type == TILED) {
4810      for (i=0; i<GA[h_a].ndim; i++) {
4811        if (GA[h_a].block_dims[i] != GA[h_b].block_dims[i]) return FALSE;
4812      }
4813      for (i=0; i<GA[h_a].ndim; i++) {
4814        if (GA[h_a].num_blocks[i] != GA[h_b].num_blocks[i]) return FALSE;
4815      }
4816      if (GA[h_a].distr_type == SCALAPACK || GA[h_a].distr_type == TILED) {
4817        for (i=0; i<GA[h_a].ndim; i++) {
4818          if (GA[h_a].nblock[i] != GA[h_b].nblock[i]) return FALSE;
4819        }
4820      }
4821    } else if (GA[h_a].distr_type == TILED_IRREG) {
4822      if (h_a_maplen != h_b_maplen) return FALSE;
4823      for(i=0; i <h_a_maplen; i++){
4824        if(GA[h_a].mapc[i] != GA[h_b].mapc[i]) return FALSE;
4825        if(GA[h_a].mapc[i] == -1) break;
4826      }
4827      for (i=0; i<GA[h_a].ndim; i++) {
4828        if (GA[h_a].nblock[i] != GA[h_b].nblock[i]) return FALSE;
4829      }
4830    }
4831    if (GA[h_a].num_rstrctd == GA[h_b].num_rstrctd) {
4832      if (GA[h_a].num_rstrctd > 0) {
4833        for (i=0; i<GA[h_a].num_rstrctd; i++) {
4834          if (GA[h_a].rstrctd_list[i] != GA[h_b].rstrctd_list[i]) return FALSE;
4835        }
4836      }
4837    } else {
4838      return FALSE;
4839    }
4840    return TRUE;
4841 }
4842 
4843 
4844 static int num_mutexes=0;
4845 static int chunk_mutex;
4846 /**
4847  * Create a set of mutexes
4848  */
4849 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4850 #   pragma weak wnga_create_mutexes =  pnga_create_mutexes
4851 #endif
4852 
pnga_create_mutexes(Integer num)4853 logical pnga_create_mutexes(Integer num)
4854 {
4855 int myshare;
4856 
4857    _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
4858    if (num <= 0 || num > MAX_MUTEXES) return(FALSE);
4859    if(num_mutexes) pnga_error("mutexes already created",num_mutexes);
4860 
4861    num_mutexes= (int)num;
4862 
4863    if(GAnproc == 1){
4864       return(TRUE);
4865    }
4866    chunk_mutex = (int)((num + GAnproc-1)/GAnproc);
4867    if(GAme * chunk_mutex >= num)myshare =0;
4868    else myshare=chunk_mutex;
4869 
4870    /* need work here to use permutation */
4871    if(ARMCI_Create_mutexes(myshare)){
4872       return FALSE;
4873    }
4874    return TRUE;
4875 }
4876 
4877 
4878 /**
4879  * Lock an object defined by the mutex number
4880  */
4881 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4882 #   pragma weak wnga_lock =  pnga_lock
4883 #endif
4884 
pnga_lock(Integer mutex)4885 void pnga_lock(Integer mutex)
4886 {
4887 int m,p;
4888 
4889    if(GAnproc == 1) return;
4890    if(num_mutexes< mutex)pnga_error("invalid mutex",mutex);
4891 
4892    p = num_mutexes/chunk_mutex -1;
4893    m = num_mutexes%chunk_mutex;
4894 
4895    ARMCI_Lock(m,p);
4896 }
4897 
4898 /**
4899  *  Unlock a mutex
4900  */
4901 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4902 #   pragma weak wnga_unlock =  pnga_unlock
4903 #endif
4904 
pnga_unlock(Integer mutex)4905 void pnga_unlock(Integer mutex)
4906 {
4907 int m,p;
4908 
4909    if(GAnproc == 1) return;
4910    if(num_mutexes< mutex)pnga_error("invalid mutex",mutex);
4911 
4912    p = num_mutexes/chunk_mutex -1;
4913    m = num_mutexes%chunk_mutex;
4914 
4915    ARMCI_Unlock(m,p);
4916 }
4917 
4918 
4919 /**
4920  * Destroy mutexes
4921  */
4922 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4923 #   pragma weak wnga_destroy_mutexes =  pnga_destroy_mutexes
4924 #endif
4925 
pnga_destroy_mutexes()4926 logical pnga_destroy_mutexes()
4927 {
4928    _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
4929    if(num_mutexes<1) pnga_error("mutexes destroyed",0);
4930 
4931    num_mutexes= 0;
4932    if(GAnproc == 1){
4933       return TRUE;
4934    }
4935    if(ARMCI_Destroy_mutexes()){
4936       return FALSE;
4937    }
4938    return TRUE;
4939 }
4940 
4941 /**
4942  * Return a list that maps GA process IDs to message-passing process IDs
4943  */
4944 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4945 #   pragma weak wnga_list_nodeid =  pnga_list_nodeid
4946 #endif
4947 
pnga_list_nodeid(Integer * list,Integer num_procs)4948 void pnga_list_nodeid(Integer *list, Integer num_procs)
4949 {
4950   Integer proc;
4951   for( proc = 0; proc < num_procs; proc++)
4952       list[proc]=proc;
4953 }
4954 
4955 /**
4956  *  Returns true/false depending on validity of the handle
4957  */
4958 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4959 #   pragma weak wnga_valid_handle =  pnga_valid_handle
4960 #endif
4961 
pnga_valid_handle(Integer g_a)4962 logical pnga_valid_handle(Integer g_a)
4963 {
4964    if(GA_OFFSET+ (g_a) < 0 || GA_OFFSET+(g_a) >= _max_global_array ||
4965       ! (GA[GA_OFFSET+(g_a)].actv) ) return FALSE;
4966    else return TRUE;
4967 }
4968 
4969 
4970 /**
4971  *     A function that helps users avoid syncs inside a collective call
4972  *     that they think are unnecessary
4973  *
4974  *     Mask flags have to be reset in every collective call. Even if that
4975  *     collective call doesnt do any sync at all.
4976  *     If masking only the beginning sync is possible, make sure to
4977  *     clear even the _sync_end mask to avoid a mask intended for this
4978  *     collective_function_call to be carried to next collective_function_call
4979  *     or to a collective function called by this function.
4980  *     Similarly, make sure to use two copy mask values to local variables
4981  *     and reset the global mask variables to avoid carring the mask to a
4982  *     collective call inside the current collective call.
4983  */
4984 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4985 #   pragma weak wnga_mask_sync =  pnga_mask_sync
4986 #endif
4987 
pnga_mask_sync(Integer begin,Integer end)4988 void pnga_mask_sync(Integer begin, Integer end)
4989 {
4990   if (begin) _ga_sync_begin = 1;
4991   else _ga_sync_begin = 0;
4992 
4993   if (end) _ga_sync_end = 1;
4994   else _ga_sync_end = 0;
4995 }
4996 
4997 /**
4998  *  Merge all copies of a mirrored array by adding them together
4999  */
5000 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5001 #   pragma weak wnga_merge_mirrored =  pnga_merge_mirrored
5002 #endif
5003 
pnga_merge_mirrored(Integer g_a)5004 void pnga_merge_mirrored(Integer g_a)
5005 {
5006   Integer handle = GA_OFFSET + g_a;
5007   Integer inode, nprocs, nnodes, zero, zproc, nblocks;
5008   int *blocks;
5009   C_Integer  *map, *dims, *width;
5010   Integer i, j, index[MAXDIM], itmp, ndim;
5011   Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
5012   Integer nelem, count, type, atype=ARMCI_INT;
5013   char *zptr=NULL, *bptr=NULL, *nptr=NULL;
5014   Integer bytes, total;
5015   int local_sync_begin, local_sync_end;
5016   long bigint;
5017   int chk = 1;
5018   void *ptr_a;
5019 
5020   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
5021   _ga_sync_begin = 1; _ga_sync_end = 1; /*remove any previous masking */
5022   if (local_sync_begin) pnga_sync();
5023   /* don't perform update if node is not mirrored */
5024   if (!pnga_is_mirrored(g_a)) return;
5025 
5026   inode = pnga_cluster_nodeid();
5027   nnodes = pnga_cluster_nnodes();
5028   nprocs = pnga_cluster_nprocs(inode);
5029   zero = 0;
5030 
5031   zproc = pnga_cluster_procid(inode, zero);
5032   zptr = GA[handle].ptr[zproc];
5033   map = GA[handle].mapc;
5034   blocks = GA[handle].nblock;
5035   dims = GA[handle].dims;
5036   width = GA[handle].width;
5037   type = GA[handle].type;
5038   ndim = GA[handle].ndim;
5039   bigint = 2147483647L/GAsizeof(type);
5040 
5041 #ifdef OPENIB
5042   /* Check whether or not all nodes contain the same number
5043      of processors. */
5044   if (nnodes*nprocs == pnga_nnodes())  {
5045     /* check to see if there is any buffer space between the data
5046        associated with each processor that needs to be zeroed out
5047        before performing the merge */
5048     if (zproc == GAme) {
5049       /* the use of nblocks instead of nprocs is designed to support a peculiar
5050          coding style in which the dimensions of the block array are all set to
5051          1 and all the data is restricted to the master processor on the node */
5052       nblocks = 1;
5053       for (i=0; i<ndim; i++) {
5054         nblocks *= blocks[i];
5055       }
5056       for (i=0; i<nblocks; i++) {
5057         /* Find out from mapc data how many elements are supposed to be located
5058            on this processor. Start by converting processor number to indices */
5059         itmp = i;
5060         for (j=0; j<ndim; j++) {
5061           index[j] = itmp%(Integer)blocks[j];
5062           itmp = (itmp - index[j])/(Integer)blocks[j];
5063         }
5064 
5065         nelem = 1;
5066         count = 0;
5067         for (j=0; j<ndim; j++) {
5068           if (index[j] < (Integer)blocks[j]-1) {
5069             nelem *= (Integer)(map[index[j]+1+count] - map[index[j]+count]
5070                    + 2*width[j]);
5071           } else {
5072             nelem *= (Integer)(dims[j] - map[index[j]+count] + 1 + 2*width[j]);
5073           }
5074           count += (Integer)blocks[j];
5075         }
5076         /* We now have the total number of elements located on this processor.
5077            Find out if the location of the end of this data set matches the
5078            origin of the data on the next processor. If not, then zero data in
5079            the gap. */
5080         nelem *= GAsizeof(type);
5081         bptr = GA[handle].ptr[pnga_cluster_procid(inode, i)];
5082         bptr += nelem;
5083         if (i<nblocks-1) {
5084           j = i+1;
5085           nptr = GA[handle].ptr[pnga_cluster_procid(inode, j)];
5086           if (bptr != nptr) {
5087             bytes = (long)nptr - (long)bptr;
5088             /* BJP printf("p[%d] Gap on proc %d is %d\n",GAme,i,bytes); */
5089             bzero(bptr, bytes);
5090           }
5091         }
5092       }
5093       /* find total number of bytes containing global array */
5094       total = (long)bptr - (long)zptr;
5095       total /= GAsizeof(type);
5096       /*convert from C data type to ARMCI type */
5097       switch(type) {
5098         case C_FLOAT: atype=ARMCI_FLOAT; break;
5099         case C_DBL: atype=ARMCI_DOUBLE; break;
5100         case C_LONG: atype=ARMCI_LONG; break;
5101         case C_INT: atype=ARMCI_INT; break;
5102         case C_DCPL: atype=ARMCI_DOUBLE; break;
5103         case C_SCPL: atype=ARMCI_FLOAT; break;
5104         default: pnga_error("type not supported",type);
5105       }
5106       /* now that gap data has been zeroed, do a global sum on data */
5107       {
5108         int i, len;
5109         int nsteps = (int) ceil(((double)total)/((double)bigint));
5110         long istart=0;
5111         /* printf("%ld total %ld bigint %ld  nsteps %d \n",GAme,total,bigint,nsteps); */
5112         for (i=0; i < nsteps; i++){
5113 
5114           len=bigint;
5115           if (istart+len > total) len=((long)(total - istart));
5116           /* printf("%ld step %d of %d  len= %d total=%ld istart= %ld\n",GAme,(i+1),nsteps,len,total,istart); */
5117           armci_msg_gop_scope(SCOPE_MASTERS, zptr+istart*GAsizeof(type), len, "+", atype);
5118           istart+=len;
5119         }
5120       }
5121     }
5122   } else {
5123     Integer _ga_tmp;
5124     Integer idims[MAXDIM], iwidth[MAXDIM], ichunk[MAXDIM];
5125     void *one = NULL;
5126     double d_one = 1.0;
5127     int i_one = 1;
5128     float f_one = 1.0;
5129     long l_one = 1;
5130     double c_one[2];
5131     float cf_one[2];
5132     c_one[0] = 1.0;
5133     c_one[1] = 0.0;
5134 
5135     cf_one[0] = 1.0;
5136     cf_one[1] = 0.0;
5137 
5138     /* choose one as scaling factor in accumulate */
5139     switch (type) {
5140       case C_FLOAT: one = &f_one; break;
5141       case C_DBL: one = &d_one; break;
5142       case C_LONG: one = &l_one; break;
5143       case C_INT: one = &i_one; break;
5144       case C_DCPL: one = &c_one; break;
5145       case C_SCPL: one = &cf_one; break;
5146       default: pnga_error("type not supported",type);
5147     }
5148 
5149   /* Nodes contain a mixed number of processors. Create a temporary GA to
5150      complete merge operation. */
5151     count = 0;
5152     for (i=0; i<ndim; i++) {
5153       idims[i] = (Integer)dims[i];
5154       iwidth[i] = (Integer)width[i];
5155       ichunk[i] = 0;
5156     }
5157     if (!pnga_create_ghosts(type, ndim, idims,
5158         iwidth, "temporary", ichunk, &_ga_tmp))
5159       pnga_error("Unable to create work array for merge",GAme);
5160     pnga_zero(_ga_tmp);
5161     /* Find data on this processor and accumulate in temporary global array */
5162     inode = GAme - zproc;
5163     pnga_distribution(g_a,inode,lo,hi);
5164 
5165     /* Check to make sure processor has data */
5166     chk = 1;
5167     for (i=0; i<ndim; i++) {
5168       if (hi[i]<lo[i]) {
5169         chk = 0;
5170       }
5171     }
5172     if (chk) {
5173       pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
5174       pnga_acc(_ga_tmp, lo, hi, ptr_a, ld, one);
5175     }
5176     /* copy data back to original global array */
5177     pnga_sync();
5178     if (chk) {
5179       pnga_get(_ga_tmp, lo, hi, ptr_a, ld);
5180     }
5181     pnga_destroy(_ga_tmp);
5182   }
5183 #else
5184   inode = GAme - zproc;
5185   pnga_distribution(g_a,inode,lo,hi);
5186   /* Check to make sure processor has data */
5187   chk = 1;
5188   nelem = 1;
5189   for (i=0; i<ndim; i++) {
5190     nelem *= (hi[i]-lo[i]+1);
5191     if (hi[i]<lo[i]) {
5192       chk = 0;
5193     }
5194   }
5195   if (chk) {
5196     pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
5197     pnga_pgroup_gop(_mirror_gop_grp,type,ptr_a,nelem,"+");
5198   }
5199 #endif
5200   if (local_sync_end) pnga_sync();
5201 }
5202 
5203 /**
5204  *  Merge all copies of a  patch of a mirrored array into a patch in a
5205  *  distributed array
5206  */
5207 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5208 #   pragma weak wnga_merge_distr_patch =  pnga_merge_distr_patch
5209 #endif
5210 
pnga_merge_distr_patch(Integer g_a,Integer * alo,Integer * ahi,Integer g_b,Integer * blo,Integer * bhi)5211 void pnga_merge_distr_patch(Integer g_a, Integer *alo, Integer *ahi,
5212                             Integer g_b, Integer *blo, Integer *bhi)
5213 /*    Integer g_a  handle to mirrored array
5214       Integer *alo  indices of lower corner of mirrored array patch
5215       Integer *ahi  indices of upper corner of mirrored array patch
5216       Integer g_b  handle to distributed array
5217       Integer *blo  indices of lower corner of distributed array patch
5218       Integer *bhi  indices of upper corner of distributed array patch
5219 */
5220 {
5221   Integer local_sync_begin, local_sync_end;
5222   Integer a_handle, b_handle, adim, bdim;
5223   Integer mlo[MAXDIM], mhi[MAXDIM], mld[MAXDIM];
5224   Integer dlo[MAXDIM], dhi[MAXDIM];
5225   char trans[2];
5226   double d_one;
5227   Integer type, i_one;
5228   double z_one[2];
5229   float  c_one[2];
5230   float f_one;
5231   long l_one;
5232   void *src_data_ptr;
5233   void *one = NULL;
5234   Integer i, idim, intersect, p_handle;
5235 
5236   local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
5237   _ga_sync_begin = 1; _ga_sync_end = 1; /*remove any previous masking */
5238   if (local_sync_begin) pnga_sync();
5239   pnga_check_handle(g_a, "nga_merge_distr_patch");
5240   pnga_check_handle(g_b, "nga_merge_distr_patch");
5241 
5242   /* check to make sure that both patches lie within global arrays and
5243      that patches are the same dimensions */
5244   a_handle = GA_OFFSET + g_a;
5245   b_handle = GA_OFFSET + g_b;
5246 
5247   if (!pnga_is_mirrored(g_a)) {
5248     if (pnga_cluster_nnodes() > 1) {
5249       pnga_error("Handle to a non-mirrored array passed",0);
5250     } else {
5251       trans[0] = 'N';
5252       trans[1] = '\0';
5253       pnga_copy_patch(trans, g_a, alo, ahi, g_b, blo, bhi);
5254       return;
5255     }
5256   }
5257 
5258   if (pnga_is_mirrored(g_b) && pnga_cluster_nnodes())
5259     pnga_error("Distributed array is mirrored",0);
5260 
5261   adim = GA[a_handle].ndim;
5262   bdim = GA[b_handle].ndim;
5263 
5264   p_handle = GA[a_handle].p_handle;
5265 
5266   if (adim != bdim)
5267     pnga_error("Global arrays must have same dimension",0);
5268 
5269   type = GA[a_handle].type;
5270   if (type != GA[b_handle].type)
5271     pnga_error("Global arrays must be of same type",0);
5272 
5273   for (i=0; i<adim; i++) {
5274     idim = (Integer)GA[a_handle].dims[i];
5275     if (alo[i] < 0 || alo[i] >= idim || ahi[i] < 0 || ahi[i] >= idim ||
5276         alo[i] > ahi[i])
5277       pnga_error("Invalid patch index on mirrored GA",0);
5278   }
5279   for (i=0; i<bdim; i++) {
5280     idim = GA[b_handle].dims[i];
5281     if (blo[i] < 0 || blo[i] >= idim || bhi[i] < 0 || bhi[i] >= idim ||
5282         blo[i] > bhi[i])
5283       pnga_error("Invalid patch index on distributed GA",0);
5284   }
5285   for (i=0; i<bdim; i++) {
5286     idim = (Integer)GA[b_handle].dims[i];
5287     if (ahi[i] - alo[i] != bhi[i] - blo[i])
5288       pnga_error("Patch dimensions do not match for index ",i);
5289   }
5290   pnga_zero_patch(g_b, blo, bhi);
5291 
5292   /* Find coordinates of mirrored array patch that I own */
5293   i = PGRP_LIST[p_handle].map_proc_list[GAme];
5294   pnga_distribution(g_a, i, mlo, mhi);
5295   /* Check to see if mirrored array patch intersects my portion of
5296      mirrored array */
5297   intersect = 1;
5298   for (i=0; i<adim; i++) {
5299     if (mhi[i] < alo[i]) intersect = 0;
5300     if (mlo[i] > ahi[i]) intersect = 0;
5301   }
5302   if (intersect) {
5303     /* get portion of mirrored array patch that actually resides on this
5304        processor */
5305     for (i=0; i<adim; i++) {
5306       mlo[i] = GA_MAX(alo[i],mlo[i]);
5307       mhi[i] = GA_MIN(ahi[i],mhi[i]);
5308     }
5309 
5310     /* get pointer to locally held distribution */
5311     pnga_access_ptr(g_a, mlo, mhi, &src_data_ptr, mld);
5312 
5313     /* find indices in distributed array corresponding to this patch */
5314     for (i=0; i<adim; i++) {
5315       dlo[i] = blo[i] + mlo[i]-alo[i];
5316       dhi[i] = blo[i] + mhi[i]-alo[i];
5317     }
5318 
5319     /* perform accumulate */
5320     if (type == C_DBL) {
5321       d_one = 1.0;
5322       one = &d_one;
5323     } else if (type == C_DCPL) {
5324       z_one[0] = 1.0;
5325       z_one[1] = 0.0;
5326       one = &z_one;
5327     } else if (type == C_SCPL) {
5328       c_one[0] = 1.0;
5329       c_one[1] = 0.0;
5330       one = &c_one;
5331     } else if (type == C_FLOAT) {
5332       f_one = 1.0;
5333       one = &f_one;
5334     } else if (type == C_INT) {
5335       i_one = 1;
5336       one = &i_one;
5337     } else if (type == C_LONG) {
5338       l_one = 1;
5339       one = &l_one;
5340     } else {
5341       pnga_error("Type not supported",type);
5342     }
5343     pnga_acc(g_b, dlo, dhi, src_data_ptr, mld, one);
5344   }
5345   if (local_sync_end) pnga_sync();
5346 }
5347 
5348 /**
5349  * Return the total number of blocks in a region (if any)
5350  */
5351 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5352 #   pragma weak wnga_locate_num_blocks =  pnga_locate_num_blocks
5353 #endif
5354 
pnga_locate_num_blocks(Integer g_a,Integer * lo,Integer * hi)5355 Integer pnga_locate_num_blocks(Integer g_a, Integer *lo, Integer *hi)
5356 {
5357   Integer ga_handle = GA_OFFSET + g_a;
5358   Integer ndim = GA[ga_handle].ndim;
5359   Integer ret = -1, d;
5360   Integer cnt;
5361 
5362   for(d = 0; d< GA[ga_handle].ndim; d++)
5363     if((lo[d]<1 || hi[d]>GA[ga_handle].dims[d]) ||(lo[d]>hi[d]))
5364       pnga_error("Requested region out of bounds",0);
5365 
5366   if (GA[ga_handle].distr_type != REGULAR) {
5367     Integer nblocks = GA[ga_handle].block_total;
5368     Integer chk, i, j, tlo[MAXDIM], thi[MAXDIM];
5369     cnt = 0;
5370     for (i=0; i<nblocks; i++) {
5371       /* check to see if this block overlaps with requested block
5372        * defined by lo and hi */
5373       chk = 1;
5374       /* get limits on block i */
5375       pnga_distribution(g_a,i,tlo,thi);
5376       for (j=0; j<ndim && chk==1; j++) {
5377         /* check to see if at least one end point of the interval
5378          * represented by blo and bhi falls in the interval
5379          * represented by lo and hi */
5380         if (!((tlo[j] >= lo[j] && tlo[j] <= hi[j]) ||
5381               (thi[j] >= lo[j] && thi[j] <= hi[j]))) {
5382           chk = 0;
5383         }
5384       }
5385 
5386       if (chk) {
5387         cnt++;
5388       }
5389     }
5390     ret = cnt;
5391   }
5392 
5393   return ret;
5394 }
5395 
5396 /**
5397  *  Return the total number of blocks in a Global Array (if any). Only returns
5398  *  non-zero values for block-cyclic data distributions.
5399  */
5400 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5401 #   pragma weak wnga_total_blocks =  pnga_total_blocks
5402 #endif
5403 
pnga_total_blocks(Integer g_a)5404 Integer pnga_total_blocks(Integer g_a)
5405 {
5406   Integer ga_handle = GA_OFFSET + g_a;
5407   return GA[ga_handle].block_total;
5408 }
5409 
5410 /**
5411  *  Return true if GA uses SCALPACK data distribution
5412  */
5413 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5414 #   pragma weak wnga_uses_proc_grid =  pnga_uses_proc_grid
5415 #endif
5416 
pnga_uses_proc_grid(Integer g_a)5417 logical pnga_uses_proc_grid(Integer g_a)
5418 {
5419   Integer ga_handle = GA_OFFSET + g_a;
5420   return (logical)(GA[ga_handle].distr_type == SCALAPACK
5421       || GA[ga_handle].distr_type == TILED ||
5422       GA[ga_handle].distr_type == TILED_IRREG);
5423 }
5424 
5425 /**
5426  *  Return the index of a processor based on the block partition associated
5427  *  with a particular Global Array, assuming GA uses some sort of block-cyclic
5428  *  data distribution based on an underlying processor grid. (e.g. a
5429  *  ScaLAPACK data distribution)
5430  */
5431 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5432 #   pragma weak wnga_get_proc_index =  pnga_get_proc_index
5433 #endif
5434 
pnga_get_proc_index(Integer g_a,Integer iproc,Integer * index)5435 void pnga_get_proc_index(Integer g_a, Integer iproc, Integer *index)
5436 {
5437   Integer ga_handle = GA_OFFSET + g_a;
5438   if (GA[ga_handle].distr_type == SCALAPACK) {
5439     gam_find_proc_indices(ga_handle, iproc, index);
5440   } else if (GA[ga_handle].distr_type == TILED ||
5441       GA[ga_handle].distr_type == TILED_IRREG) {
5442     gam_find_tile_proc_indices(ga_handle, iproc, index);
5443   } else {
5444     pnga_error("Global array does not use ScaLAPACK data distribution",0);
5445   }
5446   return;
5447 }
5448 
5449 /**
5450  *  Return proc grid dimension and block dimension for a particular
5451  *  Global Array
5452  */
5453 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5454 #   pragma weak wnga_get_block_info =  pnga_get_block_info
5455 #endif
5456 
pnga_get_block_info(Integer g_a,Integer * num_blocks,Integer * block_dims)5457 void pnga_get_block_info(Integer g_a, Integer *num_blocks, Integer *block_dims)
5458 {
5459   Integer ga_handle = GA_OFFSET + g_a;
5460   Integer i, ndim;
5461   ndim = GA[ga_handle].ndim;
5462   if (GA[ga_handle].distr_type == SCALAPACK ||
5463       GA[ga_handle].distr_type == TILED) {
5464     for (i=0; i<ndim; i++) {
5465       num_blocks[i] = GA[ga_handle].num_blocks[i];
5466       block_dims[i] = GA[ga_handle].block_dims[i];
5467     }
5468   } else if (GA[ga_handle].distr_type == TILED_IRREG) {
5469     /* not sure what to do here */
5470     pnga_error("Don't know how to respond to get_block_infor for"
5471         " irregular tiled array",0);
5472   } else {
5473     Integer dim, bsize;
5474     for (i=0; i<ndim; i++) {
5475       dim = GA[ga_handle].dims[i];
5476       bsize = GA[ga_handle].block_dims[i];
5477       if (bsize > 0) {
5478         if (dim%bsize == 0) {
5479           num_blocks[i] = dim/bsize;
5480         } else {
5481           num_blocks[i] = dim/bsize+1;
5482         }
5483       } else {
5484         num_blocks[i] = 0;
5485       }
5486       block_dims[i] = GA[ga_handle].block_dims[i];
5487     }
5488   }
5489   return;
5490 }
5491 
5492 /**
5493  *  Set the value of internal debug flag
5494  */
5495 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5496 #   pragma weak wnga_set_debug =  pnga_set_debug
5497 #endif
5498 
pnga_set_debug(logical flag)5499 void pnga_set_debug(logical flag)
5500 {
5501   GA_Debug_flag = (Integer)(flag);
5502 }
5503 
5504 /**
5505  *  Get current value of internal debug flag
5506  */
5507 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5508 #   pragma weak wnga_get_debug =  pnga_get_debug
5509 #endif
5510 
pnga_get_debug()5511 logical pnga_get_debug()
5512 {
5513   return (logical)GA_Debug_flag;
5514 }
5515 
5516 #ifdef ENABLE_CHECKPOINT
ga_checkpoint_arrays(Integer * gas,int num)5517 void ga_checkpoint_arrays(Integer *gas,int num)
5518 {
5519    int ga = *(gas+0);
5520    int hdl = GA_OFFSET + ga;
5521    printf("\n%d:in checkpoint %d %d %d\n",GAme,ga,*(gas+1),*num);fflush(stdout);
5522    if(GA[hdl].record_id==0)
5523      ga_icheckpoint_init(gas,num);
5524    ga_icheckpoint(gas,num);
5525 }
5526 
ga_recover_arrays(Integer * gas,int num)5527 int ga_recover_arrays(Integer *gas, int num)
5528 {
5529     int i;
5530     for(i=0;i<num;i++){
5531        int ga = *(gas+i);
5532        int hdl = GA_OFFSET + ga;
5533        if(GA[hdl].record_id!=0)
5534          ga_irecover(ga);
5535     }
5536 }
5537 #endif
5538 /**
5539  *  Return the world group ID of the pid in process group grp
5540  */
5541 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5542 #   pragma weak wnga_pgroup_absolute_id = pnga_pgroup_absolute_id
5543 #endif
5544 
pnga_pgroup_absolute_id(Integer grp,Integer pid)5545 Integer pnga_pgroup_absolute_id(Integer grp, Integer pid)
5546 {
5547 #ifdef MSG_COMMS_MPI
5548   if(grp == GA_World_Proc_Group) /*a.k.a -1*/
5549     return pid;
5550   else
5551     return ARMCI_Absolute_id(&PGRP_LIST[grp].group, pid);
5552 #else
5553   pnga_error("ga_pgroup_absolute_id(): Defined only when using MPI groups",0);
5554   return -1;
5555 #endif
5556 }
5557 
calc_maplen(int handle)5558 static int calc_maplen(int handle)
5559 {
5560     if (GA[handle].mapc != NULL) {
5561         int i,len=0;
5562         if (GA[handle].distr_type != TILED_IRREG) {
5563           for (i=0; i<GA[handle].ndim; i++) {
5564             len += GA[handle].nblock[i];
5565           }
5566         } else {
5567           for (i=0; i<GA[handle].ndim; i++) {
5568             len += GA[handle].num_blocks[i];
5569           }
5570         }
5571         return len;
5572     }
5573     return 0;
5574 }
5575 
5576 
5577 /***************************************************************
5578  *
5579  * GA types related functions
5580  *
5581  ***************************************************************/
5582 
5583 ga_typeinfo_t ga_types[GA_TYPES_MAX] = {
5584   {1, sizeof(char)},
5585   {1, sizeof(int)},
5586   {1, sizeof(long)},
5587   {1, sizeof(float)},
5588   {1, sizeof(double)},
5589   {1, sizeof(long double)},
5590   {1, sizeof(SingleComplex)},
5591   {1, sizeof(DoubleComplex)},
5592   {1, -1 /*sizeof(LongDoubleComplex)*/},
5593   {1, sizeof(char)},
5594   {1, sizeof(Integer)},
5595   {1, sizeof(logical)},
5596   {1, sizeof(Real)},
5597   {1, sizeof(DoublePrecision)},
5598   {1, sizeof(SingleComplex)},
5599   {1, sizeof(DoubleComplex)},
5600   {1, sizeof(long long)},
5601 };
5602 
5603 /* #define GAsizeofM(_type)   ga_types[_type-MT_BASE]  */
5604 
5605 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5606 #   pragma weak wnga_register_type = pnga_register_type
5607 #endif
pnga_register_type(size_t bytes)5608 int pnga_register_type(size_t bytes) {
5609   int i;
5610   for(i=GA_TYPES_RESERVED; i<GA_TYPES_MAX && ga_types[i].active==1; i++);
5611   if(i==GA_TYPES_MAX) {
5612     return -1;
5613   }
5614   ga_types[i].active = 1;
5615   ga_types[i].size = bytes;
5616   return i+MT_BASE;
5617 }
5618 
5619 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5620 #   pragma weak wnga_deregister_type = pnga_deregister_type
5621 #endif
pnga_deregister_type(int type)5622 int pnga_deregister_type(int type) {
5623   int tp = type-MT_BASE;
5624   if(tp<GA_TYPES_RESERVED) {
5625     return -1;
5626   }
5627   if(ga_types[tp].active==0) {
5628     return -2;
5629   }
5630   ga_types[tp].active = 0;
5631   ga_types[tp].size = 0;
5632   return 0;
5633 }
5634 
5635 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5636 #   pragma weak wnga_version = pnga_version
5637 #endif
pnga_version(Integer * major_version,Integer * minor_version,Integer * patch)5638 void pnga_version(Integer *major_version, Integer *minor_version, Integer *patch)
5639 {
5640   *major_version = GA_VERSION_MAJOR;
5641   *minor_version = GA_VERSION_MINOR;
5642   *patch         = GA_VERSION_PATCH;
5643 }
5644