1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /* $Id: base.c,v 1.149.2.19 2007/12/18 18:42:20 d3g293 Exp $ */
6 /*
7 * module: base.c
8 * author: Jarek Nieplocha
9 * description: implements GA primitive operations --
10 * create (regular& irregular) and duplicate, destroy
11 *
12 * DISCLAIMER
13 *
14 * This material was prepared as an account of work sponsored by an
15 * agency of the United States Government. Neither the United States
16 * Government nor the United States Department of Energy, nor Battelle,
17 * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
18 * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
19 * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
20 * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
21 * INFRINGE PRIVATELY OWNED RIGHTS.
22 *
23 *
24 * ACKNOWLEDGMENT
25 *
26 * This software and its documentation were produced with United States
27 * Government support under Contract Number DE-AC06-76RLO-1830 awarded by
28 * the United States Department of Energy. The United States Government
29 * retains a paid-up non-exclusive, irrevocable worldwide license to
30 * reproduce, prepare derivative works, perform publicly and display
31 * publicly by or for the US Government, including the right to
32 * distribute to other US Government contractors.
33 */
34
35 #if HAVE_STDIO_H
36 # include <stdio.h>
37 #endif
38 #if HAVE_STRING_H
39 # include <string.h>
40 #endif
41 #if HAVE_STDLIB_H
42 # include <stdlib.h>
43 #endif
44 #if HAVE_MATH_H
45 # include <math.h>
46 #endif
47 #if HAVE_ASSERT_H
48 # include <assert.h>
49 #endif
50
51 #include <ctype.h>
52 #include "farg.h"
53 #include "globalp.h"
54 #include "message.h"
55 #include "base.h"
56 #include "macdecls.h"
57 #include "armci.h"
58 #include "ga-papi.h"
59 #include "ga-wapi.h"
60 #include "thread-safe.h"
61
62 static int calc_maplen(int handle);
63
64 #ifdef PROFILE_OLD
65 #include "ga_profile.h"
66 #endif
67 /*#define AVOID_MA_STORAGE 1*/
68 #define DEBUG 0
69 #define USE_MALLOC 1
70 #define INVALID_MA_HANDLE -1
71 #define NEAR_INT(x) (x)< 0.0 ? ceil( (x) - 0.5) : floor((x) + 0.5)
72
73 #define FLEN 80 /* length of Fortran strings */
74
75 /*uncomment line below to verify consistency of MA in every sync */
76 /*#define CHECK_MA yes */
77
78 /*uncomment line below to verify if MA base address is alligned wrt datatype*/
79 #if !(defined(LINUX) || defined(CRAY) || defined(CYGWIN))
80 #define CHECK_MA_ALGN 1
81 #endif
82
83 /*uncomment line below to initialize arrays in ga_create/duplicate */
84 /*#define GA_CREATE_INDEF yes */
85
86 /*uncomment line below to introduce padding between shared memory regions
87 of a GA when the region spans in more than 1 process within SMP */
88 #define GA_ELEM_PADDING yes
89
90 #define OLD_DISTRIBUTION 1
91 #if OLD_DISTRIBUTION
92 extern void ddb_h2(Integer ndims, Integer dims[], Integer npes,
93 double threshold, Integer bias, Integer blk[],
94 Integer pedims[]);
95 #else
96 extern void ddb(Integer ndims, Integer dims[], Integer npes,
97 Integer blk[], Integer pedims[]);
98 #endif
99
100 global_array_t *_ga_main_data_structure;
101 global_array_t *GA;
102 proc_list_t *_proc_list_main_data_structure;
103 proc_list_t *PGRP_LIST;
104 static int GAinitialized = 0;
105 static int ARMCIinitialized = 0;
106 int _ga_sync_begin = 1;
107 int _ga_sync_end = 1;
108 int _max_global_array = MAX_ARRAYS;
109 int GA_World_Proc_Group = -1;
110 int GA_Default_Proc_Group = -1;
111 int ga_armci_world_group=0;
112 int GA_Init_Proc_Group = -2;
113 Integer GA_Debug_flag = 0;
114
115 /* MA addressing */
116 DoubleComplex *DCPL_MB; /* double precision complex base address */
117 SingleComplex *SCPL_MB; /* single precision complex base address */
118 DoublePrecision *DBL_MB; /* double precision base address */
119 Integer *INT_MB; /* integer base address */
120 float *FLT_MB; /* float base address */
121 int** GA_Update_Flags;
122 int* GA_Update_Signal;
123
124 typedef struct {
125 long id;
126 long type;
127 long size;
128 long dummy;
129 } getmem_t;
130
131 /*\
132 * Copy of GA's internal communicator
133 \*/
134 #ifdef MSG_COMMS_MPI
135 MPI_Comm GA_MPI_World_comm_dup;
136 #endif
137
138 /* set total limit (bytes) for memory usage per processor to "unlimited" */
139 static Integer GA_total_memory = -1;
140 static Integer GA_memory_limited = 0;
141 struct ga_stat_t GAstat;
142 struct ga_bytes_t GAbytes ={0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.};
143 long *GAstat_arr;
144 static Integer GA_memory_limit=0;
145 Integer GAme, GAnproc;
146 static Integer MPme;
147 Integer *mapALL;
148
149 static Integer _mirror_gop_grp;
150
151 /* Function prototypes */
152 int gai_getmem(char* name, char **ptr_arr, C_Long bytes, int type, long *id,
153 int grp_id);
154 int gai_get_devmem(char *name, char **ptr_arr, C_Long bytes, int type, long *adj,
155 int grp_id, int dev_flag, const char *device);
156 #ifdef ENABLE_CHECKPOINT
157 static int ga_group_is_for_ft=0;
158 int ga_spare_procs;
159 #endif
160
161
162 /*************************************************************************/
163
164 /*\ This macro computes index (place in ordered set) for the element
165 * identified by _subscript in ndim- dimensional array of dimensions _dim[]
166 * assume that first subscript component changes first
167 \*/
168 #define ga_ComputeIndexM(_index, _ndim, _subscript, _dims) \
169 { \
170 Integer _i, _factor=1; \
171 __CRAYX1_PRAGMA("_CRI novector"); \
172 for(_i=0,*(_index)=0; _i<_ndim; _i++){ \
173 *(_index) += _subscript[_i]*_factor; \
174 if(_i<_ndim-1)_factor *= _dims[_i]; \
175 } \
176 }
177
178
179 /*\ updates subscript corresponding to next element in a patch <lo[]:hi[]>
180 \*/
181 #define ga_UpdateSubscriptM(_ndim, _subscript, _lo, _hi, _dims)\
182 { \
183 Integer _i; \
184 __CRAYX1_PRAGMA("_CRI novector"); \
185 for(_i=0; _i<_ndim; _i++){ \
186 if(_subscript[_i] < _hi[_i]) { _subscript[_i]++; break;} \
187 _subscript[_i] = _lo[_i]; \
188 } \
189 }
190
191
192 /*\ Initialize n-dimensional loop by counting elements and setting subscript=lo
193 \*/
194 #define ga_InitLoopM(_elems, _ndim, _subscript, _lo, _hi, _dims)\
195 { \
196 Integer _i; \
197 *_elems = 1; \
198 __CRAYX1_PRAGMA("_CRI novector"); \
199 for(_i=0; _i<_ndim; _i++){ \
200 *_elems *= _hi[_i]-_lo[_i] +1; \
201 _subscript[_i] = _lo[_i]; \
202 } \
203 }
204
205
GAsizeof(Integer type)206 Integer GAsizeof(Integer type)
207 {
208 switch (type) {
209 case C_DBL : return (sizeof(double));
210 case C_INT : return (sizeof(int));
211 case C_SCPL : return (sizeof(SingleComplex));
212 case C_DCPL : return (sizeof(DoubleComplex));
213 case C_FLOAT : return (sizeof(float));
214 case C_LONG : return (sizeof(long));
215 case C_LONGLONG : return (sizeof(long long));
216 default : return 0;
217 }
218 }
219
220
221 /*\ Register process list
222 * process list can be used to:
223 * 1. permute process ids w.r.t. message-passing ids (set PERMUTE_PIDS), or
224 * 2. change logical mapping of array blocks to processes
225 \*/
ga_register_proclist_(Integer * list,Integer * np)226 void ga_register_proclist_(Integer *list, Integer* np)
227 {
228 /* no longer used */
229 }
230
231
GA_Register_proclist(int * list,int np)232 void GA_Register_proclist(int *list, int np)
233 {
234 /* no long used */
235 }
236
237
238 /*\ FINAL CLEANUP of shmem when terminating
239 \*/
ga_clean_resources()240 void ga_clean_resources()
241 {
242 ARMCI_Cleanup();
243 }
244
245
246 /*\ CHECK GA HANDLE and if it's wrong TERMINATE
247 * C version
248 \*/
249 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
250 # pragma weak wnga_check_handle = pnga_check_handle
251 #endif
pnga_check_handle(Integer g_a,char * string)252 void pnga_check_handle(Integer g_a, char * string)
253 {
254 ga_check_handleM(g_a, string);
255 }
256
257
258 /*\ Initialize MA-like addressing:
259 * get addressees for the base arrays for double, complex and int types
260 \*/
261 static int ma_address_init=0;
gai_ma_address_init()262 void gai_ma_address_init()
263 {
264 #ifdef CHECK_MA_ALGN
265 Integer off_dbl, off_int, off_dcpl, off_flt,off_scpl;
266 #endif
267 ma_address_init=1;
268 INT_MB = (Integer*)MA_get_mbase(MT_F_INT);
269 DBL_MB = (DoublePrecision*)MA_get_mbase(MT_F_DBL);
270 DCPL_MB= (DoubleComplex*)MA_get_mbase(MT_F_DCPL);
271 SCPL_MB= (SingleComplex*)MA_get_mbase(MT_F_SCPL);
272 FLT_MB = (float*)MA_get_mbase(MT_F_REAL);
273 # ifdef CHECK_MA_ALGN
274 off_dbl = 0 != ((long)DBL_MB)%sizeof(DoublePrecision);
275 off_int = 0 != ((long)INT_MB)%sizeof(Integer);
276 off_dcpl= 0 != ((long)DCPL_MB)%sizeof(DoublePrecision);
277 off_scpl= 0 != ((long)SCPL_MB)%sizeof(float);
278 off_flt = 0 != ((long)FLT_MB)%sizeof(float);
279 if(off_dbl)
280 pnga_error("GA initialize: MA DBL_MB not alligned", (Integer)DBL_MB);
281
282 if(off_int)
283 pnga_error("GA initialize: INT_MB not alligned", (Integer)INT_MB);
284
285 if(off_dcpl)
286 pnga_error("GA initialize: DCPL_MB not alligned", (Integer)DCPL_MB);
287
288 if(off_scpl)
289 pnga_error("GA initialize: SCPL_MB not alligned", (Integer)SCPL_MB);
290
291 if(off_flt)
292 pnga_error("GA initialize: FLT_MB not alligned", (Integer)FLT_MB);
293
294 # endif
295
296 if(DEBUG) printf("%d INT_MB=%p DBL_MB=%p DCPL_MB=%p FLT_MB=%p SCPL_MB=%p\n",
297 (int)GAme, (void*)INT_MB, (void*)DBL_MB, (void*)DCPL_MB, (void*)FLT_MB, (void*)SCPL_MB);
298 }
299
300
301
302 extern int *_ga_argc;
303 extern char ***_ga_argv;
304 extern int _ga_initialize_args;
305 extern int _ga_initialize_c;
306 extern int _ga_initialize_f;
307
308 /**
309 * Initialize library structures in Global Arrays.
310 * either ga_initialize_ltd or ga_initialize must be the first
311 * GA routine called (except ga_uses_ma)
312 */
313 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
314 # pragma weak wnga_initialize = pnga_initialize
315 #endif
316
pnga_initialize()317 void pnga_initialize()
318 {
319 Integer i, j,nproc, nnode, zero;
320 int bytes;
321 GA_Internal_Threadsafe_Lock();
322 #ifdef MSG_COMMS_MPI
323 MPI_Comm comm;
324 #endif
325
326 if(GAinitialized)
327 {
328 GA_Internal_Threadsafe_Unlock();
329 return;
330 }
331
332 #if HAVE_ARMCI_INITIALIZED_FUNCTION
333 if (!ARMCI_Initialized())
334 #else
335 if (!ARMCIinitialized)
336 #endif
337 {
338 /* assure that GA will not alocate more shared memory than specified */
339 if(GA_memory_limited) ARMCI_Set_shm_limit(GA_total_memory);
340 if (_ga_initialize_c) {
341 if (_ga_initialize_args) {
342 ARMCI_Init_args(_ga_argc, _ga_argv);
343 }
344 else {
345 ARMCI_Init();
346 }
347 }
348 else if (_ga_initialize_f) {
349 _ga_argc = malloc(sizeof(int));
350 _ga_argv = malloc(sizeof(char**));
351 if (!_ga_argc) pnga_error("malloc argc failed",1);
352 ga_f2c_get_cmd_args(_ga_argc, _ga_argv);
353 ARMCI_Init_args(_ga_argc, _ga_argv);
354 }
355 else {
356 pnga_error("pnga_initialize called outside of C or F APIs",1);
357 }
358 ARMCIinitialized = 1;
359 }
360
361 GA_Default_Proc_Group = -1;
362 /* zero in pointers in GA array */
363 _ga_main_data_structure
364 = (global_array_t *)malloc(sizeof(global_array_t)*MAX_ARRAYS);
365 _proc_list_main_data_structure
366 = (proc_list_t *)malloc(sizeof(proc_list_t)*MAX_ARRAYS);
367 if(!_ga_main_data_structure)
368 pnga_error("ga_init:malloc ga failed",0);
369 if(!_proc_list_main_data_structure)
370 pnga_error("ga_init:malloc proc_list failed",0);
371 GA = _ga_main_data_structure;
372 PGRP_LIST = _proc_list_main_data_structure;
373 for(i=0;i<MAX_ARRAYS; i++) {
374 GA[i].ptr = (char**)0;
375 GA[i].mapc = (C_Integer*)0;
376 GA[i].rstrctd_list = (C_Integer*)0;
377 GA[i].rank_rstrctd = (C_Integer*)0;
378 GA[i].property = NO_PROPERTY;
379 GA[i].mem_dev_set = 0;
380 #ifdef ENABLE_CHECKPOINT
381 GA[i].record_id = 0;
382 #endif
383 GA[i].actv = 0;
384 GA[i].p_handle = GA_Init_Proc_Group;
385 GA[i].overlay = 0;
386 PGRP_LIST[i].map_proc_list = (int*)0;
387 PGRP_LIST[i].inv_map_proc_list = (int*)0;
388 PGRP_LIST[i].actv = 0;
389 }
390
391 bzero(&GAstat,sizeof(GAstat));
392
393 /* initialize some data structures used in non-blocking communication */
394 gai_nb_init();
395
396 GAnproc = (Integer)armci_msg_nproc();
397
398 /* Allocate arrays used by library */
399 mapALL = (Integer*)malloc((GAnproc+MAXDIM-1)*sizeof(Integer*));
400
401 GAme = (Integer)armci_msg_me();
402 if(GAme<0 || GAme>GAnproc)
403 pnga_error("ga_init:message-passing initialization problem: my ID=",GAme);
404
405 MPme= (Integer)armci_msg_me();
406
407 gai_init_onesided();
408
409 /* set activity status for all arrays to inactive */
410 for(i=0;i<_max_global_array;i++)GA[i].actv=0;
411 for(i=0;i<_max_global_array;i++)GA[i].actv_handle=0;
412
413 /* Create proc list for mirrored arrays */
414 PGRP_LIST[0].map_proc_list = (int*)malloc(GAnproc*sizeof(int)*2);
415 PGRP_LIST[0].inv_map_proc_list = PGRP_LIST[0].map_proc_list + GAnproc;
416 for (i=0; i<GAnproc; i++) PGRP_LIST[0].map_proc_list[i] = -1;
417 for (i=0; i<GAnproc; i++) PGRP_LIST[0].inv_map_proc_list[i] = -1;
418 nnode = pnga_cluster_nodeid();
419 nproc = pnga_cluster_nprocs(nnode);
420 zero = 0;
421 j = pnga_cluster_procid(nnode, zero);
422 PGRP_LIST[0].parent = -1;
423 PGRP_LIST[0].actv = 1;
424 PGRP_LIST[0].map_nproc = nproc;
425 PGRP_LIST[0].mirrored = 1;
426 for (i=0; i<nproc; i++) {
427 PGRP_LIST[0].map_proc_list[i+j] = i;
428 PGRP_LIST[0].inv_map_proc_list[i] = i+j;
429 }
430
431 /* Set up group for doing cluster merge. Start by checking if
432 * number of procs per node is the same for all nodes */
433 i = nproc;
434 pnga_pgroup_gop(GA_World_Proc_Group,pnga_type_f2c(MT_F_INT),&i,1,"max");
435 j = nproc;
436 pnga_pgroup_gop(GA_World_Proc_Group,pnga_type_f2c(MT_F_INT),&j,1,"min");
437 if (i == j) {
438 /* construct a group that containing all processors with the same local
439 * proc ID across all nodes in the system */
440 Integer numnodes = pnga_cluster_nnodes();
441 Integer *nodelist = (Integer*)malloc(numnodes*sizeof(Integer));
442 Integer myproc = GAme-pnga_cluster_procid(nnode,zero);
443 for (i=0; i<numnodes; i++) {
444 nodelist[i] = i*nproc+myproc;
445 }
446 _mirror_gop_grp = pnga_pgroup_create(nodelist,numnodes);
447 free(nodelist);
448 } else {
449 _mirror_gop_grp = GA_World_Proc_Group;
450 }
451
452
453
454 /* Allocate memory for update flags and signal*/
455 bytes = 2*MAXDIM*sizeof(int);
456 GA_Update_Flags = (int**)malloc(GAnproc*sizeof(void*));
457 if (!GA_Update_Flags)
458 pnga_error("ga_init: Failed to initialize GA_Update_Flags",(int)GAme);
459 if (ARMCI_Malloc((void**)GA_Update_Flags, (armci_size_t) bytes))
460 pnga_error("ga_init:Failed to initialize memory for update flags",GAme);
461 if(GA_Update_Flags[GAme]==NULL)pnga_error("ga_init:ARMCIMalloc failed",GAme);
462
463 bytes = sizeof(int);
464 GA_Update_Signal = ARMCI_Malloc_local((armci_size_t) bytes);
465
466 /* Zero update flags */
467 for (i=0; i<2*MAXDIM; i++) {
468 GA_Update_Flags[GAme][i] = 0;
469 }
470
471 /* set MA error function */
472 MA_set_error_callback(ARMCI_Error);
473
474 GAinitialized = 1;
475
476 #ifdef PROFILE_OLD
477 ga_profile_init();
478 #endif
479 #ifdef ENABLE_CHECKPOINT
480 {
481 Integer tmplist[1000];
482 Integer tmpcount;
483 tmpcount = GAnproc-ga_spare_procs;
484 for(i=0;i<tmpcount;i++)
485 tmplist[i]=i;
486 ga_group_is_for_ft=1;
487 GA_Default_Proc_Group = pnga_pgroup_create(tmplist,tmpcount);
488 ga_group_is_for_ft=0;
489 if(GAme>=tmpcount)
490 ga_irecover(0);
491 printf("\n%d:here done with initialize\n",GAme);
492
493 }
494 #endif
495 /* create duplicate of world communicator */
496 #ifdef MSG_COMMS_MPI
497 comm = GA_MPI_Comm_pgroup(-1);
498 MPI_Comm_dup(comm, &GA_MPI_World_comm_dup);
499 #endif
500 GA_Internal_Threadsafe_Unlock();
501 }
502
503
504 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
505 # pragma weak wnga_initialized = pnga_initialized
506 #endif
pnga_initialized()507 int pnga_initialized()
508 {
509 return GAinitialized;
510 }
511
512 #if ENABLE_CHECKPOINT
set_ga_group_is_for_ft(int val)513 void set_ga_group_is_for_ft(int val)
514 {
515 ga_group_is_for_ft = val;
516 }
517 #endif
518
519 /**
520 * Is MA used for allocation of GA memory?
521 */
522 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
523 # pragma weak wnga_uses_ma = pnga_uses_ma
524 #endif
525
pnga_uses_ma()526 logical pnga_uses_ma()
527 {
528 #ifdef AVOID_MA_STORAGE
529 return FALSE;
530 #else
531 if(!GAinitialized) return FALSE;
532
533 if(ARMCI_Uses_shm()) return FALSE;
534 else return TRUE;
535 #endif
536 }
537
538 /**
539 * Is memory limit set
540 */
541 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
542 # pragma weak wnga_memory_limited = pnga_memory_limited
543 #endif
544
pnga_memory_limited()545 logical pnga_memory_limited()
546 {
547 if(GA_memory_limited) return TRUE;
548 else return FALSE;
549 }
550
551 /**
552 * Returns the amount of memory on each processor used in active Global Arrays
553 */
554 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
555 # pragma weak wnga_inquire_memory = pnga_inquire_memory
556 #endif
557
pnga_inquire_memory()558 Integer pnga_inquire_memory()
559 {
560 Integer i, sum=0;
561 for(i=0; i<_max_global_array; i++)
562 if(GA[i].actv) sum += (Integer)GA[i].size;
563 return(sum);
564 }
565
566 /**
567 * Returns the amount of memory available on the calling processor
568 */
569 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
570 # pragma weak wnga_memory_avail = pnga_memory_avail
571 #endif
572
pnga_memory_avail()573 Integer pnga_memory_avail()
574 {
575 if(!pnga_uses_ma()) return(GA_total_memory);
576 else{
577 Integer ma_limit = MA_inquire_avail(MT_F_BYTE);
578
579 if ( GA_memory_limited ) return( GA_MIN(GA_total_memory, ma_limit) );
580 else return( ma_limit );
581 }
582 }
583
584
585
586 /**
587 * (re)set limit on GA memory usage
588 */
589 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
590 # pragma weak wnga_set_memory_limit = pnga_set_memory_limit
591 #endif
592
pnga_set_memory_limit(Integer mem_limit)593 void pnga_set_memory_limit(Integer mem_limit)
594 {
595 if(GA_memory_limited){
596
597 /* if we had the limit set we need to adjust the amount available */
598 if (mem_limit>=0)
599 /* adjust the current value by diff between old and new limit */
600 GA_total_memory += (mem_limit - GA_memory_limit);
601 else{
602
603 /* negative values reset limit to "unlimited" */
604 GA_memory_limited = 0;
605 GA_total_memory= -1;
606 }
607
608 }else{
609
610 GA_total_memory = GA_memory_limit = mem_limit;
611 if(mem_limit >= 0) GA_memory_limited = 1;
612 }
613 }
614
615 /**
616 * Initialize Global Array library structures and set a limit on memory
617 * usage by GA.
618 * the byte limit is per processor (even for shared memory)
619 * either ga_initialize_ltd or ga_initialize must be the first
620 * GA routine called (except ga_uses_ma)
621 * ga_initialize is another version of ga_initialize_ltd, except
622 * without memory control
623 * mem_limit < 0 means "memory unlimited"
624 */
625 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
626 # pragma weak wnga_initialize_ltd = pnga_initialize_ltd
627 #endif
628
pnga_initialize_ltd(Integer mem_limit)629 void pnga_initialize_ltd(Integer mem_limit)
630 {
631 GA_total_memory =GA_memory_limit = mem_limit;
632 if(mem_limit >= 0) GA_memory_limited = 1;
633 pnga_initialize();
634 }
635
636 /* #define gam_checktype(_type)\ */
637 /* if(_type != C_DBL && _type != C_INT && \ */
638 /* _type != C_DCPL && _type != C_SCPL && _type != C_FLOAT && \ */
639 /* _type != C_LONG &&_type != C_LONGLONG)\ */
640 /* pnga_error("ttype not yet supported ", _type) */
641
642 #define gam_checktype(_type) if(!GAvalidtypeM(_type))pnga_error("type not yet supported", (_type))
643
644 #define gam_checkdim(ndim, dims)\
645 {\
646 int _d;\
647 if(ndim<1||ndim>MAXDIM) pnga_error("unsupported number of dimensions",ndim);\
648 __CRAYX1_PRAGMA("_CRI novector"); \
649 for(_d=0; _d<ndim; _d++)\
650 if(dims[_d]<1)pnga_error("wrong dimension specified",dims[_d]);\
651 }
652
653 /**
654 * Utility function to tell whether or not an array is mirrored
655 */
656 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
657 # pragma weak wnga_is_mirrored = pnga_is_mirrored
658 #endif
659
pnga_is_mirrored(Integer g_a)660 logical pnga_is_mirrored(Integer g_a)
661 {
662 Integer ret = FALSE;
663 Integer handle = GA_OFFSET + g_a;
664 Integer p_handle = (Integer)GA[handle].p_handle;
665 if (p_handle >= 0) {
666 if (PGRP_LIST[p_handle].mirrored) ret = TRUE;
667 }
668 return ret;
669 }
670
671 /**
672 * map_ij: pointer to map array containing axis partitions
673 * n: number of blocks along axis
674 * scale: factor for coming up with an initial guess
675 * elem: array element index that we are trying to find
676 * block: index of block containing elem
677 */
678 #define findblock(map_ij,n,scale,elem, block)\
679 {\
680 int candidate, found, b; \
681 C_Integer *map= (map_ij);\
682 \
683 candidate = (int)(scale*(elem));\
684 found = 0;\
685 if(map[candidate] <= (elem)){ /* search downward */\
686 b= candidate;\
687 while(b<(n)-1){ \
688 found = (map[b+1]>(elem));\
689 if(found)break;\
690 b++;\
691 } \
692 }else{ /* search upward */\
693 b= candidate-1;\
694 while(b>=0){\
695 found = (map[b]<=(elem));\
696 if(found)break;\
697 b--;\
698 }\
699 }\
700 if(!found)b=(n)-1;\
701 *(block) = b;\
702 }
703
704 /*\
705 * Find indices of block containing the array element at the location
706 * in subscript.
707 \*/
708 #define gam_find_block_indices_from_subscript(handle,subscript,index)\
709 { \
710 int _type = GA[handle].distr_type; \
711 Integer *_mapc = GA[handle].mapc; \
712 Integer _offset; \
713 int _i; \
714 int _ndim = GA[handle].ndim; \
715 if (_type == REGULAR) { \
716 for (_i=0, _offset=0; _i<_ndim; _i++) { \
717 findblock(_mapc+_offset, GA[handle].nblock[_i], \
718 GA[handle].scale[_i],subscript[_i],&index[_i]); \
719 _offset += GA[handle].nblock[_i]; \
720 } \
721 } else if (_type == TILED_IRREG) { \
722 for (_i=0, _offset=0; _i<_ndim; _i++) { \
723 findblock(_mapc+_offset, GA[handle].num_blocks[_i], \
724 GA[handle].scale[_i],subscript[_i],&index[_i]); \
725 _offset += GA[handle].num_blocks[_i]; \
726 } \
727 } else { \
728 for (_i=0; _i<_ndim; _i++) { \
729 index[_i] = (subscript[_i]-1)/GA[handle].block_dims[_i]; \
730 } \
731 } \
732 }
733
734
735 /**
736 * Locate the owner of an element of a Global Array specified by the array
737 * subscript
738 */
739 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
740 # pragma weak wnga_locate = pnga_locate
741 #endif
742
pnga_locate(Integer g_a,Integer * subscript,Integer * owner)743 logical pnga_locate(Integer g_a, Integer* subscript, Integer* owner)
744 {
745 Integer d, proc, dpos, ndim, ga_handle = GA_OFFSET + g_a, proc_s[MAXDIM];
746
747 ga_check_handleM(g_a, "nga_locate");
748 ndim = GA[ga_handle].ndim;
749
750 if (GA[ga_handle].distr_type == REGULAR) {
751 for(d=0, *owner=-1; d< ndim; d++)
752 if(subscript[d]< 1 || subscript[d]>GA[ga_handle].dims[d]) return FALSE;
753
754 for(d = 0, dpos = 0; d< ndim; d++){
755 findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
756 GA[ga_handle].scale[d], subscript[d], &proc_s[d]);
757 dpos += GA[ga_handle].nblock[d];
758 }
759
760 ga_ComputeIndexM(&proc, ndim, proc_s, GA[ga_handle].nblock);
761
762 *owner = proc;
763 if (GA[ga_handle].num_rstrctd > 0) {
764 *owner = GA[ga_handle].rstrctd_list[*owner];
765 }
766 } else {
767 Integer i;
768 Integer index[MAXDIM];
769 gam_find_block_indices_from_subscript(ga_handle,subscript,index);
770 gam_find_block_from_indices(ga_handle,i,index);
771 *owner = i;
772 }
773
774 return TRUE;
775 }
776
777
778
779 /*\ UTILITY FUNCTION TO LOCATE THE BOUNDING INDICES OF A CONTIGUOUS CHUNK OF
780 * SHARED MEMORY FOR A MIRRORED ARRAY
781 \*/
ngai_get_first_last_indices(Integer g_a)782 void ngai_get_first_last_indices( Integer g_a) /* array handle (input) */
783 {
784
785 Integer lo[MAXDIM], hi[MAXDIM];
786 Integer nelems, nnodes, inode, nproc;
787 Integer ifirst, ilast, nfirst, nlast, icnt, np;
788 Integer i, j, itmp, ndim, map_offset[MAXDIM];
789 /* Integer icheck; */
790 Integer index[MAXDIM], subscript[MAXDIM];
791 Integer handle = GA_OFFSET + g_a;
792 Integer type, size=0, id;
793 /* Integer grp_id; */
794 int Save_default_group;
795 char *fptr, *lptr;
796
797 /* find total number of elements */
798 ndim = GA[handle].ndim;
799 nelems = 1;
800 for (i=0; i<ndim; i++) nelems *= GA[handle].dims[i];
801
802 /* If array is mirrored, evaluate first and last indices */
803 if (pnga_is_mirrored(g_a)) {
804 /* If default group is not world group, change default group to world group
805 temporarily */
806 Save_default_group = GA_Default_Proc_Group;
807 GA_Default_Proc_Group = -1;
808 nnodes = pnga_cluster_nnodes();
809 inode = pnga_cluster_nodeid();
810 nproc = pnga_cluster_nprocs(inode);
811 /* grp_id = GA[handle].p_handle; */
812 ifirst = (Integer)((double)(inode*nelems)/((double)nnodes));
813 if (inode != nnodes-1) {
814 ilast = (Integer)((double)((inode+1)*nelems)/((double)nnodes))-1;
815 } else {
816 ilast = nelems-1;
817 }
818 /* ifirst and ilast correspond to offsets in shared memory. Find the
819 actual indices of the data elements corresponding to these offsets.
820 The map_offset array provides a convenient mechanism for extracting
821 the first indices on each processor along each coordinate dimension
822 from the mapc array. */
823 for (i = 0; i<ndim; i++) {
824 map_offset[i] = 0;
825 for (j = 0; j<i; j++) {
826 map_offset[i] += GA[handle].nblock[j];
827 }
828 }
829 icnt = 0;
830 nfirst = -1;
831 nlast = -1;
832 for (i = 0; i<nproc; i++) {
833 /* find block indices corresponding to proc i */
834 pnga_proc_topology(g_a, i, index);
835 nelems = 1;
836 for (j = 0; j<ndim; j++) {
837 if (index[j] < GA[handle].nblock[j]-1) {
838
839 itmp = ((Integer)GA[handle].mapc[map_offset[j]+index[j]+1]
840 - (Integer)GA[handle].mapc[map_offset[j]+index[j]]);
841 nelems *= itmp;
842 } else {
843 itmp = ((Integer)GA[handle].dims[j]
844 - (Integer)GA[handle].mapc[map_offset[j]+index[j]] + 1);
845 nelems *= itmp;
846 }
847 }
848 icnt += nelems;
849 if (icnt-1 >= ifirst && nfirst < 0) {
850 nfirst = i;
851 }
852 if (ilast <= icnt-1 && nfirst >= 0 && nlast < 0) {
853 nlast = i;
854 }
855 }
856 /* Adjust indices corresponding to start and end of block of
857 shared memory so that it can be decomposed into large
858 rectangular blocks of the global array. Start by
859 adusting the lower index */
860 icnt = 0;
861 for (i = 0; i<nfirst; i++) {
862 pnga_distribution(g_a, i, lo, hi);
863 nelems = 1;
864 for (j = 0; j<ndim; j++) {
865 if (hi[j] >= lo[j]) {
866 nelems *= (hi[j] - lo[j] + 1);
867 } else {
868 nelems = 0;
869 }
870 }
871 icnt += nelems;
872 }
873 /* calculate offset in local block of memory */
874 ifirst = ifirst - icnt;
875 /* find dimensions of data on block nfirst */
876 np = nfirst;
877 pnga_distribution(g_a, np, lo, hi);
878 nelems = 1;
879 for (i=0; i<ndim-1; i++) {
880 nelems *= (hi[i] - lo[i] + 1);
881 }
882 if (ifirst%nelems == 0) {
883 ifirst = ifirst/nelems;
884 } else {
885 ifirst = (ifirst-ifirst%nelems)/nelems;
886 ifirst++;
887 }
888 if (ifirst > GA[handle].dims[ndim-1]-1) ifirst=GA[handle].dims[ndim-1]-1;
889 /* adjust value of ifirst */
890 pnga_proc_topology(g_a, nfirst, index);
891 subscript[ndim-1] = ifirst;
892 for (i=0; i<ndim-1; i++) {
893 subscript[i] = 0;
894 }
895 /* Finally, evaluate absolute indices of first data point */
896 for (i=0; i<ndim; i++) {
897 GA[handle].first[i] = GA[handle].mapc[map_offset[i]+index[i]]
898 + (C_Integer)subscript[i];
899 }
900 /* adjust upper bound. If nlast = nfirst, just use old value of icnt */
901 if (nlast > nfirst) {
902 icnt = 0;
903 for (i = 0; i<nlast; i++) {
904 pnga_distribution(g_a, i, lo, hi);
905 nelems = 1;
906 for (j = 0; j<ndim; j++) {
907 if (hi[j] >= lo[j]) {
908 nelems *= (hi[j] - lo[j] + 1);
909 } else {
910 nelems = 0;
911 }
912 }
913 icnt += nelems;
914 }
915 }
916 ilast = ilast - icnt;
917 /* find dimensions of data on block nlast */
918 np = nlast;
919 pnga_distribution(g_a, np, lo, hi);
920 nelems = 1;
921 for (i=0; i<ndim-1; i++) {
922 nelems *= (hi[i] - lo[i] + 1);
923 }
924 ilast = (ilast-ilast%nelems)/nelems;
925 /* adjust value of ilast */
926 subscript[ndim-1] = ilast;
927 for (i=0; i<ndim-1; i++) {
928 subscript[i] = (hi[i] - lo[i]);
929 }
930 pnga_proc_topology(g_a, nlast, index);
931 /*
932 icheck = 1;
933 for (i=1; i<ndim; i++) {
934 if (index[i] < GA[handle].nblock[i]-1) {
935 itmp = (Integer)GA[handle].mapc[map_offset[i]+index[i]+1]
936 - (Integer)GA[handle].mapc[map_offset[i]+index[i]];
937 } else {
938 itmp = (Integer)GA[handle].dims[i]
939 - (Integer)GA[handle].mapc[map_offset[i]+index[i]] + 1;
940 }
941 if (subscript[i] < itmp-1) icheck = 0;
942 subscript[i] = itmp-1;
943 }
944 if (!icheck) {
945 subscript[0]--;
946 } */
947 /* Finally, evaluate absolute indices of last data point */
948 for (i=0; i<ndim; i++) {
949 GA[handle].last[i] = GA[handle].mapc[map_offset[i]+index[i]]
950 + (C_Integer)subscript[i];
951 if (GA[handle].last[i] > GA[handle].dims[i]) {
952 GA[handle].last[i] = GA[handle].dims[i];
953 }
954 }
955 /* find length of shared memory segment owned by this node. Adjust
956 * length, if necessary, to account for gaps in memory between
957 * processors */
958 type = GA[handle].type;
959 switch(type) {
960 case C_FLOAT: size = sizeof(float); break;
961 case C_DBL: size = sizeof(double); break;
962 case C_LONG: size = sizeof(long); break;
963 case C_LONGLONG: size = sizeof(long long); break;
964 case C_INT: size = sizeof(int); break;
965 case C_SCPL: size = 2*sizeof(float); break;
966 case C_DCPL: size = 2*sizeof(double); break;
967 default: pnga_error("type not supported",type);
968 }
969 for (i=0; i<ndim; i++) index[i] = (Integer)GA[handle].first[i];
970 i = (int)pnga_locate(g_a, index, &id);
971 gam_Loc_ptr(id, handle, (Integer)GA[handle].first, &fptr);
972
973 for (i=0; i<ndim; i++) index[i] = (Integer)GA[handle].last[i];
974 i = (int)pnga_locate(g_a, index, &id);
975 gam_Loc_ptr(id, handle, (Integer)GA[handle].last, &lptr);
976
977 GA[handle].shm_length = (C_Long)(lptr - fptr + size);
978 GA_Default_Proc_Group = Save_default_group;
979 } else {
980 for (i=0; i<ndim; i++) {
981 GA[handle].first[i] = 0;
982 GA[handle].last[i] = -1;
983 GA[handle].shm_length = -1;
984 }
985 }
986 }
987
988 /*\ print subscript of ndim dimensional array with two strings before and after
989 \*/
gai_print_subscript(char * pre,int ndim,Integer subscript[],char * post)990 void gai_print_subscript(char *pre,int ndim, Integer subscript[], char* post)
991 {
992 int i;
993
994 printf("%s [",pre);
995 for(i=0;i<ndim;i++){
996 printf("%ld",(long)subscript[i]);
997 if(i==ndim-1)printf("] %s",post);
998 else printf(",");
999 }
1000 }
1001
gai_init_struct(int handle)1002 void gai_init_struct(int handle)
1003 {
1004 if(!GA[handle].ptr){
1005 int len = (int)GAnproc;
1006 GA[handle].ptr = (char**)malloc(len*sizeof(char**));
1007 }
1008 if(!GA[handle].ptr)pnga_error("malloc failed: ptr:",0);
1009 GA[handle].ndim = -1;
1010 }
1011
1012 /**
1013 * Function to set default processor group
1014 */
1015 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1016 # pragma weak wnga_pgroup_set_default = pnga_pgroup_set_default
1017 #endif
1018
pnga_pgroup_set_default(Integer grp)1019 void pnga_pgroup_set_default(Integer grp)
1020 {
1021 #if 0
1022 int local_sync_begin,local_sync_end;
1023
1024 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1025 #endif
1026 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
1027
1028 /* force a hang if default group is not being set correctly */
1029 #if 0
1030 if (local_sync_begin || local_sync_end) pnga_pgroup_sync(grp);
1031 #endif
1032 GA_Default_Proc_Group = (int)(grp);
1033
1034 #ifdef MSG_COMMS_MPI
1035 {
1036 ARMCI_Group parent_grp;
1037 if(GA_Default_Proc_Group > 0)
1038 parent_grp = PGRP_LIST[GA_Default_Proc_Group].group;
1039 else
1040 ARMCI_Group_get_world(&parent_grp);
1041 ARMCI_Group_set_default(&parent_grp);
1042 }
1043 #endif
1044 }
1045
1046 /**
1047 * Create a new processor group containing count processors with
1048 * process IDs (in the default group) in list. Return process group
1049 * handle.
1050 */
1051 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1052 # pragma weak wnga_pgroup_create = pnga_pgroup_create
1053 #endif
1054
pnga_pgroup_create(Integer * list,Integer count)1055 Integer pnga_pgroup_create(Integer *list, Integer count)
1056 {
1057 Integer pgrp_handle, i, j, nprocs, itmp;
1058 Integer parent;
1059 int tmp_count;
1060 Integer *tmp_list;
1061 int *tmp2_list;
1062 #ifdef MSG_COMMS_MPI
1063 ARMCI_Group *tmpgrp;
1064 #endif
1065
1066
1067 /* Allocate temporary arrays */
1068 tmp_list = (Integer*)malloc(GAnproc*sizeof(Integer));
1069 tmp2_list = (int*)malloc(GAnproc*sizeof(int));
1070
1071 /*** Get next free process group handle ***/
1072 pgrp_handle =-1; i=0;
1073 do{
1074 if(!PGRP_LIST[i].actv) pgrp_handle=i;
1075 i++;
1076 }while(i<_max_global_array && pgrp_handle==-1);
1077 if( pgrp_handle == -1)
1078 pnga_error(" Too many process groups ", (Integer)_max_global_array);
1079
1080 /* Check list for validity (no duplicates and no out of range entries) */
1081 nprocs = GAnproc;
1082 for (i=0; i<count; i++) {
1083 if (list[i] <0 || list[i] >= nprocs)
1084 pnga_error(" invalid element in list ", list[i]);
1085 for (j=i+1; j<count; j++) {
1086 if (list[i] == list[j])
1087 pnga_error(" Duplicate elements in list ", list[i]);
1088 }
1089 }
1090
1091 /* Allocate memory for arrays containg processor maps and initialize
1092 values */
1093 PGRP_LIST[pgrp_handle].map_proc_list
1094 = (int*)malloc(GAnproc*sizeof(int)*2);
1095 PGRP_LIST[pgrp_handle].inv_map_proc_list
1096 = PGRP_LIST[pgrp_handle].map_proc_list + GAnproc;
1097 for (i=0; i<GAnproc; i++)
1098 PGRP_LIST[pgrp_handle].map_proc_list[i] = -1;
1099 for (i=0; i<GAnproc; i++)
1100 PGRP_LIST[pgrp_handle].inv_map_proc_list[i] = -1;
1101
1102 for (i=0; i<count; i++) {
1103 tmp2_list[i] = (int)list[i];
1104 }
1105
1106 /* use a simple sort routine to reorder list into assending order */
1107 for (j=1; j<count; j++) {
1108 itmp = tmp2_list[j];
1109 i = j-1;
1110 while(i>=0 && tmp2_list[i] > itmp) {
1111 tmp2_list[i+1] = tmp2_list[i];
1112 i--;
1113 }
1114 tmp2_list[i+1] = itmp;
1115 }
1116
1117 /* Remap elements in list to absolute processor indices (if necessary)*/
1118 if (GA_Default_Proc_Group != -1) {
1119 parent = GA_Default_Proc_Group;
1120 for (i=0; i<count; i++) {
1121 tmp_list[i] = (int)PGRP_LIST[parent].inv_map_proc_list[tmp2_list[i]];
1122 }
1123 } else {
1124 for (i=0; i<count; i++) {
1125 tmp_list[i] = (int)tmp2_list[i];
1126 }
1127 }
1128
1129 tmp_count = (int)(count);
1130 /* Create proc list maps */
1131 for (i=0; i<count; i++) {
1132 j = tmp_list[i];
1133 PGRP_LIST[pgrp_handle].map_proc_list[j] = i;
1134 PGRP_LIST[pgrp_handle].inv_map_proc_list[i] = j;
1135 }
1136 PGRP_LIST[pgrp_handle].actv = 1;
1137 PGRP_LIST[pgrp_handle].parent = GA_Default_Proc_Group;
1138 PGRP_LIST[pgrp_handle].mirrored = 0;
1139 PGRP_LIST[pgrp_handle].map_nproc = tmp_count;
1140 #ifdef MSG_COMMS_MPI
1141 tmpgrp = &PGRP_LIST[pgrp_handle].group;
1142 #if ENABLE_CHECKPOINT
1143 if(ga_group_is_for_ft)
1144 tmpgrp = ARMCI_Get_ft_group();
1145 else
1146 #endif
1147 ARMCI_Group_create(tmp_count, tmp2_list, &PGRP_LIST[pgrp_handle].group);
1148 #endif
1149
1150 /* Clean up temporary arrays */
1151 free(tmp_list);
1152 free(tmp2_list);
1153
1154 #ifdef MSG_COMMS_MPI
1155 return pgrp_handle;
1156 #else
1157 return pnga_pgroup_get_default();
1158 #endif
1159 }
1160
1161 /**
1162 * Duplicate and existing processor group
1163 */
1164 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1165 # pragma weak wnga_pgroup_duplicate = pnga_pgroup_duplicate
1166 #endif
1167
pnga_pgroup_duplicate(Integer grp)1168 Integer pnga_pgroup_duplicate(Integer grp)
1169 {
1170 Integer pgrp_handle, i, j, nprocs, itmp;
1171 int tmp_count;
1172 int *tmp_list, *tmp2_list;
1173 #ifdef MSG_COMMS_MPI
1174 ARMCI_Group *tmpgrp;
1175 #endif
1176 Integer save_grp;
1177 if (grp != -1 && !PGRP_LIST[grp].actv) {
1178 pnga_error(" Group is not active ", grp);
1179 }
1180
1181 /*** Get next free process group handle ***/
1182 pgrp_handle =-1; i=0;
1183 do{
1184 if(!PGRP_LIST[i].actv) pgrp_handle=i;
1185 i++;
1186 }while(i<_max_global_array && pgrp_handle==-1);
1187 if( pgrp_handle == -1)
1188 pnga_error(" Too many process groups ", (Integer)_max_global_array);
1189
1190 /* Allocate memory for arrays containg processor maps and initialize
1191 values */
1192 PGRP_LIST[pgrp_handle].map_proc_list
1193 = (int*)malloc(GAnproc*sizeof(int)*2);
1194 PGRP_LIST[pgrp_handle].inv_map_proc_list
1195 = PGRP_LIST[pgrp_handle].map_proc_list + GAnproc;
1196 for (i=0; i<GAnproc; i++)
1197 PGRP_LIST[pgrp_handle].map_proc_list[i] = -1;
1198 for (i=0; i<GAnproc; i++)
1199 PGRP_LIST[pgrp_handle].inv_map_proc_list[i] = -1;
1200 if (grp != -1) {
1201 for (i=0; i<GAnproc; i++) {
1202 PGRP_LIST[pgrp_handle].map_proc_list[i]
1203 = PGRP_LIST[grp].map_proc_list[i];
1204 PGRP_LIST[pgrp_handle].inv_map_proc_list[i]
1205 = PGRP_LIST[grp].inv_map_proc_list[i];
1206 }
1207 } else {
1208 for (i=0; i<GAnproc; i++) {
1209 PGRP_LIST[pgrp_handle].map_proc_list[i]
1210 = i;
1211 PGRP_LIST[pgrp_handle].inv_map_proc_list[i]
1212 = i;
1213 }
1214 }
1215 tmp_count = PGRP_LIST[grp].map_nproc;
1216
1217 tmp_list = (int*)malloc(GAnproc*sizeof(int));
1218 tmp2_list = PGRP_LIST[grp].inv_map_proc_list;
1219 save_grp = GA_Default_Proc_Group;
1220 GA_Default_Proc_Group = PGRP_LIST[grp].parent;
1221 if (grp != -1 && GA_Default_Proc_Group != -1) {
1222 int parent = GA_Default_Proc_Group;
1223 for (i=0; i<tmp_count; i++) {
1224 tmp_list[i] = (int)PGRP_LIST[parent].map_proc_list[tmp2_list[i]];
1225 }
1226 } else if (grp != -1 && GA_Default_Proc_Group == -1) {
1227 for (i=0; i<tmp_count; i++) {
1228 tmp_list[i] = (int)PGRP_LIST[grp].map_proc_list[tmp2_list[i]];
1229 }
1230 } else {
1231 for (i=0; i<GAnproc; i++) {
1232 tmp_list[i] = i;
1233 }
1234 }
1235
1236 PGRP_LIST[pgrp_handle].map_nproc = tmp_count;
1237 PGRP_LIST[pgrp_handle].actv = 1;
1238 PGRP_LIST[pgrp_handle].parent = PGRP_LIST[grp].parent;
1239 PGRP_LIST[pgrp_handle].mirrored = 0;
1240 PGRP_LIST[pgrp_handle].map_nproc = PGRP_LIST[grp].map_nproc;
1241 #ifdef MSG_COMMS_MPI
1242 tmpgrp = &PGRP_LIST[pgrp_handle].group;
1243 #if ENABLE_CHECKPOINT
1244 if(ga_group_is_for_ft)
1245 tmpgrp = ARMCI_Get_ft_group();
1246 else
1247 #endif
1248 ARMCI_Group_create(tmp_count, tmp_list, &PGRP_LIST[pgrp_handle].group);
1249 #endif
1250
1251 GA_Default_Proc_Group = save_grp;
1252 /* Clean up temporary arrays */
1253 free(tmp_list);
1254
1255 #ifdef MSG_COMMS_MPI
1256 return pgrp_handle;
1257 #else
1258 return pnga_pgroup_get_default();
1259 #endif
1260 }
1261
1262 /**
1263 * Create a duplicate of the group with only the calling processor in it
1264 */
1265 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1266 # pragma weak wnga_pgroup_self = pnga_pgroup_self
1267 #endif
1268
pnga_pgroup_self()1269 Integer pnga_pgroup_self()
1270 {
1271 Integer one = 1;
1272 Integer me = pnga_nodeid();
1273 return pnga_pgroup_create(&me,one);
1274 }
1275
1276 /**
1277 * Free up processor group handle for reuse
1278 */
1279 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1280 # pragma weak wnga_pgroup_destroy = pnga_pgroup_destroy
1281 #endif
1282
pnga_pgroup_destroy(Integer grp_id)1283 logical pnga_pgroup_destroy(Integer grp_id)
1284 {
1285 logical ret = TRUE;
1286 int i, ok;
1287
1288 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
1289
1290 #ifdef MSG_COMMS_MPI
1291 ARMCI_Group_free(&PGRP_LIST[grp_id].group);
1292 #endif
1293 /* check to make sure there are no GAs that depend on this process group */
1294 i=0;
1295 ok = 1;
1296 do{
1297 if (GA[i].actv) {
1298 if(GA[i].p_handle == (int)grp_id && GA[i].actv) ok = 0;
1299 }
1300 i++;
1301 }while(i<_max_global_array && ok);
1302 if (!ok) pnga_error("Attempt to destroy process group with attached GAs",grp_id);
1303
1304 if (PGRP_LIST[grp_id].actv == 0) {
1305 ret = FALSE;
1306 }
1307 PGRP_LIST[grp_id].actv = 0;
1308
1309 /* Deallocate memory for lists */
1310 free(PGRP_LIST[grp_id].map_proc_list);
1311 return ret;
1312 }
1313
1314 /**
1315 * Simple function to recover handle of current default processor group
1316 */
1317 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1318 # pragma weak wnga_pgroup_get_default = pnga_pgroup_get_default
1319 #endif
1320
pnga_pgroup_get_default()1321 Integer pnga_pgroup_get_default()
1322 {
1323 return GA_Default_Proc_Group;
1324 }
1325
1326 /**
1327 * Simple function to recover handle of mirror group
1328 */
1329 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1330 # pragma weak wnga_pgroup_get_mirror = pnga_pgroup_get_mirror
1331 #endif
1332
pnga_pgroup_get_mirror()1333 Integer pnga_pgroup_get_mirror()
1334 {
1335 return 0;
1336 }
1337
1338 /**
1339 * Simple function to recover handle of world group
1340 */
1341 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1342 # pragma weak wnga_pgroup_get_world = pnga_pgroup_get_world
1343 #endif
1344
pnga_pgroup_get_world()1345 Integer pnga_pgroup_get_world()
1346 {
1347 return -1;
1348 }
1349
1350 /**
1351 * Create new process groups by splitting the group grp into grp_num new
1352 * groups. If mod(size(grp),grp_num) != 0, then one group consists of a smaller
1353 * number of processes than the others. The new group handle is returned by
1354 * the call.
1355 */
1356 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1357 # pragma weak wnga_pgroup_split = pnga_pgroup_split
1358 #endif
1359
pnga_pgroup_split(Integer grp,Integer grp_num)1360 Integer pnga_pgroup_split(Integer grp, Integer grp_num)
1361 {
1362 Integer nprocs, me, default_grp;
1363 Integer ratio, start, end, grp_size;
1364 Integer i, icnt;
1365 Integer *nodes;
1366 Integer grp_id, ret=-1;
1367
1368 /* Allocate temporary array */
1369 nodes = (Integer*)malloc(GAnproc*sizeof(Integer));
1370
1371 if(grp_num<0) pnga_error("Invalid argument (number of groups < 0)",grp_num);
1372 if(grp_num==0) return grp;
1373
1374 default_grp = pnga_pgroup_get_default();
1375 pnga_pgroup_set_default(grp);
1376
1377 #if 0 /* This is wrong. Should split only default group and not world group */
1378 world_grp = pnga_pgroup_get_world();
1379 pnga_pgroup_set_default(world_grp);
1380 #endif
1381 nprocs = pnga_nnodes();
1382 me = pnga_nodeid();
1383 /* Figure out how big groups are */
1384 grp_size = nprocs/grp_num;
1385 if (nprocs > grp_size*grp_num) grp_size++;
1386 /* Figure out what procs are in my group */
1387 ratio = me/grp_size;
1388 start = ratio*grp_size;
1389 end = (ratio+1)*grp_size-1;
1390 end = GA_MIN(end,nprocs-1);
1391 if (end<start)
1392 pnga_error("Invalid proc range encountered",0);
1393 icnt = 0;
1394 for (i= 0; i<nprocs; i++) {
1395 if (icnt%grp_size == 0 && i>0) {
1396 grp_id = pnga_pgroup_create(nodes, grp_size);
1397 if (i == end + 1) {
1398 ret = grp_id;
1399 }
1400 icnt = 0;
1401 }
1402 nodes[icnt] = i;
1403 icnt++;
1404 }
1405 grp_id = pnga_pgroup_create(nodes, icnt);
1406 if (end == nprocs-1) {
1407 ret = grp_id;
1408 }
1409 pnga_pgroup_set_default(default_grp);
1410 if(ret==-1) pnga_error("ga_pgroup_split failed",ret);
1411 /* Free temporary array */
1412 free(nodes);
1413 return ret;
1414 }
1415
1416 /**
1417 * Split grp into multiple groups based on the color in mycolor. All processes
1418 * in grp with the same color are assigned to the same group.
1419 */
1420 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1421 # pragma weak wnga_pgroup_split_irreg = pnga_pgroup_split_irreg
1422 #endif
1423
pnga_pgroup_split_irreg(Integer grp,Integer mycolor)1424 Integer pnga_pgroup_split_irreg(Integer grp, Integer mycolor)
1425 {
1426 Integer nprocs, me, default_grp, grp_id;
1427 Integer i, icnt=0;
1428 Integer *nodes, *color_arr;
1429
1430
1431 /* Allocate temporary arrays */
1432 nodes = (Integer*)malloc(GAnproc*sizeof(Integer));
1433 color_arr = (Integer*)malloc(GAnproc*sizeof(Integer));
1434
1435 if(mycolor<0) pnga_error("Invalid argument (color < 0)",mycolor);
1436
1437 default_grp = pnga_pgroup_get_default();
1438 pnga_pgroup_set_default(grp);
1439 nprocs = pnga_nnodes();
1440 me = pnga_nodeid();
1441
1442 /* Figure out what procs are in my group */
1443 for(i=0; i<nprocs; i++) color_arr[i] = 0;
1444 color_arr[me] = mycolor;
1445 pnga_gop(pnga_type_f2c(MT_F_INT), color_arr, nprocs, "+");
1446
1447 for (icnt=0, i=0; i<nprocs; i++) {
1448 if(color_arr[i] == mycolor) {
1449 nodes[icnt] = i;
1450 icnt++;
1451 }
1452 }
1453
1454 grp_id = pnga_pgroup_create(nodes, icnt);
1455
1456 pnga_pgroup_set_default(default_grp);
1457
1458 /* Free temporary arrays */
1459 free(nodes);
1460 free(color_arr);
1461
1462
1463 return grp_id;
1464 }
1465
1466 #ifdef MSG_COMMS_MPI
ga_get_armci_group_(int grp_id)1467 ARMCI_Group* ga_get_armci_group_(int grp_id)
1468 {
1469 return &PGRP_LIST[grp_id].group;
1470 }
1471 #endif
1472
1473 /**
1474 * Return a new global array handle
1475 */
1476 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1477 # pragma weak wnga_create_handle = pnga_create_handle
1478 #endif
1479
pnga_create_handle()1480 Integer pnga_create_handle()
1481 {
1482 Integer ga_handle, i, g_a;
1483 /*** Get next free global array handle ***/
1484 ga_handle =-1; i=0;
1485 do{
1486 if(!GA[i].actv_handle) ga_handle=i;
1487 i++;
1488 }while(i<_max_global_array && ga_handle==-1);
1489 if( ga_handle == -1)
1490 pnga_error(" too many arrays ", (Integer)_max_global_array);
1491 g_a = (Integer)ga_handle - GA_OFFSET;
1492
1493 /*** fill in Global Info Record for g_a ***/
1494 gai_init_struct(ga_handle);
1495 GA[ga_handle].p_handle = GA_Init_Proc_Group;
1496 GA[ga_handle].ndim = -1;
1497 GA[ga_handle].name[0] = '\0';
1498 GA[ga_handle].mapc = NULL;
1499 GA[ga_handle].irreg = 0;
1500 GA[ga_handle].ghosts = 0;
1501 GA[ga_handle].corner_flag = -1;
1502 GA[ga_handle].cache = NULL;
1503 GA[ga_handle].distr_type = REGULAR;
1504 GA[ga_handle].block_total = -1;
1505 GA[ga_handle].rstrctd_list = NULL;
1506 GA[ga_handle].rank_rstrctd = NULL;
1507 GA[ga_handle].num_rstrctd = 0; /* This is also used as a flag for */
1508 /* restricted arrays. If it is zero, */
1509 /* then array is not restricted. */
1510 GA[ga_handle].actv_handle = 1;
1511 GA[ga_handle].has_data = 1;
1512 GA[ga_handle].property = NO_PROPERTY;
1513 return g_a;
1514 }
1515
1516 /**
1517 * Set the dimensions and data type on a new global array handle
1518 */
1519 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1520 # pragma weak wnga_set_data = pnga_set_data
1521 #endif
1522
pnga_set_data(Integer g_a,Integer ndim,Integer * dims,Integer type)1523 void pnga_set_data(Integer g_a, Integer ndim, Integer *dims, Integer type)
1524 {
1525 Integer i;
1526 Integer ga_handle = g_a + GA_OFFSET;
1527 if (GA[ga_handle].actv == 1)
1528 pnga_error("Cannot set data on array that has been allocated",0);
1529 gam_checkdim(ndim, dims);
1530 gam_checktype(pnga_type_f2c(type));
1531
1532 GA[ga_handle].type = pnga_type_f2c((int)(type));
1533 GA[ga_handle].elemsize = GAsizeofM(GA[ga_handle].type);
1534
1535 for (i=0; i<ndim; i++) {
1536 GA[ga_handle].dims[i] = (C_Integer)dims[i];
1537 GA[ga_handle].chunk[i] = 0;
1538 GA[ga_handle].width[i] = 0;
1539 }
1540 GA[ga_handle].ndim = (int)(ndim);
1541 }
1542
1543 /**
1544 * Set the chunk array on a new global array
1545 */
1546 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1547 # pragma weak wnga_set_chunk = pnga_set_chunk
1548 #endif
1549
pnga_set_chunk(Integer g_a,Integer * chunk)1550 void pnga_set_chunk(Integer g_a, Integer *chunk)
1551 {
1552 Integer i;
1553 Integer ga_handle = g_a + GA_OFFSET;
1554 if (GA[ga_handle].actv == 1)
1555 pnga_error("Cannot set chunk on array that has been allocated",0);
1556 if (GA[ga_handle].ndim < 1)
1557 pnga_error("Dimensions must be set before chunk array is specified",0);
1558 if (chunk) {
1559 for (i=0; i<GA[ga_handle].ndim; i++) {
1560 GA[ga_handle].chunk[i] = (C_Integer)chunk[i];
1561 }
1562 }
1563 }
1564
1565 /**
1566 * Set the array name on a new global array
1567 */
1568 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1569 # pragma weak wnga_set_array_name = pnga_set_array_name
1570 #endif
1571
pnga_set_array_name(Integer g_a,char * array_name)1572 void pnga_set_array_name(Integer g_a, char *array_name)
1573 {
1574 Integer ga_handle = g_a + GA_OFFSET;
1575 if (GA[ga_handle].actv == 1)
1576 pnga_error("Cannot set array name on array that has been allocated",0);
1577 if (strlen(array_name) > FNAM)
1578 pnga_error("Array name exceeds maximum array name length",FNAM);
1579 strcpy(GA[ga_handle].name, array_name);
1580 }
1581
1582 /**
1583 * Get the array name of a global array
1584 */
1585 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1586 # pragma weak wnga_get_array_name = pnga_get_array_name
1587 #endif
1588
pnga_get_array_name(Integer g_a,char * array_name)1589 void pnga_get_array_name(Integer g_a, char *array_name)
1590 {
1591 Integer ga_handle = g_a + GA_OFFSET;
1592 if (GA[ga_handle].actv == 1)
1593 pnga_error("Cannot get array name on array that has not been allocated",0);
1594 if (strlen(array_name) > FNAM)
1595 pnga_error("Array name exceeds maximum array name length",FNAM);
1596 strcpy(array_name, GA[ga_handle].name);
1597 }
1598
1599 /**
1600 * Set the processor group on a new global array
1601 */
1602 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1603 # pragma weak wnga_set_pgroup = pnga_set_pgroup
1604 #endif
1605
pnga_set_pgroup(Integer g_a,Integer p_handle)1606 void pnga_set_pgroup(Integer g_a, Integer p_handle)
1607 {
1608 Integer ga_handle = g_a + GA_OFFSET;
1609 if (GA[ga_handle].actv == 1)
1610 pnga_error("Cannot set processor configuration on array that has been allocated",0);
1611 if (p_handle == GA_World_Proc_Group || PGRP_LIST[p_handle].actv == 1) {
1612 GA[ga_handle].p_handle = (int) (p_handle);
1613 } else {
1614 pnga_error("Processor group does not exist",0);
1615 }
1616 }
1617
1618 /**
1619 * Get the processor group handle associated with g_a
1620 */
1621 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1622 # pragma weak wnga_get_pgroup = pnga_get_pgroup
1623 #endif
1624
pnga_get_pgroup(Integer g_a)1625 Integer pnga_get_pgroup(Integer g_a)
1626 {
1627 Integer ga_handle = g_a + GA_OFFSET;
1628 return (Integer)GA[ga_handle].p_handle;
1629 }
1630
1631 /**
1632 * Return the number of processors associated with a processor group
1633 */
1634 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1635 # pragma weak wnga_get_pgroup_size = pnga_get_pgroup_size
1636 #endif
1637
pnga_get_pgroup_size(Integer grp_id)1638 Integer pnga_get_pgroup_size(Integer grp_id)
1639 {
1640 int p_handle = (int)(grp_id);
1641 if (p_handle > 0) {
1642 return (Integer)PGRP_LIST[p_handle].map_nproc;
1643 } else {
1644 return GAnproc;
1645 }
1646 }
1647
1648 /**
1649 * Add ghost cells to a new global array
1650 */
1651 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1652 # pragma weak wnga_set_ghosts = pnga_set_ghosts
1653 #endif
1654
pnga_set_ghosts(Integer g_a,Integer * width)1655 void pnga_set_ghosts(Integer g_a, Integer *width)
1656 {
1657 Integer i;
1658 Integer ga_handle = g_a + GA_OFFSET;
1659 if (GA[ga_handle].actv == 1)
1660 pnga_error("Cannot set ghost widths on array that has been allocated",0);
1661 if (GA[ga_handle].ndim < 1)
1662 pnga_error("Dimensions must be set before array widths are specified",0);
1663 for (i=0; i<GA[ga_handle].ndim; i++) {
1664 if ((C_Integer)width[i] > GA[ga_handle].dims[i])
1665 pnga_error("Boundary width must be <= corresponding dimension",i);
1666 if ((C_Integer)width[i] < 0)
1667 pnga_error("Boundary width must be >= 0",i);
1668 }
1669 for (i=0; i<GA[ga_handle].ndim; i++) {
1670 GA[ga_handle].width[i] = (C_Integer)width[i];
1671 if (width[i] > 0) GA[ga_handle].ghosts = 1;
1672 }
1673 }
1674
1675 /**
1676 * Set irregular distribution in a new global array
1677 */
1678 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1679 # pragma weak wnga_set_irreg_distr = pnga_set_irreg_distr
1680 #endif
1681
pnga_set_irreg_distr(Integer g_a,Integer * mapc,Integer * nblock)1682 void pnga_set_irreg_distr(Integer g_a, Integer *mapc, Integer *nblock)
1683 {
1684 Integer i, j, ichk, maplen;
1685 Integer ga_handle = g_a + GA_OFFSET;
1686 if (GA[ga_handle].actv == 1)
1687 pnga_error("Cannot set irregular data distribution on array that has been allocated",0);
1688 if (GA[ga_handle].ndim < 1)
1689 pnga_error("Dimensions must be set before irregular distribution is specified",0);
1690 for (i=0; i<GA[ga_handle].ndim; i++)
1691 if ((C_Integer)nblock[i] > GA[ga_handle].dims[i])
1692 pnga_error("number of blocks must be <= corresponding dimension",i);
1693 /* Check to see that mapc array is sensible */
1694 maplen = 0;
1695 for (i=0; i<GA[ga_handle].ndim; i++) {
1696 ichk = mapc[maplen];
1697 if (ichk < 1 || ichk > GA[ga_handle].dims[i])
1698 pnga_error("Mapc entry outside array dimension limits",ichk);
1699 maplen++;
1700 for (j=1; j<nblock[i]; j++) {
1701 if (mapc[maplen] < ichk) {
1702 pnga_error("Mapc entries are not properly monotonic",ichk);
1703 }
1704 ichk = mapc[maplen];
1705 if (ichk < 1 || ichk > GA[ga_handle].dims[i])
1706 pnga_error("Mapc entry outside array dimension limits",ichk);
1707 maplen++;
1708 }
1709 }
1710
1711 maplen = 0;
1712 for (i=0; i<GA[ga_handle].ndim; i++) {
1713 maplen += nblock[i];
1714 GA[ga_handle].nblock[i] = (C_Integer)nblock[i];
1715 }
1716 GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
1717 for (i=0; i<maplen; i++) {
1718 GA[ga_handle].mapc[i] = (C_Integer)mapc[i];
1719 }
1720 GA[ga_handle].mapc[maplen] = -1;
1721 GA[ga_handle].irreg = 1;
1722 }
1723
1724 /**
1725 * Overide the irregular data distribution flag on a new global array
1726 */
1727 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1728 # pragma weak wnga_set_irreg_flag = pnga_set_irreg_flag
1729 #endif
1730
pnga_set_irreg_flag(Integer g_a,logical flag)1731 void pnga_set_irreg_flag(Integer g_a, logical flag)
1732 {
1733 Integer ga_handle = g_a + GA_OFFSET;
1734 GA[ga_handle].irreg = (int)(flag);
1735 }
1736
1737 /**
1738 * Get dimension on a new global array
1739 */
1740 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1741 # pragma weak wnga_get_dimension = pnga_get_dimension
1742 #endif
1743
pnga_get_dimension(Integer g_a)1744 Integer pnga_get_dimension(Integer g_a)
1745 {
1746 Integer ga_handle = g_a + GA_OFFSET;
1747 return (Integer)GA[ga_handle].ndim;
1748 }
1749
1750 /**
1751 * Use a simple block-cyclic data distribution for array
1752 */
1753 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1754 # pragma weak wnga_set_block_cyclic = pnga_set_block_cyclic
1755 #endif
1756
pnga_set_block_cyclic(Integer g_a,Integer * dims)1757 void pnga_set_block_cyclic(Integer g_a, Integer *dims)
1758 {
1759 Integer i, jsize;
1760 Integer ga_handle = g_a + GA_OFFSET;
1761 if (GA[ga_handle].actv == 1)
1762 pnga_error("Cannot set block-cyclic data distribution on array that has been allocated",0);
1763 if (!(GA[ga_handle].ndim > 0))
1764 pnga_error("Cannot set block-cyclic data distribution if array size not set",0);
1765 if (GA[ga_handle].distr_type != REGULAR)
1766 pnga_error("Cannot reset block-cyclic data distribution on array that has been set",0);
1767 GA[ga_handle].distr_type = BLOCK_CYCLIC;
1768 /* evaluate number of blocks in each dimension */
1769 for (i=0; i<GA[ga_handle].ndim; i++) {
1770 if (dims[i] < 1)
1771 pnga_error("Block dimensions must all be greater than zero",0);
1772 GA[ga_handle].block_dims[i] = dims[i];
1773 jsize = GA[ga_handle].dims[i]/dims[i];
1774 if (GA[ga_handle].dims[i]%dims[i] != 0) jsize++;
1775 GA[ga_handle].num_blocks[i] = jsize;
1776 }
1777 jsize = 1;
1778 for (i=0; i<GA[ga_handle].ndim; i++) {
1779 jsize *= GA[ga_handle].num_blocks[i];
1780 }
1781 GA[ga_handle].block_total = jsize;
1782 }
1783
1784 /**
1785 * Use block-cyclic data distribution with ScaLAPACK-type proc grid for array
1786 */
1787 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1788 # pragma weak wnga_set_block_cyclic_proc_grid = pnga_set_block_cyclic_proc_grid
1789 #endif
1790
pnga_set_block_cyclic_proc_grid(Integer g_a,Integer * dims,Integer * proc_grid)1791 void pnga_set_block_cyclic_proc_grid(Integer g_a, Integer *dims, Integer *proc_grid)
1792 {
1793 Integer i, jsize, tot;
1794 Integer ga_handle = g_a + GA_OFFSET;
1795 if (GA[ga_handle].actv == 1)
1796 pnga_error("Cannot set block-cyclic data distribution on array that has been allocated",0);
1797 if (!(GA[ga_handle].ndim > 0))
1798 pnga_error("Cannot set block-cyclic data distribution if array size not set",0);
1799 if (GA[ga_handle].distr_type != REGULAR)
1800 pnga_error("Cannot reset block-cyclic data distribution on array that has been set",0);
1801 GA[ga_handle].distr_type = SCALAPACK;
1802 /* Check to make sure processor grid is compatible with total number of processors */
1803 tot = 1;
1804 for (i=0; i<GA[ga_handle].ndim; i++) {
1805 if (proc_grid[i] < 1)
1806 pnga_error("Processor grid dimensions must all be greater than zero",0);
1807 GA[ga_handle].nblock[i] = proc_grid[i];
1808 tot *= proc_grid[i];
1809 }
1810 if (tot != GAnproc)
1811 pnga_error("Number of processors in processor grid must equal available processors",0);
1812 /* evaluate number of blocks in each dimension */
1813 for (i=0; i<GA[ga_handle].ndim; i++) {
1814 if (dims[i] < 1)
1815 pnga_error("Block dimensions must all be greater than zero",0);
1816 GA[ga_handle].block_dims[i] = dims[i];
1817 jsize = GA[ga_handle].dims[i]/dims[i];
1818 if (GA[ga_handle].dims[i]%dims[i] != 0) jsize++;
1819 GA[ga_handle].num_blocks[i] = jsize;
1820 }
1821 jsize = 1;
1822 for (i=0; i<GA[ga_handle].ndim; i++) {
1823 jsize *= GA[ga_handle].num_blocks[i];
1824 }
1825 GA[ga_handle].block_total = jsize;
1826 }
1827
1828 /**
1829 * Use block-cyclic data distribution with ScaLAPACK-type proc grid for array
1830 */
1831 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1832 # pragma weak wnga_set_tiled_proc_grid = pnga_set_tiled_proc_grid
1833 #endif
1834
pnga_set_tiled_proc_grid(Integer g_a,Integer * dims,Integer * proc_grid)1835 void pnga_set_tiled_proc_grid(Integer g_a, Integer *dims, Integer *proc_grid)
1836 {
1837 Integer i, jsize, tot;
1838 Integer ga_handle = g_a + GA_OFFSET;
1839 if (GA[ga_handle].actv == 1)
1840 pnga_error("Cannot set tiled data distribution on array that has been allocated",0);
1841 if (!(GA[ga_handle].ndim > 0))
1842 pnga_error("Cannot set tiled data distribution if array size not set",0);
1843 if (GA[ga_handle].distr_type != REGULAR)
1844 pnga_error("Cannot reset tiled data distribution on array that has been set",0);
1845 GA[ga_handle].distr_type = TILED;
1846 /* Check to make sure processor grid is compatible with total number of processors */
1847 tot = 1;
1848 for (i=0; i<GA[ga_handle].ndim; i++) {
1849 if (proc_grid[i] < 1)
1850 pnga_error("Processor grid dimensions must all be greater than zero",0);
1851 GA[ga_handle].nblock[i] = proc_grid[i];
1852 tot *= proc_grid[i];
1853 }
1854 if (tot != GAnproc)
1855 pnga_error("Number of processors in processor grid must equal available processors",0);
1856 /* evaluate number of blocks in each dimension */
1857 for (i=0; i<GA[ga_handle].ndim; i++) {
1858 if (dims[i] < 1)
1859 pnga_error("Block dimensions must all be greater than zero",0);
1860 GA[ga_handle].block_dims[i] = dims[i];
1861 jsize = GA[ga_handle].dims[i]/dims[i];
1862 if (GA[ga_handle].dims[i]%dims[i] != 0) jsize++;
1863 GA[ga_handle].num_blocks[i] = jsize;
1864 }
1865 jsize = 1;
1866 for (i=0; i<GA[ga_handle].ndim; i++) {
1867 jsize *= GA[ga_handle].num_blocks[i];
1868 }
1869 GA[ga_handle].block_total = jsize;
1870 }
1871
1872 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1873 # pragma weak wnga_set_tiled_irreg_proc_grid = pnga_set_tiled_irreg_proc_grid
1874 #endif
1875
pnga_set_tiled_irreg_proc_grid(Integer g_a,Integer * mapc,Integer * nblocks,Integer * proc_grid)1876 void pnga_set_tiled_irreg_proc_grid(Integer g_a, Integer *mapc, Integer *nblocks,
1877 Integer *proc_grid)
1878 {
1879 Integer i, j, ichk, maplen, tot, jsize;
1880 Integer ga_handle = g_a + GA_OFFSET;
1881 if (GA[ga_handle].actv == 1)
1882 pnga_error("Cannot set irregular tiled data distribution on array"
1883 " that has been allocated",0);
1884 if (!(GA[ga_handle].ndim > 0))
1885 pnga_error("Cannot set irregular tiled data distribution if array size not set",0);
1886 if (GA[ga_handle].ndim < 1)
1887 pnga_error("Dimensions must be set before irregular distribution is specified",0);
1888 for (i=0; i<GA[ga_handle].ndim; i++)
1889 if ((C_Integer)nblocks[i] > GA[ga_handle].dims[i])
1890 pnga_error("number of blocks must be <= corresponding dimension",i);
1891 if (GA[ga_handle].distr_type != REGULAR)
1892 pnga_error("Cannot reset irregular tiled data distribution on array that has been set",0);
1893 GA[ga_handle].distr_type = TILED_IRREG;
1894 /* Check to see that mapc array is sensible */
1895 maplen = 0;
1896 for (i=0; i<GA[ga_handle].ndim; i++) {
1897 ichk = mapc[maplen];
1898 if (ichk < 1 || ichk > GA[ga_handle].dims[i])
1899 pnga_error("Mapc entry outside array dimension limits",ichk);
1900 maplen++;
1901 for (j=1; j<nblocks[i]; j++) {
1902 if (mapc[maplen] < ichk) {
1903 pnga_error("Mapc entries are not properly monotonic",ichk);
1904 }
1905 ichk = mapc[maplen];
1906 if (ichk < 1 || ichk > GA[ga_handle].dims[i])
1907 pnga_error("Mapc entry outside array dimension limits",ichk);
1908 maplen++;
1909 }
1910 }
1911
1912 maplen = 0;
1913 for (i=0; i<GA[ga_handle].ndim; i++) {
1914 maplen += nblocks[i];
1915 GA[ga_handle].num_blocks[i] = (C_Integer)nblocks[i];
1916 }
1917 GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
1918 for (i=0; i<maplen; i++) {
1919 GA[ga_handle].mapc[i] = (C_Integer)mapc[i];
1920 }
1921 GA[ga_handle].mapc[maplen] = -1;
1922 GA[ga_handle].irreg = 1;
1923
1924 /* Check to make sure processor grid is compatible with total number of processors */
1925 tot = 1;
1926 for (i=0; i<GA[ga_handle].ndim; i++) {
1927 if (proc_grid[i] < 1)
1928 pnga_error("Processor grid dimensions must all be greater than zero",0);
1929 GA[ga_handle].nblock[i] = proc_grid[i];
1930 tot *= proc_grid[i];
1931 }
1932 if (tot != GAnproc)
1933 pnga_error("Number of processors in processor grid must equal available processors",0);
1934 /* Find total number of blocks */
1935 jsize = 1;
1936 for (i=0; i<GA[ga_handle].ndim; i++) {
1937 jsize *= GA[ga_handle].num_blocks[i];
1938 }
1939 GA[ga_handle].block_total = jsize;
1940 }
1941
1942 /**
1943 * Restrict processors that actually contain data in the global array. Can also
1944 * be used to rearrange the distribution of data amongst processors
1945 */
1946 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1947 # pragma weak wnga_set_restricted = pnga_set_restricted
1948 #endif
1949
pnga_set_restricted(Integer g_a,Integer * list,Integer size)1950 void pnga_set_restricted(Integer g_a, Integer *list, Integer size)
1951 {
1952 Integer i, ig, id=0, me, p_handle, has_data, nproc;
1953 Integer ga_handle = g_a + GA_OFFSET;
1954 GA[ga_handle].num_rstrctd = size;
1955 GA[ga_handle].rstrctd_list = (Integer*)malloc(size*sizeof(Integer));
1956 GA[ga_handle].rank_rstrctd = (Integer*)malloc((GAnproc)*sizeof(Integer));
1957 p_handle = GA[ga_handle].p_handle;
1958 if (p_handle == -2) p_handle = pnga_pgroup_get_default();
1959 if (p_handle > 0) {
1960 me = PGRP_LIST[p_handle].map_proc_list[GAme];
1961 nproc = PGRP_LIST[p_handle].map_nproc;
1962 } else {
1963 me = GAme;
1964 nproc = GAnproc;
1965 }
1966 has_data = 0;
1967
1968 for (i=0; i<GAnproc; i++) {
1969 GA[ga_handle].rank_rstrctd[i] = -1;
1970 }
1971
1972 for (i=0; i<size; i++) {
1973 GA[ga_handle].rstrctd_list[i] = list[i];
1974 /* check if processor is in list */
1975 if (me == list[i]) {
1976 has_data = 1;
1977 id = i;
1978 }
1979 /* check if processor is in group */
1980 if (list[i] < 0 || list[i] >= nproc)
1981 pnga_error("Invalid processor in list",list[i]);
1982 ig = list[i];
1983 GA[ga_handle].rank_rstrctd[ig] = i;
1984 }
1985 GA[ga_handle].has_data = has_data;
1986 GA[ga_handle].rstrctd_id = id;
1987 }
1988
1989 /**
1990 * Restrict processors that actually contain data in the global array
1991 * by specifying a range of processors
1992 */
1993 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
1994 # pragma weak wnga_set_restricted_range = pnga_set_restricted_range
1995 #endif
1996
pnga_set_restricted_range(Integer g_a,Integer lo_proc,Integer hi_proc)1997 void pnga_set_restricted_range(Integer g_a, Integer lo_proc, Integer hi_proc)
1998 {
1999 Integer i, ig, id=0, me, p_handle, has_data, icnt, nproc, size;
2000 Integer ga_handle = g_a + GA_OFFSET;
2001 size = hi_proc - lo_proc + 1;
2002 GA[ga_handle].num_rstrctd = size;
2003 GA[ga_handle].rstrctd_list = (Integer*)malloc((size)*sizeof(Integer));
2004 GA[ga_handle].rank_rstrctd = (Integer*)malloc((GAnproc)*sizeof(Integer));
2005 p_handle = GA[ga_handle].p_handle;
2006 if (p_handle == -2) p_handle = pnga_pgroup_get_default();
2007 if (p_handle > 0) {
2008 me = PGRP_LIST[p_handle].map_proc_list[GAme];
2009 nproc = PGRP_LIST[p_handle].map_nproc;
2010 } else {
2011 me = GAme;
2012 nproc = GAnproc;
2013 }
2014 has_data = 0;
2015
2016 for (i=0; i<GAnproc; i++) {
2017 GA[ga_handle].rank_rstrctd[i] = -1;
2018 }
2019
2020 icnt = 0;
2021 for (i=lo_proc; i<=hi_proc; i++) {
2022 GA[ga_handle].rstrctd_list[icnt] = i;
2023 /* check if processor is in list */
2024 if (me == i) {
2025 has_data = 1;
2026 id = icnt;
2027 }
2028 /* check if processor is in group */
2029 if (i < 0 || i >= nproc)
2030 pnga_error("Invalid processor in list",i);
2031 ig = i;
2032 GA[ga_handle].rank_rstrctd[ig] = icnt;
2033 icnt++;
2034 }
2035 GA[ga_handle].has_data = has_data;
2036 GA[ga_handle].rstrctd_id = id;
2037 }
2038
2039 /**
2040 * Set the property on a global array.
2041 */
2042 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2043 # pragma weak wnga_set_property = pnga_set_property
2044 #endif
pnga_set_property(Integer g_a,char * property)2045 void pnga_set_property(Integer g_a, char* property) {
2046 Integer ga_handle = g_a + GA_OFFSET;
2047 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
2048 pnga_pgroup_sync(GA[ga_handle].p_handle);
2049 /* Check to see if property conflicts with properties already set on the
2050 * global array. This check may need more refinement as additional properties
2051 * are added. */
2052 if (GA[ga_handle].property != NO_PROPERTY) {
2053 pnga_error("Cannot set property on an array that already has property set",0);
2054 }
2055 if (strcmp(property,"read_only")==0) {
2056 /* TODO: copy global array to new configuration */
2057 int i, d, ndim, btot, chk;
2058 Integer nprocs, nodeid, origin_id, dflt_grp, handle, maplen;
2059 Integer nelem, mem_size, status, grp_me, g_tmp, me_local;
2060 Integer *list;
2061 Integer blk[MAXDIM], dims[MAXDIM], pe[MAXDIM], chunk[MAXDIM];
2062 Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
2063 Integer *pmap[MAXDIM], *map;
2064 char *buf;
2065 if (GA[ga_handle].distr_type != REGULAR) {
2066 pnga_error("Block-cyclic arrays not supported for READ_ONLY",0);
2067 }
2068 if (GA[ga_handle].p_handle != pnga_pgroup_get_world()) {
2069 pnga_error("Arrays on subgroups not supported for READ_ONLY",0);
2070 }
2071 ndim = (int)GA[ga_handle].ndim;
2072 btot = 0;
2073 for (i=0; i<ndim; i++) {
2074 GA[ga_handle].old_nblock[i] = GA[ga_handle].nblock[i];
2075 GA[ga_handle].old_lo[i] = GA[ga_handle].lo[i];
2076 GA[ga_handle].old_chunk[i] = GA[ga_handle].chunk[i];
2077 btot += GA[ga_handle].nblock[i];
2078 }
2079 GA[ga_handle].old_mapc = (Integer*)malloc((btot+1)*sizeof(Integer));
2080 for (i=0; i<btot+1; i++) {
2081 GA[ga_handle].old_mapc[i] = GA[ga_handle].mapc[i];
2082 }
2083 /* Make a temporary copy of GA */
2084 g_tmp = pnga_create_handle();
2085 pnga_set_data(g_tmp,ndim,GA[ga_handle].dims,GA[ga_handle].type);
2086 pnga_set_pgroup(g_tmp,GA[ga_handle].p_handle);
2087 if (!pnga_allocate(g_tmp)) {
2088 pnga_error("Failed to allocate temporary array",0);
2089 }
2090 pnga_copy(g_a, g_tmp);
2091 /* Create a group containing all the processors on this node */
2092 GA[ga_handle].old_handle = GA[ga_handle].p_handle;
2093 nodeid = pnga_cluster_nodeid();
2094 nprocs = pnga_cluster_nprocs(nodeid);
2095 origin_id = pnga_cluster_procid(nodeid,0);
2096 dflt_grp = pnga_pgroup_get_default();
2097 pnga_pgroup_set_default(pnga_pgroup_get_world());
2098 list = (Integer*)malloc(nprocs*sizeof(Integer));
2099 for (i=0; i<nprocs; i++) {
2100 list[i] = origin_id+i;
2101 }
2102 handle = pnga_pgroup_create(list, nprocs);
2103 free(list);
2104 GA[ga_handle].p_handle = handle;
2105 GA[ga_handle].property = READ_ONLY;
2106
2107 /* Ignore hints on data distribution (chunk) and just go with default
2108 * distribution on the node, except if chunk dimension is same as array
2109 * dimension (no partitioning on that axis) */
2110 for (i=0; i<ndim; i++) {
2111 /* eliminate dimension=1 from analysis, otherwise set blk to -1*/
2112 if (GA[ga_handle].chunk[i] == GA[ga_handle].dims[i]) {
2113 chunk[i] = GA[ga_handle].chunk[i];
2114 } else {
2115 chunk[i] = -1;
2116 }
2117 dims[i] = GA[ga_handle].dims[i];
2118 }
2119 if (chunk[0] != 0)
2120 for (d=0; d<ndim; d++) blk[d]=(Integer)GA_MIN(chunk[d],dims[d]);
2121 else
2122 for (d=0; d<ndim; d++) blk[d]=-1;
2123 for (d=0; d<ndim; d++) if (dims[d]==1) blk[d]=1;
2124 ddb_h2(ndim, dims, PGRP_LIST[handle].map_nproc,0.0,(Integer)0,blk,pe);
2125 for(d=0, map=mapALL; d< ndim; d++){
2126 Integer nblock;
2127 Integer pcut; /* # procs that get full blk[] elements; the rest gets less*/
2128 int p;
2129
2130 pmap[d] = map;
2131
2132 /* RJH ... don't leave some nodes without data if possible
2133 * but respect the users block size */
2134
2135 if (chunk[d] > 1) {
2136 Integer ddim = ((dims[d]-1)/GA_MIN(chunk[d],dims[d]) + 1);
2137 pcut = (ddim -(blk[d]-1)*pe[d]) ;
2138 } else {
2139 pcut = (dims[d]-(blk[d]-1)*pe[d]) ;
2140 }
2141
2142 for (nblock=i=p=0; (p<pe[d]) && (i<dims[d]); p++, nblock++) {
2143 Integer b = blk[d];
2144 if (p >= pcut)
2145 b = b-1;
2146 map[nblock] = i+1;
2147 if (chunk[d]>1) b *= GA_MIN(chunk[d],dims[d]);
2148 i += b;
2149 }
2150
2151 pe[d] = GA_MIN(pe[d],nblock);
2152 map += pe[d];
2153 }
2154 maplen = 0;
2155 for( i = 0; i< ndim; i++){
2156 GA[ga_handle].nblock[i] = pe[i];
2157 maplen += pe[i];
2158 }
2159 free(GA[ga_handle].mapc);
2160 GA[ga_handle].mapc = (Integer*)malloc((maplen+1)*sizeof(Integer));
2161 for(i = 0; i< maplen; i++) {
2162 GA[ga_handle].mapc[i] = (C_Integer)mapALL[i];
2163 }
2164 GA[ga_handle].mapc[maplen] = -1;
2165 /* Set remaining paramters and determine memory size if regular data
2166 * distribution is being used */
2167
2168 for( i = 0; i< ndim; i++){
2169 GA[ga_handle].scale[i] = (double)GA[ga_handle].nblock[i]
2170 / (double)GA[ga_handle].dims[i];
2171 }
2172
2173 /*** determine which portion of the array I am supposed
2174 * to hold ***/
2175 pnga_distribution(g_a, GAme, GA[ga_handle].lo, hi);
2176 chk = 1;
2177 for( i = 0, nelem=1; i< ndim; i++){
2178 if (hi[i]-(Integer)GA[ga_handle].lo[i]+1 <= 0) chk = 0;
2179 nelem *= (hi[i]-(Integer)GA[ga_handle].lo[i]+1);
2180 }
2181 mem_size = nelem * GA[ga_handle].elemsize;
2182 if (!chk) mem_size = 0;
2183 grp_me = pnga_pgroup_nodeid(handle);
2184 /* Clean up old memory first */
2185 #ifndef AVOID_MA_STORAGE
2186 if(gai_uses_shm((int)handle)){
2187 #endif
2188 /* make sure that we free original (before address allignment)
2189 * pointer */
2190 #ifdef MSG_COMMS_MPI
2191 if (GA[ga_handle].old_handle > 0){
2192 ARMCI_Free_group(
2193 GA[ga_handle].ptr[pnga_pgroup_nodeid(GA[ga_handle].old_handle)]
2194 - GA[ga_handle].id,
2195 &PGRP_LIST[GA[ga_handle].old_handle].group);
2196 }
2197 else
2198 #endif
2199 {
2200 ARMCI_Free(
2201 GA[ga_handle].ptr[pnga_pgroup_nodeid(GA[ga_handle].old_handle)]
2202 - GA[ga_handle].id);
2203 }
2204 #ifndef AVOID_MA_STORAGE
2205 }else{
2206 if(GA[ga_handle].id != INVALID_MA_HANDLE) MA_free_heap(GA[ga_handle].id);
2207 }
2208 #endif
2209
2210 if(GA_memory_limited) GA_total_memory += GA[ga_handle].size;
2211 GAstat.curmem -= GA[ga_handle].size;
2212 /* if requested, enforce limits on memory consumption */
2213 if(GA_memory_limited) GA_total_memory -= mem_size;
2214 GAstat.curmem += mem_size;
2215 /* check if everybody has enough memory left */
2216 if(GA_memory_limited){
2217 status = (GA_total_memory >= 0) ? 1 : 0;
2218 pnga_pgroup_gop(handle,pnga_type_f2c(MT_F_INT), &status, 1, "&&");
2219 } else status = 1;
2220 /* allocate memory */
2221 if (status) {
2222 /* Allocate new memory */
2223 if (GA[ga_handle].mem_dev_set) {
2224 status = !gai_get_devmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2225 GA[ga_handle].type, &GA[ga_handle].id, handle,
2226 GA[ga_handle].mem_dev_set,GA[ga_handle].mem_dev);
2227 } else {
2228 status = !gai_getmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2229 GA[ga_handle].type, &GA[ga_handle].id, handle);
2230 }
2231 } else {
2232 GA[ga_handle].ptr[grp_me]=NULL;
2233 }
2234 GA[ga_handle].size = (C_Long)mem_size;
2235 if (!status) {
2236 pnga_error("Memory failure when unsetting READ_ONLY",0);
2237 }
2238 /* Copy data from copy of old GA to new GA and then get rid of copy*/
2239 pnga_distribution(g_a,GAme,lo,hi);
2240 chk = 1;
2241 nelem = 1;
2242 for (i=0; i<ndim; i++) {
2243 /*
2244 GA[ga_handle].chunk[i] = ((C_Integer)hi[i]-GA[ga_handle].lo[i]+1);
2245 */
2246 ld[i] = hi[i] - lo[i] + 1;
2247 nelem *= ld[i];
2248 if (hi[i] < lo[i]) chk = 0;
2249 }
2250 if (chk) {
2251 #if 1
2252 pnga_get(g_tmp,lo,hi,GA[ga_handle].ptr[grp_me],ld);
2253 #else
2254 /* MPI RMA does not allow you to use memory assigned to one window as
2255 * local buffer for another buffer. Create a local buffer to get around
2256 * this problem */
2257 buf = (char*)malloc(nelem*GA[ga_handle].elemsize);
2258 pnga_get(g_tmp,lo,hi,buf,ld);
2259 memcpy(GA[ga_handle].ptr[grp_me],buf,nelem*GA[ga_handle].elemsize);
2260 free(buf);
2261 #endif
2262 }
2263 pnga_destroy(g_tmp);
2264 } else if (strcmp(property, "read_cache") == 0) {
2265 GA[ga_handle].property = READ_CACHE;
2266 GA[ga_handle].cache_head = NULL; /* (cache_struct_t *)malloc(sizeof(cache_struct_t)) */
2267 } else {
2268 pnga_error("Trying to set unknown property",0);
2269 }
2270 }
2271
2272 /**
2273 * Set the memory device on a global array.
2274 */
2275 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2276 # pragma weak wnga_set_memory_dev = pnga_set_memory_dev
2277 #endif
pnga_set_memory_dev(Integer g_a,char * device)2278 void pnga_set_memory_dev(Integer g_a, char *device) {
2279 Integer ga_handle = g_a + GA_OFFSET;
2280 int len = strlen(device);
2281 int i, ilen;
2282 if (len>FNAM) {
2283 pnga_error("Illegal memory device name specified. Device name exceeds length: ",
2284 FNAM);
2285 }
2286 /* convert device name to lower case */
2287 for (i=0; i<len; i++) {
2288 device[i] = tolower(device[i]);
2289 }
2290 /* remove blanks */
2291 ilen = 0;
2292 for (i=0; i<len; i++) {
2293 if (device[i] != ' ') {
2294 device [ilen] = device[i];
2295 ilen++;
2296 }
2297 }
2298 if (ilen > len && ilen < FNAM) device[ilen] = '\0';
2299 GA[ga_handle].mem_dev_set = 1;
2300 strcpy(GA[ga_handle].mem_dev,device);
2301 }
2302
2303 /**
2304 * Clear property from global array.
2305 */
2306 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2307 # pragma weak wnga_unset_property = pnga_unset_property
2308 #endif
pnga_unset_property(Integer g_a)2309 void pnga_unset_property(Integer g_a) {
2310 Integer ga_handle = g_a + GA_OFFSET;
2311 if (GA[ga_handle].property == READ_ONLY) {
2312 /* TODO: Copy global array to original configuration */
2313 int i, d, ndim, btot, chk;
2314 Integer g_tmp, grp_me;
2315 Integer nprocs, nodeid, origin_id, dflt_grp, handle, maplen;
2316 Integer nelem, mem_size, status;
2317 Integer *list;
2318 Integer dims[MAXDIM], chunk[MAXDIM];
2319 Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
2320 void *ptr, *buf;
2321
2322 ndim = (int)GA[ga_handle].ndim;
2323 /* Start by making a copy of the GA */
2324 for (i=0; i<ndim; i++) {
2325 chunk[i] = GA[ga_handle].chunk[i];
2326 dims[i] = GA[ga_handle].dims[i];
2327 }
2328 /* Make a temporary copy of GA */
2329 g_tmp = pnga_create_handle();
2330 pnga_set_data(g_tmp,ndim,GA[ga_handle].dims,GA[ga_handle].type);
2331 pnga_set_pgroup(g_tmp,GA[ga_handle].old_handle);
2332 if (!pnga_allocate(g_tmp)) {
2333 pnga_error("Failed to allocate temporary array",0);
2334 }
2335 /* Copy portion of global array to locally held portion of tmp array */
2336 grp_me = pnga_pgroup_nodeid(GA[ga_handle].old_handle);
2337 pnga_distribution(g_tmp,grp_me,lo,hi);
2338 chk = 1;
2339 for (i=0; i<ndim; i++) {
2340 ld[i] = hi[i]-lo[i]+1;
2341 if (hi[i] < lo[i]) chk = 0;
2342 }
2343 if (chk) {
2344 pnga_access_ptr(g_tmp,lo,hi,&ptr,ld);
2345 pnga_get(g_a,lo,hi,ptr,ld);
2346 }
2347
2348 /* Get rid of current memory allocation */
2349 #ifndef AVOID_MA_STORAGE
2350 if(gai_uses_shm((int)GA[ga_handle].p_handle)){
2351 #endif
2352 /* make sure that we free original (before address allignment)
2353 * pointer */
2354 #ifdef MSG_COMMS_MPI
2355 if (GA[ga_handle].p_handle > 0){
2356 ARMCI_Free_group(
2357 GA[ga_handle].ptr[pnga_pgroup_nodeid(GA[ga_handle].p_handle)]
2358 - GA[ga_handle].id,
2359 &PGRP_LIST[GA[ga_handle].p_handle].group);
2360 }
2361 else
2362 #endif
2363 {
2364 ARMCI_Free(
2365 GA[ga_handle].ptr[pnga_pgroup_nodeid(GA[ga_handle].p_handle)]
2366 - GA[ga_handle].id);
2367 }
2368 #ifndef AVOID_MA_STORAGE
2369 }else{
2370 if(GA[ga_handle].id != INVALID_MA_HANDLE) MA_free_heap(GA[ga_handle].id);
2371 }
2372 #endif
2373
2374 /* Reset distribution parameters back to original values */
2375 btot = 0;
2376 for (i=0; i<ndim; i++) {
2377 GA[ga_handle].nblock[i] = GA[ga_handle].old_nblock[i];
2378 GA[ga_handle].lo[i] = GA[ga_handle].old_lo[i];
2379 GA[ga_handle].chunk[i] = GA[ga_handle].old_chunk[i];
2380 btot += GA[ga_handle].nblock[i];
2381 }
2382 free(GA[ga_handle].mapc);
2383 GA[ga_handle].mapc = (Integer*)malloc((btot+1)*sizeof(Integer));
2384 for (i=0; i<btot+1; i++) {
2385 GA[ga_handle].mapc[i] = GA[ga_handle].old_mapc[i];
2386 }
2387 free(GA[ga_handle].old_mapc);
2388
2389 pnga_distribution(g_a, grp_me, GA[ga_handle].lo, hi);
2390 chk = 1;
2391 for( i = 0, nelem=1; i< ndim; i++){
2392 if (hi[i]-(Integer)GA[ga_handle].lo[i]+1 <= 0) chk = 0;
2393 nelem *= (hi[i]-(Integer)GA[ga_handle].lo[i]+1);
2394 }
2395 mem_size = nelem * GA[ga_handle].elemsize;
2396 if (!chk) mem_size = 0;
2397
2398 if(GA_memory_limited) GA_total_memory += GA[ga_handle].size;
2399 GAstat.curmem -= GA[ga_handle].size;
2400 /* if requested, enforce limits on memory consumption */
2401 if(GA_memory_limited) GA_total_memory -= mem_size;
2402 /* check if everybody has enough memory left */
2403 if(GA_memory_limited){
2404 status = (GA_total_memory >= 0) ? 1 : 0;
2405 pnga_pgroup_gop(GA[ga_handle].old_handle,pnga_type_f2c(MT_F_INT),
2406 &status, 1, "&&");
2407 } else status = 1;
2408 handle = (Integer)GA[ga_handle].p_handle;
2409 GA[ga_handle].p_handle = GA[ga_handle].old_handle;
2410 if (status) {
2411 /* Allocate new memory */
2412 if (GA[ga_handle].mem_dev_set) {
2413 status = !gai_get_devmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2414 GA[ga_handle].type, &GA[ga_handle].id, GA[ga_handle].p_handle,
2415 GA[ga_handle].mem_dev_set,GA[ga_handle].mem_dev);
2416 } else {
2417 status = !gai_getmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2418 GA[ga_handle].type, &GA[ga_handle].id, GA[ga_handle].p_handle);
2419 }
2420 } else {
2421 GA[ga_handle].ptr[grp_me]=NULL;
2422 }
2423 GA[ga_handle].size = (C_Long)mem_size;
2424 if (!status) {
2425 pnga_error("Memory failure when setting READ_ONLY",0);
2426 }
2427 /* Get rid of read-only group */
2428 pnga_pgroup_destroy(handle);
2429 /* Generate parameters for new memory allocation. Distribution function
2430 * should work, so we can use that to find out how much data is on this
2431 * processor */
2432 GA[ga_handle].property = NO_PROPERTY;
2433 pnga_distribution(g_a,GAme,lo,hi);
2434 chk = 1;
2435 nelem = 1;
2436 for (i=0; i<ndim; i++) {
2437 ld[i] = hi[i]-lo[i]+1;
2438 nelem *= ld[i];
2439 if (hi[i] < lo[i]) chk = 0;
2440 }
2441 if (chk) {
2442 #if 1
2443 pnga_access_ptr(g_a,lo,hi,&ptr,ld);
2444 pnga_get(g_tmp,lo,hi,ptr,ld);
2445 #else
2446 /* MPI RMA does not allow you to use memory assigned to one window as
2447 * local buffer for another buffer. Create a local buffer to get around
2448 * this problem */
2449 buf = (void*)malloc(nelem*GA[ga_handle].elemsize);
2450 pnga_get(g_tmp,lo,hi,buf,ld);
2451 pnga_access_ptr(g_a,lo,hi,&ptr,ld);
2452 memcpy(ptr,buf,nelem*GA[ga_handle].elemsize);
2453 free(buf);
2454 #endif
2455 }
2456 pnga_destroy(g_tmp);
2457 } else if (GA[ga_handle].property == READ_CACHE) {
2458 if (GA[ga_handle].cache_head != NULL) {
2459 cache_struct_t *next;
2460 next = GA[ga_handle].cache_head->next;
2461 if (GA[ga_handle].cache_head->cache_buf)
2462 free(GA[ga_handle].cache_head->cache_buf);
2463 free(GA[ga_handle].cache_head);
2464 while (next) {
2465 GA[ga_handle].cache_head = next;
2466 next = next->next;
2467 if (GA[ga_handle].cache_head->cache_buf)
2468 free(GA[ga_handle].cache_head->cache_buf);
2469 free(GA[ga_handle].cache_head);
2470 }
2471 }
2472 GA[ga_handle].cache_head = NULL;
2473 } else {
2474 GA[ga_handle].property = NO_PROPERTY;
2475 }
2476 }
2477
2478 /**
2479 * Allocate memory and complete setup of global array
2480 */
2481 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2482 # pragma weak wnga_allocate = pnga_allocate
2483 #endif
2484
pnga_allocate(Integer g_a)2485 logical pnga_allocate(Integer g_a)
2486 {
2487
2488 Integer hi[MAXDIM];
2489 Integer ga_handle = g_a + GA_OFFSET;
2490 Integer d, width[MAXDIM], ndim;
2491 Integer mem_size, nelem;
2492 Integer i, status, maplen=0, p_handle;
2493 Integer dims[MAXDIM], chunk[MAXDIM];
2494 Integer pe[MAXDIM], *pmap[MAXDIM], *map;
2495 Integer blk[MAXDIM];
2496 Integer grp_me=GAme, grp_nproc=GAnproc;
2497 Integer block_size = 0;
2498
2499 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
2500 if (GA[ga_handle].ndim == -1)
2501 pnga_error("Insufficient data to create global array",0);
2502
2503 p_handle = (Integer)GA[ga_handle].p_handle;
2504 if (p_handle == (Integer)GA_Init_Proc_Group) {
2505 GA[ga_handle].p_handle = GA_Default_Proc_Group;
2506 p_handle = GA_Default_Proc_Group;
2507 }
2508 pnga_pgroup_sync(p_handle);
2509
2510 if (p_handle > 0) {
2511 grp_nproc = PGRP_LIST[p_handle].map_nproc;
2512 grp_me = PGRP_LIST[p_handle].map_proc_list[GAme];
2513 }
2514
2515 if(!GAinitialized) pnga_error("GA not initialized ", 0);
2516 if(!ma_address_init) gai_ma_address_init();
2517
2518 ndim = GA[ga_handle].ndim;
2519 for (i=0; i<ndim; i++) width[i] = (C_Integer)GA[ga_handle].width[i];
2520
2521 /* The data distribution has not been specified by the user. Create
2522 default distribution */
2523 if (GA[ga_handle].mapc == NULL && GA[ga_handle].distr_type == REGULAR) {
2524 for (d=0; d<ndim; d++) {
2525 dims[d] = (Integer)GA[ga_handle].dims[d];
2526 chunk[d] = (Integer)GA[ga_handle].chunk[d];
2527 }
2528 if(chunk[0]!=0) /* for chunk[0]=0 compute all */
2529 for(d=0; d< ndim; d++) blk[d]=(Integer)GA_MIN(chunk[d],dims[d]);
2530 else
2531 for(d=0; d< ndim; d++) blk[d]=-1;
2532
2533 /* eliminate dimensions =1 from ddb analysis */
2534 for(d=0; d<ndim; d++)if(dims[d]==1)blk[d]=1;
2535
2536 if (GAme==0 && DEBUG )
2537 for (d=0;d<ndim;d++) fprintf(stderr,"b[%ld]=%ld\n",(long)d,(long)blk[d]);
2538 pnga_pgroup_sync(p_handle);
2539
2540 /* ddb(ndim, dims, GAnproc, blk, pe);*/
2541 if(p_handle == 0) /* for mirrored arrays */
2542 #if OLD_DISTRIBUTION
2543 ddb_h2(ndim, dims, PGRP_LIST[p_handle].map_nproc,0.0,(Integer)0, blk, pe);
2544 #else
2545 ddb(ndim, dims, PGRP_LIST[p_handle].map_nproc, blk, pe);
2546 #endif
2547 else
2548 if (GA[ga_handle].num_rstrctd == 0) {
2549 /* Data is normally distributed on processors */
2550 #if OLD_DISTRIBUTION
2551 ddb_h2(ndim, dims, grp_nproc,0.0,(Integer)0, blk, pe);
2552 #else
2553 ddb(ndim, dims, grp_nproc, blk, pe);
2554 #endif
2555 } else {
2556 /* Data is only distributed on subset of processors */
2557 #if OLD_DISTRIBUTION
2558 ddb_h2(ndim, dims, GA[ga_handle].num_rstrctd, 0.0, (Integer)0, blk, pe);
2559 #else
2560 ddb(ndim, dims, GA[ga_handle].num_rstrctd, blk, pe);
2561 #endif
2562 }
2563
2564 for(d=0, map=mapALL; d< ndim; d++){
2565 Integer nblock;
2566 Integer pcut; /* # procs that get full blk[] elements; the rest gets less*/
2567 int p;
2568
2569 pmap[d] = map;
2570
2571 /* RJH ... don't leave some nodes without data if possible
2572 but respect the users block size */
2573
2574 if (chunk[d] > 1) {
2575 Integer ddim = ((dims[d]-1)/GA_MIN(chunk[d],dims[d]) + 1);
2576 pcut = (ddim -(blk[d]-1)*pe[d]) ;
2577 }
2578 else {
2579 pcut = (dims[d]-(blk[d]-1)*pe[d]) ;
2580 }
2581
2582 for (nblock=i=p=0; (p<pe[d]) && (i<dims[d]); p++, nblock++) {
2583 Integer b = blk[d];
2584 if (p >= pcut)
2585 b = b-1;
2586 map[nblock] = i+1;
2587 if (chunk[d]>1) b *= GA_MIN(chunk[d],dims[d]);
2588 i += b;
2589 }
2590
2591 pe[d] = GA_MIN(pe[d],nblock);
2592 map += pe[d];
2593 }
2594 if(GAme==0&& DEBUG){
2595 gai_print_subscript("pe ",(int)ndim, pe,"\n");
2596 gai_print_subscript("blocks ",(int)ndim, blk,"\n");
2597 printf("decomposition map\n");
2598 for(d=0; d< ndim; d++){
2599 printf("dim=%ld: ",(long)d);
2600 for (i=0;i<pe[d];i++)printf("%ld ",(long)pmap[d][i]);
2601 printf("\n");
2602 }
2603 fflush(stdout);
2604 }
2605 maplen = 0;
2606 for( i = 0; i< ndim; i++){
2607 GA[ga_handle].nblock[i] = pe[i];
2608 maplen += pe[i];
2609 }
2610 GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
2611 for(i = 0; i< maplen; i++) {
2612 GA[ga_handle].mapc[i] = (C_Integer)mapALL[i];
2613 }
2614 GA[ga_handle].mapc[maplen] = -1;
2615 } else if (GA[ga_handle].distr_type == BLOCK_CYCLIC) {
2616 /* Regular block-cyclic data distribution has been specified. Figure
2617 out how much memory is needed by each processor to store blocks */
2618 Integer nblocks = GA[ga_handle].block_total;
2619 Integer tsize, j;
2620 Integer lo[MAXDIM];
2621 block_size = 0;
2622 for (i=GAme; i<nblocks; i +=GAnproc) {
2623 ga_ownsM(ga_handle,i,lo,hi);
2624 tsize = 1;
2625 for (j=0; j<ndim; j++) {
2626 tsize *= (hi[j] - lo[j] + 1);
2627 }
2628 block_size += tsize;
2629 }
2630 } else if (GA[ga_handle].distr_type == SCALAPACK) {
2631 /* ScaLAPACK block-cyclic data distribution has been specified. Figure
2632 out how much memory is needed by each processor to store blocks */
2633 Integer j, jtot, skip, imin, imax;
2634 Integer index[MAXDIM];
2635 gam_find_proc_indices(ga_handle,GAme,index);
2636 block_size = 1;
2637 for (i=0; i<ndim; i++) {
2638 skip = GA[ga_handle].nblock[i];
2639 jtot = 0;
2640 for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2641 imin = j*GA[ga_handle].block_dims[i] + 1;
2642 imax = (j+1)*GA[ga_handle].block_dims[i];
2643 if (imax > GA[ga_handle].dims[i]) imax = GA[ga_handle].dims[i];
2644 jtot += (imax-imin+1);
2645 }
2646 block_size *= jtot;
2647 }
2648 } else if (GA[ga_handle].distr_type == TILED) {
2649 /* Tiled data distribution has been specified. Figure
2650 out how much memory is needed by each processor to store blocks */
2651 Integer j, jtot, skip, imin, imax;
2652 Integer index[MAXDIM];
2653 gam_find_tile_proc_indices(ga_handle,GAme,index);
2654 block_size = 1;
2655 for (i=0; i<ndim; i++) {
2656 skip = GA[ga_handle].nblock[i];
2657 jtot = 0;
2658 for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2659 imin = j*GA[ga_handle].block_dims[i] + 1;
2660 imax = (j+1)*GA[ga_handle].block_dims[i];
2661 if (imax > GA[ga_handle].dims[i]) imax = GA[ga_handle].dims[i];
2662 jtot += (imax-imin+1);
2663 }
2664 block_size *= jtot;
2665 }
2666 } else if (GA[ga_handle].distr_type == TILED_IRREG) {
2667 /* Tiled data distribution has been specified. Figure
2668 out how much memory is needed by each processor to store blocks */
2669 Integer j, jtot, skip, imin, imax;
2670 Integer index[MAXDIM];
2671 Integer offset = 0;
2672 gam_find_tile_proc_indices(ga_handle,GAme,index);
2673 block_size = 1;
2674 for (i=0; i<ndim; i++) {
2675 skip = GA[ga_handle].nblock[i];
2676 jtot = 0;
2677 for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2678 imin = GA[ga_handle].mapc[offset+j];
2679 if (j<GA[ga_handle].num_blocks[i]-1) {
2680 imax = GA[ga_handle].mapc[offset+j+1]-1;
2681 } else {
2682 imax = GA[ga_handle].dims[i];
2683 }
2684 jtot += (imax-imin+1);
2685 }
2686 block_size *= jtot;
2687 offset += GA[ga_handle].num_blocks[i];
2688 }
2689 }
2690
2691 GAstat.numcre ++;
2692
2693 GA[ga_handle].actv = 1;
2694 /* If only one node is being used and array is mirrored,
2695 * set proc list to world group */
2696 if (pnga_cluster_nnodes() == 1 && GA[ga_handle].p_handle == 0) {
2697 GA[ga_handle].p_handle = pnga_pgroup_get_world();
2698 }
2699
2700 /* Set remaining parameters and determine memory size if regular data
2701 * distribution is being used */
2702 if (GA[ga_handle].distr_type == REGULAR) {
2703 /* set corner flag, if it has not already been set and set up message
2704 passing data */
2705 if (GA[ga_handle].corner_flag == -1) {
2706 i = 1;
2707 } else {
2708 i = GA[ga_handle].corner_flag;
2709 }
2710 for( i = 0; i< ndim; i++){
2711 GA[ga_handle].scale[i] = (double)GA[ga_handle].nblock[i]
2712 / (double)GA[ga_handle].dims[i];
2713 }
2714 pnga_set_ghost_corner_flag(g_a, i);
2715
2716 /*** determine which portion of the array I am supposed to hold ***/
2717 if (p_handle == 0) { /* for mirrored arrays */
2718 Integer me_local = (Integer)PGRP_LIST[p_handle].map_proc_list[GAme];
2719 pnga_distribution(g_a, me_local, GA[ga_handle].lo, hi);
2720 } else {
2721 pnga_distribution(g_a, grp_me, GA[ga_handle].lo, hi);
2722 }
2723 if (GA[ga_handle].num_rstrctd == 0 || GA[ga_handle].has_data == 1) {
2724 for( i = 0, nelem=1; i< ndim; i++){
2725 /*
2726 GA[ga_handle].chunk[i] = ((C_Integer)hi[i]-GA[ga_handle].lo[i]+1);
2727 */
2728 nelem *= (hi[i]-(Integer)GA[ga_handle].lo[i]+1+2*width[i]);
2729 }
2730 } else {
2731 nelem = 0;
2732 }
2733 mem_size = nelem * GA[ga_handle].elemsize;
2734 } else {
2735 mem_size = block_size * GA[ga_handle].elemsize;
2736 }
2737 GA[ga_handle].id = INVALID_MA_HANDLE;
2738 GA[ga_handle].size = (C_Long)mem_size;
2739 /* if requested, enforce limits on memory consumption */
2740 if(GA_memory_limited) GA_total_memory -= mem_size;
2741 /* check if everybody has enough memory left */
2742 if(GA_memory_limited){
2743 status = (GA_total_memory >= 0) ? 1 : 0;
2744 if (p_handle > 0) {
2745 /* pnga_pgroup_gop(p_handle,pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
2746 pnga_pgroup_gop(p_handle,pnga_type_f2c(MT_F_INT), &status, 1, "&&");
2747 } else {
2748 /* pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
2749 pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "&&");
2750 }
2751 }else status = 1;
2752
2753 if (status) {
2754 if (GA[ga_handle].mem_dev_set) {
2755 status = !gai_get_devmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2756 GA[ga_handle].type, &GA[ga_handle].id, p_handle,
2757 GA[ga_handle].mem_dev_set, GA[ga_handle].mem_dev);
2758 } else {
2759 status = !gai_getmem(GA[ga_handle].name, GA[ga_handle].ptr,mem_size,
2760 GA[ga_handle].type, &GA[ga_handle].id, p_handle);
2761 }
2762 } else {
2763 GA[ga_handle].ptr[grp_me]=NULL;
2764 }
2765
2766 if (GA[ga_handle].distr_type == REGULAR) {
2767 /* Finish setting up information for ghost cell updates */
2768 if (GA[ga_handle].ghosts == 1) {
2769 if (!pnga_set_ghost_info(g_a))
2770 pnga_error("Could not allocate update information for ghost cells",0);
2771 }
2772 /* If array is mirrored, evaluate first and last indices */
2773 /* ngai_get_first_last_indices(&g_a); */
2774 }
2775
2776 pnga_pgroup_sync(p_handle);
2777 if (status) {
2778 GAstat.curmem += (long)GA[ga_handle].size;
2779 GAstat.maxmem = (long)GA_MAX(GAstat.maxmem, GAstat.curmem);
2780 status = TRUE;
2781 } else {
2782 if(GA_memory_limited) GA_total_memory += mem_size;
2783 pnga_destroy(g_a);
2784 status = FALSE;
2785 }
2786 return status;
2787 }
2788
2789 /**
2790 * Use memory from another GA and complete setup of global array
2791 */
2792 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
2793 # pragma weak wnga_overlay = pnga_overlay
2794 #endif
2795
pnga_overlay(Integer g_a,Integer g_parent)2796 logical pnga_overlay(Integer g_a, Integer g_parent)
2797 {
2798
2799 Integer hi[MAXDIM];
2800 Integer ga_handle = g_a + GA_OFFSET;
2801 Integer g_p = g_parent + GA_OFFSET;
2802 Integer d, width[MAXDIM], ndim;
2803 Integer mem_size, nelem;
2804 Integer i, status, maplen=0, p_handle;
2805 Integer dims[MAXDIM], chunk[MAXDIM];
2806 Integer pe[MAXDIM], *pmap[MAXDIM], *map;
2807 Integer blk[MAXDIM];
2808 Integer grp_me=GAme, grp_nproc=GAnproc;
2809 Integer block_size = 0;
2810
2811 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
2812 if (GA[ga_handle].ndim == -1)
2813 pnga_error("Insufficient data to create global array",0);
2814
2815 p_handle = (Integer)GA[ga_handle].p_handle;
2816 if (p_handle == (Integer)GA_Init_Proc_Group) {
2817 GA[ga_handle].p_handle = GA_Default_Proc_Group;
2818 p_handle = GA_Default_Proc_Group;
2819 }
2820 pnga_pgroup_sync(p_handle);
2821 if (p_handle != (Integer)GA[g_p].p_handle) {
2822 pnga_error("Parent and overlay global array must be on same procesor group",0);
2823 }
2824
2825 if (p_handle > 0) {
2826 grp_nproc = PGRP_LIST[p_handle].map_nproc;
2827 grp_me = PGRP_LIST[p_handle].map_proc_list[GAme];
2828 }
2829
2830 if(!GAinitialized) pnga_error("GA not initialized ", 0);
2831 if(!ma_address_init) gai_ma_address_init();
2832
2833 ndim = GA[ga_handle].ndim;
2834 for (i=0; i<ndim; i++) width[i] = (C_Integer)GA[ga_handle].width[i];
2835
2836 /* The data distribution has not been specified by the user. Create
2837 default distribution */
2838 if (GA[ga_handle].mapc == NULL && GA[ga_handle].distr_type == REGULAR) {
2839 for (d=0; d<ndim; d++) {
2840 dims[d] = (Integer)GA[ga_handle].dims[d];
2841 chunk[d] = (Integer)GA[ga_handle].chunk[d];
2842 }
2843 if(chunk[0]!=0) /* for chunk[0]=0 compute all */
2844 for(d=0; d< ndim; d++) blk[d]=(Integer)GA_MIN(chunk[d],dims[d]);
2845 else
2846 for(d=0; d< ndim; d++) blk[d]=-1;
2847
2848 /* eliminate dimensions =1 from ddb analysis */
2849 for(d=0; d<ndim; d++)if(dims[d]==1)blk[d]=1;
2850
2851 if (GAme==0 && DEBUG )
2852 for (d=0;d<ndim;d++) fprintf(stderr,"b[%ld]=%ld\n",(long)d,(long)blk[d]);
2853 pnga_pgroup_sync(p_handle);
2854
2855 /* ddb(ndim, dims, GAnproc, blk, pe);*/
2856 if(p_handle == 0) /* for mirrored arrays */
2857 #if OLD_DISTRIBUTION
2858 ddb_h2(ndim, dims, PGRP_LIST[p_handle].map_nproc,0.0,(Integer)0, blk, pe);
2859 #else
2860 ddb(ndim, dims, PGRP_LIST[p_handle].map_nproc, blk, pe);
2861 #endif
2862 else
2863 if (GA[ga_handle].num_rstrctd == 0) {
2864 /* Data is normally distributed on processors */
2865 #if OLD_DISTRIBUTION
2866 ddb_h2(ndim, dims, grp_nproc,0.0,(Integer)0, blk, pe);
2867 #else
2868 ddb(ndim, dims, grp_nproc, blk, pe);
2869 #endif
2870 } else {
2871 /* Data is only distributed on subset of processors */
2872 #if OLD_DISTRIBUTION
2873 ddb_h2(ndim, dims, GA[ga_handle].num_rstrctd, 0.0, (Integer)0, blk, pe);
2874 #else
2875 ddb(ndim, dims, GA[ga_handle].num_rstrctd, blk, pe);
2876 #endif
2877 }
2878
2879 for(d=0, map=mapALL; d< ndim; d++){
2880 Integer nblock;
2881 Integer pcut; /* # procs that get full blk[] elements; the rest gets less*/
2882 int p;
2883
2884 pmap[d] = map;
2885
2886 /* RJH ... don't leave some nodes without data if possible
2887 but respect the users block size */
2888
2889 if (chunk[d] > 1) {
2890 Integer ddim = ((dims[d]-1)/GA_MIN(chunk[d],dims[d]) + 1);
2891 pcut = (ddim -(blk[d]-1)*pe[d]) ;
2892 }
2893 else {
2894 pcut = (dims[d]-(blk[d]-1)*pe[d]) ;
2895 }
2896
2897 for (nblock=i=p=0; (p<pe[d]) && (i<dims[d]); p++, nblock++) {
2898 Integer b = blk[d];
2899 if (p >= pcut)
2900 b = b-1;
2901 map[nblock] = i+1;
2902 if (chunk[d]>1) b *= GA_MIN(chunk[d],dims[d]);
2903 i += b;
2904 }
2905
2906 pe[d] = GA_MIN(pe[d],nblock);
2907 map += pe[d];
2908 }
2909 if(GAme==0&& DEBUG){
2910 gai_print_subscript("pe ",(int)ndim, pe,"\n");
2911 gai_print_subscript("blocks ",(int)ndim, blk,"\n");
2912 printf("decomposition map\n");
2913 for(d=0; d< ndim; d++){
2914 printf("dim=%ld: ",(long)d);
2915 for (i=0;i<pe[d];i++)printf("%ld ",(long)pmap[d][i]);
2916 printf("\n");
2917 }
2918 fflush(stdout);
2919 }
2920 maplen = 0;
2921 for( i = 0; i< ndim; i++){
2922 GA[ga_handle].nblock[i] = pe[i];
2923 maplen += pe[i];
2924 }
2925 GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
2926 for(i = 0; i< maplen; i++) {
2927 GA[ga_handle].mapc[i] = (C_Integer)mapALL[i];
2928 }
2929 GA[ga_handle].mapc[maplen] = -1;
2930 } else if (GA[ga_handle].distr_type == BLOCK_CYCLIC) {
2931 /* Regular block-cyclic data distribution has been specified. Figure
2932 out how much memory is needed by each processor to store blocks */
2933 Integer nblocks = GA[ga_handle].block_total;
2934 Integer tsize, j;
2935 Integer lo[MAXDIM];
2936 block_size = 0;
2937 for (i=GAme; i<nblocks; i +=GAnproc) {
2938 ga_ownsM(ga_handle,i,lo,hi);
2939 tsize = 1;
2940 for (j=0; j<ndim; j++) {
2941 tsize *= (hi[j] - lo[j] + 1);
2942 }
2943 block_size += tsize;
2944 }
2945 } else if (GA[ga_handle].distr_type == SCALAPACK) {
2946 /* ScaLAPACK block-cyclic data distribution has been specified. Figure
2947 out how much memory is needed by each processor to store blocks */
2948 Integer j, jtot, skip, imin, imax;
2949 Integer index[MAXDIM];
2950 gam_find_proc_indices(ga_handle,GAme,index);
2951 block_size = 1;
2952 for (i=0; i<ndim; i++) {
2953 skip = GA[ga_handle].nblock[i];
2954 jtot = 0;
2955 for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2956 imin = j*GA[ga_handle].block_dims[i] + 1;
2957 imax = (j+1)*GA[ga_handle].block_dims[i];
2958 if (imax > GA[ga_handle].dims[i]) imax = GA[ga_handle].dims[i];
2959 jtot += (imax-imin+1);
2960 }
2961 block_size *= jtot;
2962 }
2963 } else if (GA[ga_handle].distr_type == TILED) {
2964 /* Tiled data distribution has been specified. Figure
2965 out how much memory is needed by each processor to store blocks */
2966 Integer j, jtot, skip, imin, imax;
2967 Integer index[MAXDIM];
2968 gam_find_tile_proc_indices(ga_handle,GAme,index);
2969 block_size = 1;
2970 for (i=0; i<ndim; i++) {
2971 skip = GA[ga_handle].nblock[i];
2972 jtot = 0;
2973 for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2974 imin = j*GA[ga_handle].block_dims[i] + 1;
2975 imax = (j+1)*GA[ga_handle].block_dims[i];
2976 if (imax > GA[ga_handle].dims[i]) imax = GA[ga_handle].dims[i];
2977 jtot += (imax-imin+1);
2978 }
2979 block_size *= jtot;
2980 }
2981 } else if (GA[ga_handle].distr_type == TILED_IRREG) {
2982 /* Tiled data distribution has been specified. Figure
2983 out how much memory is needed by each processor to store blocks */
2984 Integer j, jtot, skip, imin, imax;
2985 Integer index[MAXDIM];
2986 Integer offset = 0;
2987 gam_find_tile_proc_indices(ga_handle,GAme,index);
2988 block_size = 1;
2989 for (i=0; i<ndim; i++) {
2990 skip = GA[ga_handle].nblock[i];
2991 jtot = 0;
2992 for (j=index[i]; j<GA[ga_handle].num_blocks[i]; j += skip) {
2993 imin = GA[ga_handle].mapc[offset+j];
2994 if (j<GA[ga_handle].num_blocks[i]-1) {
2995 imax = GA[ga_handle].mapc[offset+j+1]-1;
2996 } else {
2997 imax = GA[ga_handle].dims[i];
2998 }
2999 jtot += (imax-imin+1);
3000 }
3001 block_size *= jtot;
3002 offset += GA[ga_handle].num_blocks[i];
3003 }
3004 }
3005
3006 GAstat.numcre ++;
3007
3008 GA[ga_handle].actv = 1;
3009 /* If only one node is being used and array is mirrored,
3010 * set proc list to world group */
3011 if (pnga_cluster_nnodes() == 1 && GA[ga_handle].p_handle == 0) {
3012 GA[ga_handle].p_handle = pnga_pgroup_get_world();
3013 }
3014
3015 /* Set remaining parameters and determine memory size if regular data
3016 * distribution is being used */
3017 if (GA[ga_handle].distr_type == REGULAR) {
3018 /* set corner flag, if it has not already been set and set up message
3019 passing data */
3020 if (GA[ga_handle].corner_flag == -1) {
3021 i = 1;
3022 } else {
3023 i = GA[ga_handle].corner_flag;
3024 }
3025 for( i = 0; i< ndim; i++){
3026 GA[ga_handle].scale[i] = (double)GA[ga_handle].nblock[i]
3027 / (double)GA[ga_handle].dims[i];
3028 }
3029 pnga_set_ghost_corner_flag(g_a, i);
3030
3031 /*** determine which portion of the array I am supposed to hold ***/
3032 if (p_handle == 0) { /* for mirrored arrays */
3033 Integer me_local = (Integer)PGRP_LIST[p_handle].map_proc_list[GAme];
3034 pnga_distribution(g_a, me_local, GA[ga_handle].lo, hi);
3035 } else {
3036 pnga_distribution(g_a, grp_me, GA[ga_handle].lo, hi);
3037 }
3038 if (GA[ga_handle].num_rstrctd == 0 || GA[ga_handle].has_data == 1) {
3039 for( i = 0, nelem=1; i< ndim; i++){
3040 /*
3041 GA[ga_handle].chunk[i] = ((C_Integer)hi[i]-GA[ga_handle].lo[i]+1);
3042 */
3043 nelem *= (hi[i]-(Integer)GA[ga_handle].lo[i]+1+2*width[i]);
3044 }
3045 } else {
3046 nelem = 0;
3047 }
3048 mem_size = nelem * GA[ga_handle].elemsize;
3049 } else {
3050 mem_size = block_size * GA[ga_handle].elemsize;
3051 }
3052 GA[ga_handle].id = INVALID_MA_HANDLE;
3053 GA[ga_handle].size = (C_Long)mem_size;
3054 /* check if everybody has enough memory to fit overlay GA */
3055 status = (GA[ga_handle].size <= GA[g_p].size) ? 1 : 0;
3056 if (p_handle > 0) {
3057 pnga_pgroup_gop(p_handle,pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3058 } else {
3059 pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3060 }
3061
3062 if (status) {
3063 GA[ga_handle].overlay = 1;
3064 GA[ga_handle].id = GA[g_p].id;
3065 for (i=0; i<grp_nproc; i++) {
3066 GA[ga_handle].ptr[i] = GA[g_p].ptr[i];
3067 }
3068 }
3069
3070 if (GA[ga_handle].distr_type == REGULAR) {
3071 /* Finish setting up information for ghost cell updates */
3072 if (GA[ga_handle].ghosts == 1) {
3073 if (!pnga_set_ghost_info(g_a))
3074 pnga_error("Could not allocate update information for ghost cells",0);
3075 }
3076 /* If array is mirrored, evaluate first and last indices */
3077 /* ngai_get_first_last_indices(&g_a); */
3078 }
3079
3080 pnga_pgroup_sync(p_handle);
3081 if (status) {
3082 status = TRUE;
3083 } else {
3084 pnga_destroy(g_a);
3085 status = FALSE;
3086 }
3087 return status;
3088 }
3089
3090 /**
3091 * Create an N-dimensional Global Array with ghost cells using an
3092 * irregular distribution on a user-specified process group.
3093 * This is the master routine. All other creation routines are derived
3094 * from this one.
3095 */
3096 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3097 # pragma weak wnga_create_ghosts_irreg_config = pnga_create_ghosts_irreg_config
3098 #endif
3099
pnga_create_ghosts_irreg_config(Integer type,Integer ndim,Integer * dims,Integer * width,char * array_name,Integer * map,Integer * nblock,Integer p_handle,Integer * g_a)3100 logical pnga_create_ghosts_irreg_config(
3101 Integer type, /* MA type */
3102 Integer ndim, /* number of dimensions */
3103 Integer *dims, /* array of dimensions */
3104 Integer *width, /* width of boundary cells for each dimension */
3105 char *array_name, /* array name */
3106 Integer *map, /* decomposition map array */
3107 Integer *nblock, /* number of blocks for each dimension in map */
3108 Integer p_handle, /* processor list handle */
3109 Integer *g_a) /* array handle (output) */
3110 {
3111 logical status;
3112 Integer g_A;
3113
3114 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
3115 pnga_pgroup_sync(p_handle);
3116
3117 g_A = pnga_create_handle();
3118 *g_a = g_A;
3119 pnga_set_data(g_A,ndim,dims,type);
3120 pnga_set_ghosts(g_A,width);
3121 pnga_set_array_name(g_A,array_name);
3122 pnga_set_irreg_distr(g_A,map,nblock);
3123 pnga_set_pgroup(g_A,p_handle);
3124 status = pnga_allocate(g_A);
3125
3126 return status;
3127 }
3128
3129 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3130 # pragma weak wnga_create_ghosts_irreg = pnga_create_ghosts_irreg
3131 #endif
3132
pnga_create_ghosts_irreg(Integer type,Integer ndim,Integer dims[],Integer width[],char * array_name,Integer map[],Integer nblock[],Integer * g_a)3133 logical pnga_create_ghosts_irreg(
3134 Integer type, /* MA type */
3135 Integer ndim, /* number of dimensions */
3136 Integer dims[], /* array of dimensions */
3137 Integer width[], /* width of boundary cells for each dimension */
3138 char *array_name, /* array name */
3139 Integer map[], /* decomposition map array */
3140 Integer nblock[], /* number of blocks for each dimension in map */
3141 Integer *g_a) /* array handle (output) */
3142 {
3143 Integer p_handle = pnga_pgroup_get_default();
3144 return pnga_create_ghosts_irreg_config(type, ndim, dims, width,
3145 array_name, map, nblock, p_handle, g_a);
3146 }
3147
3148
3149 /** Create an N-dimensional Global Array on user-specified process group.
3150 * Allow machine to choose location of array boundaries on individual
3151 * processors
3152 */
3153 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3154 # pragma weak wnga_create_config = pnga_create_config
3155 #endif
3156
pnga_create_config(Integer type,Integer ndim,Integer dims[],char * array_name,Integer * chunk,Integer p_handle,Integer * g_a)3157 logical pnga_create_config(Integer type,
3158 Integer ndim,
3159 Integer dims[],
3160 char* array_name,
3161 Integer *chunk,
3162 Integer p_handle,
3163 Integer *g_a)
3164 {
3165 logical status;
3166 Integer g_A;
3167 g_A = pnga_create_handle();
3168 *g_a = g_A;
3169 pnga_set_data(g_A,ndim,dims,type);
3170 pnga_set_array_name(g_A,array_name);
3171 pnga_set_chunk(g_A,chunk);
3172 pnga_set_pgroup(g_A,p_handle);
3173 status = pnga_allocate(g_A);
3174 return status;
3175 }
3176
3177 /** Create an N-dimensional Global Array on default processor group.
3178 * Allow machine to choose location of array boundaries on individual
3179 * processors
3180 */
3181 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3182 # pragma weak wnga_create = pnga_create
3183 #endif
3184
pnga_create(Integer type,Integer ndim,Integer dims[],char * array_name,Integer * chunk,Integer * g_a)3185 logical pnga_create(Integer type,
3186 Integer ndim,
3187 Integer dims[],
3188 char* array_name,
3189 Integer *chunk,
3190 Integer *g_a)
3191 {
3192 GA_Internal_Threadsafe_Lock();
3193 Integer p_handle = pnga_pgroup_get_default();
3194 logical result = pnga_create_config(type, ndim, dims, array_name, chunk, p_handle, g_a);
3195 GA_Internal_Threadsafe_Unlock();
3196 return result;
3197 }
3198
3199
3200 /*\ CREATE AN N-DIMENSIONAL GLOBAL ARRAY WITH GHOST CELLS
3201 * Allow machine to choose location of array boundaries on individual
3202 * processors.
3203 \*/
3204 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3205 # pragma weak wnga_create_ghosts_config = pnga_create_ghosts_config
3206 #endif
3207
pnga_create_ghosts_config(Integer type,Integer ndim,Integer dims[],Integer width[],char * array_name,Integer chunk[],Integer p_handle,Integer * g_a)3208 logical pnga_create_ghosts_config(Integer type,
3209 Integer ndim,
3210 Integer dims[],
3211 Integer width[],
3212 char* array_name,
3213 Integer chunk[],
3214 Integer p_handle,
3215 Integer *g_a)
3216 {
3217 logical status;
3218 Integer g_A;
3219 g_A = pnga_create_handle();
3220 *g_a = g_A;
3221 pnga_set_data(g_A,ndim,dims,type);
3222 pnga_set_ghosts(g_A,width);
3223 pnga_set_array_name(g_A,array_name);
3224 pnga_set_chunk(g_A,chunk);
3225 pnga_set_pgroup(g_A,p_handle);
3226 status = pnga_allocate(g_A);
3227 return status;
3228 }
3229
3230 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3231 # pragma weak wnga_create_ghosts = pnga_create_ghosts
3232 #endif
3233
pnga_create_ghosts(Integer type,Integer ndim,Integer dims[],Integer width[],char * array_name,Integer chunk[],Integer * g_a)3234 logical pnga_create_ghosts(Integer type,
3235 Integer ndim,
3236 Integer dims[],
3237 Integer width[],
3238 char* array_name,
3239 Integer chunk[],
3240 Integer *g_a)
3241 {
3242 Integer p_handle = pnga_pgroup_get_default();
3243 return pnga_create_ghosts_config(type, ndim, dims, width, array_name,
3244 chunk, p_handle, g_a);
3245 }
3246
3247 /**
3248 * Create a Global Array with an irregular distribution and a user-specified
3249 * process group. The user can specify location of array boundaries on
3250 * individual processors.
3251 */
3252 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3253 # pragma weak wnga_create_irreg_config = pnga_create_irreg_config
3254 #endif
3255
pnga_create_irreg_config(Integer type,Integer ndim,Integer dims[],char * array_name,Integer map[],Integer nblock[],Integer p_handle,Integer * g_a)3256 logical pnga_create_irreg_config(
3257 Integer type, /* MA type */
3258 Integer ndim, /* number of dimensions */
3259 Integer dims[], /* array of dimensions */
3260 char *array_name, /* array name */
3261 Integer map[], /* decomposition map array */
3262 Integer nblock[], /* number of blocks for each dimension in map */
3263 Integer p_handle, /* processor list hande */
3264 Integer *g_a) /* array handle (output) */
3265 {
3266 Integer d,width[MAXDIM];
3267 logical status;
3268
3269 for (d=0; d<ndim; d++) width[d] = 0;
3270 status = pnga_create_ghosts_irreg_config(type, ndim, dims, width,
3271 array_name, map, nblock, p_handle, g_a);
3272
3273 return status;
3274 }
3275
3276
3277 /**
3278 * Create a Global Array with an irregular distribution. The user can specify
3279 * location of array boundaries on individual
3280 * processors.
3281 */
3282 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3283 # pragma weak wnga_create_irreg = pnga_create_irreg
3284 #endif
3285
pnga_create_irreg(Integer type,Integer ndim,Integer dims[],char * array_name,Integer map[],Integer nblock[],Integer * g_a)3286 logical pnga_create_irreg(
3287 Integer type, /* MA type */
3288 Integer ndim, /* number of dimensions */
3289 Integer dims[], /* array of dimensions */
3290 char *array_name, /* array name */
3291 Integer map[], /* decomposition map array */
3292 Integer nblock[], /* number of blocks for each dimension in map */
3293 Integer *g_a) /* array handle (output) */
3294 {
3295
3296 Integer d,width[MAXDIM];
3297 logical status;
3298
3299 for (d=0; d<ndim; d++) width[d] = 0;
3300 status = pnga_create_ghosts_irreg(type, ndim, dims, width,
3301 array_name, map, nblock, g_a);
3302
3303 return status;
3304 }
3305
3306 /*\ get memory alligned w.r.t. MA base
3307 * required on Linux as g77 ignores natural data alignment in common blocks
3308 \*/
gai_get_shmem(char ** ptr_arr,C_Long bytes,int type,long * adj,int grp_id)3309 int gai_get_shmem(char **ptr_arr, C_Long bytes, int type, long *adj,
3310 int grp_id)
3311 {
3312 int status=0;
3313 #ifndef _CHECK_MA_ALGN
3314 char *base;
3315 long diff, item_size;
3316 Integer *adjust;
3317 int i, nproc,grp_me=GAme;
3318
3319 if (grp_id > 0) {
3320 nproc = PGRP_LIST[grp_id].map_nproc;
3321 grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3322 }
3323 else
3324 nproc = GAnproc;
3325
3326 /* need to enforce proper, natural allignment (on size boundary) */
3327 switch (pnga_type_c2f(type)){
3328 case MT_F_DBL: base = (char *) DBL_MB; break;
3329 case MT_F_INT: base = (char *) INT_MB; break;
3330 case MT_F_DCPL: base = (char *) DCPL_MB; break;
3331 case MT_F_SCPL: base = (char *) SCPL_MB; break;
3332 case MT_F_REAL: base = (char *) FLT_MB; break;
3333 default: base = (char*)0;
3334 }
3335
3336 item_size = GAsizeofM(type);
3337 # ifdef GA_ELEM_PADDING
3338 bytes += (C_Long)item_size;
3339 # endif
3340
3341 #endif
3342
3343 *adj = 0;
3344
3345 /* use ARMCI_Malloc_group for groups if proc group is not world group
3346 or mirror group */
3347 #ifdef MSG_COMMS_MPI
3348 if (grp_id > 0) {
3349 status = ARMCI_Malloc_group((void**)ptr_arr, (armci_size_t)bytes,
3350 &PGRP_LIST[grp_id].group);
3351 } else {
3352 #endif
3353 status = ARMCI_Malloc((void**)ptr_arr, (armci_size_t)bytes);
3354 #ifdef MSG_COMMS_MPI
3355 }
3356 #endif
3357
3358 if(bytes!=0 && ptr_arr[grp_me]==NULL)
3359 pnga_error("gai_get_shmem: ARMCI Malloc failed", GAme);
3360 if(status) return status;
3361
3362 #ifndef _CHECK_MA_ALGN
3363
3364 /* adjust all addresses if they are not alligned on corresponding nodes*/
3365
3366 /* we need storage for GAnproc*sizeof(Integer) */
3367 /* JAD -- fixed bug where _ga_map was reused before gai_getmem was done
3368 * with it. Now malloc/free needed memory. */
3369 adjust = (Integer*)malloc(GAnproc*sizeof(Integer));
3370
3371 diff = (GA_ABS( base - (char *) ptr_arr[grp_me])) % item_size;
3372 for(i=0;i<nproc;i++)adjust[i]=0;
3373 adjust[grp_me] = (diff > 0) ? item_size - diff : 0;
3374 *adj = adjust[grp_me];
3375
3376 if (grp_id > 0)
3377 pnga_pgroup_gop(grp_id, pnga_type_f2c(MT_F_INT), adjust, nproc, "+");
3378 else
3379 pnga_gop(pnga_type_f2c(MT_F_INT), adjust, nproc, "+");
3380
3381 for(i=0;i<nproc;i++){
3382 ptr_arr[i] = adjust[i] + (char*)ptr_arr[i];
3383 }
3384 free(adjust);
3385
3386 #endif
3387 return status;
3388 }
3389
gai_uses_shm(int grp_id)3390 int gai_uses_shm(int grp_id)
3391 {
3392 #ifdef MSG_COMMS_MPI
3393 if(grp_id > 0) return ARMCI_Uses_shm_grp(&PGRP_LIST[grp_id].group);
3394 else
3395 #endif
3396 return ARMCI_Uses_shm();
3397 }
3398
gai_getmem(char * name,char ** ptr_arr,C_Long bytes,int type,long * id,int grp_id)3399 int gai_getmem(char* name, char **ptr_arr, C_Long bytes, int type, long *id,
3400 int grp_id)
3401 {
3402 #ifdef AVOID_MA_STORAGE
3403 return gai_get_shmem(ptr_arr, bytes, type, id, grp_id);
3404 #else
3405 Integer handle = INVALID_MA_HANDLE, index;
3406 Integer nproc=GAnproc, grp_me=GAme, item_size = GAsizeofM(type);
3407 C_Long nelem;
3408 char *ptr = (char*)0;
3409
3410 if (grp_id > 0) {
3411 nproc = PGRP_LIST[grp_id].map_nproc;
3412 grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3413 }
3414
3415 if(gai_uses_shm(grp_id)) return gai_get_shmem(ptr_arr, bytes, type, id, grp_id);
3416 else{
3417 nelem = bytes/((C_Long)item_size) + 1;
3418 if(bytes)
3419 if(MA_alloc_get(type, nelem, name, &handle, &index)){
3420 MA_get_pointer(handle, &ptr);}
3421 *id = (long)handle;
3422
3423 /*
3424 printf("bytes=%d ptr=%ld index=%d\n",bytes, ptr,index);
3425 fflush(stdout);
3426 */
3427
3428 bzero((char*)ptr_arr,(int)nproc*sizeof(char*));
3429 ptr_arr[grp_me] = ptr;
3430
3431 # ifndef _CHECK_MA_ALGN /* align */
3432 {
3433 long diff, adjust;
3434 diff = ((unsigned long)ptr_arr[grp_me]) % item_size;
3435 adjust = (diff > 0) ? item_size - diff : 0;
3436 ptr_arr[grp_me] = adjust + (char*)ptr_arr[grp_me];
3437 }
3438 # endif
3439
3440 # ifdef MSG_COMMS_MPI
3441 if (grp_id > 0) {
3442 armci_exchange_address_grp((void**)ptr_arr,(int)nproc,
3443 &PGRP_LIST[grp_id].group);
3444 } else
3445 # endif
3446 armci_exchange_address((void**)ptr_arr,(int)nproc);
3447 if(bytes && !ptr) return 1;
3448 else return 0;
3449 }
3450 #endif /* AVOID_MA_STORAGE */
3451 }
3452
3453 /*\ get device memory alligned w.r.t. MA base
3454 * required on Linux as g77 ignores natural data alignment in common blocks
3455 \*/
gai_get_devmem(char * name,char ** ptr_arr,C_Long bytes,int type,long * adj,int grp_id,int dev_flag,const char * device)3456 int gai_get_devmem(char *name, char **ptr_arr, C_Long bytes, int type, long *adj,
3457 int grp_id, int dev_flag, const char *device)
3458 {
3459 int status=0;
3460 #ifndef _CHECK_MA_ALGN
3461 char *base;
3462 long diff, item_size;
3463 Integer *adjust;
3464 int i, nproc,grp_me=GAme;
3465
3466 if (grp_id > 0) {
3467 nproc = PGRP_LIST[grp_id].map_nproc;
3468 grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3469 }
3470 else
3471 nproc = GAnproc;
3472
3473 /* need to enforce proper, natural allignment (on size boundary) */
3474 switch (pnga_type_c2f(type)){
3475 case MT_F_DBL: base = (char *) DBL_MB; break;
3476 case MT_F_INT: base = (char *) INT_MB; break;
3477 case MT_F_DCPL: base = (char *) DCPL_MB; break;
3478 case MT_F_SCPL: base = (char *) SCPL_MB; break;
3479 case MT_F_REAL: base = (char *) FLT_MB; break;
3480 default: base = (char*)0;
3481 }
3482
3483 item_size = GAsizeofM(type);
3484 # ifdef GA_ELEM_PADDING
3485 bytes += (C_Long)item_size;
3486 # endif
3487
3488 #endif
3489
3490 *adj = 0;
3491
3492 /* use ARMCI_Malloc_group for groups if proc group is not world group
3493 or mirror group */
3494 #ifdef MSG_COMMS_MPI
3495 if (grp_id > 0) {
3496 if (dev_flag) {
3497 status = ARMCI_Malloc_group_memdev((void**)ptr_arr, (armci_size_t)bytes,
3498 &PGRP_LIST[grp_id].group, device);
3499 } else {
3500 status = ARMCI_Malloc_group((void**)ptr_arr, (armci_size_t)bytes,
3501 &PGRP_LIST[grp_id].group);
3502 }
3503 } else {
3504 #endif
3505 if (dev_flag) {
3506 status = ARMCI_Malloc_memdev((void**)ptr_arr, (armci_size_t)bytes, device);
3507 } else {
3508 status = ARMCI_Malloc((void**)ptr_arr, (armci_size_t)bytes);
3509 }
3510 #ifdef MSG_COMMS_MPI
3511 }
3512 #endif
3513
3514 if(bytes!=0 && ptr_arr[grp_me]==NULL)
3515 pnga_error("gai_get_shmem: ARMCI Malloc failed", GAme);
3516 if(status) return status;
3517
3518 #ifndef _CHECK_MA_ALGN
3519
3520 /* adjust all addresses if they are not alligned on corresponding nodes*/
3521
3522 /* we need storage for GAnproc*sizeof(Integer) */
3523 /* JAD -- fixed bug where _ga_map was reused before gai_getmem was done
3524 * with it. Now malloc/free needed memory. */
3525 adjust = (Integer*)malloc(GAnproc*sizeof(Integer));
3526
3527 diff = (GA_ABS( base - (char *) ptr_arr[grp_me])) % item_size;
3528 for(i=0;i<nproc;i++)adjust[i]=0;
3529 adjust[grp_me] = (diff > 0) ? item_size - diff : 0;
3530 *adj = adjust[grp_me];
3531
3532 if (grp_id > 0)
3533 pnga_pgroup_gop(grp_id, pnga_type_f2c(MT_F_INT), adjust, nproc, "+");
3534 else
3535 pnga_gop(pnga_type_f2c(MT_F_INT), adjust, nproc, "+");
3536
3537 for(i=0;i<nproc;i++){
3538 ptr_arr[i] = adjust[i] + (char*)ptr_arr[i];
3539 }
3540 free(adjust);
3541
3542 #endif
3543 return status;
3544 }
3545
3546 /*\ externalized version of gai_getmem to facilitate two-step array creation
3547 \*/
GA_Getmem(int type,int nelem,int grp_id)3548 void *GA_Getmem(int type, int nelem, int grp_id)
3549 {
3550 char **ptr_arr=(char**)0;
3551 int rc,i;
3552 long id;
3553 int bytes;
3554 int extra=sizeof(getmem_t)+GAnproc*sizeof(char*);
3555 char *myptr;
3556 Integer status;
3557 type = pnga_type_f2c(type);
3558 bytes = nelem * GAsizeofM(type);
3559 if(GA_memory_limited){
3560 GA_total_memory -= bytes+extra;
3561 status = (GA_total_memory >= 0) ? 1 : 0;
3562 /* pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
3563 pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3564 if(!status)GA_total_memory +=bytes+extra;
3565 }else status = 1;
3566
3567 ptr_arr=malloc(GAnproc*sizeof(char**));
3568 rc= gai_getmem("ga_getmem", ptr_arr,(Integer)bytes+extra, type, &id, grp_id);
3569 if(rc)pnga_error("ga_getmem: failed to allocate memory",bytes+extra);
3570
3571 myptr = ptr_arr[GAme];
3572
3573 /* make sure that remote memory addresses point to user memory */
3574 for(i=0; i<GAnproc; i++)ptr_arr[i] += extra;
3575
3576 #ifndef AVOID_MA_STORAGE
3577 if(ARMCI_Uses_shm())
3578 #endif
3579 id += extra; /* id is used to store offset */
3580
3581 /* stuff the type and id info at the beginning */
3582 ((getmem_t*)myptr)->id = id;
3583 ((getmem_t*)myptr)->type = type;
3584 ((getmem_t*)myptr)->size = bytes+extra;
3585
3586 /* add ptr info */
3587 memcpy(myptr+sizeof(getmem_t),ptr_arr,(size_t)GAnproc*sizeof(char**));
3588 free(ptr_arr);
3589
3590 return (void*)(myptr+extra);
3591 }
3592
3593
GA_Freemem(void * ptr)3594 void GA_Freemem(void *ptr)
3595 {
3596 int extra = sizeof(getmem_t)+GAnproc*sizeof(char*);
3597 getmem_t *info = (getmem_t *)((char*)ptr - extra);
3598 char **ptr_arr = (char**)(info+1);
3599
3600 #ifndef AVOID_MA_STORAGE
3601 if(ARMCI_Uses_shm()){
3602 #endif
3603 /* make sure that we free original (before address alignment) pointer */
3604 ARMCI_Free(ptr_arr[GAme] - info->id);
3605 #ifndef AVOID_MA_STORAGE
3606 }else{
3607 if(info->id != INVALID_MA_HANDLE) MA_free_heap(info->id);
3608 }
3609 #endif
3610
3611 if(GA_memory_limited) GA_total_memory += info->size;
3612 }
3613
3614 /**
3615 * Return coordinates of a GA patch associated with processor proc
3616 */
3617 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3618 # pragma weak wnga_distribution = pnga_distribution
3619 #endif
3620
pnga_distribution(Integer g_a,Integer proc,Integer * lo,Integer * hi)3621 void pnga_distribution(Integer g_a, Integer proc, Integer *lo, Integer * hi)
3622 {
3623 Integer ga_handle, lproc, old_grp;
3624
3625 ga_check_handleM(g_a, "nga_distribution");
3626 ga_handle = (GA_OFFSET + g_a);
3627
3628 lproc = proc;
3629 if (GA[ga_handle].num_rstrctd > 0) {
3630 lproc = GA[ga_handle].rank_rstrctd[lproc];
3631 }
3632 /* This currently assumes that read-only property can only be applied to
3633 * processors on the world group */
3634 if (GA[ga_handle].property == READ_ONLY) {
3635 Integer node = pnga_cluster_proc_nodeid(proc);
3636 Integer nodesize = pnga_cluster_nprocs(node);
3637 lproc = proc%nodesize;
3638 }
3639 ga_ownsM(ga_handle, lproc, lo, hi);
3640 }
3641
3642 /**
3643 * Check to see if array has ghost cells.
3644 */
3645 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3646 # pragma weak wnga_has_ghosts = pnga_has_ghosts
3647 #endif
3648
pnga_has_ghosts(Integer g_a)3649 logical pnga_has_ghosts(Integer g_a)
3650 {
3651 int h_a = (int)g_a + GA_OFFSET;
3652 return GA[h_a].ghosts;
3653 }
3654 /**
3655 * Return the dimension of a Global Array
3656 */
3657 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3658 # pragma weak wnga_ndim = pnga_ndim
3659 #endif
3660
pnga_ndim(Integer g_a)3661 Integer pnga_ndim(Integer g_a)
3662 {
3663 ga_check_handleM(g_a,"ga_ndim");
3664 return GA[g_a +GA_OFFSET].ndim;
3665 }
3666
3667
3668
3669 /**
3670 * Duplicate an existing global array
3671 * -- new array g_b will have properties of g_a
3672 * array_name - a character string [input]
3673 * g_a - Integer handle for reference array [input]
3674 * g_b - Integer handle for new array [output]
3675 \*/
3676 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3677 # pragma weak wnga_duplicate = pnga_duplicate
3678 #endif
3679
pnga_duplicate(Integer g_a,Integer * g_b,char * array_name)3680 logical pnga_duplicate(Integer g_a, Integer *g_b, char* array_name)
3681 {
3682 char **save_ptr;
3683 C_Long mem_size, mem_size_proc;
3684 Integer i, ga_handle, status;
3685 int local_sync_begin,local_sync_end;
3686 Integer grp_id, grp_me=GAme;
3687 /* Integer grp_nproc=GAnproc; */
3688 int maplen;
3689
3690 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3691 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3692 grp_id = pnga_get_pgroup(g_a);
3693 if(local_sync_begin)pnga_pgroup_sync(grp_id);
3694
3695 if (grp_id > 0) {
3696 /* grp_nproc = PGRP_LIST[grp_id].map_nproc; */
3697 grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3698 }
3699
3700 GAstat.numcre ++;
3701
3702 ga_check_handleM(g_a,"ga_duplicate");
3703
3704 /* find a free global_array handle for g_b */
3705 ga_handle =-1; i=0;
3706 do{
3707 if(!GA[i].actv_handle) ga_handle=i;
3708 i++;
3709 }while(i<_max_global_array && ga_handle==-1);
3710 if( ga_handle == -1)
3711 pnga_error("ga_duplicate: too many arrays", (Integer)_max_global_array);
3712 *g_b = (Integer)ga_handle - GA_OFFSET;
3713 GA[ga_handle].actv_handle = 1;
3714
3715 gai_init_struct(ga_handle);
3716
3717 /*** copy content of the data structure ***/
3718 save_ptr = GA[ga_handle].ptr;
3719 GA[ga_handle] = GA[GA_OFFSET + g_a]; /* <--- shallow copy */
3720 strcpy(GA[ga_handle].name, array_name);
3721 GA[ga_handle].ptr = save_ptr;
3722 GA[ga_handle].distr_type = GA[GA_OFFSET + g_a].distr_type;
3723 maplen = calc_maplen(GA_OFFSET + g_a);
3724 if (maplen > 0) {
3725 GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
3726 for(i=0;i<maplen; i++)GA[ga_handle].mapc[i] = GA[GA_OFFSET+ g_a].mapc[i];
3727 GA[ga_handle].mapc[maplen] = -1;
3728 }
3729
3730 /*** if ghost cells are used, initialize ghost cache data ***/
3731 GA[ga_handle].cache = NULL;
3732 pnga_set_ghost_info(*g_b);
3733
3734 /*** initialize and copy info for restricted arrays, if relevant ***/
3735 GA[ga_handle].rstrctd_list = NULL;
3736 GA[ga_handle].rank_rstrctd = NULL;
3737 GA[ga_handle].num_rstrctd = 0;
3738 if (GA[GA_OFFSET + g_a].num_rstrctd > 0) {
3739 GA[ga_handle].num_rstrctd = GA[GA_OFFSET + g_a].num_rstrctd;
3740 pnga_set_restricted(*g_b, GA[GA_OFFSET + g_a].rstrctd_list,
3741 GA[GA_OFFSET + g_a].num_rstrctd);
3742 }
3743
3744 /*** Memory Allocation & Initialization of GA Addressing Space ***/
3745 mem_size = mem_size_proc = GA[ga_handle].size;
3746 GA[ga_handle].id = INVALID_MA_HANDLE;
3747 /* if requested, enforce limits on memory consumption */
3748 if(GA_memory_limited) GA_total_memory -= mem_size_proc;
3749
3750 /* check if everybody has enough memory left */
3751 if(GA_memory_limited){
3752 status = (GA_total_memory >= 0) ? 1 : 0;
3753 if (grp_id > 0) {
3754 /* pnga_pgroup_gop(grp_id, pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
3755 pnga_pgroup_gop(grp_id, pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3756 status = (Integer)status;
3757 } else {
3758 /* pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "*"); */
3759 pnga_gop(pnga_type_f2c(MT_F_INT), &status, 1, "&&");
3760 }
3761 }else status = 1;
3762
3763 if(status)
3764 {
3765 if (GA[ga_handle].mem_dev_set) {
3766 status = !gai_get_devmem(array_name, GA[ga_handle].ptr,mem_size,
3767 (int)GA[ga_handle].type, &GA[ga_handle].id,
3768 (int)grp_id,GA[ga_handle].mem_dev_set,GA[ga_handle].mem_dev);
3769 } else {
3770 status = !gai_getmem(array_name, GA[ga_handle].ptr,mem_size,
3771 (int)GA[ga_handle].type, &GA[ga_handle].id,
3772 (int)grp_id);
3773 }
3774 }
3775 else{
3776 GA[ga_handle].ptr[grp_me]=NULL;
3777 }
3778
3779 if(local_sync_end)pnga_pgroup_sync(grp_id);
3780
3781 # ifdef GA_CREATE_INDEF
3782 /* This code is incorrect. It needs to fixed if INDEF is ever used */
3783 if(status){
3784 Integer one = 1;
3785 Integer dim1 =(Integer)GA[ga_handle].dims[1], dim2=(Integer)GA[ga_handle].dims[2];
3786 if(GAme==0)fprintf(stderr,"duplicate:initializing GA array%ld\n",g_b);
3787 if(GA[ga_handle].type == C_DBL) {
3788 double bad = (double) DBL_MAX;
3789 ga_fill_patch_(g_b, &one, &dim1, &one, &dim2, &bad);
3790 } else if (GA[ga_handle].type == C_INT) {
3791 int bad = (int) INT_MAX;
3792 ga_fill_patch_(g_b, &one, &dim1, &one, &dim2, &bad);
3793 } else if (GA[ga_handle].type == C_LONG) {
3794 long bad = LONG_MAX;
3795 ga_fill_patch_(g_b, &one, &dim1, &one, &dim2, &bad);
3796 } else if (GA[ga_handle].type == C_LONGLONG) {
3797 long long bad = LONG_MAX;
3798 ga_fill_patch_(g_b, &one, &dim1, &one, &dim2, &bad);
3799 } else if (GA[ga_handle].type == C_DCPL) {
3800 DoubleComplex bad = {DBL_MAX, DBL_MAX};
3801 ga_fill_patch_(g_b, &one, &dim1, &one, &dim2, &bad);
3802 } else if (GA[ga_handle].type == C_SCPL) {
3803 SingleComplex bad = {FLT_MAX, FLT_MAX};
3804 ga_fill_patch_(g_b, &one, &dim1, &one, &dim2, &bad);
3805 } else if (GA[ga_handle].type == C_FLOAT) {
3806 float bad = FLT_MAX;
3807 ga_fill_patch_(g_b, &one, &dim1, &one, &dim2, &bad);
3808 } else {
3809 pnga_error("ga_duplicate: type not supported ",GA[ga_handle].type);
3810 }
3811 }
3812 # endif
3813
3814 if(status){
3815 GAstat.curmem += (long)GA[ga_handle].size;
3816 GAstat.maxmem = (long)GA_MAX(GAstat.maxmem, GAstat.curmem);
3817 return(TRUE);
3818 }else{
3819 if (GA_memory_limited) GA_total_memory += mem_size_proc;
3820 pnga_destroy(*g_b);
3821 return(FALSE);
3822 }
3823 }
3824
3825 /*\ DUPLICATE A GLOBAL ARRAY -- memory comes from user
3826 * -- new array g_b will have properties of g_a
3827 \*/
GA_Assemble_duplicate(int g_a,char * array_name,void * ptr)3828 int GA_Assemble_duplicate(int g_a, char* array_name, void* ptr)
3829 {
3830 char **save_ptr;
3831 int i, ga_handle;
3832 int extra = sizeof(getmem_t)+GAnproc*sizeof(char*);
3833 getmem_t *info = (getmem_t *)((char*)ptr - extra);
3834 char **ptr_arr = (char**)(info+1);
3835 int g_b;
3836 int maplen = calc_maplen(GA_OFFSET + g_a);
3837
3838
3839 pnga_sync();
3840
3841 GAstat.numcre ++;
3842
3843 ga_check_handleM(g_a,"ga_assemble_duplicate");
3844
3845 /* find a free global_array handle for g_b */
3846 ga_handle =-1; i=0;
3847 do{
3848 if(!GA[i].actv_handle) ga_handle=i;
3849 i++;
3850 }while(i<_max_global_array && ga_handle==-1);
3851 if( ga_handle == -1)
3852 pnga_error("ga_assemble_duplicate: too many arrays ",
3853 (Integer)_max_global_array);
3854 g_b = ga_handle - GA_OFFSET;
3855
3856 gai_init_struct(ga_handle);
3857 GA[ga_handle].actv_handle = 1;
3858
3859 /*** copy content of the data structure ***/
3860 save_ptr = GA[ga_handle].ptr;
3861 GA[ga_handle] = GA[GA_OFFSET + g_a];
3862 strcpy(GA[ga_handle].name, array_name);
3863 GA[ga_handle].ptr = save_ptr;
3864 if (maplen > 0) {
3865 GA[ga_handle].mapc = (C_Integer*)malloc((maplen+1)*sizeof(C_Integer*));
3866 for(i=0;i<maplen; i++)GA[ga_handle].mapc[i] = GA[GA_OFFSET+ g_a].mapc[i];
3867 GA[ga_handle].mapc[maplen] = -1;
3868 }
3869
3870 /* get ptrs and datatype from user memory */
3871 gam_checktype(pnga_type_f2c(info->type));
3872 GA[ga_handle].type = pnga_type_f2c(info->type);
3873 GA[ga_handle].size = (C_Long)info->size;
3874 GA[ga_handle].id = info->id;
3875 memcpy(GA[ga_handle].ptr,ptr_arr,(size_t)GAnproc*sizeof(char**));
3876
3877 GAstat.curmem += (long)GA[ga_handle].size;
3878 GAstat.maxmem = (long)GA_MAX(GAstat.maxmem, GAstat.curmem);
3879
3880 pnga_sync();
3881
3882 return(g_b);
3883 }
3884
3885 /**
3886 * Destroy a Global Array and clean up memory
3887 */
3888 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
3889 # pragma weak wnga_destroy = pnga_destroy
3890 #endif
3891
pnga_destroy(Integer g_a)3892 logical pnga_destroy(Integer g_a)
3893 {
3894 Integer ga_handle = GA_OFFSET + g_a, grp_id, grp_me=GAme;
3895 int local_sync_begin,local_sync_end;
3896
3897 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3898 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3899 grp_id = (Integer)GA[ga_handle].p_handle;
3900 if(local_sync_begin)pnga_pgroup_sync(grp_id);
3901
3902 if (grp_id > 0) grp_me = PGRP_LIST[grp_id].map_proc_list[GAme];
3903 else grp_me=GAme;
3904
3905 GAstat.numdes ++; /*regardless of array status we count this call */
3906 /* fails if handle is out of range or array not active */
3907 if(ga_handle < 0 || ga_handle >= _max_global_array){
3908 return FALSE;
3909 }
3910 if(GA[ga_handle].actv==0){
3911 return FALSE;
3912 }
3913 if (GA[ga_handle].cache)
3914 free(GA[ga_handle].cache);
3915 GA[ga_handle].cache = NULL;
3916 GA[ga_handle].actv = 0;
3917 GA[ga_handle].actv_handle = 0;
3918 GA[ga_handle].mem_dev_set = 0;
3919
3920 if (GA[ga_handle].num_rstrctd > 0) {
3921 GA[ga_handle].num_rstrctd = 0;
3922 if (GA[ga_handle].rstrctd_list)
3923 free(GA[ga_handle].rstrctd_list);
3924 GA[ga_handle].rstrctd_list = NULL;
3925
3926 if (GA[ga_handle].rank_rstrctd)
3927 free(GA[ga_handle].rank_rstrctd);
3928 GA[ga_handle].rank_rstrctd = NULL;
3929 }
3930
3931 if(GA[ga_handle].mapc != NULL){
3932 free(GA[ga_handle].mapc);
3933 GA[ga_handle].mapc = NULL;
3934 }
3935
3936 if (GA[ga_handle].property == READ_CACHE) {
3937 if (GA[ga_handle].cache_head != NULL) {
3938 cache_struct_t *next;
3939 next = GA[ga_handle].cache_head->next;
3940 if (GA[ga_handle].cache_head->cache_buf)
3941 free(GA[ga_handle].cache_head->cache_buf);
3942 free(GA[ga_handle].cache_head);
3943 while (next) {
3944 GA[ga_handle].cache_head = next;
3945 next = next->next;
3946 if (GA[ga_handle].cache_head->cache_buf)
3947 free(GA[ga_handle].cache_head->cache_buf);
3948 free(GA[ga_handle].cache_head);
3949 }
3950 }
3951 }
3952 GA[ga_handle].cache_head = NULL;
3953
3954 if (GA[ga_handle].property == READ_ONLY) {
3955 free(GA[ga_handle].old_mapc);
3956 pnga_pgroup_destroy(GA[ga_handle].p_handle);
3957 }
3958
3959 if(GA[ga_handle].ptr[grp_me]==NULL){
3960 return TRUE;
3961 }
3962 if (!GA[ga_handle].overlay) {
3963 #ifndef AVOID_MA_STORAGE
3964 if(gai_uses_shm((int)grp_id)){
3965 #endif
3966 /* make sure that we free original (before address allignment) pointer */
3967 #ifdef MSG_COMMS_MPI
3968 if (grp_id > 0){
3969 ARMCI_Free_group(GA[ga_handle].ptr[grp_me] - GA[ga_handle].id,
3970 &PGRP_LIST[grp_id].group);
3971 }
3972 else
3973 #endif
3974 if (GA[ga_handle].mem_dev_set) {
3975 ARMCI_Free_memdev(GA[ga_handle].ptr[GAme]-GA[ga_handle].id);
3976 } else {
3977 ARMCI_Free(GA[ga_handle].ptr[GAme] - GA[ga_handle].id);
3978 }
3979 #ifndef AVOID_MA_STORAGE
3980 }else{
3981 if(GA[ga_handle].id != INVALID_MA_HANDLE) MA_free_heap(GA[ga_handle].id);
3982 }
3983 #endif
3984 if(GA_memory_limited) GA_total_memory += GA[ga_handle].size;
3985 GAstat.curmem -= GA[ga_handle].size;
3986 } else {
3987 GA[ga_handle].overlay = 0;
3988 }
3989
3990
3991 if(local_sync_end)pnga_pgroup_sync(grp_id);
3992 return(TRUE);
3993 }
3994
3995
3996
3997 /**
3998 * Terminate Global Array structures
3999 *
4000 * All GA arrays are destroyed & shared memory is dealocated
4001 * GA routines (except for ga_initialize) should not be called thereafter
4002 */
4003 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4004 # pragma weak wnga_terminate = pnga_terminate
4005 #endif
4006
pnga_terminate()4007 void pnga_terminate()
4008 {
4009 //GA_Internal_Threadsafe_Lock();
4010 Integer i, handle;
4011
4012 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
4013 if(!GAinitialized)
4014 {
4015 //GA_Internal_Threadsafe_Unlock();
4016 return;
4017 }
4018
4019 #ifdef PROFILE_OLD
4020 ga_profile_terminate();
4021 #endif
4022 for (i=0;i<_max_global_array;i++){
4023 handle = i - GA_OFFSET ;
4024 if(GA[i].actv) pnga_destroy(handle);
4025 if(GA[i].ptr) free(GA[i].ptr);
4026 if(GA[i].mapc) free(GA[i].mapc);
4027 }
4028 /* don't free groups list until all arrays destroyed */
4029 for (i=0;i<_max_global_array;i++){
4030 if(PGRP_LIST[i].actv) free(PGRP_LIST[i].map_proc_list);
4031 }
4032 pnga_sync();
4033
4034 GA_total_memory = -1; /* restore "unlimited" memory usage status */
4035 GA_memory_limited = 0;
4036 gai_finalize_onesided();
4037 free(mapALL);
4038 free(_ga_main_data_structure);
4039 free(_proc_list_main_data_structure);
4040 ARMCI_Free(GA_Update_Flags[GAme]);
4041 free(GA_Update_Flags);
4042 ARMCI_Free_local(GA_Update_Signal);
4043
4044 pnga_sync();
4045 ARMCI_Finalize();
4046 ARMCIinitialized = 0;
4047 GAinitialized = 0;
4048 //GA_Internal_Threadsafe_Unlock();
4049 }
4050
4051
4052 /**
4053 * Is array active or inactive
4054 */
4055 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4056 # pragma weak wnga_verify_handle = pnga_verify_handle
4057 #endif
4058
pnga_verify_handle(Integer g_a)4059 Integer pnga_verify_handle(Integer g_a)
4060 {
4061 return (Integer)
4062 ((g_a + GA_OFFSET>= 0) && (g_a + GA_OFFSET< _max_global_array) &&
4063 GA[GA_OFFSET + (g_a)].actv);
4064 }
4065
4066
4067
4068 /**
4069 * Fill array with random values in [0,val)
4070 */
4071 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4072 # pragma weak wnga_randomize = pnga_randomize
4073 #endif
4074
pnga_randomize(Integer g_a,void * val)4075 void pnga_randomize(Integer g_a, void* val)
4076 {
4077 int i,handle=GA_OFFSET + (int)g_a;
4078 char *ptr;
4079 int local_sync_begin,local_sync_end;
4080 C_Long elems;
4081 Integer grp_id;
4082 Integer num_blocks;
4083
4084
4085 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
4086 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
4087 grp_id = pnga_get_pgroup(g_a);
4088 if(local_sync_begin)pnga_pgroup_sync(grp_id);
4089
4090
4091 ga_check_handleM(g_a, "ga_randomize");
4092 gam_checktype(GA[handle].type);
4093 elems = GA[handle].size/((C_Long)GA[handle].elemsize);
4094 num_blocks = GA[handle].block_total;
4095
4096 if (num_blocks < 0) {
4097 /* Bruce..Please CHECK if this is correct */
4098 if (grp_id >= 0){
4099 Integer grp_me = PGRP_LIST[GA[handle].p_handle].map_proc_list[GAme];
4100 ptr = GA[handle].ptr[grp_me];
4101 }
4102 else ptr = GA[handle].ptr[GAme];
4103
4104 switch (GA[handle].type){
4105 /*
4106 case C_DCPL:
4107 for(i=0; i<elems;i++)((DoubleComplex*)ptr)[i]=*(DoubleComplex*) rand();
4108 break;
4109 case C_SCPL:
4110 for(i=0; i<elems;i++)((SingleComplex*)ptr)[i]=*(SingleComplex*)val;
4111 break;
4112 */
4113 case C_DBL:
4114 for(i=0; i<elems;i++)((double*)ptr)[i]=*(double*) val * ((double)rand())/RAND_MAX;
4115 break;
4116 case C_INT:
4117 for(i=0; i<elems;i++)((int*)ptr)[i]=*(int*) val * ((int)rand())/RAND_MAX;
4118 break;
4119 case C_FLOAT:
4120 for(i=0; i<elems;i++)((float*)ptr)[i]=*(float*) val * ((float)rand())/RAND_MAX;
4121 break;
4122 case C_LONG:
4123 for(i=0; i<elems;i++)((long*)ptr)[i]=*(long*) val * ((long)rand())/RAND_MAX;
4124 break;
4125 case C_LONGLONG:
4126 for(i=0; i<elems;i++)((long long*)ptr)[i]=*( long long*) val * ((long long)rand())/RAND_MAX;
4127 break;
4128 default:
4129 pnga_error("type not supported",GA[handle].type);
4130 }
4131 } else {
4132 Integer I_elems = (Integer)elems;
4133 pnga_access_block_segment_ptr(g_a,GAme,&ptr,&I_elems);
4134 elems = (C_Long)I_elems;
4135 switch (GA[handle].type){
4136 /*
4137 case C_DCPL:
4138 for(i=0; i<elems;i++)((DoubleComplex*)ptr)[i]=*(DoubleComplex*)val;
4139 break;
4140 case C_SCPL:
4141 for(i=0; i<elems;i++)((SingleComplex*)ptr)[i]=*(SingleComplex*)val;
4142 break;
4143 */
4144 case C_DBL:
4145 for(i=0; i<elems;i++)((double*)ptr)[i]=*(double*)val * ((double)rand())/RAND_MAX;
4146 break;
4147 case C_INT:
4148 for(i=0; i<elems;i++)((int*)ptr)[i]=*(int*)val * ((int)rand())/RAND_MAX;
4149 break;
4150 case C_FLOAT:
4151 for(i=0; i<elems;i++)((float*)ptr)[i]=*(float*)val * ((float)rand())/RAND_MAX;
4152 break;
4153 case C_LONG:
4154 for(i=0; i<elems;i++)((long*)ptr)[i]=*(long*)val * ((long)rand())/RAND_MAX;
4155 break;
4156 case C_LONGLONG:
4157 for(i=0; i<elems;i++)((long long*)ptr)[i]=*(long long*)val * ((long long)rand())/RAND_MAX;
4158 break;
4159 default:
4160 pnga_error("type not supported",GA[handle].type);
4161 }
4162 pnga_release_block_segment(g_a,GAme);
4163 }
4164
4165 if(local_sync_end)pnga_pgroup_sync(grp_id);
4166
4167 }
4168
4169 /**
4170 * Fill array with value
4171 */
4172 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4173 # pragma weak wnga_fill = pnga_fill
4174 #endif
4175
pnga_fill(Integer g_a,void * val)4176 void pnga_fill(Integer g_a, void* val)
4177 {
4178 int i,handle=GA_OFFSET + (int)g_a;
4179 char *ptr;
4180 int local_sync_begin,local_sync_end;
4181 C_Long elems;
4182 Integer grp_id;
4183 Integer num_blocks;
4184
4185
4186 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
4187 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous sync masking*/
4188 grp_id = pnga_get_pgroup(g_a);
4189 if(local_sync_begin)pnga_pgroup_sync(grp_id);
4190
4191
4192 ga_check_handleM(g_a, "ga_fill");
4193 gam_checktype(GA[handle].type);
4194 elems = GA[handle].size/((C_Long)GA[handle].elemsize);
4195 num_blocks = GA[handle].block_total;
4196
4197 if (num_blocks < 0) {
4198 /* Bruce..Please CHECK if this is correct */
4199 if (grp_id >= 0){
4200 Integer grp_me = PGRP_LIST[GA[handle].p_handle].map_proc_list[GAme];
4201 ptr = GA[handle].ptr[grp_me];
4202 }
4203 else ptr = GA[handle].ptr[GAme];
4204
4205 switch (GA[handle].type){
4206 case C_DCPL:
4207 for(i=0; i<elems;i++)((DoubleComplex*)ptr)[i]=*(DoubleComplex*)val;
4208 break;
4209 case C_SCPL:
4210 for(i=0; i<elems;i++)((SingleComplex*)ptr)[i]=*(SingleComplex*)val;
4211 break;
4212 case C_DBL:
4213 for(i=0; i<elems;i++)((double*)ptr)[i]=*(double*)val;
4214 break;
4215 case C_INT:
4216 for(i=0; i<elems;i++)((int*)ptr)[i]=*(int*)val;
4217 break;
4218 case C_FLOAT:
4219 for(i=0; i<elems;i++)((float*)ptr)[i]=*(float*)val;
4220 break;
4221 case C_LONG:
4222 for(i=0; i<elems;i++)((long*)ptr)[i]=*(long*)val;
4223 break;
4224 case C_LONGLONG:
4225 for(i=0; i<elems;i++)((long long*)ptr)[i]=*( long long*)val;
4226 break;
4227 default:
4228 pnga_error("type not supported",GA[handle].type);
4229 }
4230 } else {
4231 Integer I_elems = (Integer)elems;
4232 pnga_access_block_segment_ptr(g_a,GAme,&ptr,&I_elems);
4233 elems = (C_Long)I_elems;
4234 switch (GA[handle].type){
4235 case C_DCPL:
4236 for(i=0; i<elems;i++)((DoubleComplex*)ptr)[i]=*(DoubleComplex*)val;
4237 break;
4238 case C_SCPL:
4239 for(i=0; i<elems;i++)((SingleComplex*)ptr)[i]=*(SingleComplex*)val;
4240 break;
4241 case C_DBL:
4242 for(i=0; i<elems;i++)((double*)ptr)[i]=*(double*)val;
4243 break;
4244 case C_INT:
4245 for(i=0; i<elems;i++)((int*)ptr)[i]=*(int*)val;
4246 break;
4247 case C_FLOAT:
4248 for(i=0; i<elems;i++)((float*)ptr)[i]=*(float*)val;
4249 break;
4250 case C_LONG:
4251 for(i=0; i<elems;i++)((long*)ptr)[i]=*(long*)val;
4252 break;
4253 case C_LONGLONG:
4254 for(i=0; i<elems;i++)((long long*)ptr)[i]=*(long long*)val;
4255 break;
4256 default:
4257 pnga_error("type not supported",GA[handle].type);
4258 }
4259 pnga_release_block_segment(g_a,GAme);
4260 }
4261
4262 if(local_sync_end)pnga_pgroup_sync(grp_id);
4263
4264 }
4265
4266 /**
4267 * Get properties of Global Array. Note that type variable will be
4268 * using C conventions
4269 */
4270 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4271 # pragma weak wnga_inquire = pnga_inquire
4272 #endif
4273
pnga_inquire(Integer g_a,Integer * type,Integer * ndim,Integer * dims)4274 void pnga_inquire(Integer g_a, Integer *type, Integer *ndim, Integer *dims)
4275 {
4276 Integer handle = GA_OFFSET + g_a,i;
4277 ga_check_handleM(g_a, "nga_inquire");
4278 *type = GA[handle].type;
4279 *ndim = GA[handle].ndim;
4280 for(i=0;i<*ndim;i++) dims[i]=(Integer)GA[handle].dims[i];
4281 }
4282
4283 /**
4284 * Get type of Global Array. Note that type variable will be
4285 * using C conventions
4286 */
4287 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4288 # pragma weak wnga_inquire_type = pnga_inquire_type
4289 #endif
4290
pnga_inquire_type(Integer g_a,Integer * type)4291 void pnga_inquire_type(Integer g_a, Integer *type)
4292 {
4293 Integer handle = GA_OFFSET + g_a;
4294 ga_check_handleM(g_a, "nga_inquire");
4295 *type = GA[handle].type;
4296 }
4297
4298 /**
4299 * Inquire name of Global Array
4300 */
4301 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4302 # pragma weak wnga_inquire_name = pnga_inquire_name
4303 #endif
4304
pnga_inquire_name(Integer g_a,char ** array_name)4305 void pnga_inquire_name(Integer g_a, char **array_name)
4306 {
4307 ga_check_handleM(g_a, "ga_inquire_name");
4308 *array_name = GA[GA_OFFSET + g_a].name;
4309 }
4310
4311 /**
4312 * Return processor coordinates in processor grid
4313 */
4314 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4315 # pragma weak wnga_proc_topology = pnga_proc_topology
4316 #endif
4317
pnga_proc_topology(Integer g_a,Integer proc,Integer * subscript)4318 void pnga_proc_topology(Integer g_a, Integer proc, Integer* subscript)
4319 {
4320 Integer d, index, ndim, ga_handle = GA_OFFSET + g_a;
4321
4322 ga_check_handleM(g_a, "nga_proc_topology");
4323 ndim = GA[ga_handle].ndim;
4324
4325 index = proc;
4326
4327 for(d=0; d<ndim; d++){
4328 subscript[d] = index% GA[ga_handle].nblock[d];
4329 index /= GA[ga_handle].nblock[d];
4330 }
4331 }
4332
4333 /**
4334 * Return dimensions of processor grid associate with a Global Array
4335 */
4336 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4337 # pragma weak wnga_get_proc_grid = pnga_get_proc_grid
4338 #endif
4339
pnga_get_proc_grid(Integer g_a,Integer * dims)4340 void pnga_get_proc_grid(Integer g_a, Integer *dims)
4341 {
4342 Integer i, ndim, ga_handle = GA_OFFSET + g_a;
4343 ga_check_handleM(g_a, "ga_get_proc_grid");
4344 ndim = GA[ga_handle].ndim;
4345 for (i=0; i<ndim; i++) {
4346 dims[i] = GA[ga_handle].nblock[i];
4347 }
4348 }
4349
4350 #if 0
4351 static void gai_get_proc_from_block_index_(Integer g_a, Integer *index, Integer *proc)
4352 {
4353 Integer ga_handle = GA_OFFSET + g_a;
4354 Integer ndim = GA[ga_handle].ndim;
4355 Integer i, ld;
4356 if (pnga_uses_proc_grid(g_a)) {
4357 int *proc_grid = GA[ga_handle].nblock;
4358 Integer proc_id[MAXDIM];
4359 for (i=0; i<ndim; i++) {
4360 proc_id[i] = index[i]%proc_grid[i];
4361 }
4362 ld = 1;
4363 *proc = index[0];
4364 for (i=1; i<ndim; i++) {
4365 ld *= proc_grid[i];
4366 *proc *= ld;
4367 *proc += index[i];
4368 }
4369 } else {
4370 C_Integer *block_grid = GA[ga_handle].num_blocks;
4371 *proc = index[ndim-1];
4372 ld = 1;
4373 for (i=ndim-2; i >= 0; i--) {
4374 ld *= block_grid[i];
4375 *proc *= ld;
4376 *proc += index[i];
4377 }
4378 *proc = *proc%pnga_nnodes();
4379 }
4380 }
4381 #endif
4382
4383 /*\
4384 * RETURN HOW MANY PROCESSORS/OWNERS THERE ARE FOR THE SPECIFIED PATCH OF A
4385 * GLOBAL ARRAY
4386 \*/
4387 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4388 # pragma weak wnga_locate_nnodes = pnga_locate_nnodes
4389 #endif
pnga_locate_nnodes(Integer g_a,Integer * lo,Integer * hi,Integer * np)4390 logical pnga_locate_nnodes( Integer g_a,
4391 Integer *lo,
4392 Integer *hi,
4393 Integer *np)
4394 /* g_a [input] global array handle
4395 lo [input] lower indices of patch in global array
4396 hi [input] upper indices of patch in global array
4397 np [output] total number of processors containing a portion
4398 of the patch
4399
4400 For a block cyclic data distribution, this function returns a list of
4401 blocks that cover the region along with the lower and upper indices of
4402 each block.
4403 */
4404 {
4405 int procT[MAXDIM], procB[MAXDIM], proc_subscript[MAXDIM];
4406 Integer proc, i, ga_handle;
4407 Integer d, dpos, ndim, elems, use_blocks;
4408 /* Integer p_handle; */
4409
4410 ga_check_handleM(g_a, "nga_locate_nnodes");
4411
4412 ga_handle = GA_OFFSET + g_a;
4413 #ifdef __crayx1
4414 #pragma _CRI novector
4415 #endif
4416 for(d = 0; d< GA[ga_handle].ndim; d++)
4417 if((lo[d]<1 || hi[d]>GA[ga_handle].dims[d]) ||(lo[d]>hi[d]))return FALSE;
4418
4419 ndim = GA[ga_handle].ndim;
4420
4421 if (GA[ga_handle].distr_type == REGULAR) {
4422 /* find "processor coordinates" for the top left corner and store them
4423 * in ProcT */
4424 #ifdef __crayx1
4425 #pragma _CRI novector
4426 #endif
4427 for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){
4428 findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
4429 GA[ga_handle].scale[d], lo[d], &procT[d]);
4430 dpos += GA[ga_handle].nblock[d];
4431 }
4432
4433 /* find "processor coordinates" for the right bottom corner and store
4434 * them in procB */
4435 #ifdef __crayx1
4436 #pragma _CRI novector
4437 #endif
4438 for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){
4439 findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
4440 GA[ga_handle].scale[d], hi[d], &procB[d]);
4441 dpos += GA[ga_handle].nblock[d];
4442 }
4443
4444 *np = 0;
4445
4446 /* Find total number of processors containing data and return the
4447 * result in elems. Also find the lowest "processor coordinates" of the
4448 * processor block containing data and return these in proc_subscript.
4449 */
4450 ga_InitLoopM(&elems, ndim, proc_subscript, procT,procB,GA[ga_handle].nblock);
4451
4452 /* p_handle = (Integer)GA[ga_handle].p_handle; */
4453 for(i= 0; i< elems; i++){
4454 Integer _lo[MAXDIM], _hi[MAXDIM];
4455
4456 /* convert i to owner processor id using the current values in
4457 proc_subscript */
4458 ga_ComputeIndexM(&proc, ndim, proc_subscript, GA[ga_handle].nblock);
4459 /* get range of global array indices that are owned by owner */
4460 ga_ownsM(ga_handle, proc, _lo, _hi);
4461
4462 /* Update to proc_subscript so that it corresponds to the next
4463 * processor in the block of processors containing the patch */
4464 ga_UpdateSubscriptM(ndim,proc_subscript,procT,procB,GA[ga_handle].nblock);
4465 (*np)++;
4466 }
4467 } else {
4468 Integer nblocks = GA[ga_handle].block_total;
4469 Integer chk, j, tlo[MAXDIM], thi[MAXDIM], cnt;
4470 cnt = 0;
4471 for (i=0; i<nblocks; i++) {
4472 /* check to see if this block overlaps with requested block
4473 * defined by lo and hi */
4474 chk = 1;
4475 /* get limits on block i */
4476 pnga_distribution(g_a,i,tlo,thi);
4477 for (j=0; j<ndim && chk==1; j++) {
4478 /* check to see if at least one end point of the interval
4479 * represented by blo and bhi falls in the interval
4480 * represented by lo and hi */
4481 if (!((tlo[j] >= lo[j] && tlo[j] <= hi[j]) ||
4482 (thi[j] >= lo[j] && thi[j] <= hi[j]))) {
4483 chk = 0;
4484 }
4485 }
4486 /* store blocks that overlap request region in proclist */
4487 if (chk) {
4488 cnt++;
4489 }
4490 }
4491 *np = cnt;
4492 }
4493 return(TRUE);
4494 }
4495 #ifdef __crayx1
4496 #pragma _CRI inline nga_locate_nnodes_
4497 #endif
4498
4499
4500 /**
4501 * Locate individual patches and their owner of specified patch of a
4502 * Global Array
4503 */
4504 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4505 # pragma weak wnga_locate_region = pnga_locate_region
4506 #endif
4507
pnga_locate_region(Integer g_a,Integer * lo,Integer * hi,Integer * map,Integer * proclist,Integer * np)4508 logical pnga_locate_region( Integer g_a,
4509 Integer *lo,
4510 Integer *hi,
4511 Integer *map,
4512 Integer *proclist,
4513 Integer *np)
4514 /* g_a [input] global array handle
4515 lo [input] lower indices of patch in global array
4516 hi [input] upper indices of patch in global array
4517 map [output] list of lower and upper indices for portion of
4518 patch that exists on each processor containing a
4519 portion of the patch. The map is constructed so
4520 that for a D dimensional global array, the first
4521 D elements are the lower indices on the first
4522 processor in proclist, the next D elements are
4523 the upper indices of the first processor in
4524 proclist, the next D elements are the lower
4525 indices for the second processor in proclist, and
4526 so on.
4527 proclist [output] list of processors containing some portion of the
4528 patch
4529 np [output] total number of processors containing a portion
4530 of the patch
4531
4532 For a block cyclic data distribution, this function returns a list of
4533 blocks that cover the region along with the lower and upper indices of
4534 each block.
4535 */
4536 {
4537 int procT[MAXDIM], procB[MAXDIM], proc_subscript[MAXDIM];
4538 Integer proc, owner, i, ga_handle;
4539 Integer d, dpos, ndim, elems, use_blocks;
4540 /* Integer p_handle; */
4541
4542 ga_check_handleM(g_a, "nga_locate_region");
4543
4544 ga_handle = GA_OFFSET + g_a;
4545 #ifdef __crayx1
4546 #pragma _CRI novector
4547 #endif
4548 for(d = 0; d< GA[ga_handle].ndim; d++)
4549 if((lo[d]<1 || hi[d]>GA[ga_handle].dims[d]) ||(lo[d]>hi[d]))return FALSE;
4550
4551 ndim = GA[ga_handle].ndim;
4552
4553 if (GA[ga_handle].distr_type == REGULAR) {
4554 /* find "processor coordinates" for the top left corner and store them
4555 * in ProcT */
4556 #ifdef __crayx1
4557 #pragma _CRI novector
4558 #endif
4559 for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){
4560 findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
4561 GA[ga_handle].scale[d], lo[d], &procT[d]);
4562 dpos += GA[ga_handle].nblock[d];
4563 }
4564
4565 /* find "processor coordinates" for the right bottom corner and store
4566 * them in procB */
4567 #ifdef __crayx1
4568 #pragma _CRI novector
4569 #endif
4570 for(d = 0, dpos = 0; d< GA[ga_handle].ndim; d++){
4571 findblock(GA[ga_handle].mapc + dpos, GA[ga_handle].nblock[d],
4572 GA[ga_handle].scale[d], hi[d], &procB[d]);
4573 dpos += GA[ga_handle].nblock[d];
4574 }
4575
4576 *np = 0;
4577
4578 /* Find total number of processors containing data and return the
4579 * result in elems. Also find the lowest "processor coordinates" of the
4580 * processor block containing data and return these in proc_subscript.
4581 */
4582 ga_InitLoopM(&elems, ndim, proc_subscript, procT,procB,GA[ga_handle].nblock);
4583
4584 /* p_handle = (Integer)GA[ga_handle].p_handle; */
4585 for(i= 0; i< elems; i++){
4586 Integer _lo[MAXDIM], _hi[MAXDIM];
4587 Integer offset;
4588
4589 /* convert i to owner processor id using the current values in
4590 proc_subscript */
4591 ga_ComputeIndexM(&proc, ndim, proc_subscript, GA[ga_handle].nblock);
4592 /* get range of global array indices that are owned by owner */
4593 ga_ownsM(ga_handle, proc, _lo, _hi);
4594
4595 offset = *np *(ndim*2); /* location in map to put patch range */
4596
4597 #ifdef __crayx1
4598 #pragma _CRI novector
4599 #endif
4600 for(d = 0; d< ndim; d++)
4601 map[d + offset ] = lo[d] < _lo[d] ? _lo[d] : lo[d];
4602 #ifdef __crayx1
4603 #pragma _CRI novector
4604 #endif
4605 for(d = 0; d< ndim; d++)
4606 map[ndim + d + offset ] = hi[d] > _hi[d] ? _hi[d] : hi[d];
4607
4608 owner = proc;
4609 if (GA[ga_handle].num_rstrctd == 0) {
4610 proclist[i] = owner;
4611 } else {
4612 proclist[i] = GA[ga_handle].rstrctd_list[owner];
4613 }
4614 /* Update to proc_subscript so that it corresponds to the next
4615 * processor in the block of processors containing the patch */
4616 ga_UpdateSubscriptM(ndim,proc_subscript,procT,procB,GA[ga_handle].nblock);
4617 (*np)++;
4618 }
4619 } else {
4620 Integer nblocks = GA[ga_handle].block_total;
4621 Integer chk, j, tlo[MAXDIM], thi[MAXDIM], cnt;
4622 Integer offset;
4623 cnt = 0;
4624 for (i=0; i<nblocks; i++) {
4625 /* check to see if this block overlaps with requested block
4626 * defined by lo and hi */
4627 chk = 1;
4628 /* get limits on block i */
4629 pnga_distribution(g_a,i,tlo,thi);
4630 for (j=0; j<ndim && chk==1; j++) {
4631 /* check to see if at least one end point of the interval
4632 * represented by blo and bhi falls in the interval
4633 * represented by lo and hi */
4634 if (!((tlo[j] >= lo[j] && tlo[j] <= hi[j]) ||
4635 (thi[j] >= lo[j] && thi[j] <= hi[j]))) {
4636 chk = 0;
4637 }
4638 }
4639 /* store blocks that overlap request region in proclist */
4640 if (chk) {
4641 proclist[cnt] = i;
4642 cnt++;
4643 }
4644 }
4645 *np = cnt;
4646
4647 /* fill map array with block coordinates */
4648 for (i=0; i<cnt; i++) {
4649 offset = i*2*ndim;
4650 j = proclist[i];
4651 pnga_distribution(g_a,j,tlo,thi);
4652 for (j=0; j<ndim; j++) {
4653 map[offset + j] = lo[j] < tlo[j] ? tlo[j] : lo[j];
4654 map[offset + ndim + j] = hi[j] > thi[j] ? thi[j] : hi[j];
4655 }
4656 }
4657 }
4658 return(TRUE);
4659 }
4660 #ifdef __crayx1
4661 #pragma _CRI inline pnga_locate_region
4662 #endif
4663
4664 /**
4665 * Returns the processor grid for the global array
4666 */
4667 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4668 # pragma weak wnga_nblock = pnga_nblock
4669 #endif
4670
pnga_nblock(Integer g_a,Integer * nblock)4671 void pnga_nblock(Integer g_a, Integer *nblock)
4672 {
4673 Integer ga_handle = GA_OFFSET + g_a;
4674 int i, n;
4675
4676 ga_check_handleM(g_a, "ga_nblock");
4677
4678 n = GA[ga_handle].ndim;
4679
4680 if (GA[ga_handle].distr_type == REGULAR) {
4681 for(i=0; i<n; i++) nblock[i] = (Integer)GA[ga_handle].nblock[i];
4682 } else {
4683 for(i=0; i<n; i++) nblock[i] = (Integer)GA[ga_handle].num_blocks[i];
4684 }
4685 }
4686
4687 /**
4688 * Returns a character string describing internal data distribution
4689 */
4690 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4691 # pragma weak wnga_get_distribution_type = pnga_get_distribution_type
4692 #endif
4693
pnga_get_distribution_type(Integer g_a,char * type)4694 void pnga_get_distribution_type(Integer g_a, char *type)
4695 {
4696 Integer ga_handle = GA_OFFSET + g_a;
4697 Integer itype = GA[ga_handle].distr_type;
4698 if (itype == REGULAR) {
4699 strcpy(type,"regular");
4700 } else if (itype == BLOCK_CYCLIC) {
4701 strcpy(type,"block_cyclic");
4702 } else if (itype == SCALAPACK) {
4703 strcpy(type,"scalapack");
4704 } else if (itype == TILED) {
4705 strcpy(type,"tiled");
4706 } else if (itype == TILED_IRREG) {
4707 strcpy(type,"tiled_irreg");
4708 } else {
4709 strcpy(type,"unknown");
4710 }
4711 }
4712
4713 /**
4714 * Return ID of calling process in default group
4715 */
4716 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4717 # pragma weak wnga_nodeid = pnga_nodeid
4718 #endif
4719
pnga_nodeid()4720 Integer pnga_nodeid()
4721 {
4722 if (GA_Default_Proc_Group > 0) {
4723 return (Integer)PGRP_LIST[GA_Default_Proc_Group].map_proc_list[GAme];
4724 } else {
4725 return ((Integer)GAme);
4726 }
4727 }
4728
4729 /**
4730 * Return ID of calling process in group grp
4731 */
4732 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4733 # pragma weak wnga_pgroup_nodeid = pnga_pgroup_nodeid
4734 #endif
4735
pnga_pgroup_nodeid(Integer grp)4736 Integer pnga_pgroup_nodeid(Integer grp)
4737 {
4738 if (grp >= 0) {
4739 return (Integer)PGRP_LIST[(int)grp].map_proc_list[GAme];
4740 } else {
4741 return GAme;
4742 }
4743 }
4744
4745 /**
4746 * Return number of nodes in default group
4747 */
4748 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4749 # pragma weak wnga_nnodes = pnga_nnodes
4750 #endif
4751
pnga_nnodes()4752 Integer pnga_nnodes()
4753 {
4754 if (GA_Default_Proc_Group > 0) {
4755 return (Integer)PGRP_LIST[GA_Default_Proc_Group].map_nproc;
4756 } else {
4757 return ((Integer)GAnproc);
4758 }
4759 }
4760
4761 /**
4762 * Return number of nodes in group grp
4763 */
4764 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4765 # pragma weak wnga_pgroup_nnodes = pnga_pgroup_nnodes
4766 #endif
4767
pnga_pgroup_nnodes(Integer grp)4768 Integer pnga_pgroup_nnodes(Integer grp)
4769 {
4770 if(grp >=0 )
4771 return (Integer)PGRP_LIST[(int)grp].map_nproc;
4772 else
4773 return ((Integer)GAnproc);
4774 }
4775
4776 /**
4777 * Compare distributions of two global arrays
4778 */
4779 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4780 # pragma weak wnga_compare_distr = pnga_compare_distr
4781 #endif
4782
pnga_compare_distr(Integer g_a,Integer g_b)4783 logical pnga_compare_distr(Integer g_a, Integer g_b)
4784 {
4785 int h_a =(int)g_a + GA_OFFSET;
4786 int h_b =(int)g_b + GA_OFFSET;
4787 int h_a_maplen = calc_maplen(h_a);
4788 int h_b_maplen = calc_maplen(h_b);
4789 int i;
4790
4791 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
4792 ga_check_handleM(g_a, "distribution a");
4793 ga_check_handleM(g_b, "distribution b");
4794
4795
4796 if(GA[h_a].ndim != GA[h_b].ndim) return FALSE;
4797
4798 for(i=0; i <GA[h_a].ndim; i++)
4799 if(GA[h_a].dims[i] != GA[h_b].dims[i]) return FALSE;
4800
4801 if (GA[h_a].distr_type != GA[h_b].distr_type) return FALSE;
4802 if (GA[h_a].distr_type == REGULAR) {
4803 if (h_a_maplen != h_b_maplen) return FALSE;
4804 for(i=0; i <h_a_maplen; i++){
4805 if(GA[h_a].mapc[i] != GA[h_b].mapc[i]) return FALSE;
4806 if(GA[h_a].mapc[i] == -1) break;
4807 }
4808 } else if (GA[h_a].distr_type == BLOCK_CYCLIC ||
4809 GA[h_a].distr_type == SCALAPACK || GA[h_a].distr_type == TILED) {
4810 for (i=0; i<GA[h_a].ndim; i++) {
4811 if (GA[h_a].block_dims[i] != GA[h_b].block_dims[i]) return FALSE;
4812 }
4813 for (i=0; i<GA[h_a].ndim; i++) {
4814 if (GA[h_a].num_blocks[i] != GA[h_b].num_blocks[i]) return FALSE;
4815 }
4816 if (GA[h_a].distr_type == SCALAPACK || GA[h_a].distr_type == TILED) {
4817 for (i=0; i<GA[h_a].ndim; i++) {
4818 if (GA[h_a].nblock[i] != GA[h_b].nblock[i]) return FALSE;
4819 }
4820 }
4821 } else if (GA[h_a].distr_type == TILED_IRREG) {
4822 if (h_a_maplen != h_b_maplen) return FALSE;
4823 for(i=0; i <h_a_maplen; i++){
4824 if(GA[h_a].mapc[i] != GA[h_b].mapc[i]) return FALSE;
4825 if(GA[h_a].mapc[i] == -1) break;
4826 }
4827 for (i=0; i<GA[h_a].ndim; i++) {
4828 if (GA[h_a].nblock[i] != GA[h_b].nblock[i]) return FALSE;
4829 }
4830 }
4831 if (GA[h_a].num_rstrctd == GA[h_b].num_rstrctd) {
4832 if (GA[h_a].num_rstrctd > 0) {
4833 for (i=0; i<GA[h_a].num_rstrctd; i++) {
4834 if (GA[h_a].rstrctd_list[i] != GA[h_b].rstrctd_list[i]) return FALSE;
4835 }
4836 }
4837 } else {
4838 return FALSE;
4839 }
4840 return TRUE;
4841 }
4842
4843
4844 static int num_mutexes=0;
4845 static int chunk_mutex;
4846 /**
4847 * Create a set of mutexes
4848 */
4849 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4850 # pragma weak wnga_create_mutexes = pnga_create_mutexes
4851 #endif
4852
pnga_create_mutexes(Integer num)4853 logical pnga_create_mutexes(Integer num)
4854 {
4855 int myshare;
4856
4857 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
4858 if (num <= 0 || num > MAX_MUTEXES) return(FALSE);
4859 if(num_mutexes) pnga_error("mutexes already created",num_mutexes);
4860
4861 num_mutexes= (int)num;
4862
4863 if(GAnproc == 1){
4864 return(TRUE);
4865 }
4866 chunk_mutex = (int)((num + GAnproc-1)/GAnproc);
4867 if(GAme * chunk_mutex >= num)myshare =0;
4868 else myshare=chunk_mutex;
4869
4870 /* need work here to use permutation */
4871 if(ARMCI_Create_mutexes(myshare)){
4872 return FALSE;
4873 }
4874 return TRUE;
4875 }
4876
4877
4878 /**
4879 * Lock an object defined by the mutex number
4880 */
4881 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4882 # pragma weak wnga_lock = pnga_lock
4883 #endif
4884
pnga_lock(Integer mutex)4885 void pnga_lock(Integer mutex)
4886 {
4887 int m,p;
4888
4889 if(GAnproc == 1) return;
4890 if(num_mutexes< mutex)pnga_error("invalid mutex",mutex);
4891
4892 p = num_mutexes/chunk_mutex -1;
4893 m = num_mutexes%chunk_mutex;
4894
4895 ARMCI_Lock(m,p);
4896 }
4897
4898 /**
4899 * Unlock a mutex
4900 */
4901 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4902 # pragma weak wnga_unlock = pnga_unlock
4903 #endif
4904
pnga_unlock(Integer mutex)4905 void pnga_unlock(Integer mutex)
4906 {
4907 int m,p;
4908
4909 if(GAnproc == 1) return;
4910 if(num_mutexes< mutex)pnga_error("invalid mutex",mutex);
4911
4912 p = num_mutexes/chunk_mutex -1;
4913 m = num_mutexes%chunk_mutex;
4914
4915 ARMCI_Unlock(m,p);
4916 }
4917
4918
4919 /**
4920 * Destroy mutexes
4921 */
4922 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4923 # pragma weak wnga_destroy_mutexes = pnga_destroy_mutexes
4924 #endif
4925
pnga_destroy_mutexes()4926 logical pnga_destroy_mutexes()
4927 {
4928 _ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
4929 if(num_mutexes<1) pnga_error("mutexes destroyed",0);
4930
4931 num_mutexes= 0;
4932 if(GAnproc == 1){
4933 return TRUE;
4934 }
4935 if(ARMCI_Destroy_mutexes()){
4936 return FALSE;
4937 }
4938 return TRUE;
4939 }
4940
4941 /**
4942 * Return a list that maps GA process IDs to message-passing process IDs
4943 */
4944 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4945 # pragma weak wnga_list_nodeid = pnga_list_nodeid
4946 #endif
4947
pnga_list_nodeid(Integer * list,Integer num_procs)4948 void pnga_list_nodeid(Integer *list, Integer num_procs)
4949 {
4950 Integer proc;
4951 for( proc = 0; proc < num_procs; proc++)
4952 list[proc]=proc;
4953 }
4954
4955 /**
4956 * Returns true/false depending on validity of the handle
4957 */
4958 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4959 # pragma weak wnga_valid_handle = pnga_valid_handle
4960 #endif
4961
pnga_valid_handle(Integer g_a)4962 logical pnga_valid_handle(Integer g_a)
4963 {
4964 if(GA_OFFSET+ (g_a) < 0 || GA_OFFSET+(g_a) >= _max_global_array ||
4965 ! (GA[GA_OFFSET+(g_a)].actv) ) return FALSE;
4966 else return TRUE;
4967 }
4968
4969
4970 /**
4971 * A function that helps users avoid syncs inside a collective call
4972 * that they think are unnecessary
4973 *
4974 * Mask flags have to be reset in every collective call. Even if that
4975 * collective call doesnt do any sync at all.
4976 * If masking only the beginning sync is possible, make sure to
4977 * clear even the _sync_end mask to avoid a mask intended for this
4978 * collective_function_call to be carried to next collective_function_call
4979 * or to a collective function called by this function.
4980 * Similarly, make sure to use two copy mask values to local variables
4981 * and reset the global mask variables to avoid carring the mask to a
4982 * collective call inside the current collective call.
4983 */
4984 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
4985 # pragma weak wnga_mask_sync = pnga_mask_sync
4986 #endif
4987
pnga_mask_sync(Integer begin,Integer end)4988 void pnga_mask_sync(Integer begin, Integer end)
4989 {
4990 if (begin) _ga_sync_begin = 1;
4991 else _ga_sync_begin = 0;
4992
4993 if (end) _ga_sync_end = 1;
4994 else _ga_sync_end = 0;
4995 }
4996
4997 /**
4998 * Merge all copies of a mirrored array by adding them together
4999 */
5000 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5001 # pragma weak wnga_merge_mirrored = pnga_merge_mirrored
5002 #endif
5003
pnga_merge_mirrored(Integer g_a)5004 void pnga_merge_mirrored(Integer g_a)
5005 {
5006 Integer handle = GA_OFFSET + g_a;
5007 Integer inode, nprocs, nnodes, zero, zproc, nblocks;
5008 int *blocks;
5009 C_Integer *map, *dims, *width;
5010 Integer i, j, index[MAXDIM], itmp, ndim;
5011 Integer lo[MAXDIM], hi[MAXDIM], ld[MAXDIM];
5012 Integer nelem, count, type, atype=ARMCI_INT;
5013 char *zptr=NULL, *bptr=NULL, *nptr=NULL;
5014 Integer bytes, total;
5015 int local_sync_begin, local_sync_end;
5016 long bigint;
5017 int chk = 1;
5018 void *ptr_a;
5019
5020 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
5021 _ga_sync_begin = 1; _ga_sync_end = 1; /*remove any previous masking */
5022 if (local_sync_begin) pnga_sync();
5023 /* don't perform update if node is not mirrored */
5024 if (!pnga_is_mirrored(g_a)) return;
5025
5026 inode = pnga_cluster_nodeid();
5027 nnodes = pnga_cluster_nnodes();
5028 nprocs = pnga_cluster_nprocs(inode);
5029 zero = 0;
5030
5031 zproc = pnga_cluster_procid(inode, zero);
5032 zptr = GA[handle].ptr[zproc];
5033 map = GA[handle].mapc;
5034 blocks = GA[handle].nblock;
5035 dims = GA[handle].dims;
5036 width = GA[handle].width;
5037 type = GA[handle].type;
5038 ndim = GA[handle].ndim;
5039 bigint = 2147483647L/GAsizeof(type);
5040
5041 #ifdef OPENIB
5042 /* Check whether or not all nodes contain the same number
5043 of processors. */
5044 if (nnodes*nprocs == pnga_nnodes()) {
5045 /* check to see if there is any buffer space between the data
5046 associated with each processor that needs to be zeroed out
5047 before performing the merge */
5048 if (zproc == GAme) {
5049 /* the use of nblocks instead of nprocs is designed to support a peculiar
5050 coding style in which the dimensions of the block array are all set to
5051 1 and all the data is restricted to the master processor on the node */
5052 nblocks = 1;
5053 for (i=0; i<ndim; i++) {
5054 nblocks *= blocks[i];
5055 }
5056 for (i=0; i<nblocks; i++) {
5057 /* Find out from mapc data how many elements are supposed to be located
5058 on this processor. Start by converting processor number to indices */
5059 itmp = i;
5060 for (j=0; j<ndim; j++) {
5061 index[j] = itmp%(Integer)blocks[j];
5062 itmp = (itmp - index[j])/(Integer)blocks[j];
5063 }
5064
5065 nelem = 1;
5066 count = 0;
5067 for (j=0; j<ndim; j++) {
5068 if (index[j] < (Integer)blocks[j]-1) {
5069 nelem *= (Integer)(map[index[j]+1+count] - map[index[j]+count]
5070 + 2*width[j]);
5071 } else {
5072 nelem *= (Integer)(dims[j] - map[index[j]+count] + 1 + 2*width[j]);
5073 }
5074 count += (Integer)blocks[j];
5075 }
5076 /* We now have the total number of elements located on this processor.
5077 Find out if the location of the end of this data set matches the
5078 origin of the data on the next processor. If not, then zero data in
5079 the gap. */
5080 nelem *= GAsizeof(type);
5081 bptr = GA[handle].ptr[pnga_cluster_procid(inode, i)];
5082 bptr += nelem;
5083 if (i<nblocks-1) {
5084 j = i+1;
5085 nptr = GA[handle].ptr[pnga_cluster_procid(inode, j)];
5086 if (bptr != nptr) {
5087 bytes = (long)nptr - (long)bptr;
5088 /* BJP printf("p[%d] Gap on proc %d is %d\n",GAme,i,bytes); */
5089 bzero(bptr, bytes);
5090 }
5091 }
5092 }
5093 /* find total number of bytes containing global array */
5094 total = (long)bptr - (long)zptr;
5095 total /= GAsizeof(type);
5096 /*convert from C data type to ARMCI type */
5097 switch(type) {
5098 case C_FLOAT: atype=ARMCI_FLOAT; break;
5099 case C_DBL: atype=ARMCI_DOUBLE; break;
5100 case C_LONG: atype=ARMCI_LONG; break;
5101 case C_INT: atype=ARMCI_INT; break;
5102 case C_DCPL: atype=ARMCI_DOUBLE; break;
5103 case C_SCPL: atype=ARMCI_FLOAT; break;
5104 default: pnga_error("type not supported",type);
5105 }
5106 /* now that gap data has been zeroed, do a global sum on data */
5107 {
5108 int i, len;
5109 int nsteps = (int) ceil(((double)total)/((double)bigint));
5110 long istart=0;
5111 /* printf("%ld total %ld bigint %ld nsteps %d \n",GAme,total,bigint,nsteps); */
5112 for (i=0; i < nsteps; i++){
5113
5114 len=bigint;
5115 if (istart+len > total) len=((long)(total - istart));
5116 /* printf("%ld step %d of %d len= %d total=%ld istart= %ld\n",GAme,(i+1),nsteps,len,total,istart); */
5117 armci_msg_gop_scope(SCOPE_MASTERS, zptr+istart*GAsizeof(type), len, "+", atype);
5118 istart+=len;
5119 }
5120 }
5121 }
5122 } else {
5123 Integer _ga_tmp;
5124 Integer idims[MAXDIM], iwidth[MAXDIM], ichunk[MAXDIM];
5125 void *one = NULL;
5126 double d_one = 1.0;
5127 int i_one = 1;
5128 float f_one = 1.0;
5129 long l_one = 1;
5130 double c_one[2];
5131 float cf_one[2];
5132 c_one[0] = 1.0;
5133 c_one[1] = 0.0;
5134
5135 cf_one[0] = 1.0;
5136 cf_one[1] = 0.0;
5137
5138 /* choose one as scaling factor in accumulate */
5139 switch (type) {
5140 case C_FLOAT: one = &f_one; break;
5141 case C_DBL: one = &d_one; break;
5142 case C_LONG: one = &l_one; break;
5143 case C_INT: one = &i_one; break;
5144 case C_DCPL: one = &c_one; break;
5145 case C_SCPL: one = &cf_one; break;
5146 default: pnga_error("type not supported",type);
5147 }
5148
5149 /* Nodes contain a mixed number of processors. Create a temporary GA to
5150 complete merge operation. */
5151 count = 0;
5152 for (i=0; i<ndim; i++) {
5153 idims[i] = (Integer)dims[i];
5154 iwidth[i] = (Integer)width[i];
5155 ichunk[i] = 0;
5156 }
5157 if (!pnga_create_ghosts(type, ndim, idims,
5158 iwidth, "temporary", ichunk, &_ga_tmp))
5159 pnga_error("Unable to create work array for merge",GAme);
5160 pnga_zero(_ga_tmp);
5161 /* Find data on this processor and accumulate in temporary global array */
5162 inode = GAme - zproc;
5163 pnga_distribution(g_a,inode,lo,hi);
5164
5165 /* Check to make sure processor has data */
5166 chk = 1;
5167 for (i=0; i<ndim; i++) {
5168 if (hi[i]<lo[i]) {
5169 chk = 0;
5170 }
5171 }
5172 if (chk) {
5173 pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
5174 pnga_acc(_ga_tmp, lo, hi, ptr_a, ld, one);
5175 }
5176 /* copy data back to original global array */
5177 pnga_sync();
5178 if (chk) {
5179 pnga_get(_ga_tmp, lo, hi, ptr_a, ld);
5180 }
5181 pnga_destroy(_ga_tmp);
5182 }
5183 #else
5184 inode = GAme - zproc;
5185 pnga_distribution(g_a,inode,lo,hi);
5186 /* Check to make sure processor has data */
5187 chk = 1;
5188 nelem = 1;
5189 for (i=0; i<ndim; i++) {
5190 nelem *= (hi[i]-lo[i]+1);
5191 if (hi[i]<lo[i]) {
5192 chk = 0;
5193 }
5194 }
5195 if (chk) {
5196 pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
5197 pnga_pgroup_gop(_mirror_gop_grp,type,ptr_a,nelem,"+");
5198 }
5199 #endif
5200 if (local_sync_end) pnga_sync();
5201 }
5202
5203 /**
5204 * Merge all copies of a patch of a mirrored array into a patch in a
5205 * distributed array
5206 */
5207 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5208 # pragma weak wnga_merge_distr_patch = pnga_merge_distr_patch
5209 #endif
5210
pnga_merge_distr_patch(Integer g_a,Integer * alo,Integer * ahi,Integer g_b,Integer * blo,Integer * bhi)5211 void pnga_merge_distr_patch(Integer g_a, Integer *alo, Integer *ahi,
5212 Integer g_b, Integer *blo, Integer *bhi)
5213 /* Integer g_a handle to mirrored array
5214 Integer *alo indices of lower corner of mirrored array patch
5215 Integer *ahi indices of upper corner of mirrored array patch
5216 Integer g_b handle to distributed array
5217 Integer *blo indices of lower corner of distributed array patch
5218 Integer *bhi indices of upper corner of distributed array patch
5219 */
5220 {
5221 Integer local_sync_begin, local_sync_end;
5222 Integer a_handle, b_handle, adim, bdim;
5223 Integer mlo[MAXDIM], mhi[MAXDIM], mld[MAXDIM];
5224 Integer dlo[MAXDIM], dhi[MAXDIM];
5225 char trans[2];
5226 double d_one;
5227 Integer type, i_one;
5228 double z_one[2];
5229 float c_one[2];
5230 float f_one;
5231 long l_one;
5232 void *src_data_ptr;
5233 void *one = NULL;
5234 Integer i, idim, intersect, p_handle;
5235
5236 local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
5237 _ga_sync_begin = 1; _ga_sync_end = 1; /*remove any previous masking */
5238 if (local_sync_begin) pnga_sync();
5239 pnga_check_handle(g_a, "nga_merge_distr_patch");
5240 pnga_check_handle(g_b, "nga_merge_distr_patch");
5241
5242 /* check to make sure that both patches lie within global arrays and
5243 that patches are the same dimensions */
5244 a_handle = GA_OFFSET + g_a;
5245 b_handle = GA_OFFSET + g_b;
5246
5247 if (!pnga_is_mirrored(g_a)) {
5248 if (pnga_cluster_nnodes() > 1) {
5249 pnga_error("Handle to a non-mirrored array passed",0);
5250 } else {
5251 trans[0] = 'N';
5252 trans[1] = '\0';
5253 pnga_copy_patch(trans, g_a, alo, ahi, g_b, blo, bhi);
5254 return;
5255 }
5256 }
5257
5258 if (pnga_is_mirrored(g_b) && pnga_cluster_nnodes())
5259 pnga_error("Distributed array is mirrored",0);
5260
5261 adim = GA[a_handle].ndim;
5262 bdim = GA[b_handle].ndim;
5263
5264 p_handle = GA[a_handle].p_handle;
5265
5266 if (adim != bdim)
5267 pnga_error("Global arrays must have same dimension",0);
5268
5269 type = GA[a_handle].type;
5270 if (type != GA[b_handle].type)
5271 pnga_error("Global arrays must be of same type",0);
5272
5273 for (i=0; i<adim; i++) {
5274 idim = (Integer)GA[a_handle].dims[i];
5275 if (alo[i] < 0 || alo[i] >= idim || ahi[i] < 0 || ahi[i] >= idim ||
5276 alo[i] > ahi[i])
5277 pnga_error("Invalid patch index on mirrored GA",0);
5278 }
5279 for (i=0; i<bdim; i++) {
5280 idim = GA[b_handle].dims[i];
5281 if (blo[i] < 0 || blo[i] >= idim || bhi[i] < 0 || bhi[i] >= idim ||
5282 blo[i] > bhi[i])
5283 pnga_error("Invalid patch index on distributed GA",0);
5284 }
5285 for (i=0; i<bdim; i++) {
5286 idim = (Integer)GA[b_handle].dims[i];
5287 if (ahi[i] - alo[i] != bhi[i] - blo[i])
5288 pnga_error("Patch dimensions do not match for index ",i);
5289 }
5290 pnga_zero_patch(g_b, blo, bhi);
5291
5292 /* Find coordinates of mirrored array patch that I own */
5293 i = PGRP_LIST[p_handle].map_proc_list[GAme];
5294 pnga_distribution(g_a, i, mlo, mhi);
5295 /* Check to see if mirrored array patch intersects my portion of
5296 mirrored array */
5297 intersect = 1;
5298 for (i=0; i<adim; i++) {
5299 if (mhi[i] < alo[i]) intersect = 0;
5300 if (mlo[i] > ahi[i]) intersect = 0;
5301 }
5302 if (intersect) {
5303 /* get portion of mirrored array patch that actually resides on this
5304 processor */
5305 for (i=0; i<adim; i++) {
5306 mlo[i] = GA_MAX(alo[i],mlo[i]);
5307 mhi[i] = GA_MIN(ahi[i],mhi[i]);
5308 }
5309
5310 /* get pointer to locally held distribution */
5311 pnga_access_ptr(g_a, mlo, mhi, &src_data_ptr, mld);
5312
5313 /* find indices in distributed array corresponding to this patch */
5314 for (i=0; i<adim; i++) {
5315 dlo[i] = blo[i] + mlo[i]-alo[i];
5316 dhi[i] = blo[i] + mhi[i]-alo[i];
5317 }
5318
5319 /* perform accumulate */
5320 if (type == C_DBL) {
5321 d_one = 1.0;
5322 one = &d_one;
5323 } else if (type == C_DCPL) {
5324 z_one[0] = 1.0;
5325 z_one[1] = 0.0;
5326 one = &z_one;
5327 } else if (type == C_SCPL) {
5328 c_one[0] = 1.0;
5329 c_one[1] = 0.0;
5330 one = &c_one;
5331 } else if (type == C_FLOAT) {
5332 f_one = 1.0;
5333 one = &f_one;
5334 } else if (type == C_INT) {
5335 i_one = 1;
5336 one = &i_one;
5337 } else if (type == C_LONG) {
5338 l_one = 1;
5339 one = &l_one;
5340 } else {
5341 pnga_error("Type not supported",type);
5342 }
5343 pnga_acc(g_b, dlo, dhi, src_data_ptr, mld, one);
5344 }
5345 if (local_sync_end) pnga_sync();
5346 }
5347
5348 /**
5349 * Return the total number of blocks in a region (if any)
5350 */
5351 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5352 # pragma weak wnga_locate_num_blocks = pnga_locate_num_blocks
5353 #endif
5354
pnga_locate_num_blocks(Integer g_a,Integer * lo,Integer * hi)5355 Integer pnga_locate_num_blocks(Integer g_a, Integer *lo, Integer *hi)
5356 {
5357 Integer ga_handle = GA_OFFSET + g_a;
5358 Integer ndim = GA[ga_handle].ndim;
5359 Integer ret = -1, d;
5360 Integer cnt;
5361
5362 for(d = 0; d< GA[ga_handle].ndim; d++)
5363 if((lo[d]<1 || hi[d]>GA[ga_handle].dims[d]) ||(lo[d]>hi[d]))
5364 pnga_error("Requested region out of bounds",0);
5365
5366 if (GA[ga_handle].distr_type != REGULAR) {
5367 Integer nblocks = GA[ga_handle].block_total;
5368 Integer chk, i, j, tlo[MAXDIM], thi[MAXDIM];
5369 cnt = 0;
5370 for (i=0; i<nblocks; i++) {
5371 /* check to see if this block overlaps with requested block
5372 * defined by lo and hi */
5373 chk = 1;
5374 /* get limits on block i */
5375 pnga_distribution(g_a,i,tlo,thi);
5376 for (j=0; j<ndim && chk==1; j++) {
5377 /* check to see if at least one end point of the interval
5378 * represented by blo and bhi falls in the interval
5379 * represented by lo and hi */
5380 if (!((tlo[j] >= lo[j] && tlo[j] <= hi[j]) ||
5381 (thi[j] >= lo[j] && thi[j] <= hi[j]))) {
5382 chk = 0;
5383 }
5384 }
5385
5386 if (chk) {
5387 cnt++;
5388 }
5389 }
5390 ret = cnt;
5391 }
5392
5393 return ret;
5394 }
5395
5396 /**
5397 * Return the total number of blocks in a Global Array (if any). Only returns
5398 * non-zero values for block-cyclic data distributions.
5399 */
5400 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5401 # pragma weak wnga_total_blocks = pnga_total_blocks
5402 #endif
5403
pnga_total_blocks(Integer g_a)5404 Integer pnga_total_blocks(Integer g_a)
5405 {
5406 Integer ga_handle = GA_OFFSET + g_a;
5407 return GA[ga_handle].block_total;
5408 }
5409
5410 /**
5411 * Return true if GA uses SCALPACK data distribution
5412 */
5413 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5414 # pragma weak wnga_uses_proc_grid = pnga_uses_proc_grid
5415 #endif
5416
pnga_uses_proc_grid(Integer g_a)5417 logical pnga_uses_proc_grid(Integer g_a)
5418 {
5419 Integer ga_handle = GA_OFFSET + g_a;
5420 return (logical)(GA[ga_handle].distr_type == SCALAPACK
5421 || GA[ga_handle].distr_type == TILED ||
5422 GA[ga_handle].distr_type == TILED_IRREG);
5423 }
5424
5425 /**
5426 * Return the index of a processor based on the block partition associated
5427 * with a particular Global Array, assuming GA uses some sort of block-cyclic
5428 * data distribution based on an underlying processor grid. (e.g. a
5429 * ScaLAPACK data distribution)
5430 */
5431 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5432 # pragma weak wnga_get_proc_index = pnga_get_proc_index
5433 #endif
5434
pnga_get_proc_index(Integer g_a,Integer iproc,Integer * index)5435 void pnga_get_proc_index(Integer g_a, Integer iproc, Integer *index)
5436 {
5437 Integer ga_handle = GA_OFFSET + g_a;
5438 if (GA[ga_handle].distr_type == SCALAPACK) {
5439 gam_find_proc_indices(ga_handle, iproc, index);
5440 } else if (GA[ga_handle].distr_type == TILED ||
5441 GA[ga_handle].distr_type == TILED_IRREG) {
5442 gam_find_tile_proc_indices(ga_handle, iproc, index);
5443 } else {
5444 pnga_error("Global array does not use ScaLAPACK data distribution",0);
5445 }
5446 return;
5447 }
5448
5449 /**
5450 * Return proc grid dimension and block dimension for a particular
5451 * Global Array
5452 */
5453 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5454 # pragma weak wnga_get_block_info = pnga_get_block_info
5455 #endif
5456
pnga_get_block_info(Integer g_a,Integer * num_blocks,Integer * block_dims)5457 void pnga_get_block_info(Integer g_a, Integer *num_blocks, Integer *block_dims)
5458 {
5459 Integer ga_handle = GA_OFFSET + g_a;
5460 Integer i, ndim;
5461 ndim = GA[ga_handle].ndim;
5462 if (GA[ga_handle].distr_type == SCALAPACK ||
5463 GA[ga_handle].distr_type == TILED) {
5464 for (i=0; i<ndim; i++) {
5465 num_blocks[i] = GA[ga_handle].num_blocks[i];
5466 block_dims[i] = GA[ga_handle].block_dims[i];
5467 }
5468 } else if (GA[ga_handle].distr_type == TILED_IRREG) {
5469 /* not sure what to do here */
5470 pnga_error("Don't know how to respond to get_block_infor for"
5471 " irregular tiled array",0);
5472 } else {
5473 Integer dim, bsize;
5474 for (i=0; i<ndim; i++) {
5475 dim = GA[ga_handle].dims[i];
5476 bsize = GA[ga_handle].block_dims[i];
5477 if (bsize > 0) {
5478 if (dim%bsize == 0) {
5479 num_blocks[i] = dim/bsize;
5480 } else {
5481 num_blocks[i] = dim/bsize+1;
5482 }
5483 } else {
5484 num_blocks[i] = 0;
5485 }
5486 block_dims[i] = GA[ga_handle].block_dims[i];
5487 }
5488 }
5489 return;
5490 }
5491
5492 /**
5493 * Set the value of internal debug flag
5494 */
5495 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5496 # pragma weak wnga_set_debug = pnga_set_debug
5497 #endif
5498
pnga_set_debug(logical flag)5499 void pnga_set_debug(logical flag)
5500 {
5501 GA_Debug_flag = (Integer)(flag);
5502 }
5503
5504 /**
5505 * Get current value of internal debug flag
5506 */
5507 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5508 # pragma weak wnga_get_debug = pnga_get_debug
5509 #endif
5510
pnga_get_debug()5511 logical pnga_get_debug()
5512 {
5513 return (logical)GA_Debug_flag;
5514 }
5515
5516 #ifdef ENABLE_CHECKPOINT
ga_checkpoint_arrays(Integer * gas,int num)5517 void ga_checkpoint_arrays(Integer *gas,int num)
5518 {
5519 int ga = *(gas+0);
5520 int hdl = GA_OFFSET + ga;
5521 printf("\n%d:in checkpoint %d %d %d\n",GAme,ga,*(gas+1),*num);fflush(stdout);
5522 if(GA[hdl].record_id==0)
5523 ga_icheckpoint_init(gas,num);
5524 ga_icheckpoint(gas,num);
5525 }
5526
ga_recover_arrays(Integer * gas,int num)5527 int ga_recover_arrays(Integer *gas, int num)
5528 {
5529 int i;
5530 for(i=0;i<num;i++){
5531 int ga = *(gas+i);
5532 int hdl = GA_OFFSET + ga;
5533 if(GA[hdl].record_id!=0)
5534 ga_irecover(ga);
5535 }
5536 }
5537 #endif
5538 /**
5539 * Return the world group ID of the pid in process group grp
5540 */
5541 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5542 # pragma weak wnga_pgroup_absolute_id = pnga_pgroup_absolute_id
5543 #endif
5544
pnga_pgroup_absolute_id(Integer grp,Integer pid)5545 Integer pnga_pgroup_absolute_id(Integer grp, Integer pid)
5546 {
5547 #ifdef MSG_COMMS_MPI
5548 if(grp == GA_World_Proc_Group) /*a.k.a -1*/
5549 return pid;
5550 else
5551 return ARMCI_Absolute_id(&PGRP_LIST[grp].group, pid);
5552 #else
5553 pnga_error("ga_pgroup_absolute_id(): Defined only when using MPI groups",0);
5554 return -1;
5555 #endif
5556 }
5557
calc_maplen(int handle)5558 static int calc_maplen(int handle)
5559 {
5560 if (GA[handle].mapc != NULL) {
5561 int i,len=0;
5562 if (GA[handle].distr_type != TILED_IRREG) {
5563 for (i=0; i<GA[handle].ndim; i++) {
5564 len += GA[handle].nblock[i];
5565 }
5566 } else {
5567 for (i=0; i<GA[handle].ndim; i++) {
5568 len += GA[handle].num_blocks[i];
5569 }
5570 }
5571 return len;
5572 }
5573 return 0;
5574 }
5575
5576
5577 /***************************************************************
5578 *
5579 * GA types related functions
5580 *
5581 ***************************************************************/
5582
5583 ga_typeinfo_t ga_types[GA_TYPES_MAX] = {
5584 {1, sizeof(char)},
5585 {1, sizeof(int)},
5586 {1, sizeof(long)},
5587 {1, sizeof(float)},
5588 {1, sizeof(double)},
5589 {1, sizeof(long double)},
5590 {1, sizeof(SingleComplex)},
5591 {1, sizeof(DoubleComplex)},
5592 {1, -1 /*sizeof(LongDoubleComplex)*/},
5593 {1, sizeof(char)},
5594 {1, sizeof(Integer)},
5595 {1, sizeof(logical)},
5596 {1, sizeof(Real)},
5597 {1, sizeof(DoublePrecision)},
5598 {1, sizeof(SingleComplex)},
5599 {1, sizeof(DoubleComplex)},
5600 {1, sizeof(long long)},
5601 };
5602
5603 /* #define GAsizeofM(_type) ga_types[_type-MT_BASE] */
5604
5605 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5606 # pragma weak wnga_register_type = pnga_register_type
5607 #endif
pnga_register_type(size_t bytes)5608 int pnga_register_type(size_t bytes) {
5609 int i;
5610 for(i=GA_TYPES_RESERVED; i<GA_TYPES_MAX && ga_types[i].active==1; i++);
5611 if(i==GA_TYPES_MAX) {
5612 return -1;
5613 }
5614 ga_types[i].active = 1;
5615 ga_types[i].size = bytes;
5616 return i+MT_BASE;
5617 }
5618
5619 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5620 # pragma weak wnga_deregister_type = pnga_deregister_type
5621 #endif
pnga_deregister_type(int type)5622 int pnga_deregister_type(int type) {
5623 int tp = type-MT_BASE;
5624 if(tp<GA_TYPES_RESERVED) {
5625 return -1;
5626 }
5627 if(ga_types[tp].active==0) {
5628 return -2;
5629 }
5630 ga_types[tp].active = 0;
5631 ga_types[tp].size = 0;
5632 return 0;
5633 }
5634
5635 #if HAVE_SYS_WEAK_ALIAS_PRAGMA
5636 # pragma weak wnga_version = pnga_version
5637 #endif
pnga_version(Integer * major_version,Integer * minor_version,Integer * patch)5638 void pnga_version(Integer *major_version, Integer *minor_version, Integer *patch)
5639 {
5640 *major_version = GA_VERSION_MAJOR;
5641 *minor_version = GA_VERSION_MINOR;
5642 *patch = GA_VERSION_PATCH;
5643 }
5644