1 /* $Id: iterator.c,v 1.80.2.18 2007/12/18 22:22:27 d3g293 Exp $ */
2 /*
3  * module: iterator.c
4  * author: Jarek Nieplocha
5  * description: implements an iterator that can be used for looping over blocks
6  *              in a GA operation. This functionality is designed to hide
7  *              the details of the data layout from the operation
8  *
9  * DISCLAIMER
10  *
11  * This material was prepared as an account of work sponsored by an
12  * agency of the United States Government.  Neither the United States
13  * Government nor the United States Department of Energy, nor Battelle,
14  * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
15  * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
16  * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
17  * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
18  * INFRINGE PRIVATELY OWNED RIGHTS.
19  *
20  *
21  * ACKNOWLEDGMENT
22  *
23  * This software and its documentation were produced with United States
24  * Government support under Contract Number DE-AC06-76RLO-1830 awarded by
25  * the United States Department of Energy.  The United States Government
26  * retains a paid-up non-exclusive, irrevocable worldwide license to
27  * reproduce, prepare derivative works, perform publicly and display
28  * publicly by or for the US Government, including the right to
29  * distribute to other US Government contractors.
30  */
31 #if HAVE_CONFIG_H
32 #   include "config.h"
33 #endif
34 
35 #define MAX_INT_VALUE 2147483648
36 
37 #define LARGE_BLOCK_REQ
38 
39 #if HAVE_STDIO_H
40 #   include <stdio.h>
41 #endif
42 #if HAVE_STRING_H
43 #   include <string.h>
44 #endif
45 #if HAVE_STDLIB_H
46 #   include <stdlib.h>
47 #endif
48 #if HAVE_STDINT_H
49 #   include <stdint.h>
50 #endif
51 #if HAVE_MATH_H
52 #   include <math.h>
53 #endif
54 #if HAVE_ASSERT_H
55 #   include <assert.h>
56 #endif
57 #if HAVE_STDDEF_H
58 #include <stddef.h>
59 #endif
60 
61 #include "global.h"
62 #include "globalp.h"
63 #include "base.h"
64 #include "ga_iterator.h"
65 #include "armci.h"
66 #include "macdecls.h"
67 #include "ga-papi.h"
68 #include "ga-wapi.h"
69 #include "thread-safe.h"
70 
71 #define DEBUG 0
72 #ifdef PROFILE_OLD
73 #include "ga_profile.h"
74 #endif
75 
76 /*\ prepare permuted list of processes for remote ops
77 \*/
78 #define gaPermuteProcList(nproc,ProcListPerm)                         \
79 {                                                                     \
80   if((nproc) ==1) ProcListPerm[0]=0;                                  \
81   else{                                                               \
82     int _i, iswap, temp;                                              \
83     if((nproc) > GAnproc) pnga_error("permute_proc: error ", (nproc));\
84                                                                       \
85     /* every process generates different random sequence */           \
86     (void)srand((unsigned)GAme);                                      \
87                                                                       \
88     /* initialize list */                                             \
89     for(_i=0; _i< (nproc); _i++) ProcListPerm[_i]=_i;                 \
90                                                                       \
91     /* list permutation generated by random swapping */               \
92     for(_i=0; _i< (nproc); _i++){                                     \
93       iswap = (int)(rand() % (nproc));                                \
94       temp = ProcListPerm[iswap];                                     \
95       ProcListPerm[iswap] = ProcListPerm[_i];                         \
96       ProcListPerm[_i] = temp;                                        \
97     }                                                                 \
98   }                                                                   \
99 }
100 
101 #define gam_GetRangeFromMap(p, ndim, plo, phi){  \
102   Integer   _mloc = p* ndim *2;                  \
103             *plo  = hdl->map + _mloc;            \
104             *phi  = *plo + ndim;                 \
105 }
106 
107 /*\ Return pointer (ptr_loc) to location in memory of element with subscripts
108  *  (subscript). Also return physical dimensions of array in memory in ld.
109 \*/
110 #define gam_Location(proc, g_handle,  subscript, ptr_loc, ld)              \
111 {                                                                          \
112   Integer _offset=0, _d, _w, _factor=1, _last=GA[g_handle].ndim-1;         \
113   Integer _lo[MAXDIM], _hi[MAXDIM], _pinv, _p_handle;                      \
114                                                                            \
115   ga_ownsM(g_handle, proc, _lo, _hi);                                      \
116   gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim);               \
117   if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0];    \
118   __CRAYX1_PRAGMA("_CRI shortloop");                                       \
119   for(_d=0; _d < _last; _d++)            {                                 \
120     _w = (Integer)GA[g_handle].width[_d];                                  \
121     _offset += (subscript[_d]-_lo[_d]+_w) * _factor;                       \
122     ld[_d] = _hi[_d] - _lo[_d] + 1 + 2*_w;                                 \
123     _factor *= ld[_d];                                                     \
124   }                                                                        \
125   _offset += (subscript[_last]-_lo[_last]                                  \
126       + (Integer)GA[g_handle].width[_last])                                \
127   * _factor;                                                               \
128   _p_handle = GA[g_handle].p_handle;                                       \
129   if (_p_handle != 0) {                                                    \
130     if (GA[g_handle].num_rstrctd == 0) {                                   \
131       _pinv = proc;                                                        \
132     } else {                                                               \
133       _pinv = GA[g_handle].rstrctd_list[proc];                             \
134     }                                                                      \
135   } else {                                                                 \
136     _pinv = PGRP_LIST[_p_handle].inv_map_proc_list[proc];                  \
137   }                                                                        \
138   *(ptr_loc) =  GA[g_handle].ptr[_pinv]+_offset*GA[g_handle].elemsize;     \
139 }
140 
gam_LocationF(int proc,Integer g_handle,Integer subscript[],char ** ptr_loc,Integer ld[])141 void gam_LocationF(int proc, Integer g_handle,  Integer subscript[],
142     char **ptr_loc, Integer ld[])
143 {
144   Integer _offset=0, _d, _w, _factor=1, _last=GA[g_handle].ndim-1;
145   Integer _lo[MAXDIM], _hi[MAXDIM], _pinv, _p_handle;
146 
147   ga_ownsM(g_handle, proc, _lo, _hi);
148   gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim);
149   if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0];
150   __CRAYX1_PRAGMA("_CRI shortloop");
151   for(_d=0; _d < _last; _d++)            {
152     _w = (Integer)GA[g_handle].width[_d];
153     _offset += (subscript[_d]-_lo[_d]+_w) * _factor;
154     ld[_d] = _hi[_d] - _lo[_d] + 1 + 2*_w;
155     _factor *= ld[_d];
156   }
157   _offset += (subscript[_last]-_lo[_last]
158       + (Integer)GA[g_handle].width[_last])
159   * _factor;
160   _p_handle = GA[g_handle].p_handle;
161   if (_p_handle != 0) {
162     if (GA[g_handle].num_rstrctd == 0) {
163       _pinv = proc;
164     } else {
165       _pinv = GA[g_handle].rstrctd_list[proc];
166     }
167   } else {
168     _pinv = PGRP_LIST[_p_handle].inv_map_proc_list[proc];
169   }
170   *(ptr_loc) =  GA[g_handle].ptr[_pinv]+_offset*GA[g_handle].elemsize;
171 }
172 
173 #define gam_GetBlockPatch(plo,phi,lo,hi,blo,bhi,ndim) {                    \
174   Integer _d;                                                              \
175   for (_d=0; _d<ndim; _d++) {                                              \
176     if (lo[_d] <= phi[_d] && lo[_d] >= plo[_d]) blo[_d] = lo[_d];          \
177     else blo[_d] = plo[_d];                                                \
178     if (hi[_d] <= phi[_d] && hi[_d] >= plo[_d]) bhi[_d] = hi[_d];          \
179     else bhi[_d] = phi[_d];                                                \
180   }                                                                        \
181 }
182 
183 /**
184  * Initialize an iterator handle
185  * @param g_a global array handle
186  * @param lo indices for lower corner of block in global array
187  * @param hi indices for upper corner of block in global array
188  * @param hdl handle for iterator
189  */
gai_iterator_init(Integer g_a,Integer lo[],Integer hi[],_iterator_hdl * hdl)190 void gai_iterator_init(Integer g_a, Integer lo[], Integer hi[],
191                        _iterator_hdl *hdl)
192 {
193   Integer handle = GA_OFFSET + g_a;
194   Integer ndim = GA[handle].ndim;
195   int grp = GA[handle].p_handle;
196   int nproc = pnga_pgroup_nnodes(grp);
197   Integer i;
198   hdl->g_a = g_a;
199   hdl->count = 0;
200   hdl->oversize = 0;
201   hdl->map = malloc((size_t)(nproc*2*MAXDIM+1)*sizeof(Integer));
202   hdl->proclist = malloc(nproc*sizeof(Integer));;
203   hdl->proclistperm = malloc(nproc*sizeof(int));
204   for (i=0; i<ndim; i++) {
205     hdl->lo[i] = lo[i];
206     hdl->hi[i] = hi[i];
207   }
208   /* Standard GA distribution */
209   if (GA[handle].distr_type == REGULAR) {
210     /* Locate the processors containing some portion of the patch
211      * specified by lo and hi and return the results in hdl->map,
212      * hdl->proclist, and np. hdl->proclist contains a list of processors
213      * containing some portion of the patch, hdl->map contains
214      * the lower and upper indices of the portion of the patch held
215      * by a given processor, and np contains the total number of
216      * processors that contain some portion of the patch.
217      */
218     if(!pnga_locate_region(g_a, lo, hi, hdl->map, hdl->proclist, &hdl->nproc ))
219       ga_RegionError(pnga_ndim(g_a), lo, hi, g_a);
220 
221     gaPermuteProcList(hdl->nproc, hdl->proclistperm);
222 
223     /* Block-cyclic distribution */
224   } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
225     /* GA uses simple block cyclic data distribution */
226     hdl->iproc = 0;
227     hdl->iblock = pnga_nodeid();
228   } else if (GA[handle].distr_type == SCALAPACK)  {
229     /* GA uses ScaLAPACK block cyclic data distribution */
230     int j;
231     C_Integer *block_dims;
232     /* Scalapack-type block-cyclic data distribution */
233     /* Calculate some properties associated with data distribution */
234     int *proc_grid = GA[handle].nblock;
235     /*num_blocks = GA[handle].num_blocks;*/
236     block_dims = GA[handle].block_dims;
237     for (j=0; j<ndim; j++)  {
238       hdl->blk_size[j] = block_dims[j];
239       hdl->blk_dim[j] = block_dims[j]*proc_grid[j];
240       hdl->blk_num[j] = GA[handle].dims[j]/hdl->blk_dim[j];
241       hdl->blk_inc[j] = GA[handle].dims[j]-hdl->blk_num[j]*hdl->blk_dim[j];
242       hdl->blk_ld[j] = hdl->blk_num[j]*block_dims[j];
243       hdl->hlf_blk[j] = hdl->blk_inc[j]/block_dims[j];
244     }
245     hdl->iproc = 0;
246     hdl->offset = 0;
247     /* Initialize proc_index and index arrays */
248     gam_find_proc_indices(handle, hdl->iproc, hdl->proc_index);
249     gam_find_proc_indices(handle, hdl->iproc, hdl->index);
250   } else if (GA[handle].distr_type == TILED)  {
251     int j;
252     C_Integer *block_dims;
253     hdl->iproc = 0;
254     hdl->offset = 0;
255     block_dims = GA[handle].block_dims;
256     for (j=0; j<ndim; j++)  {
257       hdl->blk_size[j] = block_dims[j];
258     }
259     /* Initialize proc_index and index arrays */
260     gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
261     gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
262   } else if (GA[handle].distr_type == TILED_IRREG)  {
263     int j;
264     hdl->iproc = 0;
265     hdl->offset = 0;
266     hdl->mapc =  GA[handle].mapc;
267     /* Initialize proc_index and index arrays */
268     gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
269     gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
270   }
271 }
272 
273 /**
274  * Reset an iterator back to the start
275  * @param hdl handle for iterator
276  */
gai_iterator_reset(_iterator_hdl * hdl)277 void gai_iterator_reset(_iterator_hdl *hdl)
278 {
279   Integer handle = GA_OFFSET + hdl->g_a;
280   if (GA[handle].distr_type == REGULAR) {
281     /* Regular data distribution */
282     hdl->count = 0;
283   } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
284     /* simple block cyclic data distribution */
285     hdl->iproc = 0;
286     hdl->iblock = 0;
287     hdl->offset = 0;
288   } else if (GA[handle].distr_type == SCALAPACK) {
289     hdl->iproc = 0;
290     hdl->offset = 0;
291     /* Initialize proc_index and index arrays */
292     gam_find_proc_indices(handle, hdl->iproc, hdl->proc_index);
293     gam_find_proc_indices(handle, hdl->iproc, hdl->index);
294   } else if (GA[handle].distr_type == TILED ||
295       GA[handle].distr_type == TILED_IRREG)  {
296     hdl->iproc = 0;
297     hdl->offset = 0;
298     /* Initialize proc_index and index arrays */
299     gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
300     gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
301   }
302 }
303 
304 /**
305  * Get the next sub-block from the larger block defined when the iterator was
306  * initialized
307  * @param hdl handle for iterator
308  * @param proc processor on which the next block resides
309  * @param plo indices for lower corner of remote block
310  * @param phi indices for upper corner of remote block
311  * @param prem pointer to remote buffer
312  * @return returns false if there is no new block, true otherwise
313  */
gai_iterator_next(_iterator_hdl * hdl,int * proc,Integer * plo[],Integer * phi[],char ** prem,Integer ldrem[])314 int gai_iterator_next(_iterator_hdl *hdl, int *proc, Integer *plo[],
315     Integer *phi[], char **prem, Integer ldrem[])
316 {
317   Integer idx, i, p;
318   Integer handle = GA_OFFSET + hdl->g_a;
319   Integer p_handle = GA[handle].p_handle;
320   Integer n_rstrctd = GA[handle].num_rstrctd;
321   Integer *rank_rstrctd = GA[handle].rank_rstrctd;
322   Integer elemsize = GA[handle].elemsize;
323   int ndim;
324   int grp = GA[handle].p_handle;
325   int nproc = pnga_pgroup_nnodes(grp);
326   int ok;
327   ndim = GA[handle].ndim;
328   if (GA[handle].distr_type == REGULAR) {
329     Integer *blo, *bhi;
330     Integer nelems;
331     idx = hdl->count;
332 
333     /* Check to see if range is valid (it may not be valid if user has
334      * created and irregular distribution in which some processors do not have
335      * data). If invalid, skip this block and go to the next one
336      */
337     ok = 0;
338     while(!ok) {
339       /* no blocks left, so return */
340       if (idx>=hdl->nproc) return 0;
341       p = (Integer)hdl->proclistperm[idx];
342       *proc = (int)hdl->proclist[p];
343       if (p_handle >= 0) {
344         *proc = (int)PGRP_LIST[p_handle].inv_map_proc_list[*proc];
345       }
346       /* Find  visible portion of patch held by processor p and
347        * return the result in plo and phi. Also get actual processor
348        * index corresponding to p and store the result in proc.
349        */
350       gam_GetRangeFromMap(p, ndim, plo, phi);
351       ok = 1;
352       for (i=0; i<ndim; i++) {
353         if ((*phi)[i]<(*plo)[i]) {
354           ok = 0;
355           break;
356         }
357       }
358       if (ok) {
359         *proc = (int)hdl->proclist[p];
360         blo = *plo;
361         bhi = *phi;
362 #ifdef LARGE_BLOCK_REQ
363         /* Check to see if block size will overflow int values and initialize
364          * counter over sub-blocks if the block is too big*/
365         if (!hdl->oversize) {
366           nelems = 1;
367           for (i=0; i<ndim; i++) nelems *= (bhi[i]-blo[i]+1);
368           if (elemsize*nelems > MAX_INT_VALUE) {
369             Integer maxint = 0;
370             int maxidx;
371             hdl->oversize = 1;
372             /* Figure out block dimensions that correspond to block sizes
373              * that are beneath MAX_INT_VALUE */
374             for (i=0; i<ndim; i++) {
375               hdl->blk_size[i] = (bhi[i]-blo[i]+1);
376             }
377             while (elemsize*nelems > MAX_INT_VALUE) {
378               for (i=0; i<ndim; i++) {
379                 if (hdl->blk_size[i] > maxint) {
380                   maxidx = i;
381                   maxint = hdl->blk_size[i];
382                 }
383               }
384               hdl->blk_size[maxidx] /= 2;
385               nelems = 1;
386               for (i=0; i<ndim; i++) nelems *= hdl->blk_size[i];
387             }
388             /* Calculate the number of blocks along each dimension */
389             for (i=0; i<ndim; i++) {
390               hdl->blk_dim[i] = (bhi[i]-blo[i]+1)/hdl->blk_size[i];
391               if (hdl->blk_dim[i]*hdl->blk_size[i] < (bhi[i]-blo[i]+1))
392                 hdl->blk_dim[i]++;
393             }
394             /* initialize block counting */
395             for (i=0; i<ndim; i++) hdl->blk_inc[i] = 0;
396           }
397         }
398 
399         /* Get sub-block bounding dimensions */
400         if (hdl->oversize) {
401           Integer tmp;
402           for (i=0; i<ndim; i++) {
403             hdl->lobuf[i] = blo[i];
404             hdl->hibuf[i] = bhi[i];
405           }
406           *plo = hdl->lobuf;
407           *phi = hdl->hibuf;
408           blo = *plo;
409           bhi = *phi;
410           for (i=0; i<ndim; i++) {
411             hdl->lobuf[i] += hdl->blk_inc[i]*hdl->blk_size[i];
412             tmp = hdl->lobuf[i] + hdl->blk_size[i]-1;
413             if (tmp < hdl->hibuf[i]) hdl->hibuf[i] = tmp;
414           }
415         }
416 #endif
417 
418         if (n_rstrctd == 0) {
419           gam_Location(*proc, handle, blo, prem, ldrem);
420         } else {
421           gam_Location(rank_rstrctd[*proc], handle, blo, prem, ldrem);
422         }
423         if (p_handle >= 0) {
424           *proc = (int)hdl->proclist[p];
425           /* BJP */
426           *proc = PGRP_LIST[p_handle].inv_map_proc_list[*proc];
427         }
428       }
429 #ifdef LARGE_BLOCK_REQ
430       if (!hdl->oversize) {
431 #endif
432         hdl->count++;
433         idx = hdl->count;
434 #ifdef LARGE_BLOCK_REQ
435       } else {
436         /* update blk_inc array */
437         hdl->blk_inc[0]++;
438         for (i=0; i<ndim-1; i++) {
439           if (hdl->blk_inc[i] >= hdl->blk_dim[i]) {
440             hdl->blk_inc[i] = 0;
441             hdl->blk_inc[i+1]++;
442           }
443         }
444         if (hdl->blk_inc[ndim-1] >= hdl->blk_dim[ndim-1]) {
445           hdl->count++;
446           idx = hdl->count;
447           hdl->oversize = 0;
448         }
449       }
450 #endif
451     }
452     return 1;
453   } else {
454     Integer offset, l_offset, last, pinv;
455     Integer blk_tot = GA[handle].block_total;
456     Integer blo[MAXDIM], bhi[MAXDIM];
457     Integer idx, j, jtot, chk, iproc;
458     int check1, check2;
459     if (GA[handle].distr_type == BLOCK_CYCLIC) {
460       /* Simple block-cyclic distribution */
461       if (hdl->iproc >= nproc) return 0;
462       /*if (hdl->iproc == nproc-1 && hdl->iblock >= blk_tot) return 0;*/
463       if (hdl->iblock == hdl->iproc) hdl->offset = 0;
464       chk = 0;
465       /* loop over blocks until a block with data is found */
466       while (!chk) {
467         /* get the block corresponding to the current value of iblock */
468         idx = hdl->iblock;
469         ga_ownsM(handle,idx,blo,bhi);
470         /* check to see if this block overlaps with requested block
471          * defined by lo and hi */
472         chk = 1;
473         for (j=0; j<ndim; j++) {
474           /* check to see if at least one end point of the interval
475            * represented by blo and bhi falls in the interval
476            * represented by lo and hi */
477           check1 = ((blo[j] >= hdl->lo[j] && blo[j] <= hdl->hi[j]) ||
478               (bhi[j] >= hdl->lo[j] && bhi[j] <= hdl->hi[j]));
479           /* check to see if interval represented by lo and hi
480            * falls entirely within interval represented by blo and bhi */
481           check2 = ((hdl->lo[j] >= blo[j] && hdl->lo[j] <= bhi[j]) &&
482               (hdl->hi[j] >= blo[j] && hdl->hi[j] <= bhi[j]));
483           /* If there is some data, move to the next section of code,
484            * otherwise, check next block */
485           if (!check1 && !check2) {
486             chk = 0;
487           }
488         }
489 
490         if (!chk) {
491           /* evaluate new offset for block idx */
492           jtot = 1;
493           for (j=0; j<ndim; j++) {
494             jtot *= bhi[j]-blo[j]+1;
495           }
496           hdl->offset += jtot;
497           /* increment to next block */
498           hdl->iblock += pnga_nnodes();
499           if (hdl->iblock >= blk_tot) {
500             hdl->offset = 0;
501             hdl->iproc++;
502             hdl->iblock = hdl->iproc;
503             if (hdl->iproc >= nproc) return 0;
504           }
505         }
506       }
507 
508       /* The block overlaps some data in lo,hi */
509       if (chk) {
510         Integer *clo, *chi;
511         *plo = hdl->lobuf;
512         *phi = hdl->hibuf;
513         clo = *plo;
514         chi = *phi;
515         /* get the patch of block that overlaps requested region */
516         gam_GetBlockPatch(blo,bhi,hdl->lo,hdl->hi,clo,chi,ndim);
517 
518         /* evaluate offset within block */
519         last = ndim - 1;
520         jtot = 1;
521         if (last == 0) ldrem[0] = bhi[0] - blo[0] + 1;
522         l_offset = 0;
523         for (j=0; j<last; j++) {
524           l_offset += (clo[j]-blo[j])*jtot;
525           ldrem[j] = bhi[j]-blo[j]+1;
526           jtot *= ldrem[j];
527         }
528         l_offset += (clo[last]-blo[last])*jtot;
529         l_offset += hdl->offset;
530 
531         /* get pointer to data on remote block */
532         pinv = idx%nproc;
533         if (p_handle > 0) {
534           pinv = PGRP_LIST[p_handle].inv_map_proc_list[pinv];
535         }
536         *prem =  GA[handle].ptr[pinv]+l_offset*GA[handle].elemsize;
537         *proc = pinv;
538 
539         /* evaluate new offset for block idx */
540         jtot = 1;
541         for (j=0; j<ndim; j++) {
542           jtot *= bhi[j]-blo[j]+1;
543         }
544         hdl->offset += jtot;
545 
546         hdl->iblock += pnga_nnodes();
547         if (hdl->iblock >= blk_tot) {
548           hdl->iproc++;
549           hdl->iblock = hdl->iproc;
550           hdl->offset = 0;
551         }
552       }
553       return 1;
554     } else if (GA[handle].distr_type == SCALAPACK ||
555         GA[handle].distr_type == TILED ||
556         GA[handle].distr_type == TILED_IRREG) {
557       /* Scalapack-type data distribution */
558       Integer proc_index[MAXDIM], index[MAXDIM];
559       Integer itmp;
560       Integer blk_jinc;
561       /* Return false at the end of the iteration */
562       if (hdl->iproc >= nproc) return 0;
563       chk = 0;
564       /* loop over blocks until a block with data is found */
565       while (!chk) {
566         /* get bounds for current block */
567         if (GA[handle].distr_type == SCALAPACK ||
568             GA[handle].distr_type == TILED) {
569           for (j = 0; j < ndim; j++) {
570             blo[j] = hdl->blk_size[j]*(hdl->index[j])+1;
571             bhi[j] = hdl->blk_size[j]*(hdl->index[j]+1);
572             if (bhi[j] > GA[handle].dims[j]) bhi[j] = GA[handle].dims[j];
573           }
574         } else {
575           offset = 0;
576           for (j = 0; j < ndim; j++) {
577             blo[j] = GA[handle].mapc[offset+hdl->index[j]];
578             if (hdl->index[j] == GA[handle].num_blocks[j]-1) {
579               bhi[j] = GA[handle].dims[j];
580             } else {
581               bhi[j] = GA[handle].mapc[offset+hdl->index[j]+1]-1;
582             }
583             offset += GA[handle].num_blocks[j];
584           }
585         }
586         /* check to see if this block overlaps with requested block
587          * defined by lo and hi */
588         chk = 1;
589         for (j=0; j<ndim; j++) {
590           /* check to see if at least one end point of the interval
591            * represented by blo and bhi falls in the interval
592            * represented by lo and hi */
593           check1 = ((blo[j] >= hdl->lo[j] && blo[j] <= hdl->hi[j]) ||
594               (bhi[j] >= hdl->lo[j] && bhi[j] <= hdl->hi[j]));
595           /* check to see if interval represented by lo and hi
596            * falls entirely within interval represented by blo and bhi */
597           check2 = ((hdl->lo[j] >= blo[j] && hdl->lo[j] <= bhi[j]) &&
598               (hdl->hi[j] >= blo[j] && hdl->hi[j] <= bhi[j]));
599           /* If there is some data, move to the next section of code,
600            * otherwise, check next block */
601           if (!check1 && !check2) {
602             chk = 0;
603           }
604         }
605 
606         if (!chk) {
607           /* update offset for block */
608           itmp = 1;
609           for (j=0; j<ndim; j++) {
610             itmp *= bhi[j]-blo[j]+1;
611           }
612           hdl->offset += itmp;
613 
614           /* increment to next block */
615           hdl->index[0] += GA[handle].nblock[0];
616           for (j=0; j<ndim; j++) {
617             if (hdl->index[j] >= GA[handle].num_blocks[j] && j < ndim-1) {
618               hdl->index[j] = hdl->proc_index[j];
619               hdl->index[j+1] += GA[handle].nblock[j+1];
620             }
621           }
622           if (hdl->index[ndim-1] >= GA[handle].num_blocks[ndim-1]) {
623             /* last iteration has been completed on current processor. Go
624              * to next processor */
625             hdl->iproc++;
626             if (hdl->iproc >= nproc) return 0;
627             hdl->offset = 0;
628             if (GA[handle].distr_type == TILED ||
629                 GA[handle].distr_type == TILED_IRREG) {
630               gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
631               gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
632             } else if (GA[handle].distr_type == SCALAPACK) {
633               gam_find_proc_indices(handle, hdl->iproc, hdl->proc_index);
634               gam_find_proc_indices(handle, hdl->iproc, hdl->index);
635             }
636           }
637         }
638       }
639       if (chk) {
640         Integer *clo, *chi;
641         *plo = hdl->lobuf;
642         *phi = hdl->hibuf;
643         clo = *plo;
644         chi = *phi;
645         /* get the patch of block that overlaps requested region */
646         gam_GetBlockPatch(blo,bhi,hdl->lo,hdl->hi,clo,chi,ndim);
647 
648         /* evaluate offset within block */
649         last = ndim - 1;
650         if (GA[handle].distr_type == TILED ||
651             GA[handle].distr_type == TILED_IRREG) {
652           jtot = 1;
653           if (last == 0) ldrem[0] = bhi[0] - blo[0] + 1;
654           l_offset = 0;
655           for (j=0; j<last; j++) {
656             l_offset += (clo[j]-blo[j])*jtot;
657             ldrem[j] = bhi[j]-blo[j]+1;
658             jtot *= ldrem[j];
659           }
660           l_offset += (clo[last]-blo[last])*jtot;
661           l_offset += hdl->offset;
662         } else if (GA[handle].distr_type == SCALAPACK) {
663           l_offset = 0;
664           jtot = 1;
665           for (j=0; j<last; j++)  {
666             ldrem[j] = hdl->blk_ld[j];
667             blk_jinc = GA[handle].dims[j]%hdl->blk_size[j];
668             if (hdl->blk_inc[j] > 0) {
669               if (hdl->proc_index[j]<hdl->hlf_blk[j]) {
670                 blk_jinc = hdl->blk_size[j];
671               } else if (hdl->proc_index[j] == hdl->hlf_blk[j]) {
672                 blk_jinc = hdl->blk_inc[j]%hdl->blk_size[j];
673               } else {
674                 blk_jinc = 0;
675               }
676             }
677             ldrem[j] += blk_jinc;
678             l_offset += (clo[j]-blo[j]
679                 + ((blo[j]-1)/hdl->blk_dim[j])*hdl->blk_size[j])*jtot;
680             jtot *= ldrem[j];
681           }
682           l_offset += (clo[last]-blo[last]
683               + ((blo[last]-1)/hdl->blk_dim[j])*hdl->blk_size[last])*jtot;
684         }
685         /* get pointer to data on remote block */
686         pinv = (hdl->iproc)%nproc;
687         if (p_handle > 0) {
688           pinv = PGRP_LIST[p_handle].inv_map_proc_list[pinv];
689         }
690         *prem =  GA[handle].ptr[pinv]+l_offset*GA[handle].elemsize;
691         *proc = pinv;
692 
693         /* evaluate new offset for block */
694         itmp = 1;
695         for (j=0; j<ndim; j++) {
696           itmp *= bhi[j]-blo[j]+1;
697         }
698         hdl->offset += itmp;
699         /* increment to next block */
700         hdl->index[0] += GA[handle].nblock[0];
701         for (j=0; j<ndim; j++) {
702           if (hdl->index[j] >= GA[handle].num_blocks[j] && j < ndim-1) {
703             hdl->index[j] = hdl->proc_index[j];
704             hdl->index[j+1] += GA[handle].nblock[j+1];
705           }
706         }
707         if (hdl->index[ndim-1] >= GA[handle].num_blocks[ndim-1]) {
708           hdl->iproc++;
709           hdl->offset = 0;
710           if (GA[handle].distr_type == TILED ||
711               GA[handle].distr_type == TILED_IRREG) {
712             gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
713             gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
714           } else if (GA[handle].distr_type == SCALAPACK) {
715             gam_find_proc_indices(handle, hdl->iproc, hdl->proc_index);
716             gam_find_proc_indices(handle, hdl->iproc, hdl->index);
717           }
718         }
719       }
720     }
721     return 1;
722   }
723   return 0;
724 }
725 
726 /**
727  * Return true if this is the last data packet for this iterator
728  */
gai_iterator_last(_iterator_hdl * hdl)729 int gai_iterator_last(_iterator_hdl *hdl)
730 {
731   Integer idx;
732   Integer handle = GA_OFFSET + hdl->g_a;
733   Integer p_handle = GA[handle].p_handle;
734   Integer n_rstrctd = GA[handle].num_rstrctd;
735   Integer *rank_rstrctd = GA[handle].rank_rstrctd;
736   Integer elemsize = GA[handle].elemsize;
737   int ndim;
738   int grp = GA[handle].p_handle;
739   int nproc = pnga_pgroup_nnodes(grp);
740   ndim = GA[handle].ndim;
741   if (GA[handle].distr_type == REGULAR) {
742     idx = hdl->count;
743     /* no blocks left after this iteration */
744     if (idx>=hdl->nproc) return 1;
745   } else {
746     if (GA[handle].distr_type == BLOCK_CYCLIC) {
747       /* Simple block-cyclic distribution */
748       if (hdl->iproc >= nproc) return 1;
749     } else if (GA[handle].distr_type == SCALAPACK ||
750         GA[handle].distr_type == TILED ||
751         GA[handle].distr_type == TILED_IRREG) {
752       if (hdl->iproc >= nproc) return 1;
753     }
754   }
755   return 0;
756 }
757 
758 /**
759  * Clean up iterator
760  */
gai_iterator_destroy(_iterator_hdl * hdl)761 void gai_iterator_destroy(_iterator_hdl *hdl)
762 {
763     free(hdl->map);
764     free(hdl->proclist);
765     free(hdl->proclistperm);
766 }
767 
768 /**
769  * Functions that iterate over locally held blocks in a global array. These
770  * are used in some routines such as copy
771  */
772 
773 /**
774  * Initialize a local iterator
775  * @param g_a global array handle
776  * @param hdl handle for iterator
777  */
pnga_local_iterator_init(Integer g_a,_iterator_hdl * hdl)778 void pnga_local_iterator_init(Integer g_a, _iterator_hdl *hdl)
779 {
780   Integer handle = GA_OFFSET + g_a;
781   Integer ndim = GA[handle].ndim;
782   Integer grp = (Integer)GA[handle].p_handle;
783   hdl->g_a = g_a;
784   hdl->count = 0;
785   hdl->oversize = 0;
786   /* If standard GA distribution then no additional action needs to be taken */
787   if (GA[handle].distr_type == BLOCK_CYCLIC) {
788     /* GA uses simple block cyclic data distribution */
789     hdl->count = pnga_pgroup_nodeid(grp);
790   } else if (GA[handle].distr_type == SCALAPACK) {
791     /* GA uses ScaLAPACK block cyclic data distribution */
792     int j;
793     Integer me = pnga_pgroup_nodeid(grp);
794     /* Calculate some properties associated with data distribution */
795     for (j=0; j<ndim; j++)  {
796       hdl->blk_size[j] = GA[handle].block_dims[j];
797       hdl->blk_num[j] = GA[handle].num_blocks[j];
798       hdl->blk_inc[j] = GA[handle].nblock[j];
799       hdl->blk_dim[j] = GA[handle].dims[j];
800     }
801     /* Initialize proc_index and index arrays */
802     gam_find_proc_indices(handle, me, hdl->proc_index);
803     gam_find_proc_indices(handle, me, hdl->index);
804   } else if (GA[handle].distr_type == TILED) {
805     /* GA uses tiled distribution */
806     int j;
807     Integer me = pnga_pgroup_nodeid(grp);
808     /* Calculate some properties associated with data distribution */
809     for (j=0; j<ndim; j++)  {
810       hdl->blk_size[j] = GA[handle].block_dims[j];
811       hdl->blk_num[j] = GA[handle].num_blocks[j];
812       hdl->blk_inc[j] = GA[handle].nblock[j];
813       hdl->blk_dim[j] = GA[handle].dims[j];
814     }
815     /* Initialize proc_index and index arrays */
816     gam_find_tile_proc_indices(handle, me, hdl->proc_index);
817     gam_find_tile_proc_indices(handle, me, hdl->index);
818   } else if (GA[handle].distr_type == TILED_IRREG) {
819     /* GA uses irregular tiled distribution */
820     int j;
821     Integer me = pnga_pgroup_nodeid(grp);
822     hdl->mapc = GA[handle].mapc;
823     /* Calculate some properties associated with data distribution */
824     for (j=0; j<ndim; j++)  {
825       hdl->blk_num[j] = GA[handle].num_blocks[j];
826       hdl->blk_inc[j] = GA[handle].nblock[j];
827       hdl->blk_dim[j] = GA[handle].dims[j];
828     }
829     /* Initialize proc_index and index arrays */
830     gam_find_tile_proc_indices(handle, me, hdl->proc_index);
831     gam_find_tile_proc_indices(handle, me, hdl->index);
832   }
833 }
834 
835 /**
836  * Get the next sub-block from local portion of global array
837  * @param hdl handle for iterator
838  * @param plo indices for lower corner of block
839  * @param phi indices for upper corner of block
840  * @param ptr pointer to local buffer
841  * @param ld array of strides for local block
842  * @return returns false if there is no new block, true otherwise
843  */
pnga_local_iterator_next(_iterator_hdl * hdl,Integer plo[],Integer phi[],char ** ptr,Integer ld[])844 int pnga_local_iterator_next(_iterator_hdl *hdl, Integer plo[],
845     Integer phi[], char **ptr, Integer ld[])
846 {
847   Integer i;
848   Integer handle = GA_OFFSET + hdl->g_a;
849   Integer grp = GA[handle].p_handle;
850   Integer elemsize = GA[handle].elemsize;
851   int ndim;
852   int me = pnga_pgroup_nodeid(grp);
853   ndim = GA[handle].ndim;
854   if (GA[handle].distr_type == REGULAR) {
855     Integer nelems;
856     /* no blocks left, so return */
857     if (hdl->count>0) return 0;
858 
859     /* Find  visible portion of patch held by this processor and
860      * return the result in plo and phi. Return pointer to local
861      * data as well
862      */
863     pnga_distribution(hdl->g_a, me, plo, phi);
864     /* Check to see if this process has any data. Return 0 if
865      * it does not */
866     for (i=0; i<ndim; i++) {
867       if (phi[i]<plo[i]) return 0;
868     }
869     pnga_access_ptr(hdl->g_a,plo,phi,ptr,ld);
870     hdl->count++;
871   } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
872     /* Simple block-cyclic distribution */
873     if (hdl->count >= pnga_total_blocks(hdl->g_a)) return 0;
874     pnga_distribution(hdl->g_a,hdl->count,plo,phi);
875     pnga_access_block_ptr(hdl->g_a,hdl->count,ptr,ld);
876     hdl->count += pnga_pgroup_nnodes(grp);
877   } else if (GA[handle].distr_type == SCALAPACK ||
878       GA[handle].distr_type == TILED) {
879     /* Scalapack-type data distribution */
880     if (hdl->index[ndim-1] >= hdl->blk_num[ndim-1]) return 0;
881     /* Find coordinates of bounding block */
882     for (i=0; i<ndim; i++) {
883       plo[i] = hdl->index[i]*hdl->blk_size[i]+1;
884       phi[i] = (hdl->index[i]+1)*hdl->blk_size[i];
885       if (phi[i] > hdl->blk_dim[i]) phi[i] = hdl->blk_dim[i];
886     }
887     pnga_access_block_grid_ptr(hdl->g_a,hdl->index,ptr,ld);
888     hdl->index[0] += hdl->blk_inc[0];
889     for (i=0; i<ndim; i++) {
890       if (hdl->index[i] >= hdl->blk_num[i] && i<ndim-1) {
891         hdl->index[i] = hdl->proc_index[i];
892         hdl->index[i+1] += hdl->blk_inc[i+1];
893       }
894     }
895   } else if (GA[handle].distr_type == TILED_IRREG) {
896     /* Irregular tiled data distribution */
897     Integer offset = 0;
898     if (hdl->index[ndim-1] >= hdl->blk_num[ndim-1]) return 0;
899     /* Find coordinates of bounding block */
900     for (i=0; i<ndim; i++) {
901       plo[i] = hdl->mapc[offset+hdl->index[i]];
902       if (hdl->index[i] < hdl->blk_num[i]-1) {
903         phi[i] = hdl->mapc[offset+hdl->index[i]+1]-1;
904       } else {
905         phi[i] = GA[handle].dims[i];
906       }
907       offset += GA[handle].num_blocks[i];
908     }
909     pnga_access_block_grid_ptr(hdl->g_a,hdl->index,ptr,ld);
910     hdl->index[0] += hdl->blk_inc[0];
911     for (i=0; i<ndim; i++) {
912       if (hdl->index[i] >= hdl->blk_num[i] && i<ndim-1) {
913         hdl->index[i] = hdl->proc_index[i];
914         hdl->index[i+1] += hdl->blk_inc[i+1];
915       }
916     }
917   }
918   return 1;
919 }
920