1 /* $Id: iterator.c,v 1.80.2.18 2007/12/18 22:22:27 d3g293 Exp $ */
2 /*
3 * module: iterator.c
4 * author: Jarek Nieplocha
5 * description: implements an iterator that can be used for looping over blocks
6 * in a GA operation. This functionality is designed to hide
7 * the details of the data layout from the operation
8 *
9 * DISCLAIMER
10 *
11 * This material was prepared as an account of work sponsored by an
12 * agency of the United States Government. Neither the United States
13 * Government nor the United States Department of Energy, nor Battelle,
14 * nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
15 * ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
16 * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
17 * SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
18 * INFRINGE PRIVATELY OWNED RIGHTS.
19 *
20 *
21 * ACKNOWLEDGMENT
22 *
23 * This software and its documentation were produced with United States
24 * Government support under Contract Number DE-AC06-76RLO-1830 awarded by
25 * the United States Department of Energy. The United States Government
26 * retains a paid-up non-exclusive, irrevocable worldwide license to
27 * reproduce, prepare derivative works, perform publicly and display
28 * publicly by or for the US Government, including the right to
29 * distribute to other US Government contractors.
30 */
31 #if HAVE_CONFIG_H
32 # include "config.h"
33 #endif
34
35 #define MAX_INT_VALUE 2147483648
36
37 #define LARGE_BLOCK_REQ
38
39 #if HAVE_STDIO_H
40 # include <stdio.h>
41 #endif
42 #if HAVE_STRING_H
43 # include <string.h>
44 #endif
45 #if HAVE_STDLIB_H
46 # include <stdlib.h>
47 #endif
48 #if HAVE_STDINT_H
49 # include <stdint.h>
50 #endif
51 #if HAVE_MATH_H
52 # include <math.h>
53 #endif
54 #if HAVE_ASSERT_H
55 # include <assert.h>
56 #endif
57 #if HAVE_STDDEF_H
58 #include <stddef.h>
59 #endif
60
61 #include "global.h"
62 #include "globalp.h"
63 #include "base.h"
64 #include "ga_iterator.h"
65 #include "armci.h"
66 #include "macdecls.h"
67 #include "ga-papi.h"
68 #include "ga-wapi.h"
69 #include "thread-safe.h"
70
71 #define DEBUG 0
72 #ifdef PROFILE_OLD
73 #include "ga_profile.h"
74 #endif
75
76 /*\ prepare permuted list of processes for remote ops
77 \*/
78 #define gaPermuteProcList(nproc,ProcListPerm) \
79 { \
80 if((nproc) ==1) ProcListPerm[0]=0; \
81 else{ \
82 int _i, iswap, temp; \
83 if((nproc) > GAnproc) pnga_error("permute_proc: error ", (nproc));\
84 \
85 /* every process generates different random sequence */ \
86 (void)srand((unsigned)GAme); \
87 \
88 /* initialize list */ \
89 for(_i=0; _i< (nproc); _i++) ProcListPerm[_i]=_i; \
90 \
91 /* list permutation generated by random swapping */ \
92 for(_i=0; _i< (nproc); _i++){ \
93 iswap = (int)(rand() % (nproc)); \
94 temp = ProcListPerm[iswap]; \
95 ProcListPerm[iswap] = ProcListPerm[_i]; \
96 ProcListPerm[_i] = temp; \
97 } \
98 } \
99 }
100
101 #define gam_GetRangeFromMap(p, ndim, plo, phi){ \
102 Integer _mloc = p* ndim *2; \
103 *plo = hdl->map + _mloc; \
104 *phi = *plo + ndim; \
105 }
106
107 /*\ Return pointer (ptr_loc) to location in memory of element with subscripts
108 * (subscript). Also return physical dimensions of array in memory in ld.
109 \*/
110 #define gam_Location(proc, g_handle, subscript, ptr_loc, ld) \
111 { \
112 Integer _offset=0, _d, _w, _factor=1, _last=GA[g_handle].ndim-1; \
113 Integer _lo[MAXDIM], _hi[MAXDIM], _pinv, _p_handle; \
114 \
115 ga_ownsM(g_handle, proc, _lo, _hi); \
116 gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim); \
117 if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0]; \
118 __CRAYX1_PRAGMA("_CRI shortloop"); \
119 for(_d=0; _d < _last; _d++) { \
120 _w = (Integer)GA[g_handle].width[_d]; \
121 _offset += (subscript[_d]-_lo[_d]+_w) * _factor; \
122 ld[_d] = _hi[_d] - _lo[_d] + 1 + 2*_w; \
123 _factor *= ld[_d]; \
124 } \
125 _offset += (subscript[_last]-_lo[_last] \
126 + (Integer)GA[g_handle].width[_last]) \
127 * _factor; \
128 _p_handle = GA[g_handle].p_handle; \
129 if (_p_handle != 0) { \
130 if (GA[g_handle].num_rstrctd == 0) { \
131 _pinv = proc; \
132 } else { \
133 _pinv = GA[g_handle].rstrctd_list[proc]; \
134 } \
135 } else { \
136 _pinv = PGRP_LIST[_p_handle].inv_map_proc_list[proc]; \
137 } \
138 *(ptr_loc) = GA[g_handle].ptr[_pinv]+_offset*GA[g_handle].elemsize; \
139 }
140
gam_LocationF(int proc,Integer g_handle,Integer subscript[],char ** ptr_loc,Integer ld[])141 void gam_LocationF(int proc, Integer g_handle, Integer subscript[],
142 char **ptr_loc, Integer ld[])
143 {
144 Integer _offset=0, _d, _w, _factor=1, _last=GA[g_handle].ndim-1;
145 Integer _lo[MAXDIM], _hi[MAXDIM], _pinv, _p_handle;
146
147 ga_ownsM(g_handle, proc, _lo, _hi);
148 gaCheckSubscriptM(subscript, _lo, _hi, GA[g_handle].ndim);
149 if(_last==0) ld[0]=_hi[0]- _lo[0]+1+2*(Integer)GA[g_handle].width[0];
150 __CRAYX1_PRAGMA("_CRI shortloop");
151 for(_d=0; _d < _last; _d++) {
152 _w = (Integer)GA[g_handle].width[_d];
153 _offset += (subscript[_d]-_lo[_d]+_w) * _factor;
154 ld[_d] = _hi[_d] - _lo[_d] + 1 + 2*_w;
155 _factor *= ld[_d];
156 }
157 _offset += (subscript[_last]-_lo[_last]
158 + (Integer)GA[g_handle].width[_last])
159 * _factor;
160 _p_handle = GA[g_handle].p_handle;
161 if (_p_handle != 0) {
162 if (GA[g_handle].num_rstrctd == 0) {
163 _pinv = proc;
164 } else {
165 _pinv = GA[g_handle].rstrctd_list[proc];
166 }
167 } else {
168 _pinv = PGRP_LIST[_p_handle].inv_map_proc_list[proc];
169 }
170 *(ptr_loc) = GA[g_handle].ptr[_pinv]+_offset*GA[g_handle].elemsize;
171 }
172
173 #define gam_GetBlockPatch(plo,phi,lo,hi,blo,bhi,ndim) { \
174 Integer _d; \
175 for (_d=0; _d<ndim; _d++) { \
176 if (lo[_d] <= phi[_d] && lo[_d] >= plo[_d]) blo[_d] = lo[_d]; \
177 else blo[_d] = plo[_d]; \
178 if (hi[_d] <= phi[_d] && hi[_d] >= plo[_d]) bhi[_d] = hi[_d]; \
179 else bhi[_d] = phi[_d]; \
180 } \
181 }
182
183 /**
184 * Initialize an iterator handle
185 * @param g_a global array handle
186 * @param lo indices for lower corner of block in global array
187 * @param hi indices for upper corner of block in global array
188 * @param hdl handle for iterator
189 */
gai_iterator_init(Integer g_a,Integer lo[],Integer hi[],_iterator_hdl * hdl)190 void gai_iterator_init(Integer g_a, Integer lo[], Integer hi[],
191 _iterator_hdl *hdl)
192 {
193 Integer handle = GA_OFFSET + g_a;
194 Integer ndim = GA[handle].ndim;
195 int grp = GA[handle].p_handle;
196 int nproc = pnga_pgroup_nnodes(grp);
197 Integer i;
198 hdl->g_a = g_a;
199 hdl->count = 0;
200 hdl->oversize = 0;
201 hdl->map = malloc((size_t)(nproc*2*MAXDIM+1)*sizeof(Integer));
202 hdl->proclist = malloc(nproc*sizeof(Integer));;
203 hdl->proclistperm = malloc(nproc*sizeof(int));
204 for (i=0; i<ndim; i++) {
205 hdl->lo[i] = lo[i];
206 hdl->hi[i] = hi[i];
207 }
208 /* Standard GA distribution */
209 if (GA[handle].distr_type == REGULAR) {
210 /* Locate the processors containing some portion of the patch
211 * specified by lo and hi and return the results in hdl->map,
212 * hdl->proclist, and np. hdl->proclist contains a list of processors
213 * containing some portion of the patch, hdl->map contains
214 * the lower and upper indices of the portion of the patch held
215 * by a given processor, and np contains the total number of
216 * processors that contain some portion of the patch.
217 */
218 if(!pnga_locate_region(g_a, lo, hi, hdl->map, hdl->proclist, &hdl->nproc ))
219 ga_RegionError(pnga_ndim(g_a), lo, hi, g_a);
220
221 gaPermuteProcList(hdl->nproc, hdl->proclistperm);
222
223 /* Block-cyclic distribution */
224 } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
225 /* GA uses simple block cyclic data distribution */
226 hdl->iproc = 0;
227 hdl->iblock = pnga_nodeid();
228 } else if (GA[handle].distr_type == SCALAPACK) {
229 /* GA uses ScaLAPACK block cyclic data distribution */
230 int j;
231 C_Integer *block_dims;
232 /* Scalapack-type block-cyclic data distribution */
233 /* Calculate some properties associated with data distribution */
234 int *proc_grid = GA[handle].nblock;
235 /*num_blocks = GA[handle].num_blocks;*/
236 block_dims = GA[handle].block_dims;
237 for (j=0; j<ndim; j++) {
238 hdl->blk_size[j] = block_dims[j];
239 hdl->blk_dim[j] = block_dims[j]*proc_grid[j];
240 hdl->blk_num[j] = GA[handle].dims[j]/hdl->blk_dim[j];
241 hdl->blk_inc[j] = GA[handle].dims[j]-hdl->blk_num[j]*hdl->blk_dim[j];
242 hdl->blk_ld[j] = hdl->blk_num[j]*block_dims[j];
243 hdl->hlf_blk[j] = hdl->blk_inc[j]/block_dims[j];
244 }
245 hdl->iproc = 0;
246 hdl->offset = 0;
247 /* Initialize proc_index and index arrays */
248 gam_find_proc_indices(handle, hdl->iproc, hdl->proc_index);
249 gam_find_proc_indices(handle, hdl->iproc, hdl->index);
250 } else if (GA[handle].distr_type == TILED) {
251 int j;
252 C_Integer *block_dims;
253 hdl->iproc = 0;
254 hdl->offset = 0;
255 block_dims = GA[handle].block_dims;
256 for (j=0; j<ndim; j++) {
257 hdl->blk_size[j] = block_dims[j];
258 }
259 /* Initialize proc_index and index arrays */
260 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
261 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
262 } else if (GA[handle].distr_type == TILED_IRREG) {
263 int j;
264 hdl->iproc = 0;
265 hdl->offset = 0;
266 hdl->mapc = GA[handle].mapc;
267 /* Initialize proc_index and index arrays */
268 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
269 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
270 }
271 }
272
273 /**
274 * Reset an iterator back to the start
275 * @param hdl handle for iterator
276 */
gai_iterator_reset(_iterator_hdl * hdl)277 void gai_iterator_reset(_iterator_hdl *hdl)
278 {
279 Integer handle = GA_OFFSET + hdl->g_a;
280 if (GA[handle].distr_type == REGULAR) {
281 /* Regular data distribution */
282 hdl->count = 0;
283 } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
284 /* simple block cyclic data distribution */
285 hdl->iproc = 0;
286 hdl->iblock = 0;
287 hdl->offset = 0;
288 } else if (GA[handle].distr_type == SCALAPACK) {
289 hdl->iproc = 0;
290 hdl->offset = 0;
291 /* Initialize proc_index and index arrays */
292 gam_find_proc_indices(handle, hdl->iproc, hdl->proc_index);
293 gam_find_proc_indices(handle, hdl->iproc, hdl->index);
294 } else if (GA[handle].distr_type == TILED ||
295 GA[handle].distr_type == TILED_IRREG) {
296 hdl->iproc = 0;
297 hdl->offset = 0;
298 /* Initialize proc_index and index arrays */
299 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
300 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
301 }
302 }
303
304 /**
305 * Get the next sub-block from the larger block defined when the iterator was
306 * initialized
307 * @param hdl handle for iterator
308 * @param proc processor on which the next block resides
309 * @param plo indices for lower corner of remote block
310 * @param phi indices for upper corner of remote block
311 * @param prem pointer to remote buffer
312 * @return returns false if there is no new block, true otherwise
313 */
gai_iterator_next(_iterator_hdl * hdl,int * proc,Integer * plo[],Integer * phi[],char ** prem,Integer ldrem[])314 int gai_iterator_next(_iterator_hdl *hdl, int *proc, Integer *plo[],
315 Integer *phi[], char **prem, Integer ldrem[])
316 {
317 Integer idx, i, p;
318 Integer handle = GA_OFFSET + hdl->g_a;
319 Integer p_handle = GA[handle].p_handle;
320 Integer n_rstrctd = GA[handle].num_rstrctd;
321 Integer *rank_rstrctd = GA[handle].rank_rstrctd;
322 Integer elemsize = GA[handle].elemsize;
323 int ndim;
324 int grp = GA[handle].p_handle;
325 int nproc = pnga_pgroup_nnodes(grp);
326 int ok;
327 ndim = GA[handle].ndim;
328 if (GA[handle].distr_type == REGULAR) {
329 Integer *blo, *bhi;
330 Integer nelems;
331 idx = hdl->count;
332
333 /* Check to see if range is valid (it may not be valid if user has
334 * created and irregular distribution in which some processors do not have
335 * data). If invalid, skip this block and go to the next one
336 */
337 ok = 0;
338 while(!ok) {
339 /* no blocks left, so return */
340 if (idx>=hdl->nproc) return 0;
341 p = (Integer)hdl->proclistperm[idx];
342 *proc = (int)hdl->proclist[p];
343 if (p_handle >= 0) {
344 *proc = (int)PGRP_LIST[p_handle].inv_map_proc_list[*proc];
345 }
346 /* Find visible portion of patch held by processor p and
347 * return the result in plo and phi. Also get actual processor
348 * index corresponding to p and store the result in proc.
349 */
350 gam_GetRangeFromMap(p, ndim, plo, phi);
351 ok = 1;
352 for (i=0; i<ndim; i++) {
353 if ((*phi)[i]<(*plo)[i]) {
354 ok = 0;
355 break;
356 }
357 }
358 if (ok) {
359 *proc = (int)hdl->proclist[p];
360 blo = *plo;
361 bhi = *phi;
362 #ifdef LARGE_BLOCK_REQ
363 /* Check to see if block size will overflow int values and initialize
364 * counter over sub-blocks if the block is too big*/
365 if (!hdl->oversize) {
366 nelems = 1;
367 for (i=0; i<ndim; i++) nelems *= (bhi[i]-blo[i]+1);
368 if (elemsize*nelems > MAX_INT_VALUE) {
369 Integer maxint = 0;
370 int maxidx;
371 hdl->oversize = 1;
372 /* Figure out block dimensions that correspond to block sizes
373 * that are beneath MAX_INT_VALUE */
374 for (i=0; i<ndim; i++) {
375 hdl->blk_size[i] = (bhi[i]-blo[i]+1);
376 }
377 while (elemsize*nelems > MAX_INT_VALUE) {
378 for (i=0; i<ndim; i++) {
379 if (hdl->blk_size[i] > maxint) {
380 maxidx = i;
381 maxint = hdl->blk_size[i];
382 }
383 }
384 hdl->blk_size[maxidx] /= 2;
385 nelems = 1;
386 for (i=0; i<ndim; i++) nelems *= hdl->blk_size[i];
387 }
388 /* Calculate the number of blocks along each dimension */
389 for (i=0; i<ndim; i++) {
390 hdl->blk_dim[i] = (bhi[i]-blo[i]+1)/hdl->blk_size[i];
391 if (hdl->blk_dim[i]*hdl->blk_size[i] < (bhi[i]-blo[i]+1))
392 hdl->blk_dim[i]++;
393 }
394 /* initialize block counting */
395 for (i=0; i<ndim; i++) hdl->blk_inc[i] = 0;
396 }
397 }
398
399 /* Get sub-block bounding dimensions */
400 if (hdl->oversize) {
401 Integer tmp;
402 for (i=0; i<ndim; i++) {
403 hdl->lobuf[i] = blo[i];
404 hdl->hibuf[i] = bhi[i];
405 }
406 *plo = hdl->lobuf;
407 *phi = hdl->hibuf;
408 blo = *plo;
409 bhi = *phi;
410 for (i=0; i<ndim; i++) {
411 hdl->lobuf[i] += hdl->blk_inc[i]*hdl->blk_size[i];
412 tmp = hdl->lobuf[i] + hdl->blk_size[i]-1;
413 if (tmp < hdl->hibuf[i]) hdl->hibuf[i] = tmp;
414 }
415 }
416 #endif
417
418 if (n_rstrctd == 0) {
419 gam_Location(*proc, handle, blo, prem, ldrem);
420 } else {
421 gam_Location(rank_rstrctd[*proc], handle, blo, prem, ldrem);
422 }
423 if (p_handle >= 0) {
424 *proc = (int)hdl->proclist[p];
425 /* BJP */
426 *proc = PGRP_LIST[p_handle].inv_map_proc_list[*proc];
427 }
428 }
429 #ifdef LARGE_BLOCK_REQ
430 if (!hdl->oversize) {
431 #endif
432 hdl->count++;
433 idx = hdl->count;
434 #ifdef LARGE_BLOCK_REQ
435 } else {
436 /* update blk_inc array */
437 hdl->blk_inc[0]++;
438 for (i=0; i<ndim-1; i++) {
439 if (hdl->blk_inc[i] >= hdl->blk_dim[i]) {
440 hdl->blk_inc[i] = 0;
441 hdl->blk_inc[i+1]++;
442 }
443 }
444 if (hdl->blk_inc[ndim-1] >= hdl->blk_dim[ndim-1]) {
445 hdl->count++;
446 idx = hdl->count;
447 hdl->oversize = 0;
448 }
449 }
450 #endif
451 }
452 return 1;
453 } else {
454 Integer offset, l_offset, last, pinv;
455 Integer blk_tot = GA[handle].block_total;
456 Integer blo[MAXDIM], bhi[MAXDIM];
457 Integer idx, j, jtot, chk, iproc;
458 int check1, check2;
459 if (GA[handle].distr_type == BLOCK_CYCLIC) {
460 /* Simple block-cyclic distribution */
461 if (hdl->iproc >= nproc) return 0;
462 /*if (hdl->iproc == nproc-1 && hdl->iblock >= blk_tot) return 0;*/
463 if (hdl->iblock == hdl->iproc) hdl->offset = 0;
464 chk = 0;
465 /* loop over blocks until a block with data is found */
466 while (!chk) {
467 /* get the block corresponding to the current value of iblock */
468 idx = hdl->iblock;
469 ga_ownsM(handle,idx,blo,bhi);
470 /* check to see if this block overlaps with requested block
471 * defined by lo and hi */
472 chk = 1;
473 for (j=0; j<ndim; j++) {
474 /* check to see if at least one end point of the interval
475 * represented by blo and bhi falls in the interval
476 * represented by lo and hi */
477 check1 = ((blo[j] >= hdl->lo[j] && blo[j] <= hdl->hi[j]) ||
478 (bhi[j] >= hdl->lo[j] && bhi[j] <= hdl->hi[j]));
479 /* check to see if interval represented by lo and hi
480 * falls entirely within interval represented by blo and bhi */
481 check2 = ((hdl->lo[j] >= blo[j] && hdl->lo[j] <= bhi[j]) &&
482 (hdl->hi[j] >= blo[j] && hdl->hi[j] <= bhi[j]));
483 /* If there is some data, move to the next section of code,
484 * otherwise, check next block */
485 if (!check1 && !check2) {
486 chk = 0;
487 }
488 }
489
490 if (!chk) {
491 /* evaluate new offset for block idx */
492 jtot = 1;
493 for (j=0; j<ndim; j++) {
494 jtot *= bhi[j]-blo[j]+1;
495 }
496 hdl->offset += jtot;
497 /* increment to next block */
498 hdl->iblock += pnga_nnodes();
499 if (hdl->iblock >= blk_tot) {
500 hdl->offset = 0;
501 hdl->iproc++;
502 hdl->iblock = hdl->iproc;
503 if (hdl->iproc >= nproc) return 0;
504 }
505 }
506 }
507
508 /* The block overlaps some data in lo,hi */
509 if (chk) {
510 Integer *clo, *chi;
511 *plo = hdl->lobuf;
512 *phi = hdl->hibuf;
513 clo = *plo;
514 chi = *phi;
515 /* get the patch of block that overlaps requested region */
516 gam_GetBlockPatch(blo,bhi,hdl->lo,hdl->hi,clo,chi,ndim);
517
518 /* evaluate offset within block */
519 last = ndim - 1;
520 jtot = 1;
521 if (last == 0) ldrem[0] = bhi[0] - blo[0] + 1;
522 l_offset = 0;
523 for (j=0; j<last; j++) {
524 l_offset += (clo[j]-blo[j])*jtot;
525 ldrem[j] = bhi[j]-blo[j]+1;
526 jtot *= ldrem[j];
527 }
528 l_offset += (clo[last]-blo[last])*jtot;
529 l_offset += hdl->offset;
530
531 /* get pointer to data on remote block */
532 pinv = idx%nproc;
533 if (p_handle > 0) {
534 pinv = PGRP_LIST[p_handle].inv_map_proc_list[pinv];
535 }
536 *prem = GA[handle].ptr[pinv]+l_offset*GA[handle].elemsize;
537 *proc = pinv;
538
539 /* evaluate new offset for block idx */
540 jtot = 1;
541 for (j=0; j<ndim; j++) {
542 jtot *= bhi[j]-blo[j]+1;
543 }
544 hdl->offset += jtot;
545
546 hdl->iblock += pnga_nnodes();
547 if (hdl->iblock >= blk_tot) {
548 hdl->iproc++;
549 hdl->iblock = hdl->iproc;
550 hdl->offset = 0;
551 }
552 }
553 return 1;
554 } else if (GA[handle].distr_type == SCALAPACK ||
555 GA[handle].distr_type == TILED ||
556 GA[handle].distr_type == TILED_IRREG) {
557 /* Scalapack-type data distribution */
558 Integer proc_index[MAXDIM], index[MAXDIM];
559 Integer itmp;
560 Integer blk_jinc;
561 /* Return false at the end of the iteration */
562 if (hdl->iproc >= nproc) return 0;
563 chk = 0;
564 /* loop over blocks until a block with data is found */
565 while (!chk) {
566 /* get bounds for current block */
567 if (GA[handle].distr_type == SCALAPACK ||
568 GA[handle].distr_type == TILED) {
569 for (j = 0; j < ndim; j++) {
570 blo[j] = hdl->blk_size[j]*(hdl->index[j])+1;
571 bhi[j] = hdl->blk_size[j]*(hdl->index[j]+1);
572 if (bhi[j] > GA[handle].dims[j]) bhi[j] = GA[handle].dims[j];
573 }
574 } else {
575 offset = 0;
576 for (j = 0; j < ndim; j++) {
577 blo[j] = GA[handle].mapc[offset+hdl->index[j]];
578 if (hdl->index[j] == GA[handle].num_blocks[j]-1) {
579 bhi[j] = GA[handle].dims[j];
580 } else {
581 bhi[j] = GA[handle].mapc[offset+hdl->index[j]+1]-1;
582 }
583 offset += GA[handle].num_blocks[j];
584 }
585 }
586 /* check to see if this block overlaps with requested block
587 * defined by lo and hi */
588 chk = 1;
589 for (j=0; j<ndim; j++) {
590 /* check to see if at least one end point of the interval
591 * represented by blo and bhi falls in the interval
592 * represented by lo and hi */
593 check1 = ((blo[j] >= hdl->lo[j] && blo[j] <= hdl->hi[j]) ||
594 (bhi[j] >= hdl->lo[j] && bhi[j] <= hdl->hi[j]));
595 /* check to see if interval represented by lo and hi
596 * falls entirely within interval represented by blo and bhi */
597 check2 = ((hdl->lo[j] >= blo[j] && hdl->lo[j] <= bhi[j]) &&
598 (hdl->hi[j] >= blo[j] && hdl->hi[j] <= bhi[j]));
599 /* If there is some data, move to the next section of code,
600 * otherwise, check next block */
601 if (!check1 && !check2) {
602 chk = 0;
603 }
604 }
605
606 if (!chk) {
607 /* update offset for block */
608 itmp = 1;
609 for (j=0; j<ndim; j++) {
610 itmp *= bhi[j]-blo[j]+1;
611 }
612 hdl->offset += itmp;
613
614 /* increment to next block */
615 hdl->index[0] += GA[handle].nblock[0];
616 for (j=0; j<ndim; j++) {
617 if (hdl->index[j] >= GA[handle].num_blocks[j] && j < ndim-1) {
618 hdl->index[j] = hdl->proc_index[j];
619 hdl->index[j+1] += GA[handle].nblock[j+1];
620 }
621 }
622 if (hdl->index[ndim-1] >= GA[handle].num_blocks[ndim-1]) {
623 /* last iteration has been completed on current processor. Go
624 * to next processor */
625 hdl->iproc++;
626 if (hdl->iproc >= nproc) return 0;
627 hdl->offset = 0;
628 if (GA[handle].distr_type == TILED ||
629 GA[handle].distr_type == TILED_IRREG) {
630 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
631 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
632 } else if (GA[handle].distr_type == SCALAPACK) {
633 gam_find_proc_indices(handle, hdl->iproc, hdl->proc_index);
634 gam_find_proc_indices(handle, hdl->iproc, hdl->index);
635 }
636 }
637 }
638 }
639 if (chk) {
640 Integer *clo, *chi;
641 *plo = hdl->lobuf;
642 *phi = hdl->hibuf;
643 clo = *plo;
644 chi = *phi;
645 /* get the patch of block that overlaps requested region */
646 gam_GetBlockPatch(blo,bhi,hdl->lo,hdl->hi,clo,chi,ndim);
647
648 /* evaluate offset within block */
649 last = ndim - 1;
650 if (GA[handle].distr_type == TILED ||
651 GA[handle].distr_type == TILED_IRREG) {
652 jtot = 1;
653 if (last == 0) ldrem[0] = bhi[0] - blo[0] + 1;
654 l_offset = 0;
655 for (j=0; j<last; j++) {
656 l_offset += (clo[j]-blo[j])*jtot;
657 ldrem[j] = bhi[j]-blo[j]+1;
658 jtot *= ldrem[j];
659 }
660 l_offset += (clo[last]-blo[last])*jtot;
661 l_offset += hdl->offset;
662 } else if (GA[handle].distr_type == SCALAPACK) {
663 l_offset = 0;
664 jtot = 1;
665 for (j=0; j<last; j++) {
666 ldrem[j] = hdl->blk_ld[j];
667 blk_jinc = GA[handle].dims[j]%hdl->blk_size[j];
668 if (hdl->blk_inc[j] > 0) {
669 if (hdl->proc_index[j]<hdl->hlf_blk[j]) {
670 blk_jinc = hdl->blk_size[j];
671 } else if (hdl->proc_index[j] == hdl->hlf_blk[j]) {
672 blk_jinc = hdl->blk_inc[j]%hdl->blk_size[j];
673 } else {
674 blk_jinc = 0;
675 }
676 }
677 ldrem[j] += blk_jinc;
678 l_offset += (clo[j]-blo[j]
679 + ((blo[j]-1)/hdl->blk_dim[j])*hdl->blk_size[j])*jtot;
680 jtot *= ldrem[j];
681 }
682 l_offset += (clo[last]-blo[last]
683 + ((blo[last]-1)/hdl->blk_dim[j])*hdl->blk_size[last])*jtot;
684 }
685 /* get pointer to data on remote block */
686 pinv = (hdl->iproc)%nproc;
687 if (p_handle > 0) {
688 pinv = PGRP_LIST[p_handle].inv_map_proc_list[pinv];
689 }
690 *prem = GA[handle].ptr[pinv]+l_offset*GA[handle].elemsize;
691 *proc = pinv;
692
693 /* evaluate new offset for block */
694 itmp = 1;
695 for (j=0; j<ndim; j++) {
696 itmp *= bhi[j]-blo[j]+1;
697 }
698 hdl->offset += itmp;
699 /* increment to next block */
700 hdl->index[0] += GA[handle].nblock[0];
701 for (j=0; j<ndim; j++) {
702 if (hdl->index[j] >= GA[handle].num_blocks[j] && j < ndim-1) {
703 hdl->index[j] = hdl->proc_index[j];
704 hdl->index[j+1] += GA[handle].nblock[j+1];
705 }
706 }
707 if (hdl->index[ndim-1] >= GA[handle].num_blocks[ndim-1]) {
708 hdl->iproc++;
709 hdl->offset = 0;
710 if (GA[handle].distr_type == TILED ||
711 GA[handle].distr_type == TILED_IRREG) {
712 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->proc_index);
713 gam_find_tile_proc_indices(handle, hdl->iproc, hdl->index);
714 } else if (GA[handle].distr_type == SCALAPACK) {
715 gam_find_proc_indices(handle, hdl->iproc, hdl->proc_index);
716 gam_find_proc_indices(handle, hdl->iproc, hdl->index);
717 }
718 }
719 }
720 }
721 return 1;
722 }
723 return 0;
724 }
725
726 /**
727 * Return true if this is the last data packet for this iterator
728 */
gai_iterator_last(_iterator_hdl * hdl)729 int gai_iterator_last(_iterator_hdl *hdl)
730 {
731 Integer idx;
732 Integer handle = GA_OFFSET + hdl->g_a;
733 Integer p_handle = GA[handle].p_handle;
734 Integer n_rstrctd = GA[handle].num_rstrctd;
735 Integer *rank_rstrctd = GA[handle].rank_rstrctd;
736 Integer elemsize = GA[handle].elemsize;
737 int ndim;
738 int grp = GA[handle].p_handle;
739 int nproc = pnga_pgroup_nnodes(grp);
740 ndim = GA[handle].ndim;
741 if (GA[handle].distr_type == REGULAR) {
742 idx = hdl->count;
743 /* no blocks left after this iteration */
744 if (idx>=hdl->nproc) return 1;
745 } else {
746 if (GA[handle].distr_type == BLOCK_CYCLIC) {
747 /* Simple block-cyclic distribution */
748 if (hdl->iproc >= nproc) return 1;
749 } else if (GA[handle].distr_type == SCALAPACK ||
750 GA[handle].distr_type == TILED ||
751 GA[handle].distr_type == TILED_IRREG) {
752 if (hdl->iproc >= nproc) return 1;
753 }
754 }
755 return 0;
756 }
757
758 /**
759 * Clean up iterator
760 */
gai_iterator_destroy(_iterator_hdl * hdl)761 void gai_iterator_destroy(_iterator_hdl *hdl)
762 {
763 free(hdl->map);
764 free(hdl->proclist);
765 free(hdl->proclistperm);
766 }
767
768 /**
769 * Functions that iterate over locally held blocks in a global array. These
770 * are used in some routines such as copy
771 */
772
773 /**
774 * Initialize a local iterator
775 * @param g_a global array handle
776 * @param hdl handle for iterator
777 */
pnga_local_iterator_init(Integer g_a,_iterator_hdl * hdl)778 void pnga_local_iterator_init(Integer g_a, _iterator_hdl *hdl)
779 {
780 Integer handle = GA_OFFSET + g_a;
781 Integer ndim = GA[handle].ndim;
782 Integer grp = (Integer)GA[handle].p_handle;
783 hdl->g_a = g_a;
784 hdl->count = 0;
785 hdl->oversize = 0;
786 /* If standard GA distribution then no additional action needs to be taken */
787 if (GA[handle].distr_type == BLOCK_CYCLIC) {
788 /* GA uses simple block cyclic data distribution */
789 hdl->count = pnga_pgroup_nodeid(grp);
790 } else if (GA[handle].distr_type == SCALAPACK) {
791 /* GA uses ScaLAPACK block cyclic data distribution */
792 int j;
793 Integer me = pnga_pgroup_nodeid(grp);
794 /* Calculate some properties associated with data distribution */
795 for (j=0; j<ndim; j++) {
796 hdl->blk_size[j] = GA[handle].block_dims[j];
797 hdl->blk_num[j] = GA[handle].num_blocks[j];
798 hdl->blk_inc[j] = GA[handle].nblock[j];
799 hdl->blk_dim[j] = GA[handle].dims[j];
800 }
801 /* Initialize proc_index and index arrays */
802 gam_find_proc_indices(handle, me, hdl->proc_index);
803 gam_find_proc_indices(handle, me, hdl->index);
804 } else if (GA[handle].distr_type == TILED) {
805 /* GA uses tiled distribution */
806 int j;
807 Integer me = pnga_pgroup_nodeid(grp);
808 /* Calculate some properties associated with data distribution */
809 for (j=0; j<ndim; j++) {
810 hdl->blk_size[j] = GA[handle].block_dims[j];
811 hdl->blk_num[j] = GA[handle].num_blocks[j];
812 hdl->blk_inc[j] = GA[handle].nblock[j];
813 hdl->blk_dim[j] = GA[handle].dims[j];
814 }
815 /* Initialize proc_index and index arrays */
816 gam_find_tile_proc_indices(handle, me, hdl->proc_index);
817 gam_find_tile_proc_indices(handle, me, hdl->index);
818 } else if (GA[handle].distr_type == TILED_IRREG) {
819 /* GA uses irregular tiled distribution */
820 int j;
821 Integer me = pnga_pgroup_nodeid(grp);
822 hdl->mapc = GA[handle].mapc;
823 /* Calculate some properties associated with data distribution */
824 for (j=0; j<ndim; j++) {
825 hdl->blk_num[j] = GA[handle].num_blocks[j];
826 hdl->blk_inc[j] = GA[handle].nblock[j];
827 hdl->blk_dim[j] = GA[handle].dims[j];
828 }
829 /* Initialize proc_index and index arrays */
830 gam_find_tile_proc_indices(handle, me, hdl->proc_index);
831 gam_find_tile_proc_indices(handle, me, hdl->index);
832 }
833 }
834
835 /**
836 * Get the next sub-block from local portion of global array
837 * @param hdl handle for iterator
838 * @param plo indices for lower corner of block
839 * @param phi indices for upper corner of block
840 * @param ptr pointer to local buffer
841 * @param ld array of strides for local block
842 * @return returns false if there is no new block, true otherwise
843 */
pnga_local_iterator_next(_iterator_hdl * hdl,Integer plo[],Integer phi[],char ** ptr,Integer ld[])844 int pnga_local_iterator_next(_iterator_hdl *hdl, Integer plo[],
845 Integer phi[], char **ptr, Integer ld[])
846 {
847 Integer i;
848 Integer handle = GA_OFFSET + hdl->g_a;
849 Integer grp = GA[handle].p_handle;
850 Integer elemsize = GA[handle].elemsize;
851 int ndim;
852 int me = pnga_pgroup_nodeid(grp);
853 ndim = GA[handle].ndim;
854 if (GA[handle].distr_type == REGULAR) {
855 Integer nelems;
856 /* no blocks left, so return */
857 if (hdl->count>0) return 0;
858
859 /* Find visible portion of patch held by this processor and
860 * return the result in plo and phi. Return pointer to local
861 * data as well
862 */
863 pnga_distribution(hdl->g_a, me, plo, phi);
864 /* Check to see if this process has any data. Return 0 if
865 * it does not */
866 for (i=0; i<ndim; i++) {
867 if (phi[i]<plo[i]) return 0;
868 }
869 pnga_access_ptr(hdl->g_a,plo,phi,ptr,ld);
870 hdl->count++;
871 } else if (GA[handle].distr_type == BLOCK_CYCLIC) {
872 /* Simple block-cyclic distribution */
873 if (hdl->count >= pnga_total_blocks(hdl->g_a)) return 0;
874 pnga_distribution(hdl->g_a,hdl->count,plo,phi);
875 pnga_access_block_ptr(hdl->g_a,hdl->count,ptr,ld);
876 hdl->count += pnga_pgroup_nnodes(grp);
877 } else if (GA[handle].distr_type == SCALAPACK ||
878 GA[handle].distr_type == TILED) {
879 /* Scalapack-type data distribution */
880 if (hdl->index[ndim-1] >= hdl->blk_num[ndim-1]) return 0;
881 /* Find coordinates of bounding block */
882 for (i=0; i<ndim; i++) {
883 plo[i] = hdl->index[i]*hdl->blk_size[i]+1;
884 phi[i] = (hdl->index[i]+1)*hdl->blk_size[i];
885 if (phi[i] > hdl->blk_dim[i]) phi[i] = hdl->blk_dim[i];
886 }
887 pnga_access_block_grid_ptr(hdl->g_a,hdl->index,ptr,ld);
888 hdl->index[0] += hdl->blk_inc[0];
889 for (i=0; i<ndim; i++) {
890 if (hdl->index[i] >= hdl->blk_num[i] && i<ndim-1) {
891 hdl->index[i] = hdl->proc_index[i];
892 hdl->index[i+1] += hdl->blk_inc[i+1];
893 }
894 }
895 } else if (GA[handle].distr_type == TILED_IRREG) {
896 /* Irregular tiled data distribution */
897 Integer offset = 0;
898 if (hdl->index[ndim-1] >= hdl->blk_num[ndim-1]) return 0;
899 /* Find coordinates of bounding block */
900 for (i=0; i<ndim; i++) {
901 plo[i] = hdl->mapc[offset+hdl->index[i]];
902 if (hdl->index[i] < hdl->blk_num[i]-1) {
903 phi[i] = hdl->mapc[offset+hdl->index[i]+1]-1;
904 } else {
905 phi[i] = GA[handle].dims[i];
906 }
907 offset += GA[handle].num_blocks[i];
908 }
909 pnga_access_block_grid_ptr(hdl->g_a,hdl->index,ptr,ld);
910 hdl->index[0] += hdl->blk_inc[0];
911 for (i=0; i<ndim; i++) {
912 if (hdl->index[i] >= hdl->blk_num[i] && i<ndim-1) {
913 hdl->index[i] = hdl->proc_index[i];
914 hdl->index[i+1] += hdl->blk_inc[i+1];
915 }
916 }
917 }
918 return 1;
919 }
920