1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 
3 /*
4  *  (C) 2001 by Argonne National Laboratory.
5  *      See COPYRIGHT in top-level directory.
6  */
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 
11 #include "./dataloop.h"
12 
13 #undef DLOOP_DEBUG_MANIPULATE
14 
15 #ifndef PREPEND_PREFIX
16 #error "You must explicitly include a header that sets the PREPEND_PREFIX and includes dataloop_parts.h"
17 #endif
18 
19 /* Notes on functions:
20  *
21  * There are a few different sets of functions here:
22  * - DLOOP_Segment_manipulate() - uses a "piece" function to perform operations
23  *   using segments (piece functions defined elsewhere)
24  * - PREPEND_PREFIX functions - these define the externally visible interface
25  *   to segment functionality
26  */
27 
28 static inline DLOOP_Count DLOOP_Stackelm_blocksize(struct DLOOP_Dataloop_stackelm *elmp);
29 static inline DLOOP_Offset DLOOP_Stackelm_offset(struct DLOOP_Dataloop_stackelm *elmp);
30 static inline void DLOOP_Stackelm_load(struct DLOOP_Dataloop_stackelm *elmp,
31 				       struct DLOOP_Dataloop *dlp,
32 				       int branch_flag);
33 /* Segment_init
34  *
35  * buf    - datatype buffer location
36  * count  - number of instances of the datatype in the buffer
37  * handle - handle for datatype (could be derived or not)
38  * segp   - pointer to previously allocated segment structure
39  * flag   - flag indicating which optimizations are valid
40  *          should be one of DLOOP_DATALOOP_HOMOGENEOUS, _HETEROGENEOUS,
41  *          of _ALL_BYTES.
42  *
43  * Notes:
44  * - Assumes that the segment has been allocated.
45  * - Older MPICH2 code may pass "0" to indicate HETEROGENEOUS or "1" to
46  *   indicate HETEROGENEOUS.
47  *
48  */
PREPEND_PREFIX(Segment_init)49 int PREPEND_PREFIX(Segment_init)(const DLOOP_Buffer buf,
50 				 DLOOP_Count count,
51 				 DLOOP_Handle handle,
52 				 struct DLOOP_Segment *segp,
53 				 int flag)
54 {
55     DLOOP_Offset elmsize = 0;
56     int i, depth = 0;
57     int branch_detected = 0;
58 
59     struct DLOOP_Dataloop_stackelm *elmp;
60     struct DLOOP_Dataloop *dlp = 0, *sblp = &segp->builtin_loop;
61 
62     DLOOP_Assert(flag == DLOOP_DATALOOP_HETEROGENEOUS ||
63 		 flag == DLOOP_DATALOOP_HOMOGENEOUS   ||
64 		 flag == DLOOP_DATALOOP_ALL_BYTES);
65 
66 #ifdef DLOOP_DEBUG_MANIPULATE
67     DLOOP_dbg_printf("DLOOP_Segment_init: count = %d, buf = %x\n",
68 		    count,
69 		    buf);
70 #endif
71 
72     if (!DLOOP_Handle_hasloop_macro(handle)) {
73 	/* simplest case; datatype has no loop (basic) */
74 
75 	DLOOP_Handle_get_size_macro(handle, elmsize);
76 
77 	sblp->kind = DLOOP_KIND_CONTIG | DLOOP_FINAL_MASK;
78 	sblp->loop_params.c_t.count = count;
79 	sblp->loop_params.c_t.dataloop = 0;
80 	sblp->el_size = elmsize;
81         DLOOP_Handle_get_basic_type_macro(handle, sblp->el_type);
82 	DLOOP_Handle_get_extent_macro(handle, sblp->el_extent);
83 
84 	dlp = sblp;
85 	depth = 1;
86     }
87     else if (count == 0) {
88 	/* only use the builtin */
89 	sblp->kind = DLOOP_KIND_CONTIG | DLOOP_FINAL_MASK;
90 	sblp->loop_params.c_t.count = 0;
91 	sblp->loop_params.c_t.dataloop = 0;
92 	sblp->el_size = 0;
93 	sblp->el_extent = 0;
94 
95 	dlp = sblp;
96 	depth = 1;
97     }
98     else if (count == 1) {
99 	/* don't use the builtin */
100 	DLOOP_Handle_get_loopptr_macro(handle, dlp, flag);
101 	DLOOP_Handle_get_loopdepth_macro(handle, depth, flag);
102     }
103     else {
104 	/* default: need to use builtin to handle contig; must check
105 	 * loop depth first
106 	 */
107 	DLOOP_Dataloop *oldloop; /* loop from original type, before new count */
108 	DLOOP_Offset type_size, type_extent;
109 	DLOOP_Type el_type;
110 
111 	DLOOP_Handle_get_loopdepth_macro(handle, depth, flag);
112 
113 	DLOOP_Handle_get_loopptr_macro(handle, oldloop, flag);
114 	DLOOP_Assert(oldloop != NULL);
115 	DLOOP_Handle_get_size_macro(handle, type_size);
116 	DLOOP_Handle_get_extent_macro(handle, type_extent);
117         DLOOP_Handle_get_basic_type_macro(handle, el_type);
118 
119 	if (depth == 1 && ((oldloop->kind & DLOOP_KIND_MASK) == DLOOP_KIND_CONTIG))
120 	{
121 	    if (type_size == type_extent)
122 	    {
123 		/* use a contig */
124 		sblp->kind                     = DLOOP_KIND_CONTIG | DLOOP_FINAL_MASK;
125 		sblp->loop_params.c_t.count    = count * oldloop->loop_params.c_t.count;
126 		sblp->loop_params.c_t.dataloop = NULL;
127 		sblp->el_size                  = oldloop->el_size;
128 		sblp->el_extent                = oldloop->el_extent;
129 		sblp->el_type                  = oldloop->el_type;
130 	    }
131 	    else
132 	    {
133 		/* use a vector, with extent of original type becoming the stride */
134 		sblp->kind                      = DLOOP_KIND_VECTOR | DLOOP_FINAL_MASK;
135 		sblp->loop_params.v_t.count     = count;
136 		sblp->loop_params.v_t.blocksize = oldloop->loop_params.c_t.count;
137 		sblp->loop_params.v_t.stride    = type_extent;
138 		sblp->loop_params.v_t.dataloop  = NULL;
139 		sblp->el_size                   = oldloop->el_size;
140 		sblp->el_extent                 = oldloop->el_extent;
141 		sblp->el_type                   = oldloop->el_type;
142 	    }
143 	}
144 	else
145 	{
146 	    /* general case */
147 	    sblp->kind                     = DLOOP_KIND_CONTIG;
148 	    sblp->loop_params.c_t.count    = count;
149 	    sblp->loop_params.c_t.dataloop = oldloop;
150 	    sblp->el_size                  = type_size;
151 	    sblp->el_extent                = type_extent;
152 	    sblp->el_type                  = el_type;
153 
154 	    depth++; /* we're adding to the depth with the builtin */
155             DLOOP_Assert(depth < (DLOOP_MAX_DATATYPE_DEPTH));
156 	}
157 
158 	dlp = sblp;
159     }
160 
161     /* assert instead of return b/c dtype/dloop errorhandling code is inconsistent */
162     DLOOP_Assert(depth < (DLOOP_MAX_DATATYPE_DEPTH));
163 
164     /* initialize the rest of the segment values */
165     segp->handle = handle;
166     segp->ptr = (DLOOP_Buffer) buf;
167     segp->stream_off = 0;
168     segp->cur_sp = 0;
169     segp->valid_sp = 0;
170 
171     /* initialize the first stackelm in its entirety */
172     elmp = &(segp->stackelm[0]);
173     DLOOP_Stackelm_load(elmp, dlp, 0);
174     branch_detected = elmp->may_require_reloading;
175 
176     /* Fill in parameters not set by DLOOP_Stackelm_load */
177     elmp->orig_offset = 0;
178     elmp->curblock    = elmp->orig_block;
179     /* DLOOP_Stackelm_offset assumes correct orig_count, curcount, loop_p */
180     elmp->curoffset   = /* elmp->orig_offset + */ DLOOP_Stackelm_offset(elmp);
181 
182     i = 1;
183     while(!(dlp->kind & DLOOP_FINAL_MASK))
184     {
185         /* get pointer to next dataloop */
186         switch (dlp->kind & DLOOP_KIND_MASK)
187         {
188             case DLOOP_KIND_CONTIG:
189             case DLOOP_KIND_VECTOR:
190             case DLOOP_KIND_BLOCKINDEXED:
191             case DLOOP_KIND_INDEXED:
192                 dlp = dlp->loop_params.cm_t.dataloop;
193                 break;
194             case DLOOP_KIND_STRUCT:
195                 dlp = dlp->loop_params.s_t.dataloop_array[0];
196                 break;
197             default:
198                 /* --BEGIN ERROR HANDLING-- */
199                 DLOOP_Assert(0);
200                 break;
201                 /* --END ERROR HANDLING-- */
202         }
203 
204         DLOOP_Assert(i < DLOOP_MAX_DATATYPE_DEPTH);
205 
206 	/* loop_p, orig_count, orig_block, and curcount are all filled by us now.
207 	 * the rest are filled in at processing time.
208 	 */
209 	elmp = &(segp->stackelm[i]);
210 
211 	DLOOP_Stackelm_load(elmp, dlp, branch_detected);
212 	branch_detected = elmp->may_require_reloading;
213         i++;
214 
215     }
216 
217     segp->valid_sp = depth-1;
218 
219     return 0;
220 }
221 
222 /* Segment_alloc
223  *
224  */
PREPEND_PREFIX(Segment_alloc)225 struct DLOOP_Segment * PREPEND_PREFIX(Segment_alloc)(void)
226 {
227     return (struct DLOOP_Segment *) DLOOP_Malloc(sizeof(struct DLOOP_Segment));
228 }
229 
230 /* Segment_free
231  *
232  * Input Parameters:
233  * segp - pointer to segment
234  */
PREPEND_PREFIX(Segment_free)235 void PREPEND_PREFIX(Segment_free)(struct DLOOP_Segment *segp)
236 {
237     DLOOP_Free(segp);
238     return;
239 }
240 
241 /* DLOOP_Segment_manipulate - do something to a segment
242  *
243  * If you think of all the data to be manipulated (packed, unpacked, whatever),
244  * as a stream of bytes, it's easier to understand how first and last fit in.
245  *
246  * This function does all the work, calling the piecefn passed in when it
247  * encounters a datatype element which falls into the range of first..(last-1).
248  *
249  * piecefn can be NULL, in which case this function doesn't do anything when it
250  * hits a region.  This is used internally for repositioning within this stream.
251  *
252  * last is a byte offset to the byte just past the last byte in the stream
253  * to operate on.  this makes the calculations all over MUCH cleaner.
254  *
255  * stream_off, stream_el_size, first, and last are all working in terms of the
256  * types and sizes for the stream, which might be different from the local sizes
257  * (in the heterogeneous case).
258  *
259  * This is a horribly long function.  Too bad; it's complicated :)! -- Rob
260  *
261  * NOTE: THIS IMPLEMENTATION CANNOT HANDLE STRUCT DATALOOPS.
262  */
263 #define DLOOP_SEGMENT_SAVE_LOCAL_VALUES		\
264 {						\
265     segp->cur_sp     = cur_sp;			\
266     segp->valid_sp   = valid_sp;		\
267     segp->stream_off = stream_off;		\
268     *lastp           = stream_off;		\
269 }
270 
271 #define DLOOP_SEGMENT_LOAD_LOCAL_VALUES		\
272 {						\
273     last       = *lastp;			\
274     cur_sp     = segp->cur_sp;			\
275     valid_sp   = segp->valid_sp;		\
276     stream_off = segp->stream_off;		\
277     cur_elmp   = &(segp->stackelm[cur_sp]);	\
278 }
279 
280 #define DLOOP_SEGMENT_RESET_VALUES				\
281 {								\
282     segp->stream_off     = 0;					\
283     segp->cur_sp         = 0; 					\
284     cur_elmp             = &(segp->stackelm[0]);		\
285     cur_elmp->curcount   = cur_elmp->orig_count;		\
286     cur_elmp->orig_block = DLOOP_Stackelm_blocksize(cur_elmp);	\
287     cur_elmp->curblock   = cur_elmp->orig_block;		\
288     cur_elmp->curoffset  = cur_elmp->orig_offset +              \
289                            DLOOP_Stackelm_offset(cur_elmp);     \
290 }
291 
292 #define DLOOP_SEGMENT_POP_AND_MAYBE_EXIT			\
293 {								\
294     cur_sp--;							\
295     if (cur_sp >= 0) cur_elmp = &segp->stackelm[cur_sp];	\
296     else {							\
297 	DLOOP_SEGMENT_SAVE_LOCAL_VALUES;			\
298 	return;							\
299     }								\
300 }
301 
302 #define DLOOP_SEGMENT_PUSH			\
303 {						\
304     cur_sp++;					\
305     cur_elmp = &segp->stackelm[cur_sp];		\
306 }
307 
308 #define DLOOP_STACKELM_BLOCKINDEXED_OFFSET(elmp_, curcount_) \
309 (elmp_)->loop_p->loop_params.bi_t.offset_array[(curcount_)]
310 
311 #define DLOOP_STACKELM_INDEXED_OFFSET(elmp_, curcount_) \
312 (elmp_)->loop_p->loop_params.i_t.offset_array[(curcount_)]
313 
314 #define DLOOP_STACKELM_INDEXED_BLOCKSIZE(elmp_, curcount_) \
315 (elmp_)->loop_p->loop_params.i_t.blocksize_array[(curcount_)]
316 
317 #define DLOOP_STACKELM_STRUCT_OFFSET(elmp_, curcount_) \
318 (elmp_)->loop_p->loop_params.s_t.offset_array[(curcount_)]
319 
320 #define DLOOP_STACKELM_STRUCT_BLOCKSIZE(elmp_, curcount_) \
321 (elmp_)->loop_p->loop_params.s_t.blocksize_array[(curcount_)]
322 
323 #define DLOOP_STACKELM_STRUCT_EL_EXTENT(elmp_, curcount_) \
324 (elmp_)->loop_p->loop_params.s_t.el_extent_array[(curcount_)]
325 
326 #define DLOOP_STACKELM_STRUCT_DATALOOP(elmp_, curcount_) \
327 (elmp_)->loop_p->loop_params.s_t.dataloop_array[(curcount_)]
328 
PREPEND_PREFIX(Segment_manipulate)329 void PREPEND_PREFIX(Segment_manipulate)(struct DLOOP_Segment *segp,
330 					DLOOP_Offset first,
331 					DLOOP_Offset *lastp,
332 					int (*contigfn) (DLOOP_Offset *blocks_p,
333 							 DLOOP_Type el_type,
334 							 DLOOP_Offset rel_off,
335 							 DLOOP_Buffer bufp,
336 							 void *v_paramp),
337 					int (*vectorfn) (DLOOP_Offset *blocks_p,
338 							 DLOOP_Count count,
339 							 DLOOP_Count blklen,
340 							 DLOOP_Offset stride,
341 							 DLOOP_Type el_type,
342 							 DLOOP_Offset rel_off,
343 							 DLOOP_Buffer bufp,
344 							 void *v_paramp),
345 					int (*blkidxfn) (DLOOP_Offset *blocks_p,
346 							 DLOOP_Count count,
347 							 DLOOP_Count blklen,
348 							 DLOOP_Offset *offsetarray,
349 							 DLOOP_Type el_type,
350 							 DLOOP_Offset rel_off,
351 							 DLOOP_Buffer bufp,
352 							 void *v_paramp),
353 					int (*indexfn) (DLOOP_Offset *blocks_p,
354 							DLOOP_Count count,
355 							DLOOP_Count *blockarray,
356 							DLOOP_Offset *offsetarray,
357 							DLOOP_Type el_type,
358 							DLOOP_Offset rel_off,
359 							DLOOP_Buffer bufp,
360 							void *v_paramp),
361 					DLOOP_Offset (*sizefn) (DLOOP_Type el_type),
362 					void *pieceparams)
363 {
364     /* these four are the "local values": cur_sp, valid_sp, last, stream_off */
365     int cur_sp, valid_sp;
366     DLOOP_Offset last, stream_off;
367 
368     struct DLOOP_Dataloop_stackelm *cur_elmp;
369     enum { PF_NULL, PF_CONTIG, PF_VECTOR, PF_BLOCKINDEXED, PF_INDEXED } piecefn_type = PF_NULL;
370 
371     DLOOP_SEGMENT_LOAD_LOCAL_VALUES;
372 
373     if (first == *lastp) {
374 	/* nothing to do */
375 	DLOOP_dbg_printf("dloop_segment_manipulate: warning: first == last (" DLOOP_OFFSET_FMT_DEC_SPEC ")\n", first);
376 	return;
377     }
378 
379     /* first we ensure that stream_off and first are in the same spot */
380     if (first != stream_off) {
381 #ifdef DLOOP_DEBUG_MANIPULATE
382 	DLOOP_dbg_printf("first=" DLOOP_OFFSET_FMT_DEC_SPEC "; stream_off=" DLOOP_OFFSET_FMT_DEC_SPEC "; resetting.\n",
383 			 first, stream_off);
384 #endif
385 
386 	if (first < stream_off) {
387 	    DLOOP_SEGMENT_RESET_VALUES;
388 	    stream_off = 0;
389 	}
390 
391 	if (first != stream_off) {
392 	    DLOOP_Offset tmp_last = first;
393 
394 	    /* use manipulate function with a NULL piecefn to advance
395 	     * stream offset
396 	     */
397 	    PREPEND_PREFIX(Segment_manipulate)(segp,
398 					       stream_off,
399 					       &tmp_last,
400 					       NULL, /* contig fn */
401 					       NULL, /* vector fn */
402 					       NULL, /* blkidx fn */
403 					       NULL, /* index fn */
404 					       sizefn,
405                                                NULL);
406 
407 	    /* --BEGIN ERROR HANDLING-- */
408 	    /* verify that we're in the right location */
409 	    DLOOP_Assert(tmp_last == first);
410 	    /* --END ERROR HANDLING-- */
411 	}
412 
413 	DLOOP_SEGMENT_LOAD_LOCAL_VALUES;
414 
415 #ifdef DLOOP_DEBUG_MANIPULATE
416 	DLOOP_dbg_printf("done repositioning stream_off; first=" DLOOP_OFFSET_FMT_DEC_SPEC ", stream_off=" DLOOP_OFFSET_FMT_DEC_SPEC ", last=" DLOOP_OFFSET_FMT_DEC_SPEC "\n",
417 		   first, stream_off, last);
418 #endif
419     }
420 
421     for (;;) {
422 #ifdef DLOOP_DEBUG_MANIPULATE
423 #if 0
424         DLOOP_dbg_printf("looptop; cur_sp=%d, cur_elmp=%x\n",
425 			 cur_sp, (unsigned) cur_elmp);
426 #endif
427 #endif
428 
429 	if (cur_elmp->loop_p->kind & DLOOP_FINAL_MASK) {
430 	    int piecefn_indicated_exit = -1;
431 	    DLOOP_Offset myblocks, local_el_size, stream_el_size;
432 	    DLOOP_Type el_type;
433 
434 	    /* structs are never finals (leaves) */
435 	    DLOOP_Assert((cur_elmp->loop_p->kind & DLOOP_KIND_MASK) !=
436 		   DLOOP_KIND_STRUCT);
437 
438 	    /* pop immediately on zero count */
439 	    if (cur_elmp->curcount == 0) DLOOP_SEGMENT_POP_AND_MAYBE_EXIT;
440 
441 	    /* size on this system of the int, double, etc. that is
442 	     * the elementary type.
443 	     */
444 	    local_el_size  = cur_elmp->loop_p->el_size;
445 	    el_type        = cur_elmp->loop_p->el_type;
446 	    stream_el_size = (sizefn) ? sizefn(el_type) : local_el_size;
447 
448 	    /* calculate number of elem. types to work on and function to use.
449 	     * default is to use the contig piecefn (if there is one).
450 	     */
451 	    myblocks = cur_elmp->curblock;
452 	    piecefn_type = (contigfn ? PF_CONTIG : PF_NULL);
453 
454 	    /* check for opportunities to use other piecefns */
455 	    switch (cur_elmp->loop_p->kind & DLOOP_KIND_MASK) {
456 		case DLOOP_KIND_CONTIG:
457 		    break;
458          	case DLOOP_KIND_BLOCKINDEXED:
459 		    /* only use blkidx piecefn if at start of blkidx type */
460 		    if (blkidxfn &&
461 			cur_elmp->orig_block == cur_elmp->curblock &&
462 			cur_elmp->orig_count == cur_elmp->curcount)
463 		    {
464 			/* TODO: RELAX CONSTRAINTS */
465 			myblocks = cur_elmp->curblock * cur_elmp->curcount;
466 			piecefn_type = PF_BLOCKINDEXED;
467 		    }
468 		    break;
469 		case DLOOP_KIND_INDEXED:
470 		    /* only use index piecefn if at start of the index type.
471 		     *   count test checks that we're on first block.
472 		     *   block test checks that we haven't made progress on first block.
473 		     */
474 		    if (indexfn &&
475 			cur_elmp->orig_count == cur_elmp->curcount &&
476 			cur_elmp->curblock == DLOOP_STACKELM_INDEXED_BLOCKSIZE(cur_elmp, 0))
477 		    {
478 			/* TODO: RELAX CONSTRAINT ON COUNT? */
479 			myblocks = cur_elmp->loop_p->loop_params.i_t.total_blocks;
480 			piecefn_type = PF_INDEXED;
481 		    }
482 		    break;
483 		case DLOOP_KIND_VECTOR:
484 		    /* only use the vector piecefn if at the start of a
485 		     * contiguous block.
486 		     */
487 		    if (vectorfn && cur_elmp->orig_block == cur_elmp->curblock)
488 		    {
489 			myblocks = cur_elmp->curblock * cur_elmp->curcount;
490 			piecefn_type = PF_VECTOR;
491 		    }
492 		    break;
493 		default:
494 		    /* --BEGIN ERROR HANDLING-- */
495 		    DLOOP_Assert(0);
496 		    break;
497 		    /* --END ERROR HANDLING-- */
498 	    }
499 
500 #ifdef DLOOP_DEBUG_MANIPULATE
501 	    DLOOP_dbg_printf("\thit leaf; cur_sp=%d, elmp=%x, piece_sz=" DLOOP_OFFSET_FMT_DEC_SPEC "\n",
502 			     cur_sp,
503 		             (unsigned) cur_elmp, myblocks * local_el_size);
504 #endif
505 
506 	    /* enforce the last parameter if necessary by reducing myblocks */
507 	    if (last != SEGMENT_IGNORE_LAST &&
508 		(stream_off + (myblocks * stream_el_size) > last))
509 	    {
510 		myblocks = ((last - stream_off) / stream_el_size);
511 #ifdef DLOOP_DEBUG_MANIPULATE
512 		DLOOP_dbg_printf("\tpartial block count=" DLOOP_OFFSET_FMT_DEC_SPEC " (" DLOOP_OFFSET_FMT_DEC_SPEC " bytes)\n",
513 				 myblocks,
514                                  myblocks * stream_el_size);
515 #endif
516 		if (myblocks == 0) {
517 		    DLOOP_SEGMENT_SAVE_LOCAL_VALUES;
518 		    return;
519 		}
520 	    }
521 
522 	    /* call piecefn to perform data manipulation */
523 	    switch (piecefn_type) {
524 		case PF_NULL:
525 		    piecefn_indicated_exit = 0;
526 #ifdef DLOOP_DEBUG_MANIPULATE
527 		    DLOOP_dbg_printf("\tNULL piecefn for this piece\n");
528 #endif
529 		    break;
530 		case PF_CONTIG:
531 		    DLOOP_Assert(myblocks <= cur_elmp->curblock);
532 		    piecefn_indicated_exit =
533 			contigfn(&myblocks,
534 				 el_type,
535 				 cur_elmp->curoffset, /* relative to segp->ptr */
536 				 segp->ptr, /* start of buffer (from segment) */
537 				 pieceparams);
538 		    break;
539 		case PF_VECTOR:
540 		    piecefn_indicated_exit =
541 			vectorfn(&myblocks,
542 				 cur_elmp->curcount,
543 				 cur_elmp->orig_block,
544 				 cur_elmp->loop_p->loop_params.v_t.stride,
545 				 el_type,
546 				 cur_elmp->curoffset,
547 				 segp->ptr,
548 				 pieceparams);
549 		    break;
550 		case PF_BLOCKINDEXED:
551 		    piecefn_indicated_exit =
552 			blkidxfn(&myblocks,
553 				 cur_elmp->curcount,
554 				 cur_elmp->orig_block,
555 				 cur_elmp->loop_p->loop_params.bi_t.offset_array,
556 				 el_type,
557 				 cur_elmp->orig_offset, /* blkidxfn adds offset */
558 				 segp->ptr,
559 				 pieceparams);
560 		    break;
561 		case PF_INDEXED:
562 		    piecefn_indicated_exit =
563 			indexfn(&myblocks,
564 				cur_elmp->curcount,
565 				cur_elmp->loop_p->loop_params.i_t.blocksize_array,
566 				cur_elmp->loop_p->loop_params.i_t.offset_array,
567 				el_type,
568 				cur_elmp->orig_offset, /* indexfn adds offset value */
569 				segp->ptr,
570 				pieceparams);
571 		    break;
572 	    }
573 
574 	    /* update local values based on piecefn returns (myblocks and
575 	     * piecefn_indicated_exit)
576 	     */
577 	    DLOOP_Assert(piecefn_indicated_exit >= 0);
578 	    DLOOP_Assert(myblocks >= 0);
579 	    stream_off += myblocks * stream_el_size;
580 
581 	    /* myblocks of 0 or less than cur_elmp->curblock indicates
582 	     * that we should stop processing and return.
583 	     */
584 	    if (myblocks == 0) {
585 		DLOOP_SEGMENT_SAVE_LOCAL_VALUES;
586 		return;
587 	    }
588 	    else if (myblocks < (DLOOP_Offset)(cur_elmp->curblock)) {
589 		cur_elmp->curoffset += myblocks * local_el_size;
590 		cur_elmp->curblock  -= myblocks;
591 
592 		DLOOP_SEGMENT_SAVE_LOCAL_VALUES;
593 		return;
594 	    }
595 	    else /* myblocks >= cur_elmp->curblock */ {
596 		int count_index = 0;
597 
598 		/* this assumes we're either *just* processing the last parts
599 		 * of the current block, or we're processing as many blocks as
600 		 * we like starting at the beginning of one.
601 		 */
602 
603 		switch (cur_elmp->loop_p->kind & DLOOP_KIND_MASK) {
604 		    case DLOOP_KIND_INDEXED:
605 			while (myblocks > 0 && myblocks >= (DLOOP_Offset)(cur_elmp->curblock)) {
606 			    myblocks -= (DLOOP_Offset)(cur_elmp->curblock);
607 			    cur_elmp->curcount--;
608 			    DLOOP_Assert(cur_elmp->curcount >= 0);
609 
610 			    count_index = cur_elmp->orig_count -
611 				cur_elmp->curcount;
612 			    cur_elmp->curblock =
613 				DLOOP_STACKELM_INDEXED_BLOCKSIZE(cur_elmp,
614 								 count_index);
615 			}
616 
617 			if (cur_elmp->curcount == 0) {
618 			    /* don't bother to fill in values; we're popping anyway */
619 			    DLOOP_Assert(myblocks == 0);
620 			    DLOOP_SEGMENT_POP_AND_MAYBE_EXIT;
621 			}
622 			else {
623 			    cur_elmp->orig_block = cur_elmp->curblock;
624 			    cur_elmp->curoffset  = cur_elmp->orig_offset +
625 				DLOOP_STACKELM_INDEXED_OFFSET(cur_elmp,
626 							      count_index);
627 
628 			    cur_elmp->curblock  -= myblocks;
629 			    cur_elmp->curoffset += myblocks * local_el_size;
630 			}
631 			break;
632 		    case DLOOP_KIND_VECTOR:
633 			/* this math relies on assertions at top of code block */
634 			cur_elmp->curcount -= myblocks / (DLOOP_Offset)(cur_elmp->curblock);
635 			if (cur_elmp->curcount == 0) {
636 			    DLOOP_Assert(myblocks % ((DLOOP_Offset)(cur_elmp->curblock)) == 0);
637 			    DLOOP_SEGMENT_POP_AND_MAYBE_EXIT;
638 			}
639 			else {
640 			    /* this math relies on assertions at top of code
641 			     * block
642 			     */
643 			    cur_elmp->curblock = cur_elmp->orig_block -
644 				(myblocks % (DLOOP_Offset)(cur_elmp->curblock));
645 			    /* new offset = original offset +
646 			     *              stride * whole blocks +
647 			     *              leftover bytes
648 			     */
649 			    cur_elmp->curoffset = cur_elmp->orig_offset +
650 				(((DLOOP_Offset)(cur_elmp->orig_count - cur_elmp->curcount)) *
651 				 cur_elmp->loop_p->loop_params.v_t.stride) +
652 				(((DLOOP_Offset)(cur_elmp->orig_block - cur_elmp->curblock)) *
653 				 local_el_size);
654 			}
655 			break;
656 		    case DLOOP_KIND_CONTIG:
657 			/* contigs that reach this point have always been
658 			 * completely processed
659 			 */
660 			DLOOP_Assert(myblocks == (DLOOP_Offset)(cur_elmp->curblock) &&
661 			       cur_elmp->curcount == 1);
662 			DLOOP_SEGMENT_POP_AND_MAYBE_EXIT;
663 			break;
664 		    case DLOOP_KIND_BLOCKINDEXED:
665 			while (myblocks > 0 && myblocks >= (DLOOP_Offset)(cur_elmp->curblock))
666 			{
667 			    myblocks -= (DLOOP_Offset)(cur_elmp->curblock);
668 			    cur_elmp->curcount--;
669 			    DLOOP_Assert(cur_elmp->curcount >= 0);
670 
671 			    count_index = cur_elmp->orig_count -
672 				cur_elmp->curcount;
673 			    cur_elmp->curblock = cur_elmp->orig_block;
674 			}
675 			if (cur_elmp->curcount == 0) {
676 			    /* popping */
677 			    DLOOP_Assert(myblocks == 0);
678 			    DLOOP_SEGMENT_POP_AND_MAYBE_EXIT;
679 			}
680 			else {
681 			    /* cur_elmp->orig_block = cur_elmp->curblock; */
682 			    cur_elmp->curoffset = cur_elmp->orig_offset +
683 				DLOOP_STACKELM_BLOCKINDEXED_OFFSET(cur_elmp,
684 								   count_index);
685 			    cur_elmp->curblock  -= myblocks;
686 			    cur_elmp->curoffset += myblocks * local_el_size;
687 			}
688 			break;
689 		}
690 	    }
691 
692 	    if (piecefn_indicated_exit) {
693 		/* piece function indicated that we should quit processing */
694 		DLOOP_SEGMENT_SAVE_LOCAL_VALUES;
695 		return;
696 	    }
697 	} /* end of if leaf */
698 	else if (cur_elmp->curblock == 0) {
699 #ifdef DLOOP_DEBUG_MANIPULATE
700 	    DLOOP_dbg_printf("\thit end of block; elmp=%x [%d]\n",
701 			    (unsigned) cur_elmp, cur_sp);
702 #endif
703 	    cur_elmp->curcount--;
704 
705 	    /* new block.  for indexed and struct reset orig_block.
706 	     * reset curblock for all types
707 	     */
708 	    switch (cur_elmp->loop_p->kind & DLOOP_KIND_MASK) {
709 		case DLOOP_KIND_CONTIG:
710 		case DLOOP_KIND_VECTOR:
711 		case DLOOP_KIND_BLOCKINDEXED:
712 		    break;
713 		case DLOOP_KIND_INDEXED:
714 		    cur_elmp->orig_block =
715 			DLOOP_STACKELM_INDEXED_BLOCKSIZE(cur_elmp, cur_elmp->curcount ? cur_elmp->orig_count - cur_elmp->curcount : 0);
716 		    break;
717 		case DLOOP_KIND_STRUCT:
718 		    cur_elmp->orig_block =
719 			DLOOP_STACKELM_STRUCT_BLOCKSIZE(cur_elmp, cur_elmp->curcount ? cur_elmp->orig_count - cur_elmp->curcount : 0);
720 		    break;
721 		default:
722 		    /* --BEGIN ERROR HANDLING-- */
723 		    DLOOP_Assert(0);
724 		    break;
725 		    /* --END ERROR HANDLING-- */
726 	    }
727 	    cur_elmp->curblock = cur_elmp->orig_block;
728 
729 	    if (cur_elmp->curcount == 0) {
730 #ifdef DLOOP_DEBUG_MANIPULATE
731 		DLOOP_dbg_printf("\talso hit end of count; elmp=%x [%d]\n",
732 				(unsigned) cur_elmp, cur_sp);
733 #endif
734 		DLOOP_SEGMENT_POP_AND_MAYBE_EXIT;
735 	    }
736 	}
737 	else /* push the stackelm */ {
738 	    DLOOP_Dataloop_stackelm *next_elmp;
739 	    int count_index, block_index;
740 
741 	    count_index = cur_elmp->orig_count - cur_elmp->curcount;
742 	    block_index = cur_elmp->orig_block - cur_elmp->curblock;
743 
744 	    /* reload the next stackelm if necessary */
745 	    next_elmp = &(segp->stackelm[cur_sp + 1]);
746 	    if (cur_elmp->may_require_reloading) {
747 		DLOOP_Dataloop *load_dlp = NULL;
748 		switch (cur_elmp->loop_p->kind & DLOOP_KIND_MASK) {
749 		    case DLOOP_KIND_CONTIG:
750 		    case DLOOP_KIND_VECTOR:
751 		    case DLOOP_KIND_BLOCKINDEXED:
752 		    case DLOOP_KIND_INDEXED:
753 			load_dlp = cur_elmp->loop_p->loop_params.cm_t.dataloop;
754 			break;
755 		    case DLOOP_KIND_STRUCT:
756 			load_dlp = DLOOP_STACKELM_STRUCT_DATALOOP(cur_elmp,
757 								  count_index);
758 			break;
759 		    default:
760 			/* --BEGIN ERROR HANDLING-- */
761 			DLOOP_Assert(0);
762 			break;
763 			/* --END ERROR HANDLING-- */
764 		}
765 
766 #ifdef DLOOP_DEBUG_MANIPULATE
767 		DLOOP_dbg_printf("\tloading dlp=%x, elmp=%x [%d]\n",
768 				 (unsigned) load_dlp,
769 				 (unsigned) next_elmp,
770 				 cur_sp+1);
771 #endif
772 
773 		DLOOP_Stackelm_load(next_elmp,
774 				    load_dlp,
775 				    1);
776 	    }
777 
778 #ifdef DLOOP_DEBUG_MANIPULATE
779 	    DLOOP_dbg_printf("\tpushing type, elmp=%x [%d], count=%d, block=%d\n",
780 			    (unsigned) cur_elmp, cur_sp, count_index,
781 			     block_index);
782 #endif
783 	    /* set orig_offset and all cur values for new stackelm.
784 	     * this is done in two steps: first set orig_offset based on
785 	     * current stackelm, then set cur values based on new stackelm.
786 	     */
787 	    switch (cur_elmp->loop_p->kind & DLOOP_KIND_MASK) {
788 		case DLOOP_KIND_CONTIG:
789 		    next_elmp->orig_offset = cur_elmp->curoffset +
790 			(DLOOP_Offset) block_index * cur_elmp->loop_p->el_extent;
791 		    break;
792 		case DLOOP_KIND_VECTOR:
793 		    /* note: stride is in bytes */
794 		    next_elmp->orig_offset = cur_elmp->orig_offset +
795 			(DLOOP_Offset) count_index * cur_elmp->loop_p->loop_params.v_t.stride +
796 			(DLOOP_Offset) block_index * cur_elmp->loop_p->el_extent;
797 		    break;
798 		case DLOOP_KIND_BLOCKINDEXED:
799 		    next_elmp->orig_offset = cur_elmp->orig_offset +
800 			(DLOOP_Offset) block_index * cur_elmp->loop_p->el_extent +
801 			DLOOP_STACKELM_BLOCKINDEXED_OFFSET(cur_elmp,
802 							   count_index);
803 		    break;
804 		case DLOOP_KIND_INDEXED:
805 		    next_elmp->orig_offset = cur_elmp->orig_offset +
806 			(DLOOP_Offset) block_index * cur_elmp->loop_p->el_extent +
807 			DLOOP_STACKELM_INDEXED_OFFSET(cur_elmp, count_index);
808 		    break;
809 		case DLOOP_KIND_STRUCT:
810 		    next_elmp->orig_offset = cur_elmp->orig_offset +
811 			(DLOOP_Offset) block_index * DLOOP_STACKELM_STRUCT_EL_EXTENT(cur_elmp, count_index) +
812 			DLOOP_STACKELM_STRUCT_OFFSET(cur_elmp, count_index);
813 		    break;
814 		default:
815 		    /* --BEGIN ERROR HANDLING-- */
816 		    DLOOP_Assert(0);
817 		    break;
818 		    /* --END ERROR HANDLING-- */
819 	    }
820 
821 #ifdef DLOOP_DEBUG_MANIPULATE
822 	    DLOOP_dbg_printf("\tstep 1: next orig_offset = " DLOOP_OFFSET_FMT_DEC_SPEC " (0x" DLOOP_OFFSET_FMT_HEX_SPEC ")\n",
823 			     next_elmp->orig_offset,
824 			     next_elmp->orig_offset);
825 #endif
826 
827 	    switch (next_elmp->loop_p->kind & DLOOP_KIND_MASK) {
828 		case DLOOP_KIND_CONTIG:
829 		case DLOOP_KIND_VECTOR:
830 		    next_elmp->curcount  = next_elmp->orig_count;
831 		    next_elmp->curblock  = next_elmp->orig_block;
832 		    next_elmp->curoffset = next_elmp->orig_offset;
833 		    break;
834 		case DLOOP_KIND_BLOCKINDEXED:
835 		    next_elmp->curcount  = next_elmp->orig_count;
836 		    next_elmp->curblock  = next_elmp->orig_block;
837 		    next_elmp->curoffset = next_elmp->orig_offset +
838 			DLOOP_STACKELM_BLOCKINDEXED_OFFSET(next_elmp, 0);
839 		    break;
840 		case DLOOP_KIND_INDEXED:
841 		    next_elmp->curcount  = next_elmp->orig_count;
842 		    next_elmp->curblock  =
843 			DLOOP_STACKELM_INDEXED_BLOCKSIZE(next_elmp, 0);
844 		    next_elmp->curoffset = next_elmp->orig_offset +
845 			DLOOP_STACKELM_INDEXED_OFFSET(next_elmp, 0);
846 		    break;
847 		case DLOOP_KIND_STRUCT:
848 		    next_elmp->curcount = next_elmp->orig_count;
849 		    next_elmp->curblock =
850 			DLOOP_STACKELM_STRUCT_BLOCKSIZE(next_elmp, 0);
851 		    next_elmp->curoffset = next_elmp->orig_offset +
852 			DLOOP_STACKELM_STRUCT_OFFSET(next_elmp, 0);
853 		    break;
854 		default:
855 		    /* --BEGIN ERROR HANDLING-- */
856 		    DLOOP_Assert(0);
857 		    break;
858 		    /* --END ERROR HANDLING-- */
859 	    }
860 
861 #ifdef DLOOP_DEBUG_MANIPULATE
862 	    DLOOP_dbg_printf("\tstep 2: next curoffset = " DLOOP_OFFSET_FMT_DEC_SPEC " (0x" DLOOP_OFFSET_FMT_HEX_SPEC ")\n",
863 			     next_elmp->curoffset,
864 			     next_elmp->curoffset);
865 #endif
866 
867 	    cur_elmp->curblock--;
868 	    DLOOP_SEGMENT_PUSH;
869 	} /* end of else push the stackelm */
870     } /* end of for (;;) */
871 
872 #ifdef DLOOP_DEBUG_MANIPULATE
873     DLOOP_dbg_printf("hit end of datatype\n");
874 #endif
875 
876     DLOOP_SEGMENT_SAVE_LOCAL_VALUES;
877     return;
878 }
879 
880 /* DLOOP_Stackelm_blocksize - returns block size for stackelm based on current
881  * count in stackelm.
882  *
883  * NOTE: loop_p, orig_count, and curcount members of stackelm MUST be correct
884  * before this is called!
885  *
886  */
DLOOP_Stackelm_blocksize(struct DLOOP_Dataloop_stackelm * elmp)887 static inline DLOOP_Count DLOOP_Stackelm_blocksize(struct DLOOP_Dataloop_stackelm *elmp)
888 {
889     struct DLOOP_Dataloop *dlp = elmp->loop_p;
890 
891     switch(dlp->kind & DLOOP_KIND_MASK) {
892 	case DLOOP_KIND_CONTIG:
893 	    /* NOTE: we're dropping the count into the
894 	     * blksize field for contigs, as described
895 	     * in the init call.
896 	     */
897 	    return dlp->loop_params.c_t.count;
898 	    break;
899 	case DLOOP_KIND_VECTOR:
900 	    return dlp->loop_params.v_t.blocksize;
901 	    break;
902 	case DLOOP_KIND_BLOCKINDEXED:
903 	    return dlp->loop_params.bi_t.blocksize;
904 	    break;
905 	case DLOOP_KIND_INDEXED:
906 	    return dlp->loop_params.i_t.blocksize_array[elmp->orig_count - elmp->curcount];
907 	    break;
908 	case DLOOP_KIND_STRUCT:
909 	    return dlp->loop_params.s_t.blocksize_array[elmp->orig_count - elmp->curcount];
910 	    break;
911 	default:
912 	    /* --BEGIN ERROR HANDLING-- */
913 	    DLOOP_Assert(0);
914 	    break;
915 	    /* --END ERROR HANDLING-- */
916     }
917     return -1;
918 }
919 
920 /* DLOOP_Stackelm_offset - returns starting offset (displacement) for stackelm
921  * based on current count in stackelm.
922  *
923  * NOTE: loop_p, orig_count, and curcount members of stackelm MUST be correct
924  * before this is called!
925  *
926  * also, this really is only good at init time for vectors and contigs
927  * (all the time for indexed) at the moment.
928  *
929  */
DLOOP_Stackelm_offset(struct DLOOP_Dataloop_stackelm * elmp)930 static inline DLOOP_Offset DLOOP_Stackelm_offset(struct DLOOP_Dataloop_stackelm *elmp)
931 {
932     struct DLOOP_Dataloop *dlp = elmp->loop_p;
933 
934     switch(dlp->kind & DLOOP_KIND_MASK) {
935 	case DLOOP_KIND_VECTOR:
936 	case DLOOP_KIND_CONTIG:
937 	    return 0;
938 	    break;
939 	case DLOOP_KIND_BLOCKINDEXED:
940 	    return dlp->loop_params.bi_t.offset_array[elmp->orig_count - elmp->curcount];
941 	    break;
942 	case DLOOP_KIND_INDEXED:
943 	    return dlp->loop_params.i_t.offset_array[elmp->orig_count - elmp->curcount];
944 	    break;
945 	case DLOOP_KIND_STRUCT:
946 	    return dlp->loop_params.s_t.offset_array[elmp->orig_count - elmp->curcount];
947 	    break;
948 	default:
949 	    /* --BEGIN ERROR HANDLING-- */
950 	    DLOOP_Assert(0);
951 	    break;
952 	    /* --END ERROR HANDLING-- */
953     }
954     return -1;
955 }
956 
957 /* DLOOP_Stackelm_load
958  * loop_p, orig_count, orig_block, and curcount are all filled by us now.
959  * the rest are filled in at processing time.
960  */
DLOOP_Stackelm_load(struct DLOOP_Dataloop_stackelm * elmp,struct DLOOP_Dataloop * dlp,int branch_flag)961 static inline void DLOOP_Stackelm_load(struct DLOOP_Dataloop_stackelm *elmp,
962 				       struct DLOOP_Dataloop *dlp,
963 				       int branch_flag)
964 {
965     elmp->loop_p = dlp;
966 
967     if ((dlp->kind & DLOOP_KIND_MASK) == DLOOP_KIND_CONTIG) {
968 	elmp->orig_count = 1; /* put in blocksize instead */
969     }
970     else {
971 	elmp->orig_count = dlp->loop_params.count;
972     }
973 
974     if (branch_flag || (dlp->kind & DLOOP_KIND_MASK) == DLOOP_KIND_STRUCT)
975     {
976 	elmp->may_require_reloading = 1;
977     }
978     else {
979 	elmp->may_require_reloading = 0;
980     }
981 
982     /* required by DLOOP_Stackelm_blocksize */
983     elmp->curcount = elmp->orig_count;
984 
985     elmp->orig_block = DLOOP_Stackelm_blocksize(elmp);
986     /* TODO: GO AHEAD AND FILL IN CURBLOCK? */
987 }
988 
989 /*
990  * Local variables:
991  * c-indent-tabs-mode: nil
992  * End:
993  */
994