1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2
3 /*
4 * (C) 2001 by Argonne National Laboratory.
5 * See COPYRIGHT in top-level directory.
6 */
7
8 #include <stdio.h>
9
10 #include "./dataloop.h"
11
12 static void DLOOP_Type_blockindexed_array_copy(DLOOP_Count count,
13 const void *disp_array,
14 DLOOP_Offset *out_disp_array,
15 int dispinbytes,
16 DLOOP_Offset old_extent);
17
18 /*@
19 Dataloop_create_blockindexed - create blockindexed dataloop
20
21 Arguments:
22 + int count
23 . void *displacement_array (array of either MPI_Aints or ints)
24 . int displacement_in_bytes (boolean)
25 . MPI_Datatype old_type
26 . DLOOP_Dataloop **output_dataloop_ptr
27 . int output_dataloop_size
28 . int output_dataloop_depth
29 - int flag
30
31 .N Errors
32 .N Returns 0 on success, -1 on failure.
33 @*/
PREPEND_PREFIX(Dataloop_create_blockindexed)34 int PREPEND_PREFIX(Dataloop_create_blockindexed)(int icount,
35 int iblklen,
36 const void *disp_array,
37 int dispinbytes,
38 DLOOP_Type oldtype,
39 DLOOP_Dataloop **dlp_p,
40 int *dlsz_p,
41 int *dldepth_p,
42 int flag)
43 {
44 int err, is_builtin, is_vectorizable = 1;
45 int i, new_loop_sz, old_loop_depth;
46
47 DLOOP_Count contig_count, count, blklen;
48 DLOOP_Offset old_extent, eff_disp0, eff_disp1, last_stride;
49 DLOOP_Dataloop *new_dlp;
50
51 count = (DLOOP_Count) icount; /* avoid subsequent casting */
52 blklen = (DLOOP_Count) iblklen;
53
54 /* if count or blklen are zero, handle with contig code, call it a int */
55 if (count == 0 || blklen == 0)
56 {
57 err = PREPEND_PREFIX(Dataloop_create_contiguous)(0,
58 MPI_INT,
59 dlp_p,
60 dlsz_p,
61 dldepth_p,
62 flag);
63 return err;
64 }
65
66 is_builtin = (DLOOP_Handle_hasloop_macro(oldtype)) ? 0 : 1;
67
68 if (is_builtin)
69 {
70 DLOOP_Handle_get_size_macro(oldtype, old_extent);
71 old_loop_depth = 0;
72 }
73 else
74 {
75 DLOOP_Handle_get_extent_macro(oldtype, old_extent);
76 DLOOP_Handle_get_loopdepth_macro(oldtype, old_loop_depth, flag);
77 }
78
79 contig_count = PREPEND_PREFIX(Type_blockindexed_count_contig)(count,
80 blklen,
81 disp_array,
82 dispinbytes,
83 old_extent);
84
85 /* optimization:
86 *
87 * if contig_count == 1 and block starts at displacement 0,
88 * store it as a contiguous rather than a blockindexed dataloop.
89 */
90 if ((contig_count == 1) &&
91 ((!dispinbytes && ((int *) disp_array)[0] == 0) ||
92 (dispinbytes && ((MPI_Aint *) disp_array)[0] == 0)))
93 {
94 err = PREPEND_PREFIX(Dataloop_create_contiguous)(icount * iblklen,
95 oldtype,
96 dlp_p,
97 dlsz_p,
98 dldepth_p,
99 flag);
100 return err;
101 }
102
103 /* optimization:
104 *
105 * if contig_count == 1 store it as a blockindexed with one
106 * element rather than as a lot of individual blocks.
107 */
108 if (contig_count == 1)
109 {
110 /* adjust count and blklen and drop through */
111 blklen *= count;
112 count = 1;
113 iblklen *= icount;
114 icount = 1;
115 }
116
117 /* optimization:
118 *
119 * if displacements start at zero and result in a fixed stride,
120 * store it as a vector rather than a blockindexed dataloop.
121 */
122 eff_disp0 = (dispinbytes) ? ((DLOOP_Offset) ((MPI_Aint *) disp_array)[0]) :
123 (((DLOOP_Offset) ((int *) disp_array)[0]) * old_extent);
124
125 if (count > 1 && eff_disp0 == (DLOOP_Offset) 0)
126 {
127 eff_disp1 = (dispinbytes) ?
128 ((DLOOP_Offset) ((MPI_Aint *) disp_array)[1]) :
129 (((DLOOP_Offset) ((int *) disp_array)[1]) * old_extent);
130 last_stride = eff_disp1 - eff_disp0;
131
132 for (i=2; i < count; i++) {
133 eff_disp0 = eff_disp1;
134 eff_disp1 = (dispinbytes) ?
135 ((DLOOP_Offset) ((MPI_Aint *) disp_array)[i]) :
136 (((DLOOP_Offset) ((int *) disp_array)[i]) * old_extent);
137 if (eff_disp1 - eff_disp0 != last_stride) {
138 is_vectorizable = 0;
139 break;
140 }
141 }
142 if (is_vectorizable)
143 {
144 err = PREPEND_PREFIX(Dataloop_create_vector)(count,
145 blklen,
146 last_stride,
147 1, /* strideinbytes */
148 oldtype,
149 dlp_p,
150 dlsz_p,
151 dldepth_p,
152 flag);
153 return err;
154 }
155 }
156
157 /* TODO: optimization:
158 *
159 * if displacements result in a fixed stride, but first displacement
160 * is not zero, store it as a blockindexed (blklen == 1) of a vector.
161 */
162
163 /* TODO: optimization:
164 *
165 * if a blockindexed of a contig, absorb the contig into the blocklen
166 * parameter and keep the same overall depth
167 */
168
169 /* otherwise storing as a blockindexed dataloop */
170
171 /* Q: HOW CAN WE TELL IF IT IS WORTH IT TO STORE AS AN
172 * INDEXED WITH FEWER CONTIG BLOCKS (IF CONTIG_COUNT IS SMALL)?
173 */
174
175 if (is_builtin)
176 {
177 PREPEND_PREFIX(Dataloop_alloc)(DLOOP_KIND_BLOCKINDEXED,
178 count,
179 &new_dlp,
180 &new_loop_sz);
181 /* --BEGIN ERROR HANDLING-- */
182 if (!new_dlp) return -1;
183 /* --END ERROR HANDLING-- */
184
185 new_dlp->kind = DLOOP_KIND_BLOCKINDEXED | DLOOP_FINAL_MASK;
186
187 if (flag == DLOOP_DATALOOP_ALL_BYTES)
188 {
189 blklen *= old_extent;
190 new_dlp->el_size = 1;
191 new_dlp->el_extent = 1;
192 new_dlp->el_type = MPI_BYTE;
193 }
194 else
195 {
196 new_dlp->el_size = old_extent;
197 new_dlp->el_extent = old_extent;
198 new_dlp->el_type = oldtype;
199 }
200 }
201 else
202 {
203 DLOOP_Dataloop *old_loop_ptr = NULL;
204 int old_loop_sz = 0;
205
206 DLOOP_Handle_get_loopptr_macro(oldtype, old_loop_ptr, flag);
207 DLOOP_Handle_get_loopsize_macro(oldtype, old_loop_sz, flag);
208
209 PREPEND_PREFIX(Dataloop_alloc_and_copy)(DLOOP_KIND_BLOCKINDEXED,
210 count,
211 old_loop_ptr,
212 old_loop_sz,
213 &new_dlp,
214 &new_loop_sz);
215 /* --BEGIN ERROR HANDLING-- */
216 if (!new_dlp) return -1;
217 /* --END ERROR HANDLING-- */
218
219 new_dlp->kind = DLOOP_KIND_BLOCKINDEXED;
220
221 DLOOP_Handle_get_size_macro(oldtype, new_dlp->el_size);
222 DLOOP_Handle_get_extent_macro(oldtype, new_dlp->el_extent);
223 DLOOP_Handle_get_basic_type_macro(oldtype, new_dlp->el_type);
224 }
225
226 new_dlp->loop_params.bi_t.count = count;
227 new_dlp->loop_params.bi_t.blocksize = blklen;
228
229 /* copy in displacement parameters
230 *
231 * regardless of dispinbytes, we store displacements in bytes in loop.
232 */
233 DLOOP_Type_blockindexed_array_copy(count,
234 disp_array,
235 new_dlp->loop_params.bi_t.offset_array,
236 dispinbytes,
237 old_extent);
238
239 *dlp_p = new_dlp;
240 *dlsz_p = new_loop_sz;
241 *dldepth_p = old_loop_depth + 1;
242
243 return 0;
244 }
245
246 /* DLOOP_Type_blockindexed_array_copy
247 *
248 * Unlike the indexed version, this one does not compact adjacent
249 * blocks, because that would really mess up the blockindexed type!
250 */
DLOOP_Type_blockindexed_array_copy(DLOOP_Count count,const void * in_disp_array,DLOOP_Offset * out_disp_array,int dispinbytes,DLOOP_Offset old_extent)251 static void DLOOP_Type_blockindexed_array_copy(DLOOP_Count count,
252 const void *in_disp_array,
253 DLOOP_Offset *out_disp_array,
254 int dispinbytes,
255 DLOOP_Offset old_extent)
256 {
257 int i;
258 if (!dispinbytes)
259 {
260 for (i=0; i < count; i++)
261 {
262 out_disp_array[i] =
263 ((DLOOP_Offset) ((int *) in_disp_array)[i]) * old_extent;
264 }
265 }
266 else
267 {
268 for (i=0; i < count; i++)
269 {
270 out_disp_array[i] =
271 ((DLOOP_Offset) ((MPI_Aint *) in_disp_array)[i]);
272 }
273 }
274 return;
275 }
276
PREPEND_PREFIX(Type_blockindexed_count_contig)277 DLOOP_Count PREPEND_PREFIX(Type_blockindexed_count_contig)(DLOOP_Count count,
278 DLOOP_Count blklen,
279 const void *disp_array,
280 int dispinbytes,
281 DLOOP_Offset old_extent)
282 {
283 int i, contig_count = 1;
284
285 if (!dispinbytes)
286 {
287 /* this is from the MPI type, is of type int */
288 DLOOP_Offset cur_tdisp = (DLOOP_Offset) ((int *) disp_array)[0];
289
290 for (i=1; i < count; i++)
291 {
292 DLOOP_Offset next_tdisp = (DLOOP_Offset) ((int *) disp_array)[i];
293
294 if (cur_tdisp + (DLOOP_Offset) blklen != next_tdisp)
295 {
296 contig_count++;
297 }
298 cur_tdisp = next_tdisp;
299 }
300 }
301 else
302 {
303 /* this is from the MPI type, is of type MPI_Aint */
304 DLOOP_Offset cur_bdisp = (DLOOP_Offset) ((MPI_Aint *) disp_array)[0];
305
306 for (i=1; i < count; i++)
307 {
308 DLOOP_Offset next_bdisp =
309 (DLOOP_Offset) ((MPI_Aint *) disp_array)[i];
310
311 if (cur_bdisp + (DLOOP_Offset) blklen * old_extent != next_bdisp)
312 {
313 contig_count++;
314 }
315 cur_bdisp = next_bdisp;
316 }
317 }
318 return contig_count;
319 }
320