1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3 *
4 * Copyright (C) 1997 University of Chicago.
5 * See COPYRIGHT notice in top-level directory.
6 */
7
8 #include <assert.h>
9 #include "adio.h"
10 #include "adio_extern.h"
11 #ifdef AGGREGATION_PROFILE
12 #include "mpe.h"
13 #endif
14
15 /*
16 #define DEBUG
17 #define DEBUG2
18 */
19
20 #define COUNT_EXCH 0
21 #define BLOCK_LENS 1
22 #define INDICES 2
23 #define FPIND_DISP_OFF_SZ 3
24
25
26 typedef struct {
27 int count;
28 ADIO_Offset fp_ind;
29 ADIO_Offset disp;
30 ADIO_Offset byte_off;
31 ADIO_Offset sz;
32 ADIO_Offset ext;
33 ADIO_Offset type_sz;
34 } amount_and_extra_data_t;
35
36 /* Debugging function to print out an ADIOI_Flatlist_node. */
ADIOI_Print_flatlist_node(ADIOI_Flatlist_node * flatlist_node_p)37 void ADIOI_Print_flatlist_node(ADIOI_Flatlist_node *flatlist_node_p)
38 {
39 int i;
40 if (flatlist_node_p == NULL)
41 {
42 fprintf(stderr, "print flatlist node of NULL ptr\n");
43 return;
44 }
45 fprintf(stderr, "print flatlist node count = %d (idx,blocklen)\n",
46 (int)flatlist_node_p->count);
47 for (i = 0; i < flatlist_node_p->count; i++)
48 {
49 if (i % 5 == 0 && i != 0)
50 {
51 fprintf(stderr, "%d=(%lld,%lld)\n", i, (long long)flatlist_node_p->indices[i],
52 (long long)flatlist_node_p->blocklens[i]);
53 }
54 else
55 fprintf(stderr, "%d=(%lld,%lld) ", i, (long long)flatlist_node_p->indices[i],
56 (long long)flatlist_node_p->blocklens[i]);
57 }
58 fprintf(stderr, "\n");
59 }
60
61 /* Since ADIOI_Flatten_datatype won't add a contig datatype to the
62 * ADIOI_Flatlist, we can force it to do so with this function. */
ADIOI_Add_contig_flattened(MPI_Datatype contig_type)63 ADIOI_Flatlist_node * ADIOI_Add_contig_flattened(MPI_Datatype contig_type)
64 {
65 MPI_Count contig_type_sz = -1;
66 ADIOI_Flatlist_node *flat_node_p = ADIOI_Flatlist;
67
68 /* Add contig type to the end of the list if it doesn't already
69 * exist. */
70 while (flat_node_p->next)
71 {
72 if (flat_node_p->type == contig_type)
73 return flat_node_p;
74 flat_node_p = flat_node_p->next;
75 }
76 if (flat_node_p->type == contig_type)
77 return flat_node_p;
78
79 MPI_Type_size_x(contig_type, &contig_type_sz);
80 if ((flat_node_p->next = (ADIOI_Flatlist_node *) ADIOI_Malloc
81 (sizeof(ADIOI_Flatlist_node))) == NULL)
82 {
83 fprintf(stderr, "ADIOI_Add_contig_flattened: malloc next failed\n");
84 }
85 flat_node_p = flat_node_p->next;
86 flat_node_p->type = contig_type;
87 if ((flat_node_p->blocklens = (ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset))) == NULL)
88 {
89 fprintf(stderr, "ADIOI_Flatlist_node: malloc blocklens failed\n");
90 }
91 if ((flat_node_p->indices = (ADIO_Offset *)
92 ADIOI_Malloc(sizeof(ADIO_Offset))) == NULL)
93 {
94 fprintf(stderr, "ADIOI_Flatlist_node: malloc indices failed\n");
95 }
96 flat_node_p->blocklens[0] = contig_type_sz;
97 flat_node_p->indices[0] = 0;
98 flat_node_p->count = 1;
99 flat_node_p->next = NULL;
100 return flat_node_p;
101 }
102
103 /* ADIOI_Exchange_file_views - Sends all the aggregators the file
104 * views and file view states of the clients. It fills in the
105 * client_file_view_state_arr for the aggregators and the
106 * my_mem_view_state for the client. It also initializes the
107 * agg_file_view_state for all clients, which is the view for each
108 * aggregator of a client's filetype. */
ADIOI_Exch_file_views(int myrank,int nprocs,int file_ptr_type,ADIO_File fd,int count,MPI_Datatype datatype,ADIO_Offset off,view_state * my_mem_view_state_arr,view_state * agg_file_view_state_arr,view_state * client_file_view_state_arr)109 void ADIOI_Exch_file_views(int myrank, int nprocs, int file_ptr_type,
110 ADIO_File fd, int count,
111 MPI_Datatype datatype, ADIO_Offset off,
112 view_state *my_mem_view_state_arr,
113 view_state *agg_file_view_state_arr,
114 view_state *client_file_view_state_arr)
115 {
116 /* Convert my own fileview to an ADIOI_Flattened type and a
117 * disp. MPI_Alltoall the count of ADIOI_Flatlist nodes.
118 * MPI_Isend/Irecv the block_lens, indices of ADIOI_Flatlist node
119 * to/from each of the aggregators with the rest of the file view
120 * state. */
121
122 int i = -1, j = -1;
123 amount_and_extra_data_t *send_count_arr = NULL;
124 amount_and_extra_data_t *recv_count_arr = NULL;
125 int send_req_arr_sz = 0;
126 int recv_req_arr_sz = 0;
127 MPI_Request *send_req_arr = NULL, *recv_req_arr = NULL;
128 MPI_Status *statuses = NULL;
129 ADIO_Offset disp_off_sz_ext_typesz[6];
130 MPI_Aint memtype_extent, filetype_extent;
131 int ret = -1;
132
133 /* parameters for datatypes */
134 ADIOI_Flatlist_node *flat_mem_p = NULL, *flat_file_p = NULL;
135 MPI_Count memtype_sz = -1;
136 int memtype_is_contig = -1;
137 ADIO_Offset filetype_sz = -1;
138
139 #ifdef AGGREGATION_PROFILE
140 MPE_Log_event (5014, 0, NULL);
141 #endif
142 /* The memtype will be freed after the call. The filetype will be
143 * freed in the close and should have been flattened in the file
144 * view. */
145 MPI_Type_size_x(datatype, &memtype_sz);
146 MPI_Type_extent(datatype, &memtype_extent);
147 if (memtype_sz == memtype_extent) {
148 memtype_is_contig = 1;
149 flat_mem_p = ADIOI_Add_contig_flattened(datatype);
150 flat_mem_p->blocklens[0] = memtype_sz*count;
151 }
152 else {
153 ADIOI_Flatten_datatype(datatype);
154 flat_mem_p = ADIOI_Flatlist;
155 while (flat_mem_p->type != datatype)
156 flat_mem_p = flat_mem_p->next;
157 }
158
159 MPI_Type_extent(fd->filetype, &filetype_extent);
160 MPI_Type_size_x(fd->filetype, &filetype_sz);
161 if (filetype_extent == filetype_sz) {
162 flat_file_p = ADIOI_Add_contig_flattened(fd->filetype);
163 flat_file_p->blocklens[0] = memtype_sz*count;
164 filetype_extent = memtype_sz*count;
165 filetype_sz = filetype_extent;
166 }
167 else {
168 flat_file_p = ADIOI_Flatlist;
169 while (flat_file_p->type != fd->filetype)
170 flat_file_p = flat_file_p->next;
171 }
172
173 disp_off_sz_ext_typesz[0] = fd->fp_ind;
174 disp_off_sz_ext_typesz[1] = fd->disp;
175 disp_off_sz_ext_typesz[2] = off;
176 disp_off_sz_ext_typesz[3] = memtype_sz*count;
177 disp_off_sz_ext_typesz[4] = (ADIO_Offset) filetype_extent;
178 disp_off_sz_ext_typesz[5] = (ADIO_Offset) filetype_sz;
179
180 if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
181 recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
182 send_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
183 } else {
184 send_count_arr = ADIOI_Calloc(fd->hints->cb_nodes,
185 sizeof(amount_and_extra_data_t));
186
187 /* only aggregators receive data */
188 if (fd->is_agg) {
189 recv_count_arr = ADIOI_Calloc(nprocs,
190 sizeof(amount_and_extra_data_t));
191 recv_req_arr = ADIOI_Malloc (nprocs * sizeof(MPI_Request));
192 for (i=0; i < nprocs; i++)
193 MPI_Irecv (&recv_count_arr[i], sizeof(amount_and_extra_data_t),
194 MPI_BYTE, i, COUNT_EXCH, fd->comm, &recv_req_arr[i]);
195 }
196
197 /* only send data to aggregators */
198 send_req_arr = ADIOI_Calloc (fd->hints->cb_nodes, sizeof(MPI_Request));
199 for (i=0; i < fd->hints->cb_nodes; i++) {
200 send_count_arr[i].count = flat_file_p->count;
201 send_count_arr[i].fp_ind = disp_off_sz_ext_typesz[0];
202 send_count_arr[i].disp = disp_off_sz_ext_typesz[1];
203 send_count_arr[i].byte_off = disp_off_sz_ext_typesz[2];
204 send_count_arr[i].sz = disp_off_sz_ext_typesz[3];
205 send_count_arr[i].ext = disp_off_sz_ext_typesz[4];
206 send_count_arr[i].type_sz = disp_off_sz_ext_typesz[5];
207 MPI_Isend (&send_count_arr[i], sizeof(amount_and_extra_data_t),
208 MPI_BYTE, fd->hints->ranklist[i], COUNT_EXCH, fd->comm,
209 &send_req_arr[i]);
210 }
211 }
212
213
214 /* Every client has to build mem and file view_states for each aggregator.
215 * We initialize their values here. and we also initialize
216 * send_count_arr */
217
218 if (memtype_is_contig) {
219 /* if memory is contigous, we now replace memtype_sz and
220 * memtype_extent with the full access size */
221 memtype_sz *= count;
222 memtype_extent = memtype_sz;
223 }
224
225 for (i = 0; i < fd->hints->cb_nodes; i++)
226 {
227 int tmp_agg_idx = fd->hints->ranklist[i];
228 memset(&(my_mem_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
229 my_mem_view_state_arr[tmp_agg_idx].sz =
230 disp_off_sz_ext_typesz[3];
231 my_mem_view_state_arr[tmp_agg_idx].ext =
232 (ADIO_Offset) memtype_extent;
233 my_mem_view_state_arr[tmp_agg_idx].type_sz =
234 (ADIO_Offset) memtype_sz;
235 my_mem_view_state_arr[tmp_agg_idx].flat_type_p = flat_mem_p;
236 ADIOI_init_view_state(file_ptr_type,
237 1,
238 &(my_mem_view_state_arr[tmp_agg_idx]),
239 TEMP_OFF);
240 ADIOI_init_view_state(file_ptr_type,
241 1,
242 &(my_mem_view_state_arr[tmp_agg_idx]),
243 REAL_OFF);
244
245 memset(&(agg_file_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
246 agg_file_view_state_arr[tmp_agg_idx].fp_ind =
247 disp_off_sz_ext_typesz[0];
248 agg_file_view_state_arr[tmp_agg_idx].disp =
249 disp_off_sz_ext_typesz[1];
250 agg_file_view_state_arr[tmp_agg_idx].byte_off =
251 disp_off_sz_ext_typesz[2];
252 agg_file_view_state_arr[tmp_agg_idx].sz =
253 disp_off_sz_ext_typesz[3];
254 agg_file_view_state_arr[tmp_agg_idx].ext =
255 disp_off_sz_ext_typesz[4];
256 agg_file_view_state_arr[tmp_agg_idx].type_sz =
257 disp_off_sz_ext_typesz[5];
258 agg_file_view_state_arr[tmp_agg_idx].flat_type_p = flat_file_p;
259
260 ADIOI_init_view_state(file_ptr_type,
261 1,
262 &(agg_file_view_state_arr[tmp_agg_idx]),
263 TEMP_OFF);
264 ADIOI_init_view_state(file_ptr_type,
265 1,
266 &(agg_file_view_state_arr[tmp_agg_idx]),
267 REAL_OFF);
268
269 if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
270 send_count_arr[tmp_agg_idx].count = flat_file_p->count;
271 send_count_arr[tmp_agg_idx].fp_ind = disp_off_sz_ext_typesz[0];
272 send_count_arr[tmp_agg_idx].disp = disp_off_sz_ext_typesz[1];
273 send_count_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2];
274 send_count_arr[tmp_agg_idx].sz = disp_off_sz_ext_typesz[3];
275 send_count_arr[tmp_agg_idx].ext = disp_off_sz_ext_typesz[4];
276 send_count_arr[tmp_agg_idx].type_sz = disp_off_sz_ext_typesz[5];
277 }
278 }
279
280 #ifdef DEBUG2
281 fprintf(stderr, "my own flattened memtype: ");
282 ADIOI_Print_flatlist_node(flat_mem_p);
283 fprintf(stderr, "my own flattened filetype: ");
284 ADIOI_Print_flatlist_node(flat_file_p);
285 #endif
286
287 if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
288 ret = MPI_Alltoall(send_count_arr, sizeof(amount_and_extra_data_t),
289 MPI_BYTE,
290 recv_count_arr, sizeof(amount_and_extra_data_t),
291 MPI_BYTE, fd->comm);
292 if (ret != MPI_SUCCESS)
293 {
294 fprintf(stderr, "ADIOI_Exchange_file_views: MPI_Alltoall failed "
295 "with error %d", ret);
296 return;
297 }
298 } else {
299 statuses = (MPI_Status *) ADIOI_Malloc(1 + nprocs * sizeof(MPI_Status));
300 if (fd->is_agg) {
301 MPI_Waitall(nprocs, recv_req_arr, statuses);
302 ADIOI_Free(recv_req_arr);
303 }
304 MPI_Waitall(fd->hints->cb_nodes, send_req_arr, statuses);
305 ADIOI_Free(statuses);
306 ADIOI_Free(send_req_arr);
307 }
308 #ifdef DEBUG2
309 if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
310 fprintf(stderr, "send_count_arr:");
311 for (i = 0; i < nprocs; i++)
312 {
313 fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
314 }
315 fprintf(stderr, "\n");
316 fprintf(stderr, "recv_count_arr:");
317 for (i = 0; i < nprocs; i++)
318 {
319 fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
320 }
321 fprintf(stderr, "\n");
322 } else {
323 fprintf(stderr, "send_count_arr:");
324 for (i = 0; i < fd->hints->cb_nodes; i++)
325 {
326 fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
327 }
328 fprintf(stderr, "\n");
329 if (fd->is_agg) {
330 fprintf(stderr, "recv_count_arr:");
331 for (i = 0; i < nprocs; i++)
332 {
333 fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
334 }
335 fprintf(stderr, "\n");
336 }
337 }
338 #endif
339
340 if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
341 for (i=0; i < fd->hints->cb_nodes; i++)
342 if (send_count_arr[i].count > 0)
343 send_req_arr_sz++;
344 }
345 /* Figure out how many counts to send/recv */
346 for (i = 0; i < nprocs; i++)
347 {
348 if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
349 if (send_count_arr[i].count > 0)
350 send_req_arr_sz++;
351 }
352 /* Only aggregators should recv*/
353 if (fd->is_agg) {
354 if (recv_count_arr[i].count > 0)
355 {
356 if ((client_file_view_state_arr[i].flat_type_p =
357 (ADIOI_Flatlist_node *) ADIOI_Malloc(
358 sizeof(ADIOI_Flatlist_node))) == NULL)
359 {
360 fprintf(stderr, "ADIOI_Exchange_file_views: malloc "
361 "flat_type_p failed\n");
362 }
363 client_file_view_state_arr[i].flat_type_p->count =
364 recv_count_arr[i].count;
365 client_file_view_state_arr[i].flat_type_p->indices =
366 (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count,
367 sizeof(ADIO_Offset));
368 client_file_view_state_arr[i].flat_type_p->blocklens =
369 (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count,
370 sizeof(ADIO_Offset));
371
372 /* Copy the extra data out of the stuff we Alltoall'd */
373 memcpy (&client_file_view_state_arr[i].fp_ind,
374 &recv_count_arr[i].fp_ind,
375 6*sizeof(ADIO_Offset));
376
377 recv_req_arr_sz++;
378 }
379 }
380 }
381
382 /* Since ADIOI_Calloc may do other things we add the +1
383 * to avoid a 0-size malloc */
384 send_req_arr = (MPI_Request *) ADIOI_Calloc(2*(send_req_arr_sz)+1,
385 sizeof(MPI_Request));
386
387 j = 0;
388 if (recv_req_arr_sz > 0) {
389 assert (fd->is_agg);
390 recv_req_arr = (MPI_Request *) ADIOI_Calloc(2*(recv_req_arr_sz),
391 sizeof(MPI_Request));
392 for (i = 0; i < nprocs; i++) {
393 if (recv_count_arr[i].count > 0) {
394 MPI_Irecv(client_file_view_state_arr[i].flat_type_p->indices,
395 recv_count_arr[i].count, ADIO_OFFSET, i,
396 INDICES, fd->comm, &recv_req_arr[j]);
397 j++;
398 MPI_Irecv(client_file_view_state_arr[i].flat_type_p->blocklens,
399 recv_count_arr[i].count, ADIO_OFFSET, i,
400 BLOCK_LENS, fd->comm, &recv_req_arr[j]);
401 j++;
402 }
403 }
404 }
405
406 if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
407 j = 0;
408 for (i = 0; i < nprocs; i++) {
409 if (send_count_arr[i].count > 0) {
410 MPI_Isend(flat_file_p->indices,
411 send_count_arr[i].count, ADIO_OFFSET, i,
412 INDICES, fd->comm, &send_req_arr[j]);
413 j++;
414 MPI_Isend(flat_file_p->blocklens,
415 send_count_arr[i].count, ADIO_OFFSET, i,
416 BLOCK_LENS, fd->comm, &send_req_arr[j]);
417 j++;
418 }
419 }
420 } else {
421 j = 0;
422 for (i = 0; i < fd->hints->cb_nodes; i++) {
423 if (send_count_arr[i].count > 0) {
424 MPI_Isend(flat_file_p->indices,
425 send_count_arr[i].count, ADIO_OFFSET,
426 fd->hints->ranklist[i], INDICES, fd->comm,
427 &send_req_arr[j]);
428 j++;
429 MPI_Isend(flat_file_p->blocklens,
430 send_count_arr[i].count, ADIO_OFFSET,
431 fd->hints->ranklist[i], BLOCK_LENS, fd->comm,
432 &send_req_arr[j]);
433 j++;
434 }
435 }
436 }
437
438 /* Since ADIOI_Malloc may do other things we add the +1
439 * to avoid a 0-size malloc */
440 statuses = (MPI_Status *)
441 ADIOI_Malloc(1 + 2 * ADIOI_MAX(send_req_arr_sz,recv_req_arr_sz)
442 * sizeof(MPI_Status));
443
444 if (send_req_arr_sz > 0) {
445 MPI_Waitall(2 * send_req_arr_sz, send_req_arr, statuses);
446 ADIOI_Free(send_count_arr);
447 ADIOI_Free(send_req_arr);
448 }
449 if (recv_req_arr_sz > 0) {
450 MPI_Waitall(2 * recv_req_arr_sz, recv_req_arr, statuses);
451 ADIOI_Free(recv_count_arr);
452 ADIOI_Free(recv_req_arr);
453 }
454 ADIOI_Free(statuses);
455
456 if (fd->is_agg == 1)
457 {
458 ADIOI_init_view_state(file_ptr_type,
459 nprocs,
460 client_file_view_state_arr,
461 TEMP_OFF);
462 ADIOI_init_view_state(file_ptr_type,
463 nprocs,
464 client_file_view_state_arr,
465 REAL_OFF);
466 }
467
468 #ifdef DEBUG
469 if (fd->is_agg == 1)
470 {
471 ADIOI_Flatlist_node *fr_node_p = ADIOI_Flatlist;
472 for (i = 0; i < nprocs; i++)
473 {
474 fprintf(stderr, "client_file_view_state_arr[%d]=(fp_ind=%Ld,"
475 "disp=%Ld,byte_off=%Ld,sz=%Ld,ext=%Ld\n", i,
476 client_file_view_state_arr[i].fp_ind,
477 client_file_view_state_arr[i].disp,
478 client_file_view_state_arr[i].byte_off,
479 client_file_view_state_arr[i].sz,
480 client_file_view_state_arr[i].ext);
481 }
482
483 while (fr_node_p->type !=
484 fd->file_realm_types[fd->my_cb_nodes_index])
485 fr_node_p = fr_node_p->next;
486 assert(fr_node_p != NULL);
487
488 fprintf(stderr, "my file realm (idx=%d,st_off=%Ld) ",
489 fd->my_cb_nodes_index,
490 fd->file_realm_st_offs[fd->my_cb_nodes_index]);
491 ADIOI_Print_flatlist_node(fr_node_p);
492 }
493 #endif
494
495 #ifdef DEBUG2
496 if (fd->is_agg == 1)
497 {
498 for (i = 0; i < nprocs; i++)
499 {
500 fprintf(stderr, "client_file_view_state_arr[%d]: ", i);
501 ADIOI_Print_flatlist_node(
502 client_file_view_state_arr[i].flat_type_p);
503 }
504 }
505 #endif
506 #ifdef AGGREGATION_PROFILE
507 MPE_Log_event (5015, 0, NULL);
508 #endif
509 }
510