1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3  *
4  *   Copyright (C) 1997 University of Chicago.
5  *   See COPYRIGHT notice in top-level directory.
6  */
7 
8 #include <assert.h>
9 #include "adio.h"
10 #include "adio_extern.h"
11 #ifdef AGGREGATION_PROFILE
12 #include "mpe.h"
13 #endif
14 
15 /*
16 #define DEBUG
17 #define DEBUG2
18 */
19 
20 #define COUNT_EXCH          0
21 #define BLOCK_LENS          1
22 #define INDICES             2
23 #define FPIND_DISP_OFF_SZ   3
24 
25 
26 typedef struct {
27     int count;
28     ADIO_Offset fp_ind;
29     ADIO_Offset disp;
30     ADIO_Offset byte_off;
31     ADIO_Offset sz;
32     ADIO_Offset ext;
33     ADIO_Offset type_sz;
34 } amount_and_extra_data_t;
35 
36 /* Debugging function to print out an ADIOI_Flatlist_node. */
ADIOI_Print_flatlist_node(ADIOI_Flatlist_node * flatlist_node_p)37 void ADIOI_Print_flatlist_node(ADIOI_Flatlist_node *flatlist_node_p)
38 {
39     int i;
40     if (flatlist_node_p == NULL)
41     {
42 	fprintf(stderr, "print flatlist node of NULL ptr\n");
43 	return;
44     }
45     fprintf(stderr, "print flatlist node count = %d (idx,blocklen)\n",
46 	    (int)flatlist_node_p->count);
47     for (i = 0; i < flatlist_node_p->count; i++)
48     {
49 	if (i % 5 == 0 && i != 0)
50 	{
51 	    fprintf(stderr, "%d=(%lld,%lld)\n", i, (long long)flatlist_node_p->indices[i],
52 		    (long long)flatlist_node_p->blocklens[i]);
53 	}
54 	else
55 	    fprintf(stderr, "%d=(%lld,%lld) ", i, (long long)flatlist_node_p->indices[i],
56 		    (long long)flatlist_node_p->blocklens[i]);
57     }
58     fprintf(stderr, "\n");
59 }
60 
61 /* Since ADIOI_Flatten_datatype won't add a contig datatype to the
62  * ADIOI_Flatlist, we can force it to do so with this function. */
ADIOI_Add_contig_flattened(MPI_Datatype contig_type)63 ADIOI_Flatlist_node * ADIOI_Add_contig_flattened(MPI_Datatype contig_type)
64 {
65     MPI_Count contig_type_sz = -1;
66     ADIOI_Flatlist_node *flat_node_p = ADIOI_Flatlist;
67 
68     /* Add contig type to the end of the list if it doesn't already
69      * exist. */
70     while (flat_node_p->next)
71     {
72 	if (flat_node_p->type == contig_type)
73 	    return flat_node_p;
74 	flat_node_p = flat_node_p->next;
75     }
76     if (flat_node_p->type == contig_type)
77 	return flat_node_p;
78 
79     MPI_Type_size_x(contig_type, &contig_type_sz);
80     if ((flat_node_p->next = (ADIOI_Flatlist_node *) ADIOI_Malloc
81 	 (sizeof(ADIOI_Flatlist_node))) == NULL)
82     {
83 	fprintf(stderr, "ADIOI_Add_contig_flattened: malloc next failed\n");
84     }
85     flat_node_p = flat_node_p->next;
86     flat_node_p->type = contig_type;
87     if ((flat_node_p->blocklens = (ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset))) == NULL)
88     {
89 	fprintf(stderr, "ADIOI_Flatlist_node: malloc blocklens failed\n");
90     }
91     if ((flat_node_p->indices = (ADIO_Offset *)
92 	 ADIOI_Malloc(sizeof(ADIO_Offset))) == NULL)
93     {
94 	fprintf(stderr, "ADIOI_Flatlist_node: malloc indices failed\n");
95     }
96     flat_node_p->blocklens[0] = contig_type_sz;
97     flat_node_p->indices[0] = 0;
98     flat_node_p->count = 1;
99     flat_node_p->next = NULL;
100     return flat_node_p;
101 }
102 
103 /* ADIOI_Exchange_file_views - Sends all the aggregators the file
104  * views and file view states of the clients.  It fills in the
105  * client_file_view_state_arr for the aggregators and the
106  * my_mem_view_state for the client.  It also initializes the
107  * agg_file_view_state for all clients, which is the view for each
108  * aggregator of a client's filetype. */
ADIOI_Exch_file_views(int myrank,int nprocs,int file_ptr_type,ADIO_File fd,int count,MPI_Datatype datatype,ADIO_Offset off,view_state * my_mem_view_state_arr,view_state * agg_file_view_state_arr,view_state * client_file_view_state_arr)109 void ADIOI_Exch_file_views(int myrank, int nprocs, int file_ptr_type,
110 			   ADIO_File fd, int count,
111 			   MPI_Datatype datatype, ADIO_Offset off,
112 			   view_state *my_mem_view_state_arr,
113 			   view_state *agg_file_view_state_arr,
114 			   view_state *client_file_view_state_arr)
115 {
116     /* Convert my own fileview to an ADIOI_Flattened type and a
117      * disp. MPI_Alltoall the count of ADIOI_Flatlist nodes.
118      * MPI_Isend/Irecv the block_lens, indices of ADIOI_Flatlist node
119      * to/from each of the aggregators with the rest of the file view
120      * state. */
121 
122     int i = -1, j = -1;
123     amount_and_extra_data_t *send_count_arr = NULL;
124     amount_and_extra_data_t *recv_count_arr = NULL;
125     int send_req_arr_sz = 0;
126     int recv_req_arr_sz = 0;
127     MPI_Request *send_req_arr = NULL, *recv_req_arr = NULL;
128     MPI_Status *statuses = NULL;
129     ADIO_Offset disp_off_sz_ext_typesz[6];
130     MPI_Aint memtype_extent, filetype_extent, lb;
131     int ret = -1;
132 
133     /* parameters for datatypes */
134     ADIOI_Flatlist_node *flat_mem_p = NULL, *flat_file_p = NULL;
135     MPI_Count memtype_sz = -1;
136     int memtype_is_contig = -1;
137     ADIO_Offset filetype_sz = -1;
138 
139 #ifdef AGGREGATION_PROFILE
140     MPE_Log_event (5014, 0, NULL);
141 #endif
142     /* The memtype will be freed after the call.  The filetype will be
143      * freed in the close and should have been flattened in the file
144      * view. */
145     MPI_Type_size_x(datatype, &memtype_sz);
146     MPI_Type_get_extent(datatype, &lb, &memtype_extent);
147     if (memtype_sz == memtype_extent) {
148 	memtype_is_contig = 1;
149 	flat_mem_p = ADIOI_Add_contig_flattened(datatype);
150 	flat_mem_p->blocklens[0] = memtype_sz*count;
151     }
152     else {
153 	flat_mem_p = ADIOI_Flatten_and_find(datatype);
154     }
155 
156     MPI_Type_get_extent(fd->filetype, &lb, &filetype_extent);
157     MPI_Type_size_x(fd->filetype, &filetype_sz);
158     if (filetype_extent == filetype_sz) {
159 	flat_file_p = ADIOI_Add_contig_flattened(fd->filetype);
160 	flat_file_p->blocklens[0] = memtype_sz*count;
161 	filetype_extent = memtype_sz*count;
162 	filetype_sz = filetype_extent;
163     }
164     else {
165         flat_file_p = ADIOI_Flatlist;
166         while (flat_file_p->type != fd->filetype)
167             flat_file_p = flat_file_p->next;
168     }
169 
170     disp_off_sz_ext_typesz[0] = fd->fp_ind;
171     disp_off_sz_ext_typesz[1] = fd->disp;
172     disp_off_sz_ext_typesz[2] = off;
173     disp_off_sz_ext_typesz[3] = memtype_sz*count;
174     disp_off_sz_ext_typesz[4] = (ADIO_Offset) filetype_extent;
175     disp_off_sz_ext_typesz[5] = (ADIO_Offset) filetype_sz;
176 
177     if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
178         recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
179         send_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
180     } else {
181         send_count_arr = ADIOI_Calloc(fd->hints->cb_nodes,
182 				  sizeof(amount_and_extra_data_t));
183 
184         /* only aggregators receive data */
185         if (fd->is_agg) {
186 	    recv_count_arr = ADIOI_Calloc(nprocs,
187 			    sizeof(amount_and_extra_data_t));
188 	    recv_req_arr = ADIOI_Malloc (nprocs * sizeof(MPI_Request));
189 	    for (i=0; i < nprocs; i++)
190 	        MPI_Irecv (&recv_count_arr[i], sizeof(amount_and_extra_data_t),
191 		       MPI_BYTE, i, COUNT_EXCH, fd->comm, &recv_req_arr[i]);
192         }
193 
194         /* only send data to aggregators */
195         send_req_arr = ADIOI_Calloc (fd->hints->cb_nodes, sizeof(MPI_Request));
196         for (i=0; i < fd->hints->cb_nodes; i++) {
197 	    send_count_arr[i].count    = flat_file_p->count;
198 	    send_count_arr[i].fp_ind   = disp_off_sz_ext_typesz[0];
199 	    send_count_arr[i].disp     = disp_off_sz_ext_typesz[1];
200 	    send_count_arr[i].byte_off = disp_off_sz_ext_typesz[2];
201 	    send_count_arr[i].sz       = disp_off_sz_ext_typesz[3];
202 	    send_count_arr[i].ext      = disp_off_sz_ext_typesz[4];
203 	    send_count_arr[i].type_sz  = disp_off_sz_ext_typesz[5];
204 	    MPI_Isend (&send_count_arr[i], sizeof(amount_and_extra_data_t),
205 		   MPI_BYTE, fd->hints->ranklist[i], COUNT_EXCH, fd->comm,
206 		   &send_req_arr[i]);
207         }
208     }
209 
210 
211     /* Every client has to build mem and file view_states for each aggregator.
212      * We initialize their values here.  and we also initialize
213      * send_count_arr */
214 
215     if (memtype_is_contig) {
216 	/* if memory is contigous, we now replace memtype_sz and
217 	 * memtype_extent with the full access size */
218 	memtype_sz *= count;
219 	memtype_extent = memtype_sz;
220     }
221 
222     for (i = 0; i < fd->hints->cb_nodes; i++)
223     {
224 	int tmp_agg_idx = fd->hints->ranklist[i];
225 	memset(&(my_mem_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
226 	my_mem_view_state_arr[tmp_agg_idx].sz          =
227 	    disp_off_sz_ext_typesz[3];
228 	my_mem_view_state_arr[tmp_agg_idx].ext         =
229 	    (ADIO_Offset) memtype_extent;
230 	my_mem_view_state_arr[tmp_agg_idx].type_sz     =
231 	    (ADIO_Offset) memtype_sz;
232 	my_mem_view_state_arr[tmp_agg_idx].flat_type_p = flat_mem_p;
233 	ADIOI_init_view_state(file_ptr_type,
234 			1,
235 			&(my_mem_view_state_arr[tmp_agg_idx]),
236 			TEMP_OFF);
237 	ADIOI_init_view_state(file_ptr_type,
238 			1,
239 			&(my_mem_view_state_arr[tmp_agg_idx]),
240 			REAL_OFF);
241 
242 	memset(&(agg_file_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
243 	agg_file_view_state_arr[tmp_agg_idx].fp_ind    =
244 	    disp_off_sz_ext_typesz[0];
245 	agg_file_view_state_arr[tmp_agg_idx].disp      =
246 	    disp_off_sz_ext_typesz[1];
247 	agg_file_view_state_arr[tmp_agg_idx].byte_off  =
248 	    disp_off_sz_ext_typesz[2];
249 	agg_file_view_state_arr[tmp_agg_idx].sz        =
250 	    disp_off_sz_ext_typesz[3];
251 	agg_file_view_state_arr[tmp_agg_idx].ext       =
252 	    disp_off_sz_ext_typesz[4];
253 	agg_file_view_state_arr[tmp_agg_idx].type_sz   =
254 	    disp_off_sz_ext_typesz[5];
255 	agg_file_view_state_arr[tmp_agg_idx].flat_type_p = flat_file_p;
256 
257 	ADIOI_init_view_state(file_ptr_type,
258 			1,
259 			&(agg_file_view_state_arr[tmp_agg_idx]),
260 			TEMP_OFF);
261 	ADIOI_init_view_state(file_ptr_type,
262 			1,
263 			&(agg_file_view_state_arr[tmp_agg_idx]),
264 			REAL_OFF);
265 
266 	if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
267 	    send_count_arr[tmp_agg_idx].count    = flat_file_p->count;
268 	    send_count_arr[tmp_agg_idx].fp_ind   = disp_off_sz_ext_typesz[0];
269 	    send_count_arr[tmp_agg_idx].disp     = disp_off_sz_ext_typesz[1];
270 	    send_count_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2];
271 	    send_count_arr[tmp_agg_idx].sz       = disp_off_sz_ext_typesz[3];
272 	    send_count_arr[tmp_agg_idx].ext      = disp_off_sz_ext_typesz[4];
273 	    send_count_arr[tmp_agg_idx].type_sz  = disp_off_sz_ext_typesz[5];
274 	}
275     }
276 
277 #ifdef DEBUG2
278     fprintf(stderr, "my own flattened memtype: ");
279     ADIOI_Print_flatlist_node(flat_mem_p);
280     fprintf(stderr, "my own flattened filetype: ");
281     ADIOI_Print_flatlist_node(flat_file_p);
282 #endif
283 
284     if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
285         ret = MPI_Alltoall(send_count_arr, sizeof(amount_and_extra_data_t),
286 		       MPI_BYTE,
287 		       recv_count_arr, sizeof(amount_and_extra_data_t),
288 		       MPI_BYTE, fd->comm);
289         if (ret != MPI_SUCCESS)
290         {
291 	    fprintf(stderr, "ADIOI_Exchange_file_views: MPI_Alltoall failed "
292 		"with error %d", ret);
293 	    return;
294         }
295     } else {
296         statuses = (MPI_Status *) ADIOI_Malloc(1 + nprocs * sizeof(MPI_Status));
297         if (fd->is_agg) {
298 	    MPI_Waitall(nprocs, recv_req_arr, statuses);
299 	    ADIOI_Free(recv_req_arr);
300         }
301         MPI_Waitall(fd->hints->cb_nodes, send_req_arr, statuses);
302         ADIOI_Free(statuses);
303         ADIOI_Free(send_req_arr);
304     }
305 #ifdef DEBUG2
306     if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
307         fprintf(stderr, "send_count_arr:");
308         for (i = 0; i < nprocs; i++)
309         {
310 	    fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
311         }
312         fprintf(stderr, "\n");
313         fprintf(stderr, "recv_count_arr:");
314         for (i = 0; i < nprocs; i++)
315 	{
316 	    fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
317 	}
318         fprintf(stderr, "\n");
319     } else {
320         fprintf(stderr, "send_count_arr:");
321         for (i = 0; i < fd->hints->cb_nodes; i++)
322         {
323 	    fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
324         }
325         fprintf(stderr, "\n");
326         if (fd->is_agg) {
327 	    fprintf(stderr, "recv_count_arr:");
328 	    for (i = 0; i < nprocs; i++)
329 	    {
330 	        fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
331 	    }
332 	    fprintf(stderr, "\n");
333         }
334     }
335 #endif
336 
337     if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
338         for (i=0; i < fd->hints->cb_nodes; i++)
339 	    if (send_count_arr[i].count > 0)
340 	        send_req_arr_sz++;
341     }
342     /* Figure out how many counts to send/recv */
343     for (i = 0; i < nprocs; i++)
344     {
345         if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
346 	    if (send_count_arr[i].count > 0)
347 	        send_req_arr_sz++;
348 	}
349 	/* Only aggregators should recv*/
350 	if (fd->is_agg) {
351 	    if (recv_count_arr[i].count > 0)
352 	    {
353 		if ((client_file_view_state_arr[i].flat_type_p =
354 		     (ADIOI_Flatlist_node *) ADIOI_Malloc(
355 			 sizeof(ADIOI_Flatlist_node))) == NULL)
356 		{
357 		    fprintf(stderr, "ADIOI_Exchange_file_views: malloc "
358 			    "flat_type_p failed\n");
359 		}
360 		client_file_view_state_arr[i].flat_type_p->count =
361 		    recv_count_arr[i].count;
362 		client_file_view_state_arr[i].flat_type_p->indices =
363 		    (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count,
364 						 sizeof(ADIO_Offset));
365 		client_file_view_state_arr[i].flat_type_p->blocklens =
366 		    (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count,
367 				    sizeof(ADIO_Offset));
368 
369 		/* Copy the extra data out of the stuff we Alltoall'd */
370 		memcpy (&client_file_view_state_arr[i].fp_ind,
371 			&recv_count_arr[i].fp_ind,
372 			6*sizeof(ADIO_Offset));
373 
374 		recv_req_arr_sz++;
375 	    }
376 	}
377     }
378 
379     /* Since ADIOI_Calloc may do other things we add the +1
380      * to avoid a 0-size malloc */
381     send_req_arr = (MPI_Request *) ADIOI_Calloc(2*(send_req_arr_sz)+1,
382 						sizeof(MPI_Request));
383 
384     j = 0;
385     if (recv_req_arr_sz > 0) {
386 	assert (fd->is_agg);
387 	recv_req_arr = (MPI_Request *) ADIOI_Calloc(2*(recv_req_arr_sz),
388 						    sizeof(MPI_Request));
389     	for (i = 0; i < nprocs; i++) {
390 	    if (recv_count_arr[i].count > 0) {
391 		MPI_Irecv(client_file_view_state_arr[i].flat_type_p->indices,
392 			  recv_count_arr[i].count, ADIO_OFFSET, i,
393 			  INDICES, fd->comm, &recv_req_arr[j]);
394 		j++;
395 		MPI_Irecv(client_file_view_state_arr[i].flat_type_p->blocklens,
396 			  recv_count_arr[i].count, ADIO_OFFSET, i,
397 			  BLOCK_LENS, fd->comm, &recv_req_arr[j]);
398 		j++;
399 	    }
400 	}
401     }
402 
403     if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
404         j = 0;
405         for (i = 0; i < nprocs; i++) {
406 	    if (send_count_arr[i].count > 0) {
407 	        MPI_Isend(flat_file_p->indices,
408 		      send_count_arr[i].count, ADIO_OFFSET, i,
409                       INDICES, fd->comm, &send_req_arr[j]);
410 	        j++;
411 	        MPI_Isend(flat_file_p->blocklens,
412 		      send_count_arr[i].count, ADIO_OFFSET, i,
413                       BLOCK_LENS, fd->comm, &send_req_arr[j]);
414 	        j++;
415 	    }
416         }
417     } else {
418         j = 0;
419         for (i = 0; i < fd->hints->cb_nodes; i++) {
420 	    if (send_count_arr[i].count > 0) {
421 	        MPI_Isend(flat_file_p->indices,
422 		      send_count_arr[i].count, ADIO_OFFSET,
423 		      fd->hints->ranklist[i], INDICES, fd->comm,
424 		      &send_req_arr[j]);
425 	        j++;
426 	        MPI_Isend(flat_file_p->blocklens,
427 		      send_count_arr[i].count, ADIO_OFFSET,
428 		      fd->hints->ranklist[i], BLOCK_LENS, fd->comm,
429 		      &send_req_arr[j]);
430 	        j++;
431 	    }
432         }
433     }
434 
435     /* Since ADIOI_Malloc may do other things we add the +1
436      * to avoid a 0-size malloc */
437     statuses = (MPI_Status *)
438 	ADIOI_Malloc(1 + 2 * ADIOI_MAX(send_req_arr_sz,recv_req_arr_sz)
439 		     * sizeof(MPI_Status));
440 
441     if (send_req_arr_sz > 0) {
442 	MPI_Waitall(2 * send_req_arr_sz, send_req_arr, statuses);
443 	ADIOI_Free(send_count_arr);
444 	ADIOI_Free(send_req_arr);
445     }
446     if (recv_req_arr_sz > 0) {
447 	MPI_Waitall(2 * recv_req_arr_sz, recv_req_arr, statuses);
448 	ADIOI_Free(recv_count_arr);
449 	ADIOI_Free(recv_req_arr);
450     }
451     ADIOI_Free(statuses);
452 
453     if (fd->is_agg == 1)
454     {
455 	ADIOI_init_view_state(file_ptr_type,
456 			nprocs,
457 			client_file_view_state_arr,
458 			TEMP_OFF);
459 	ADIOI_init_view_state(file_ptr_type,
460 			nprocs,
461 			client_file_view_state_arr,
462 			REAL_OFF);
463     }
464 
465 #ifdef DEBUG
466     if (fd->is_agg == 1)
467     {
468 	ADIOI_Flatlist_node *fr_node_p = ADIOI_Flatlist;
469 	for (i = 0; i < nprocs; i++)
470 	{
471 	    fprintf(stderr, "client_file_view_state_arr[%d]=(fp_ind=%Ld,"
472 		    "disp=%Ld,byte_off=%Ld,sz=%Ld,ext=%Ld\n", i,
473 		    client_file_view_state_arr[i].fp_ind,
474 		    client_file_view_state_arr[i].disp,
475 		    client_file_view_state_arr[i].byte_off,
476 		    client_file_view_state_arr[i].sz,
477 		    client_file_view_state_arr[i].ext);
478 	}
479 
480 	while (fr_node_p->type !=
481 	       fd->file_realm_types[fd->my_cb_nodes_index])
482 	    fr_node_p = fr_node_p->next;
483 	assert(fr_node_p != NULL);
484 
485 	fprintf(stderr, "my file realm (idx=%d,st_off=%Ld) ",
486 		fd->my_cb_nodes_index,
487 		fd->file_realm_st_offs[fd->my_cb_nodes_index]);
488 	ADIOI_Print_flatlist_node(fr_node_p);
489     }
490 #endif
491 
492 #ifdef DEBUG2
493     if (fd->is_agg == 1)
494     {
495 	for (i = 0; i < nprocs; i++)
496 	{
497 	    fprintf(stderr, "client_file_view_state_arr[%d]: ", i);
498 	    ADIOI_Print_flatlist_node(
499 		client_file_view_state_arr[i].flat_type_p);
500 	}
501     }
502 #endif
503 #ifdef AGGREGATION_PROFILE
504     MPE_Log_event (5015, 0, NULL);
505 #endif
506 }
507