1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3  *
4  *   Copyright (C) 1997 University of Chicago.
5  *   See COPYRIGHT notice in top-level directory.
6  */
7 
8 #include <assert.h>
9 #include "adio.h"
10 #include "adio_extern.h"
11 #ifdef AGGREGATION_PROFILE
12 #include "mpe.h"
13 #endif
14 
15 /*
16 #define DEBUG
17 #define DEBUG2
18 */
19 
20 #define COUNT_EXCH          0
21 #define BLOCK_LENS          1
22 #define INDICES             2
23 #define FPIND_DISP_OFF_SZ   3
24 
25 
26 typedef struct {
27     int count;
28     ADIO_Offset fp_ind;
29     ADIO_Offset disp;
30     ADIO_Offset byte_off;
31     ADIO_Offset sz;
32     ADIO_Offset ext;
33     ADIO_Offset type_sz;
34 } amount_and_extra_data_t;
35 
36 /* Debugging function to print out an ADIOI_Flatlist_node. */
ADIOI_Print_flatlist_node(ADIOI_Flatlist_node * flatlist_node_p)37 void ADIOI_Print_flatlist_node(ADIOI_Flatlist_node *flatlist_node_p)
38 {
39     int i;
40     if (flatlist_node_p == NULL)
41     {
42 	fprintf(stderr, "print flatlist node of NULL ptr\n");
43 	return;
44     }
45     fprintf(stderr, "print flatlist node count = %d (idx,blocklen)\n",
46 	    (int)flatlist_node_p->count);
47     for (i = 0; i < flatlist_node_p->count; i++)
48     {
49 	if (i % 5 == 0 && i != 0)
50 	{
51 	    fprintf(stderr, "%d=(%lld,%lld)\n", i, (long long)flatlist_node_p->indices[i],
52 		    (long long)flatlist_node_p->blocklens[i]);
53 	}
54 	else
55 	    fprintf(stderr, "%d=(%lld,%lld) ", i, (long long)flatlist_node_p->indices[i],
56 		    (long long)flatlist_node_p->blocklens[i]);
57     }
58     fprintf(stderr, "\n");
59 }
60 
61 /* Since ADIOI_Flatten_datatype won't add a contig datatype to the
62  * ADIOI_Flatlist, we can force it to do so with this function. */
ADIOI_Add_contig_flattened(MPI_Datatype contig_type)63 ADIOI_Flatlist_node * ADIOI_Add_contig_flattened(MPI_Datatype contig_type)
64 {
65     MPI_Count contig_type_sz = -1;
66     ADIOI_Flatlist_node *flat_node_p = ADIOI_Flatlist;
67 
68     /* Add contig type to the end of the list if it doesn't already
69      * exist. */
70     while (flat_node_p->next)
71     {
72 	if (flat_node_p->type == contig_type)
73 	    return flat_node_p;
74 	flat_node_p = flat_node_p->next;
75     }
76     if (flat_node_p->type == contig_type)
77 	return flat_node_p;
78 
79     MPI_Type_size_x(contig_type, &contig_type_sz);
80     if ((flat_node_p->next = (ADIOI_Flatlist_node *) ADIOI_Malloc
81 	 (sizeof(ADIOI_Flatlist_node))) == NULL)
82     {
83 	fprintf(stderr, "ADIOI_Add_contig_flattened: malloc next failed\n");
84     }
85     flat_node_p = flat_node_p->next;
86     flat_node_p->type = contig_type;
87     if ((flat_node_p->blocklens = (ADIO_Offset *) ADIOI_Malloc(sizeof(ADIO_Offset))) == NULL)
88     {
89 	fprintf(stderr, "ADIOI_Flatlist_node: malloc blocklens failed\n");
90     }
91     if ((flat_node_p->indices = (ADIO_Offset *)
92 	 ADIOI_Malloc(sizeof(ADIO_Offset))) == NULL)
93     {
94 	fprintf(stderr, "ADIOI_Flatlist_node: malloc indices failed\n");
95     }
96     flat_node_p->blocklens[0] = contig_type_sz;
97     flat_node_p->indices[0] = 0;
98     flat_node_p->count = 1;
99     flat_node_p->next = NULL;
100     return flat_node_p;
101 }
102 
103 /* ADIOI_Exchange_file_views - Sends all the aggregators the file
104  * views and file view states of the clients.  It fills in the
105  * client_file_view_state_arr for the aggregators and the
106  * my_mem_view_state for the client.  It also initializes the
107  * agg_file_view_state for all clients, which is the view for each
108  * aggregator of a client's filetype. */
ADIOI_Exch_file_views(int myrank,int nprocs,int file_ptr_type,ADIO_File fd,int count,MPI_Datatype datatype,ADIO_Offset off,view_state * my_mem_view_state_arr,view_state * agg_file_view_state_arr,view_state * client_file_view_state_arr)109 void ADIOI_Exch_file_views(int myrank, int nprocs, int file_ptr_type,
110 			   ADIO_File fd, int count,
111 			   MPI_Datatype datatype, ADIO_Offset off,
112 			   view_state *my_mem_view_state_arr,
113 			   view_state *agg_file_view_state_arr,
114 			   view_state *client_file_view_state_arr)
115 {
116     /* Convert my own fileview to an ADIOI_Flattened type and a
117      * disp. MPI_Alltoall the count of ADIOI_Flatlist nodes.
118      * MPI_Isend/Irecv the block_lens, indices of ADIOI_Flatlist node
119      * to/from each of the aggregators with the rest of the file view
120      * state. */
121 
122     int i = -1, j = -1;
123     amount_and_extra_data_t *send_count_arr = NULL;
124     amount_and_extra_data_t *recv_count_arr = NULL;
125     int send_req_arr_sz = 0;
126     int recv_req_arr_sz = 0;
127     MPI_Request *send_req_arr = NULL, *recv_req_arr = NULL;
128     MPI_Status *statuses = NULL;
129     ADIO_Offset disp_off_sz_ext_typesz[6];
130     MPI_Aint memtype_extent, filetype_extent;
131     int ret = -1;
132 
133     /* parameters for datatypes */
134     ADIOI_Flatlist_node *flat_mem_p = NULL, *flat_file_p = NULL;
135     MPI_Count memtype_sz = -1;
136     int memtype_is_contig = -1;
137     ADIO_Offset filetype_sz = -1;
138 
139 #ifdef AGGREGATION_PROFILE
140     MPE_Log_event (5014, 0, NULL);
141 #endif
142     /* The memtype will be freed after the call.  The filetype will be
143      * freed in the close and should have been flattened in the file
144      * view. */
145     MPI_Type_size_x(datatype, &memtype_sz);
146     MPI_Type_extent(datatype, &memtype_extent);
147     if (memtype_sz == memtype_extent) {
148 	memtype_is_contig = 1;
149 	flat_mem_p = ADIOI_Add_contig_flattened(datatype);
150 	flat_mem_p->blocklens[0] = memtype_sz*count;
151     }
152     else {
153 	ADIOI_Flatten_datatype(datatype);
154         flat_mem_p = ADIOI_Flatlist;
155         while (flat_mem_p->type != datatype)
156             flat_mem_p = flat_mem_p->next;
157     }
158 
159     MPI_Type_extent(fd->filetype, &filetype_extent);
160     MPI_Type_size_x(fd->filetype, &filetype_sz);
161     if (filetype_extent == filetype_sz) {
162 	flat_file_p = ADIOI_Add_contig_flattened(fd->filetype);
163 	flat_file_p->blocklens[0] = memtype_sz*count;
164 	filetype_extent = memtype_sz*count;
165 	filetype_sz = filetype_extent;
166     }
167     else {
168         flat_file_p = ADIOI_Flatlist;
169         while (flat_file_p->type != fd->filetype)
170             flat_file_p = flat_file_p->next;
171     }
172 
173     disp_off_sz_ext_typesz[0] = fd->fp_ind;
174     disp_off_sz_ext_typesz[1] = fd->disp;
175     disp_off_sz_ext_typesz[2] = off;
176     disp_off_sz_ext_typesz[3] = memtype_sz*count;
177     disp_off_sz_ext_typesz[4] = (ADIO_Offset) filetype_extent;
178     disp_off_sz_ext_typesz[5] = (ADIO_Offset) filetype_sz;
179 
180     if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
181         recv_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
182         send_count_arr = ADIOI_Calloc(nprocs, sizeof(amount_and_extra_data_t));
183     } else {
184         send_count_arr = ADIOI_Calloc(fd->hints->cb_nodes,
185 				  sizeof(amount_and_extra_data_t));
186 
187         /* only aggregators receive data */
188         if (fd->is_agg) {
189 	    recv_count_arr = ADIOI_Calloc(nprocs,
190 			    sizeof(amount_and_extra_data_t));
191 	    recv_req_arr = ADIOI_Malloc (nprocs * sizeof(MPI_Request));
192 	    for (i=0; i < nprocs; i++)
193 	        MPI_Irecv (&recv_count_arr[i], sizeof(amount_and_extra_data_t),
194 		       MPI_BYTE, i, COUNT_EXCH, fd->comm, &recv_req_arr[i]);
195         }
196 
197         /* only send data to aggregators */
198         send_req_arr = ADIOI_Calloc (fd->hints->cb_nodes, sizeof(MPI_Request));
199         for (i=0; i < fd->hints->cb_nodes; i++) {
200 	    send_count_arr[i].count    = flat_file_p->count;
201 	    send_count_arr[i].fp_ind   = disp_off_sz_ext_typesz[0];
202 	    send_count_arr[i].disp     = disp_off_sz_ext_typesz[1];
203 	    send_count_arr[i].byte_off = disp_off_sz_ext_typesz[2];
204 	    send_count_arr[i].sz       = disp_off_sz_ext_typesz[3];
205 	    send_count_arr[i].ext      = disp_off_sz_ext_typesz[4];
206 	    send_count_arr[i].type_sz  = disp_off_sz_ext_typesz[5];
207 	    MPI_Isend (&send_count_arr[i], sizeof(amount_and_extra_data_t),
208 		   MPI_BYTE, fd->hints->ranklist[i], COUNT_EXCH, fd->comm,
209 		   &send_req_arr[i]);
210         }
211     }
212 
213 
214     /* Every client has to build mem and file view_states for each aggregator.
215      * We initialize their values here.  and we also initialize
216      * send_count_arr */
217 
218     if (memtype_is_contig) {
219 	/* if memory is contigous, we now replace memtype_sz and
220 	 * memtype_extent with the full access size */
221 	memtype_sz *= count;
222 	memtype_extent = memtype_sz;
223     }
224 
225     for (i = 0; i < fd->hints->cb_nodes; i++)
226     {
227 	int tmp_agg_idx = fd->hints->ranklist[i];
228 	memset(&(my_mem_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
229 	my_mem_view_state_arr[tmp_agg_idx].sz          =
230 	    disp_off_sz_ext_typesz[3];
231 	my_mem_view_state_arr[tmp_agg_idx].ext         =
232 	    (ADIO_Offset) memtype_extent;
233 	my_mem_view_state_arr[tmp_agg_idx].type_sz     =
234 	    (ADIO_Offset) memtype_sz;
235 	my_mem_view_state_arr[tmp_agg_idx].flat_type_p = flat_mem_p;
236 	ADIOI_init_view_state(file_ptr_type,
237 			1,
238 			&(my_mem_view_state_arr[tmp_agg_idx]),
239 			TEMP_OFF);
240 	ADIOI_init_view_state(file_ptr_type,
241 			1,
242 			&(my_mem_view_state_arr[tmp_agg_idx]),
243 			REAL_OFF);
244 
245 	memset(&(agg_file_view_state_arr[tmp_agg_idx]), 0, sizeof(view_state));
246 	agg_file_view_state_arr[tmp_agg_idx].fp_ind    =
247 	    disp_off_sz_ext_typesz[0];
248 	agg_file_view_state_arr[tmp_agg_idx].disp      =
249 	    disp_off_sz_ext_typesz[1];
250 	agg_file_view_state_arr[tmp_agg_idx].byte_off  =
251 	    disp_off_sz_ext_typesz[2];
252 	agg_file_view_state_arr[tmp_agg_idx].sz        =
253 	    disp_off_sz_ext_typesz[3];
254 	agg_file_view_state_arr[tmp_agg_idx].ext       =
255 	    disp_off_sz_ext_typesz[4];
256 	agg_file_view_state_arr[tmp_agg_idx].type_sz   =
257 	    disp_off_sz_ext_typesz[5];
258 	agg_file_view_state_arr[tmp_agg_idx].flat_type_p = flat_file_p;
259 
260 	ADIOI_init_view_state(file_ptr_type,
261 			1,
262 			&(agg_file_view_state_arr[tmp_agg_idx]),
263 			TEMP_OFF);
264 	ADIOI_init_view_state(file_ptr_type,
265 			1,
266 			&(agg_file_view_state_arr[tmp_agg_idx]),
267 			REAL_OFF);
268 
269 	if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
270 	    send_count_arr[tmp_agg_idx].count    = flat_file_p->count;
271 	    send_count_arr[tmp_agg_idx].fp_ind   = disp_off_sz_ext_typesz[0];
272 	    send_count_arr[tmp_agg_idx].disp     = disp_off_sz_ext_typesz[1];
273 	    send_count_arr[tmp_agg_idx].byte_off = disp_off_sz_ext_typesz[2];
274 	    send_count_arr[tmp_agg_idx].sz       = disp_off_sz_ext_typesz[3];
275 	    send_count_arr[tmp_agg_idx].ext      = disp_off_sz_ext_typesz[4];
276 	    send_count_arr[tmp_agg_idx].type_sz  = disp_off_sz_ext_typesz[5];
277 	}
278     }
279 
280 #ifdef DEBUG2
281     fprintf(stderr, "my own flattened memtype: ");
282     ADIOI_Print_flatlist_node(flat_mem_p);
283     fprintf(stderr, "my own flattened filetype: ");
284     ADIOI_Print_flatlist_node(flat_file_p);
285 #endif
286 
287     if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
288         ret = MPI_Alltoall(send_count_arr, sizeof(amount_and_extra_data_t),
289 		       MPI_BYTE,
290 		       recv_count_arr, sizeof(amount_and_extra_data_t),
291 		       MPI_BYTE, fd->comm);
292         if (ret != MPI_SUCCESS)
293         {
294 	    fprintf(stderr, "ADIOI_Exchange_file_views: MPI_Alltoall failed "
295 		"with error %d", ret);
296 	    return;
297         }
298     } else {
299         statuses = (MPI_Status *) ADIOI_Malloc(1 + nprocs * sizeof(MPI_Status));
300         if (fd->is_agg) {
301 	    MPI_Waitall(nprocs, recv_req_arr, statuses);
302 	    ADIOI_Free(recv_req_arr);
303         }
304         MPI_Waitall(fd->hints->cb_nodes, send_req_arr, statuses);
305         ADIOI_Free(statuses);
306         ADIOI_Free(send_req_arr);
307     }
308 #ifdef DEBUG2
309     if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
310         fprintf(stderr, "send_count_arr:");
311         for (i = 0; i < nprocs; i++)
312         {
313 	    fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
314         }
315         fprintf(stderr, "\n");
316         fprintf(stderr, "recv_count_arr:");
317         for (i = 0; i < nprocs; i++)
318 	{
319 	    fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
320 	}
321         fprintf(stderr, "\n");
322     } else {
323         fprintf(stderr, "send_count_arr:");
324         for (i = 0; i < fd->hints->cb_nodes; i++)
325         {
326 	    fprintf(stderr, "[%d]=%d ", i, send_count_arr[i].count);
327         }
328         fprintf(stderr, "\n");
329         if (fd->is_agg) {
330 	    fprintf(stderr, "recv_count_arr:");
331 	    for (i = 0; i < nprocs; i++)
332 	    {
333 	        fprintf(stderr, "[%d]=%d ", i, recv_count_arr[i].count);
334 	    }
335 	    fprintf(stderr, "\n");
336         }
337     }
338 #endif
339 
340     if (fd->hints->cb_alltoall == ADIOI_HINT_DISABLE) {
341         for (i=0; i < fd->hints->cb_nodes; i++)
342 	    if (send_count_arr[i].count > 0)
343 	        send_req_arr_sz++;
344     }
345     /* Figure out how many counts to send/recv */
346     for (i = 0; i < nprocs; i++)
347     {
348         if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
349 	    if (send_count_arr[i].count > 0)
350 	        send_req_arr_sz++;
351 	}
352 	/* Only aggregators should recv*/
353 	if (fd->is_agg) {
354 	    if (recv_count_arr[i].count > 0)
355 	    {
356 		if ((client_file_view_state_arr[i].flat_type_p =
357 		     (ADIOI_Flatlist_node *) ADIOI_Malloc(
358 			 sizeof(ADIOI_Flatlist_node))) == NULL)
359 		{
360 		    fprintf(stderr, "ADIOI_Exchange_file_views: malloc "
361 			    "flat_type_p failed\n");
362 		}
363 		client_file_view_state_arr[i].flat_type_p->count =
364 		    recv_count_arr[i].count;
365 		client_file_view_state_arr[i].flat_type_p->indices =
366 		    (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count,
367 						 sizeof(ADIO_Offset));
368 		client_file_view_state_arr[i].flat_type_p->blocklens =
369 		    (ADIO_Offset *) ADIOI_Calloc(recv_count_arr[i].count,
370 				    sizeof(ADIO_Offset));
371 
372 		/* Copy the extra data out of the stuff we Alltoall'd */
373 		memcpy (&client_file_view_state_arr[i].fp_ind,
374 			&recv_count_arr[i].fp_ind,
375 			6*sizeof(ADIO_Offset));
376 
377 		recv_req_arr_sz++;
378 	    }
379 	}
380     }
381 
382     /* Since ADIOI_Calloc may do other things we add the +1
383      * to avoid a 0-size malloc */
384     send_req_arr = (MPI_Request *) ADIOI_Calloc(2*(send_req_arr_sz)+1,
385 						sizeof(MPI_Request));
386 
387     j = 0;
388     if (recv_req_arr_sz > 0) {
389 	assert (fd->is_agg);
390 	recv_req_arr = (MPI_Request *) ADIOI_Calloc(2*(recv_req_arr_sz),
391 						    sizeof(MPI_Request));
392     	for (i = 0; i < nprocs; i++) {
393 	    if (recv_count_arr[i].count > 0) {
394 		MPI_Irecv(client_file_view_state_arr[i].flat_type_p->indices,
395 			  recv_count_arr[i].count, ADIO_OFFSET, i,
396 			  INDICES, fd->comm, &recv_req_arr[j]);
397 		j++;
398 		MPI_Irecv(client_file_view_state_arr[i].flat_type_p->blocklens,
399 			  recv_count_arr[i].count, ADIO_OFFSET, i,
400 			  BLOCK_LENS, fd->comm, &recv_req_arr[j]);
401 		j++;
402 	    }
403 	}
404     }
405 
406     if (fd->hints->cb_alltoall != ADIOI_HINT_DISABLE) {
407         j = 0;
408         for (i = 0; i < nprocs; i++) {
409 	    if (send_count_arr[i].count > 0) {
410 	        MPI_Isend(flat_file_p->indices,
411 		      send_count_arr[i].count, ADIO_OFFSET, i,
412                       INDICES, fd->comm, &send_req_arr[j]);
413 	        j++;
414 	        MPI_Isend(flat_file_p->blocklens,
415 		      send_count_arr[i].count, ADIO_OFFSET, i,
416                       BLOCK_LENS, fd->comm, &send_req_arr[j]);
417 	        j++;
418 	    }
419         }
420     } else {
421         j = 0;
422         for (i = 0; i < fd->hints->cb_nodes; i++) {
423 	    if (send_count_arr[i].count > 0) {
424 	        MPI_Isend(flat_file_p->indices,
425 		      send_count_arr[i].count, ADIO_OFFSET,
426 		      fd->hints->ranklist[i], INDICES, fd->comm,
427 		      &send_req_arr[j]);
428 	        j++;
429 	        MPI_Isend(flat_file_p->blocklens,
430 		      send_count_arr[i].count, ADIO_OFFSET,
431 		      fd->hints->ranklist[i], BLOCK_LENS, fd->comm,
432 		      &send_req_arr[j]);
433 	        j++;
434 	    }
435         }
436     }
437 
438     /* Since ADIOI_Malloc may do other things we add the +1
439      * to avoid a 0-size malloc */
440     statuses = (MPI_Status *)
441 	ADIOI_Malloc(1 + 2 * ADIOI_MAX(send_req_arr_sz,recv_req_arr_sz)
442 		     * sizeof(MPI_Status));
443 
444     if (send_req_arr_sz > 0) {
445 	MPI_Waitall(2 * send_req_arr_sz, send_req_arr, statuses);
446 	ADIOI_Free(send_count_arr);
447 	ADIOI_Free(send_req_arr);
448     }
449     if (recv_req_arr_sz > 0) {
450 	MPI_Waitall(2 * recv_req_arr_sz, recv_req_arr, statuses);
451 	ADIOI_Free(recv_count_arr);
452 	ADIOI_Free(recv_req_arr);
453     }
454     ADIOI_Free(statuses);
455 
456     if (fd->is_agg == 1)
457     {
458 	ADIOI_init_view_state(file_ptr_type,
459 			nprocs,
460 			client_file_view_state_arr,
461 			TEMP_OFF);
462 	ADIOI_init_view_state(file_ptr_type,
463 			nprocs,
464 			client_file_view_state_arr,
465 			REAL_OFF);
466     }
467 
468 #ifdef DEBUG
469     if (fd->is_agg == 1)
470     {
471 	ADIOI_Flatlist_node *fr_node_p = ADIOI_Flatlist;
472 	for (i = 0; i < nprocs; i++)
473 	{
474 	    fprintf(stderr, "client_file_view_state_arr[%d]=(fp_ind=%Ld,"
475 		    "disp=%Ld,byte_off=%Ld,sz=%Ld,ext=%Ld\n", i,
476 		    client_file_view_state_arr[i].fp_ind,
477 		    client_file_view_state_arr[i].disp,
478 		    client_file_view_state_arr[i].byte_off,
479 		    client_file_view_state_arr[i].sz,
480 		    client_file_view_state_arr[i].ext);
481 	}
482 
483 	while (fr_node_p->type !=
484 	       fd->file_realm_types[fd->my_cb_nodes_index])
485 	    fr_node_p = fr_node_p->next;
486 	assert(fr_node_p != NULL);
487 
488 	fprintf(stderr, "my file realm (idx=%d,st_off=%Ld) ",
489 		fd->my_cb_nodes_index,
490 		fd->file_realm_st_offs[fd->my_cb_nodes_index]);
491 	ADIOI_Print_flatlist_node(fr_node_p);
492     }
493 #endif
494 
495 #ifdef DEBUG2
496     if (fd->is_agg == 1)
497     {
498 	for (i = 0; i < nprocs; i++)
499 	{
500 	    fprintf(stderr, "client_file_view_state_arr[%d]: ", i);
501 	    ADIOI_Print_flatlist_node(
502 		client_file_view_state_arr[i].flat_type_p);
503 	}
504     }
505 #endif
506 #ifdef AGGREGATION_PROFILE
507     MPE_Log_event (5015, 0, NULL);
508 #endif
509 }
510