1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3 * Copyright (c) 2004-2019 The University of Tennessee and The University
4 * of Tennessee Research Foundation. All rights
5 * reserved.
6 * $COPYRIGHT$
7 *
8 * Additional copyrights may follow
9 *
10 * $HEADER$
11 */
12
13 #include "mpi.h"
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <unistd.h>
18 #include <math.h>
19
20 #if 0 && OPEN_MPI
21 extern void ompi_datatype_dump( MPI_Datatype ddt );
22 #define MPI_DDT_DUMP(ddt) ompi_datatype_dump( (ddt) )
23 #else
24 #define MPI_DDT_DUMP(ddt)
25 #endif /* OPEN_MPI */
26
27 static MPI_Datatype
create_merged_contig_with_gaps(int count)28 create_merged_contig_with_gaps(int count) /* count of the basic datatype */
29 {
30 int array_of_blocklengths[] = {1, 1, 1};
31 MPI_Aint array_of_displacements[] = {0, 8, 16};
32 MPI_Datatype array_of_types[] = {MPI_DOUBLE, MPI_LONG, MPI_CHAR};
33 MPI_Datatype type;
34
35 MPI_Type_create_struct(3, array_of_blocklengths,
36 array_of_displacements, array_of_types,
37 &type);
38 if( 1 < count ) {
39 MPI_Datatype temp = type;
40 MPI_Type_contiguous(count, temp, &type);
41 }
42 MPI_Type_commit(&type);
43 MPI_DDT_DUMP( type );
44 return type;
45 }
46
47 /* Create a non-contiguous resized datatype */
48 struct structure {
49 double not_transfered;
50 double transfered_1;
51 double transfered_2;
52 };
53
54 static MPI_Datatype
create_struct_constant_gap_resized_ddt(int number,int contig_size,int gap_size)55 create_struct_constant_gap_resized_ddt( int number, /* IGNORED: number of repetitions */
56 int contig_size, /* IGNORED: number of elements in a contiguous chunk */
57 int gap_size ) /* IGNORED: number of elements in a gap */
58 {
59 struct structure data[1];
60 MPI_Datatype struct_type, temp_type;
61 MPI_Datatype types[2] = {MPI_DOUBLE, MPI_DOUBLE};
62 int blocklens[2] = {1, 1};
63 MPI_Aint disps[3];
64
65 MPI_Get_address(&data[0].transfered_1, &disps[0]);
66 MPI_Get_address(&data[0].transfered_2, &disps[1]);
67 MPI_Get_address(&data[0], &disps[2]);
68 disps[1] -= disps[2]; /* 8 */
69 disps[0] -= disps[2]; /* 16 */
70
71 MPI_Type_create_struct(2, blocklens, disps, types, &temp_type);
72 MPI_Type_create_resized(temp_type, 0, sizeof(data[0]), &struct_type);
73 MPI_Type_commit(&struct_type);
74 MPI_Type_free(&temp_type);
75 MPI_DDT_DUMP( struct_type );
76
77 return struct_type;
78 }
79
80 /* Create a datatype similar to the one use by HPL */
81 static MPI_Datatype
create_indexed_constant_gap_ddt(int number,int contig_size,int gap_size)82 create_indexed_constant_gap_ddt( int number, /* number of repetitions */
83 int contig_size, /* number of elements in a contiguous chunk */
84 int gap_size ) /* number of elements in a gap */
85 {
86 MPI_Datatype dt, *types;
87 int i, *bLength;
88 MPI_Aint* displ;
89
90 types = (MPI_Datatype*)malloc( sizeof(MPI_Datatype) * number );
91 bLength = (int*)malloc( sizeof(int) * number );
92 displ = (MPI_Aint*)malloc( sizeof(MPI_Aint) * number );
93
94 types[0] = MPI_DOUBLE;
95 bLength[0] = contig_size;
96 displ[0] = 0;
97 for( i = 1; i < number; i++ ) {
98 types[i] = MPI_DOUBLE;
99 bLength[i] = contig_size;
100 displ[i] = displ[i-1] + sizeof(double) * (contig_size + gap_size);
101 }
102 MPI_Type_create_struct( number, bLength, displ, types, &dt );
103 MPI_DDT_DUMP( dt );
104 free(types);
105 free(bLength);
106 free(displ);
107 MPI_Type_commit( &dt );
108 return dt;
109 }
110
111 static MPI_Datatype
create_optimized_indexed_constant_gap_ddt(int number,int contig_size,int gap_size)112 create_optimized_indexed_constant_gap_ddt( int number, /* number of repetitions */
113 int contig_size, /* number of elements in a contiguous chunk */
114 int gap_size ) /* number of elements in a gap */
115 {
116 MPI_Datatype dt;
117
118 MPI_Type_vector( number, contig_size, (contig_size + gap_size), MPI_DOUBLE, &dt );
119 MPI_Type_commit( &dt );
120 MPI_DDT_DUMP( dt );
121 return dt;
122 }
123
124 typedef struct {
125 int i[2];
126 float f;
127 } internal_struct;
128 typedef struct {
129 int v1;
130 int gap1;
131 internal_struct is[3];
132 } ddt_gap;
133
134 static MPI_Datatype
create_indexed_gap_ddt(void)135 create_indexed_gap_ddt( void )
136 {
137 ddt_gap dt[2];
138 MPI_Datatype dt1, dt2, dt3;
139 int bLength[2] = { 2, 1 };
140 MPI_Datatype types[2] = { MPI_INT, MPI_FLOAT };
141 MPI_Aint displ[2];
142
143 MPI_Get_address( &(dt[0].is[0].i[0]), &(displ[0]) );
144 MPI_Get_address( &(dt[0].is[0].f), &(displ[1]) );
145 displ[1] -= displ[0];
146 displ[0] -= displ[0];
147 MPI_Type_create_struct( 2, bLength, displ, types, &dt1 );
148 /*MPI_DDT_DUMP( dt1 );*/
149 MPI_Type_contiguous( 3, dt1, &dt2 );
150 /*MPI_DDT_DUMP( dt2 );*/
151 bLength[0] = 1;
152 bLength[1] = 1;
153 MPI_Get_address( &(dt[0].v1), &(displ[0]) );
154 MPI_Get_address( &(dt[0].is[0]), &(displ[1]) );
155 displ[1] -= displ[0];
156 displ[0] -= displ[0];
157 types[0] = MPI_INT;
158 types[1] = dt2;
159 MPI_Type_create_struct( 2, bLength, displ, types, &dt3 );
160 /*MPI_DDT_DUMP( dt3 );*/
161 MPI_Type_free( &dt1 );
162 MPI_Type_free( &dt2 );
163 MPI_Type_contiguous( 10, dt3, &dt1 );
164 MPI_DDT_DUMP( dt1 );
165 MPI_Type_free( &dt3 );
166 MPI_Type_commit( &dt1 );
167 return dt1;
168 }
169
170 static MPI_Datatype
create_indexed_gap_optimized_ddt(void)171 create_indexed_gap_optimized_ddt( void )
172 {
173 MPI_Datatype dt1, dt2, dt3;
174 int bLength[3];
175 MPI_Datatype types[3];
176 MPI_Aint displ[3];
177
178 MPI_Type_contiguous( 40, MPI_BYTE, &dt1 );
179 MPI_Type_create_resized( dt1, 0, 44, &dt2 );
180
181 bLength[0] = 4;
182 bLength[1] = 9;
183 bLength[2] = 36;
184
185 types[0] = MPI_BYTE;
186 types[1] = dt2;
187 types[2] = MPI_BYTE;
188
189 displ[0] = 0;
190 displ[1] = 8;
191 displ[2] = 44 * 9 + 8;
192
193 MPI_Type_create_struct( 3, bLength, displ, types, &dt3 );
194
195 MPI_Type_free( &dt1 );
196 MPI_Type_free( &dt2 );
197 MPI_DDT_DUMP( dt3 );
198 MPI_Type_commit( &dt3 );
199 return dt3;
200 }
201
202
203 /********************************************************************
204 *******************************************************************/
205
206 #define DO_CONTIG 0x00000001
207 #define DO_CONSTANT_GAP 0x00000002
208 #define DO_INDEXED_GAP 0x00000004
209 #define DO_OPTIMIZED_INDEXED_GAP 0x00000008
210 #define DO_STRUCT_CONSTANT_GAP_RESIZED 0x00000010
211 #define DO_STRUCT_MERGED_WITH_GAP_RESIZED 0x00000020
212
213 #define DO_PACK 0x01000000
214 #define DO_UNPACK 0x02000000
215 #define DO_ISEND_RECV 0x04000000
216 #define DO_ISEND_IRECV 0x08000000
217 #define DO_IRECV_SEND 0x10000000
218 #define DO_IRECV_ISEND 0x20000000
219
220 #define MIN_LENGTH 1024
221 #define MAX_LENGTH (1024*1024)
222
223 static int cycles = 100;
224 static int trials = 20;
225 static int warmups = 2;
226
print_result(int length,int trials,double * timers)227 static void print_result( int length, int trials, double* timers )
228 {
229 double bandwidth, clock_prec, temp;
230 double min_time, max_time, average, std_dev = 0.0;
231 double ordered[trials];
232 int t, pos, quartile_start, quartile_end;
233
234 for( t = 0; t < trials; ordered[t] = timers[t], t++ );
235 for( t = 0; t < trials-1; t++ ) {
236 temp = ordered[t];
237 pos = t;
238 for( int i = t+1; i < trials; i++ ) {
239 if( temp > ordered[i] ) {
240 temp = ordered[i];
241 pos = i;
242 }
243 }
244 if( pos != t ) {
245 temp = ordered[t];
246 ordered[t] = ordered[pos];
247 ordered[pos] = temp;
248 }
249 }
250 quartile_start = trials - (3 * trials) / 4;
251 quartile_end = trials - (1 * trials) / 4;
252 clock_prec = MPI_Wtick();
253 min_time = ordered[quartile_start];
254 max_time = ordered[quartile_start];
255 average = ordered[quartile_start];
256 for( t = quartile_start + 1; t < quartile_end; t++ ) {
257 if( min_time > ordered[t] ) min_time = ordered[t];
258 if( max_time < ordered[t] ) max_time = ordered[t];
259 average += ordered[t];
260 }
261 average /= (quartile_end - quartile_start);
262 for( t = quartile_start; t < quartile_end; t++ ) {
263 std_dev += (ordered[t] - average) * (ordered[t] - average);
264 }
265 std_dev = sqrt( std_dev/(quartile_end - quartile_start) );
266
267 bandwidth = (length * clock_prec) / (1024.0 * 1024.0) / (average * clock_prec);
268 printf( "%8d\t%15g\t%10.4f MB/s [min %10g max %10g std %2.2f%%]\n", length, average, bandwidth,
269 min_time, max_time, (100.0 * std_dev) / average );
270 }
271
pack(int cycles,MPI_Datatype sdt,int scount,void * sbuf,void * packed_buf)272 static int pack( int cycles,
273 MPI_Datatype sdt, int scount, void* sbuf,
274 void* packed_buf )
275 {
276 int position, myself, c, t, outsize;
277 double timers[trials];
278
279 MPI_Type_size( sdt, &outsize );
280 outsize *= scount;
281
282 MPI_Comm_rank( MPI_COMM_WORLD, &myself );
283
284 for( t = 0; t < warmups; t++ ) {
285 for( c = 0; c < cycles; c++ ) {
286 position = 0;
287 MPI_Pack(sbuf, scount, sdt, packed_buf, outsize, &position, MPI_COMM_WORLD);
288 }
289 }
290
291 for( t = 0; t < trials; t++ ) {
292 timers[t] = MPI_Wtime();
293 for( c = 0; c < cycles; c++ ) {
294 position = 0;
295 MPI_Pack(sbuf, scount, sdt, packed_buf, outsize, &position, MPI_COMM_WORLD);
296 }
297 timers[t] = (MPI_Wtime() - timers[t]) / cycles;
298 }
299 print_result( outsize, trials, timers );
300 return 0;
301 }
302
unpack(int cycles,void * packed_buf,MPI_Datatype rdt,int rcount,void * rbuf)303 static int unpack( int cycles,
304 void* packed_buf,
305 MPI_Datatype rdt, int rcount, void* rbuf )
306 {
307 int position, myself, c, t, insize;
308 double timers[trials];
309
310 MPI_Type_size( rdt, &insize );
311 insize *= rcount;
312
313 MPI_Comm_rank( MPI_COMM_WORLD, &myself );
314
315 for( t = 0; t < warmups; t++ ) {
316 for( c = 0; c < cycles; c++ ) {
317 position = 0;
318 MPI_Unpack(packed_buf, insize, &position, rbuf, rcount, rdt, MPI_COMM_WORLD);
319 }
320 }
321
322 for( t = 0; t < trials; t++ ) {
323 timers[t] = MPI_Wtime();
324 for( c = 0; c < cycles; c++ ) {
325 position = 0;
326 MPI_Unpack(packed_buf, insize, &position, rbuf, rcount, rdt, MPI_COMM_WORLD);
327 }
328 timers[t] = (MPI_Wtime() - timers[t]) / cycles;
329 }
330 print_result( insize, trials, timers );
331 return 0;
332 }
333
isend_recv(int cycles,MPI_Datatype sdt,int scount,void * sbuf,MPI_Datatype rdt,int rcount,void * rbuf)334 static int isend_recv( int cycles,
335 MPI_Datatype sdt, int scount, void* sbuf,
336 MPI_Datatype rdt, int rcount, void* rbuf )
337 {
338 int myself, tag = 0, c, t, slength, rlength;
339 MPI_Status status;
340 MPI_Request req;
341 double timers[trials];
342
343 MPI_Type_size( sdt, &slength );
344 slength *= scount;
345 MPI_Type_size( rdt, &rlength );
346 rlength *= rcount;
347
348 MPI_Comm_rank( MPI_COMM_WORLD, &myself );
349
350 for( t = 0; t < trials; t++ ) {
351 timers[t] = MPI_Wtime();
352 for( c = 0; c < cycles; c++ ) {
353 MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &req );
354 MPI_Recv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &status );
355 MPI_Wait( &req, &status );
356 }
357 timers[t] = (MPI_Wtime() - timers[t]) / cycles;
358 }
359 print_result( rlength, trials, timers );
360 return 0;
361 }
362
irecv_send(int cycles,MPI_Datatype sdt,int scount,void * sbuf,MPI_Datatype rdt,int rcount,void * rbuf)363 static int irecv_send( int cycles,
364 MPI_Datatype sdt, int scount, void* sbuf,
365 MPI_Datatype rdt, int rcount, void* rbuf )
366 {
367 int myself, tag = 0, c, t, slength, rlength;
368 MPI_Request req;
369 MPI_Status status;
370 double timers[trials];
371
372 MPI_Type_size( sdt, &slength );
373 slength *= scount;
374 MPI_Type_size( rdt, &rlength );
375 rlength *= rcount;
376
377 MPI_Comm_rank( MPI_COMM_WORLD, &myself );
378
379 for( t = 0; t < trials; t++ ) {
380 timers[t] = MPI_Wtime();
381 for( c = 0; c < cycles; c++ ) {
382 MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &req );
383 MPI_Send( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD );
384 MPI_Wait( &req, &status );
385 }
386 timers[t] = (MPI_Wtime() - timers[t]) / cycles;
387 }
388 print_result( rlength, trials, timers );
389 return 0;
390 }
391
isend_irecv_wait(int cycles,MPI_Datatype sdt,int scount,void * sbuf,MPI_Datatype rdt,int rcount,void * rbuf)392 static int isend_irecv_wait( int cycles,
393 MPI_Datatype sdt, int scount, void* sbuf,
394 MPI_Datatype rdt, int rcount, void* rbuf )
395 {
396 int myself, tag = 0, c, t, slength, rlength;
397 MPI_Request requests[2];
398 MPI_Status statuses[2];
399 double timers[trials];
400
401 MPI_Type_size( sdt, &slength );
402 slength *= scount;
403 MPI_Type_size( rdt, &rlength );
404 rlength *= rcount;
405
406 MPI_Comm_rank( MPI_COMM_WORLD, &myself );
407
408 for( t = 0; t < trials; t++ ) {
409 timers[t] = MPI_Wtime();
410 for( c = 0; c < cycles; c++ ) {
411 MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &requests[0] );
412 MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &requests[1] );
413 MPI_Waitall( 2, requests, statuses );
414 }
415 timers[t] = (MPI_Wtime() - timers[t]) / cycles;
416 }
417 print_result( rlength, trials, timers );
418 return 0;
419 }
420
irecv_isend_wait(int cycles,MPI_Datatype sdt,int scount,void * sbuf,MPI_Datatype rdt,int rcount,void * rbuf)421 static int irecv_isend_wait( int cycles,
422 MPI_Datatype sdt, int scount, void* sbuf,
423 MPI_Datatype rdt, int rcount, void* rbuf )
424 {
425 int myself, tag = 0, c, t, slength, rlength;
426 MPI_Request requests[2];
427 MPI_Status statuses[2];
428 double timers[trials];
429
430 MPI_Type_size( sdt, &slength );
431 slength *= scount;
432 MPI_Type_size( rdt, &rlength );
433 rlength *= rcount;
434
435 MPI_Comm_rank( MPI_COMM_WORLD, &myself );
436
437 for( t = 0; t < trials; t++ ) {
438 timers[t] = MPI_Wtime();
439 for( c = 0; c < cycles; c++ ) {
440 MPI_Irecv( rbuf, rcount, rdt, myself, tag, MPI_COMM_WORLD, &requests[0] );
441 MPI_Isend( sbuf, scount, sdt, myself, tag, MPI_COMM_WORLD, &requests[1] );
442 MPI_Waitall( 2, requests, statuses );
443 }
444 timers[t] = (MPI_Wtime() - timers[t]) / cycles;
445 }
446 print_result( rlength, trials, timers);
447 return 0;
448 }
449
do_test_for_ddt(int doop,MPI_Datatype sddt,MPI_Datatype rddt,int length)450 static int do_test_for_ddt( int doop, MPI_Datatype sddt, MPI_Datatype rddt, int length )
451 {
452 MPI_Aint lb, extent;
453 char *sbuf, *rbuf;
454 int i;
455
456 MPI_Type_get_extent( sddt, &lb, &extent );
457 sbuf = (char*)malloc( length );
458 rbuf = (char*)malloc( length );
459 if( doop & DO_PACK ) {
460 printf("# Pack (max length %d)\n", length);
461 for( i = 1; i <= (length/extent); i *= 2 ) {
462 pack( cycles, sddt, i, sbuf, rbuf );
463 }
464 }
465
466 if( doop & DO_UNPACK ) {
467 printf("# Unpack (length %d)\n", length);
468 for( i = 1; i <= (length/extent); i *= 2 ) {
469 unpack( cycles, sbuf, rddt, i, rbuf );
470 }
471 }
472
473 if( doop & DO_ISEND_RECV ) {
474 printf( "# Isend recv (length %d)\n", length );
475 for( i = 1; i <= (length/extent); i *= 2 ) {
476 isend_recv( cycles, sddt, i, sbuf, rddt, i, rbuf );
477 }
478 }
479
480 if( doop & DO_ISEND_IRECV ) {
481 printf( "# Isend Irecv Wait (length %d)\n", length );
482 for( i = 1; i <= (length/extent); i *= 2 ) {
483 isend_irecv_wait( cycles, sddt, i, sbuf, rddt, i, rbuf );
484 }
485 }
486
487 if( doop & DO_IRECV_SEND ) {
488 printf( "# Irecv send (length %d)\n", length );
489 for( i = 1; i <= (length/extent); i *= 2 ) {
490 irecv_send( cycles, sddt, i, sbuf, rddt, i, rbuf );
491 }
492 }
493
494 if( doop & DO_IRECV_SEND ) {
495 printf( "# Irecv Isend Wait (length %d)\n", length );
496 for( i = 1; i <= (length/extent); i *= 2 ) {
497 irecv_isend_wait( cycles, sddt, i, sbuf, rddt, i, rbuf );
498 }
499 }
500 free( sbuf );
501 free( rbuf );
502 return 0;
503 }
504
main(int argc,char * argv[])505 int main( int argc, char* argv[] )
506 {
507 int run_tests = DO_STRUCT_MERGED_WITH_GAP_RESIZED; /* do all datatype tests by default */
508 int rank, size;
509 MPI_Datatype ddt;
510
511 run_tests |= DO_PACK | DO_UNPACK;
512
513 MPI_Init (&argc, &argv);
514
515 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
516 MPI_Comm_size (MPI_COMM_WORLD, &size);
517
518 if( rank != 0 ) {
519 MPI_Finalize();
520 exit(0);
521 }
522
523 if( run_tests & DO_CONTIG ) {
524 printf( "\ncontiguous datatype\n\n" );
525 do_test_for_ddt( run_tests, MPI_INT, MPI_INT, MAX_LENGTH );
526 }
527
528 if( run_tests & DO_INDEXED_GAP ) {
529 printf( "\nindexed gap\n\n" );
530 ddt = create_indexed_gap_ddt();
531 MPI_DDT_DUMP( ddt );
532 do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
533 MPI_Type_free( &ddt );
534 }
535
536 if( run_tests & DO_OPTIMIZED_INDEXED_GAP ) {
537 printf( "\noptimized indexed gap\n\n" );
538 ddt = create_indexed_gap_optimized_ddt();
539 MPI_DDT_DUMP( ddt );
540 do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
541 MPI_Type_free( &ddt );
542 }
543
544 if( run_tests & DO_CONSTANT_GAP ) {
545 printf( "\nconstant indexed gap\n\n" );
546 ddt = create_indexed_constant_gap_ddt( 80, 100, 1 );
547 MPI_DDT_DUMP( ddt );
548 do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
549 MPI_Type_free( &ddt );
550 }
551
552 if( run_tests & DO_CONSTANT_GAP ) {
553 printf( "\noptimized constant indexed gap\n\n" );
554 ddt = create_optimized_indexed_constant_gap_ddt( 80, 100, 1 );
555 MPI_DDT_DUMP( ddt );
556 do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
557 MPI_Type_free( &ddt );
558 }
559
560 if( run_tests & DO_STRUCT_CONSTANT_GAP_RESIZED ) {
561 printf( "\nstruct constant gap resized\n\n" );
562 ddt = create_struct_constant_gap_resized_ddt( 0 /* unused */, 0 /* unused */, 0 /* unused */ );
563 MPI_DDT_DUMP( ddt );
564 do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
565 MPI_Type_free( &ddt );
566 }
567
568 if( run_tests & DO_STRUCT_MERGED_WITH_GAP_RESIZED ) {
569 printf( "\nstruct constant gap resized\n\n" );
570 ddt = create_merged_contig_with_gaps( 1 );
571 MPI_DDT_DUMP( ddt );
572 do_test_for_ddt( run_tests, ddt, ddt, MAX_LENGTH );
573 MPI_Type_free( &ddt );
574 }
575
576 MPI_Finalize ();
577 exit(0);
578 }
579
580