1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 #if HAVE_STDIO_H
6 # include <stdio.h>
7 #endif
8 #if HAVE_STDLIB_H
9 # include <stdlib.h>
10 #endif
11 #if HAVE_STDARG_H
12 # include <stdarg.h>
13 #endif
14 #if HAVE_UNISTD_H
15 # include <unistd.h>
16 #elif HAVE_WINDOWS_H
17 # include <windows.h>
18 #endif
19 #if HAVE_MATH_H
20 # include <math.h>
21 #endif
22 #if HAVE_TIME_H
23 # include <time.h>
24 #endif
25 #if HAVE_STRING_H
26 # include <string.h>
27 #endif
28 #if HAVE_ASSERT_H
29 # include <assert.h>
30 #endif
31
32 #include <mpi.h>
33
34 #include "armci.h"
35 #include "message.h"
36
37 extern double exp2(double);
38 extern double round(double);
39 extern double log2(double);
40 #define NDEBUG
41 /*#define LOG2FILE*/
42
43 typedef int t_elem; /* type of an array element */
44 #define SIZE_ELEM sizeof(t_elem)
45
46
47 #define STRIDE_OFF (SIZE_ELEM * 4 - 1)
48
49 #define MIN_MSG_SIZE 8
50 #define MAX_MSG_SIZE (1024 * 1024)
51 #define MSG_COUNT 20
52
53 int armci_error_code;
54 #define ARMCI_ASSERT(error_code) if ((armci_error_code = error_code)) { \
55 fprintf(stderr, "ARMCI error %d\n", armci_error_code);pause(); \
56 ARMCI_Cleanup(); MPI_Abort(MPI_COMM_WORLD, armci_error_code); }
57
58 #define FIX_TIME(t) if (t < 0.0) t = 0.0;
59
60 int size, rank, second;
61
62 #define ITERS 18
63 #define ITER_STEPS 20
64 double iterations_times[ITERS];
65 int iterations[ITERS];
66
67 int *p_srcs, *p_dsts;
68
69 enum {CONT_PUT, CONT_GET,
70 STRIDED_PUT, STRIDED_GET, STRIDED_ACC,
71 VECTOR_PUT, VECTOR_GET, VECTOR_ACC};
72 #define OPS_COUNT (STRIDED_ACC + 1)
73 #define NON_CONT(op) (op > CONT_GET)
74
75 enum {NOWORK, TOTAL, OVERLAP};
76 #define STATS_COUNT (OVERLAP + 1)
77
78 /* prints formatted numbered message with processor's rank */
log_debug(const char * fmt,...)79 int log_debug(const char *fmt, ...)
80 {
81 int r = 0;
82 #ifndef NDEBUG
83 static int log_counter = 1;
84 va_list ap;
85 va_start(ap, fmt);
86
87 printf("%03d@%1d: ", log_counter++, rank);
88 r = vprintf(fmt, ap);
89
90 va_end(ap);
91 #endif
92 return r;
93 }
94
95 FILE *log_file = NULL;
96
start_logging(const char * fname)97 void start_logging(const char *fname)
98 {
99 #ifdef LOG2FILE
100 char exe_name[255];
101 char log_path[255];
102 int i;
103 char k;
104
105 strcpy(exe_name, fname);
106 if (exe_name[strlen(exe_name) - 2] == '.') /* remove .x */
107 exe_name[strlen(exe_name) - 2] = 0;
108
109 if (exe_name[0] == '/') { /* full path given */
110 for (i = ((int)strlen(exe_name)) - 1, k = -1; i >= 0; i--)
111 if (exe_name[i] == '/') {
112 if (k == -1) k = i + 1;
113 else {
114 exe_name[i] = 0;
115 break;
116 }
117 }
118 log_debug("exe: path=%s, name=%s\n", exe_name, exe_name + k);
119 sprintf(log_path, "%s/data/%s.dat", exe_name, exe_name + k);
120 } else { /* only executable name */
121 sprintf(log_path, "../data/%s.dat", exe_name);
122 }
123 log_debug("log: %s\n", log_path);
124
125 log_file = fopen(log_path, "w");
126
127 if (!log_file) {
128 perror("cannot open log file");
129 abort();
130 }
131 #else
132 log_file = stderr;
133 #endif
134 }
135
finish_logging()136 void finish_logging()
137 {
138 fclose(log_file);
139 }
140
141 /* prints formatted message to ../data/<prog>.dat */
log_printf(const char * fmt,...)142 int log_printf(const char *fmt, ...)
143 {
144 va_list ap;
145 int r;
146
147 va_start(ap, fmt);
148
149 if (log_file)
150 r = vfprintf(log_file, fmt, ap);
151 else {
152 fprintf(stderr, "warning: logging is not enabled for this process\n");
153 r = vfprintf(stderr, fmt, ap);
154 }
155
156 va_end(ap);
157 return r;
158 }
159
160
161 /* computes approximate time of n iterations for variable n */
time_iterations()162 void time_iterations()
163 {
164 double time_start, time_after_start, time_stop;
165 int i, j, k, l;
166
167 for (i = 0, j = 1; i < ITERS; i++, j *= 2) {
168 time_start = armci_timer();
169 time_after_start = armci_timer();
170
171 for (l = 0, k = rand(); l < j; l++) k *= rand();
172
173 time_stop = armci_timer();
174 iterations_times[i] = time_stop - time_after_start +
175 time_start - time_after_start;
176 FIX_TIME(iterations_times[i]);
177 iterations[i] = j;
178
179 log_debug("it takes %.8f sec to iterate %d times\n",
180 iterations_times[i], iterations[i]);
181 }
182 }
183
184
185 /* computes useful overlap time for contiguous/vector/strided arrays
186 * * op - operation
187 * * msg_size - size of a message/ 1st dimension (bytes)
188 * * size2 - not used for contiguous arrays
189 * * - size of 2nd dimension for strided arrays (bytes)
190 * * - # of vector segments for vectors
191 * * returns pointer to static array of stats (STATS_COUNT doubles)
192 * */
benchmark(int op,int msg_size,int size2)193 double * benchmark(int op, int msg_size, int size2)
194 {
195 static double stats[STATS_COUNT]; /* return statistics in static array */
196
197 void **array_ptrs;
198 int stride_dist, block_sizes[2], scale = 2;
199 int i=0, j=0, k=0, l=0, less=0, more=0;
200 double time_start=0, time_after_start=0, time_after_call=0,
201 time_after_work=0, time_after_wait=0;
202 double time2call_nw=0, time2wait_nw = 1.0, time_total_nw=0;
203 double time2call_fw, time2work_fw, time2wait_fw, time_total_fw;
204 armci_hdl_t handle;
205
206 array_ptrs = malloc(sizeof(void*)*size);
207
208 log_debug("barrier O\n");
209 armci_msg_barrier();
210 /* initialize: obtain remote address and generate random array */
211 switch (op) {
212 case CONT_PUT:
213 case CONT_GET:
214 ARMCI_ASSERT(ARMCI_Malloc(array_ptrs, msg_size));
215 for (i = 0; i < msg_size; i++)
216 ((char *)array_ptrs[rank])[i] = (char)(rand() >> 24);
217 break;
218
219 /* 2D strided array of ints */
220 case STRIDED_PUT:
221 case STRIDED_GET:
222 case STRIDED_ACC:
223 block_sizes[0] = msg_size;
224 block_sizes[1] = size2;
225 stride_dist = STRIDE_OFF + msg_size;
226 log_debug("strided: dim1 = %d (%d bytes), dim2 = %d, stride = %d\n",
227 msg_size / SIZE_ELEM, msg_size, size2, stride_dist);
228 ARMCI_ASSERT(ARMCI_Malloc(array_ptrs,
229 (size2 - 1) * stride_dist + msg_size));
230
231 for (i = 0; i < size2; i++)
232 for (j = 0; j < (msg_size / ((int)SIZE_ELEM)); j++) {
233 l = stride_dist * i + SIZE_ELEM * i;
234 *(int *)((char *)array_ptrs[rank] + l) = rand();
235 }
236 break;
237 }
238
239 /* warm up call */
240 log_debug("barrier A\n");
241 armci_msg_barrier();
242 if (second != -1) {
243 log_debug("testing message size %d bytes\n", msg_size);
244 switch (op) {
245 case CONT_PUT:
246 ARMCI_INIT_HANDLE(&handle);
247 time_start = armci_timer();
248 time_after_start = armci_timer();
249
250 ARMCI_ASSERT(ARMCI_NbPut(array_ptrs[rank],
251 array_ptrs[second], msg_size,
252 second, &handle));
253 time_after_call = armci_timer();
254
255 ARMCI_ASSERT(ARMCI_Wait(&handle));
256 time_after_wait = armci_timer();
257 break;
258
259 case CONT_GET:
260 ARMCI_INIT_HANDLE(&handle);
261 time_start = armci_timer();
262 time_after_start = armci_timer();
263
264 ARMCI_ASSERT(ARMCI_NbGet(array_ptrs[second],
265 array_ptrs[rank], msg_size,
266 second, &handle));
267 time_after_call = armci_timer();
268
269 ARMCI_ASSERT(ARMCI_Wait(&handle));
270 time_after_wait = armci_timer();
271 break;
272
273 case STRIDED_PUT:
274 ARMCI_INIT_HANDLE(&handle);
275 time_start = armci_timer();
276 time_after_start = armci_timer();
277
278 ARMCI_ASSERT(ARMCI_NbPutS(array_ptrs[rank], &stride_dist,
279 array_ptrs[second], &stride_dist,
280 block_sizes, 1, second, &handle));
281 time_after_call = armci_timer();
282
283 ARMCI_ASSERT(ARMCI_Wait(&handle));
284 time_after_wait = armci_timer();
285 break;
286
287 case STRIDED_GET:
288 ARMCI_INIT_HANDLE(&handle);
289 time_start = armci_timer();
290 time_after_start = armci_timer();
291
292 ARMCI_ASSERT(ARMCI_NbGetS(array_ptrs[second], &stride_dist,
293 array_ptrs[second], &stride_dist,
294 block_sizes, 1, second, &handle));
295 time_after_call = armci_timer();
296
297 ARMCI_ASSERT(ARMCI_Wait(&handle));
298 time_after_wait = armci_timer();
299 break;
300
301 case STRIDED_ACC:
302 ARMCI_INIT_HANDLE(&handle);
303 time_start = armci_timer();
304 time_after_start = armci_timer();
305
306 ARMCI_ASSERT(ARMCI_NbAccS(ARMCI_ACC_INT, &scale,
307 array_ptrs[rank], &stride_dist,
308 array_ptrs[second], &stride_dist,
309 block_sizes, 1, second, &handle));
310 time_after_call = armci_timer();
311
312 ARMCI_ASSERT(ARMCI_Wait(&handle));
313 time_after_wait = armci_timer();
314
315 break;
316 }
317
318 time2call_nw = time_after_call - time_after_start + time_start -
319 time_after_start;
320 time2wait_nw = time_after_wait - time_after_call + time_start -
321 time_after_start;
322 time_total_nw = time_after_wait - time_after_start + time_start -
323 time_after_start;
324
325 log_debug("time (warm up): %.8f call, %.8f wait, %.8f total\n",
326 time2call_nw, time2wait_nw, time_total_nw);
327 }
328
329 log_debug("barrier B\n");
330 armci_msg_barrier();
331 if (second != -1) {
332 /* no work */
333 ARMCI_INIT_HANDLE(&handle);
334 switch (op) {
335 case CONT_PUT:
336 time_start = armci_timer();
337 time_after_start = armci_timer();
338
339 ARMCI_ASSERT(ARMCI_NbPut(array_ptrs[rank],
340 array_ptrs[second], msg_size,
341 second, &handle));
342 time_after_call = armci_timer();
343
344 ARMCI_ASSERT(ARMCI_Wait(&handle));
345 time_after_wait = armci_timer();
346 break;
347
348 case CONT_GET:
349 time_start = armci_timer();
350 time_after_start = armci_timer();
351
352 ARMCI_ASSERT(ARMCI_NbGet(array_ptrs[second],
353 array_ptrs[rank], msg_size,
354 second, &handle));
355 time_after_call = armci_timer();
356
357 ARMCI_ASSERT(ARMCI_Wait(&handle));
358 time_after_wait = armci_timer();
359 break;
360
361 case STRIDED_PUT:
362 time_start = armci_timer();
363 time_after_start = armci_timer();
364
365 ARMCI_ASSERT(ARMCI_NbPutS(array_ptrs[rank], &stride_dist,
366 array_ptrs[second], &stride_dist,
367 block_sizes, 1, second, &handle));
368 time_after_call = armci_timer();
369
370 ARMCI_ASSERT(ARMCI_Wait(&handle));
371 time_after_wait = armci_timer();
372 break;
373
374 case STRIDED_GET:
375 time_start = armci_timer();
376 time_after_start = armci_timer();
377
378 ARMCI_ASSERT(ARMCI_NbGetS(array_ptrs[second], &stride_dist,
379 array_ptrs[rank], &stride_dist,
380 block_sizes, 1, second, &handle));
381 time_after_call = armci_timer();
382
383 ARMCI_ASSERT(ARMCI_Wait(&handle));
384 time_after_wait = armci_timer();
385 break;
386
387 case STRIDED_ACC:
388 time_start = armci_timer();
389 time_after_start = armci_timer();
390
391 ARMCI_ASSERT(ARMCI_NbAccS(ARMCI_ACC_INT, &scale,
392 array_ptrs[rank], &stride_dist,
393 array_ptrs[second], &stride_dist,
394 block_sizes, 1, second, &handle));
395 time_after_call = armci_timer();
396
397 ARMCI_ASSERT(ARMCI_Wait(&handle));
398 time_after_wait = armci_timer();
399 break;
400 }
401
402 time2call_nw = time_after_call - time_after_start + time_start -
403 time_after_start;
404 FIX_TIME(time2call_nw);
405 time2wait_nw = time_after_wait - time_after_call + time_start -
406 time_after_start;
407 FIX_TIME(time2wait_nw);
408 time_total_nw = time_after_wait - time_after_start + time_start -
409 time_after_start;
410 FIX_TIME(time_total_nw);
411
412 log_debug("time (no work): %.8f call, %.8f wait, %.8f total\n",
413 time2call_nw, time2wait_nw, time_total_nw);
414 }
415
416 /* only perform tests if wait time is not 0 */
417 if (time2wait_nw > 0.0) {
418 /* time2wait_nw is always 1.0 on seconds (receiving nodes) */
419 double overlaps[ITER_STEPS], totals[ITER_STEPS];
420 if (second != -1) {
421 /* compute approximate range of iterations */
422 less = 0, more = iterations[ITERS - 1];
423 assert(time2wait_nw < iterations_times[ITERS - 1]);
424
425 for (i = 0; i < ITERS; i++)
426 if (time2wait_nw > iterations_times[i])
427 less = iterations[i];
428 else
429 break;
430 for (i = 0; i < ITERS; i++)
431 if (time2wait_nw < iterations_times[ITERS - i - 1])
432 more = iterations[ITERS - i - 1];
433 else
434 break;
435
436 log_debug("wait time (%.8f) is between %d and %d iterations\n",
437 time2wait_nw, less, more);
438 }
439
440 /* benchmark ITER_STEPS steps within computed range */
441 for (i = 0, j = less; i < ITER_STEPS;
442 i++, j += (more - less) / (ITER_STEPS - 1)) {
443 /* time noneblocking call with j interations of fake work */
444 log_debug("barrier C\n");
445 armci_msg_barrier();
446 if (second != -1) {
447 ARMCI_INIT_HANDLE(&handle);
448 switch (op) {
449 case CONT_PUT:
450 time_start = armci_timer();
451 time_after_start = armci_timer();
452
453 ARMCI_ASSERT(ARMCI_NbPut(array_ptrs[rank],
454 array_ptrs[second], msg_size,
455 second, &handle));
456 time_after_call = armci_timer();
457
458 for (l = 0, k = rand(); l < j; l++) k *= rand();
459 time_after_work = armci_timer();
460
461 ARMCI_ASSERT(ARMCI_Wait(&handle));
462 time_after_wait = armci_timer();
463 break;
464
465 case CONT_GET:
466 time_start = armci_timer();
467 time_after_start = armci_timer();
468
469 ARMCI_ASSERT(ARMCI_NbGet(array_ptrs[second],
470 array_ptrs[rank], msg_size,
471 second, &handle));
472 time_after_call = armci_timer();
473
474 for (l = 0, k = rand(); l < j; l++) k *= rand();
475 time_after_work = armci_timer();
476
477 ARMCI_ASSERT(ARMCI_Wait(&handle));
478 time_after_wait = armci_timer();
479 break;
480
481 case STRIDED_PUT:
482 time_start = armci_timer();
483 time_after_start = armci_timer();
484
485 ARMCI_ASSERT(ARMCI_NbPutS(array_ptrs[rank], &stride_dist,
486 array_ptrs[second], &stride_dist,
487 block_sizes, 1, second, &handle));
488 time_after_call = armci_timer();
489
490 for (l = 0, k = rand(); l < j; l++) k *= rand();
491 time_after_work = armci_timer();
492
493 ARMCI_ASSERT(ARMCI_Wait(&handle));
494 time_after_wait = armci_timer();
495 break;
496
497 case STRIDED_GET:
498 time_start = armci_timer();
499 time_after_start = armci_timer();
500
501 ARMCI_ASSERT(ARMCI_NbGetS(array_ptrs[second],
502 &stride_dist, array_ptrs[rank], &stride_dist,
503 block_sizes, 1, second, &handle));
504 time_after_call = armci_timer();
505
506 for (l = 0, k = rand(); l < j; l++) k *= rand();
507 time_after_work = armci_timer();
508
509 ARMCI_ASSERT(ARMCI_Wait(&handle));
510 time_after_wait = armci_timer();
511
512 break;
513
514 case STRIDED_ACC:
515 time_start = armci_timer();
516 time_after_start = armci_timer();
517
518 ARMCI_ASSERT(ARMCI_NbAccS(ARMCI_ACC_INT, &scale,
519 array_ptrs[rank], &stride_dist,
520 array_ptrs[second], &stride_dist,
521 block_sizes, 1, second, &handle));
522 time_after_call = armci_timer();
523
524 for (l = 0, k = rand(); l < j; l++) k *= rand();
525 time_after_work = armci_timer();
526
527 ARMCI_ASSERT(ARMCI_Wait(&handle));
528 time_after_wait = armci_timer();
529 break;
530 }
531
532 time2call_fw = time_after_call - time_after_start + time_start -
533 time_after_start;
534 FIX_TIME(time2call_fw);
535 time2work_fw = time_after_work - time_after_call + time_start -
536 time_after_start;
537 FIX_TIME(time2work_fw);
538 time2wait_fw = time_after_wait - time_after_work + time_start -
539 time_after_start;
540 FIX_TIME(time2wait_fw);
541 time_total_fw = time_after_wait - time_after_start +
542 time_start - time_after_start;
543 FIX_TIME(time_total_fw);
544
545 log_debug("time (%d iters): %.8f call, %.8f work, "
546 "%.8f wait %.8f total\n", j, time2call_fw, time2work_fw,
547 time2wait_fw, time_total_fw);
548
549 overlaps[i] = time2work_fw;
550 totals[i] = time_total_fw;
551 }
552 }
553
554 /* pick overlap with closest total (less or equal) */
555 if (second != -1) {
556 double closest_total, closest_overlap;
557 double smallest_total = totals[ITER_STEPS - 1],
558 smallest_overlap = overlaps[ITER_STEPS - 1];
559 for (i = ITER_STEPS - 1; i >= 0; i--) {
560 closest_total = totals[i];
561 closest_overlap = overlaps[i];
562 if (closest_total < smallest_total) {
563 smallest_total = closest_total;
564 smallest_overlap = closest_overlap;
565 }
566 if (closest_total <= time_total_nw) break;
567 }
568 if (closest_total > time_total_nw) {
569 closest_total = smallest_total;
570 closest_overlap = smallest_overlap;
571 }
572 stats[NOWORK] = time_total_nw;
573 stats[TOTAL] = closest_total;
574 stats[OVERLAP] = closest_overlap;
575 }
576 } else {
577 if (second != -1) {
578 for (i = 0; i < ITER_STEPS; i++) {
579 log_debug("barrier C0\n");
580 armci_msg_barrier();
581 }
582 stats[NOWORK] = time_total_nw;
583 stats[TOTAL] = 0;
584 stats[OVERLAP] = 0;
585 }
586 }
587
588
589 ARMCI_ASSERT(ARMCI_Free(array_ptrs[rank]));
590 free(array_ptrs);
591
592 log_debug("barrier D\n");
593 armci_msg_barrier();
594
595 return stats;
596 }
597
598
599
main(int argc,char * argv[])600 int main (int argc, char *argv[])
601 {
602 int i, j, k, l;
603 double u;
604 char buf[255];
605
606 int dist, pos, time_seed;
607
608 int msg_sizes[MSG_COUNT], dim1_sizes[MSG_COUNT], dim2[MSG_COUNT], mul_elem;
609 double *stats=NULL, *stats_all=NULL;
610 double from_log = log2(MIN_MSG_SIZE);
611 double to_log = log2(MAX_MSG_SIZE);
612 double step_log = (to_log - from_log) / (MSG_COUNT - 1);
613
614 armci_msg_init(&argc, &argv);
615 rank = armci_msg_me();
616 size = armci_msg_nproc();
617 assert((size & 1) ^ 1); /* works with even number of processors only */
618 log_debug("Message passing initialized\n");
619
620 ARMCI_ASSERT(ARMCI_Init());
621 log_debug("ARMCI initialized\n");
622
623 if (!rank) start_logging(argv[0]);
624
625 /* generate MSG_COUNT message sizes MIN_MSG_SIZE thru MAX_MSG_SIZE */
626 for (i = 0, u = from_log; i < MSG_COUNT; i++, u += step_log) {
627 mul_elem = round(exp2(u));
628 msg_sizes[i] = mul_elem % ((int)SIZE_ELEM)
629 ? (mul_elem / ((int)SIZE_ELEM) + 1) * ((int)SIZE_ELEM)
630 : mul_elem; /* multiple of SIZE_ELEM */
631 }
632
633 /* generate MSG_COUNT respective dim1 sizes and dim2 for strided */
634 for (i = 0; i < MSG_COUNT; i++) {
635 mul_elem = msg_sizes[i] / SIZE_ELEM;
636 mul_elem = sqrt(2.0 * mul_elem);
637 dim1_sizes[i] = mul_elem * SIZE_ELEM;
638 dim2[i] = mul_elem / 2;
639 }
640
641 /* print msg_sizes and appropriate derivatives (debug mode only) */
642 if (!rank) {
643 log_debug("msg_sizes:\n");
644 for (i = 0; i < MSG_COUNT; i++)
645 log_debug("cont: %d bytes | strided: %d bytes X %d\n",
646 msg_sizes[i], dim1_sizes[i], dim2[i]);
647 }
648
649 /* inialize PRNG, use seed generated on processor 0 for uniform sequence */
650 time_seed = time(NULL);
651 MPI_Bcast(&time_seed, 1, MPI_INT, 0, MPI_COMM_WORLD);
652 srand(time_seed); rand();
653 log_debug("seed: %d\n", time_seed);
654
655 /* generate random pairs of processors */
656 #define HALFSIZE (size / 2)
657 p_srcs = malloc(sizeof(int) * size);
658 assert(p_srcs);
659 for (i = 0; i < size; i++) p_srcs[i] = -1;
660 p_dsts = p_srcs + HALFSIZE;
661
662 for (i = 0, j = size - 1, pos = 0; i < size; i++, j--) {
663 dist = round((double)rand() * j / RAND_MAX + 1); /* random 1..j */
664
665 for (l = 0; l < dist; ) {
666 pos = (pos + 1 == size) ? 0 : pos + 1;
667 if ((p_srcs[pos] == -1) && (pos != i)) l++;
668 }
669 p_srcs[pos] = i;
670 }
671
672 for (i = 0, j = 0; i < HALFSIZE; i++)
673 j += sprintf(buf + j, " %d->%d", p_srcs[i], p_dsts[i]);
674 log_debug("random pairs:%s\n", buf);
675
676 /* time interations: 1 thru ITERS */
677 time_iterations();
678
679 /* determine if processor initiates communication and where it sends to,
680 * * -1 for second(receiver) */
681 second = -1;
682 for (i = 0; i < HALFSIZE; i++) if (p_srcs[i] == rank) second = p_dsts[i];
683 log_debug("second: %d\n", second);
684
685 /* allocate memory for statisticis */
686 #define MSG_OFF (STATS_COUNT * size)
687 #define OPS_OFF (MSG_OFF * MSG_COUNT)
688 stats_all = malloc(OPS_COUNT * OPS_OFF * sizeof(double));
689 assert(stats_all);
690
691 for (i = 0; i < OPS_COUNT; i++)
692 for (j = 0; j < MSG_COUNT; j++) {
693 switch (i) {
694 case CONT_PUT:
695 case CONT_GET:
696 stats = benchmark(i, msg_sizes[j], 0);
697 log_debug("stats: %8d | %.8f | %.8f | %.8f | %.2f\n",
698 msg_sizes[j], stats[NOWORK], stats[TOTAL],
699 stats[OVERLAP],
700 100.0 * stats[OVERLAP] / stats[TOTAL]);
701 break;
702
703 case STRIDED_PUT:
704 case STRIDED_GET:
705 case STRIDED_ACC:
706 stats = benchmark(i, dim1_sizes[j], dim2[j]);
707 log_debug("stats: %8d | %.8f | %.8f | %.8f | %.2f\n",
708 dim1_sizes[j] * dim2[j], stats[NOWORK],
709 stats[TOTAL], stats[OVERLAP],
710 100.0 * stats[OVERLAP] / stats[TOTAL]);
711 break;
712 }
713 MPI_Gather(stats, STATS_COUNT, MPI_DOUBLE,
714 stats_all + i * OPS_OFF + j * MSG_OFF,
715 STATS_COUNT, MPI_DOUBLE, 0, MPI_COMM_WORLD);
716 }
717
718 if (!rank)
719 for (l = 0; l < HALFSIZE; l++) { /* interate thru pairs */
720 log_printf("for pair of processors %d -> %d:\n", p_srcs[l], p_dsts[l]);
721
722 for (i = 0; i < OPS_COUNT; i++) { /* iterate thru operations */
723 switch (i) {
724 case CONT_PUT:
725 log_printf("ARMCI_NbPut\n");
726 break;
727
728 case CONT_GET:
729 log_printf("ARMCI_NbGet\n");
730 break;
731
732 case STRIDED_PUT:
733 log_printf("ARMCI_NbPutS\n");
734 break;
735
736 case STRIDED_GET:
737 log_printf("ARMCI_NbGetS\n");
738 break;
739
740 case STRIDED_ACC:
741 log_printf("ARMCI_NbAccS\n");
742 break;
743 }
744 log_printf("msg size | nowork | total | overlap |"
745 " ratio\n");
746 log_printf("---------+------------+------------+------------+"
747 "------\n");
748
749 for (j = 0; j < MSG_COUNT; j++) { /* iterate thru msg sizes */
750 k = i * OPS_OFF + j * MSG_OFF + p_srcs[l] * STATS_COUNT;
751 log_printf("%8d | %.8f | %.8f | %.8f | %.2f\n",
752 NON_CONT(i) ? dim1_sizes[j] * dim2[j]: msg_sizes[j],
753 stats_all[k + NOWORK], stats_all[k + TOTAL],
754 stats_all[k + OVERLAP],
755 (stats_all[k + NOWORK] < stats_all[k + TOTAL]) ||
756 (stats_all[k + TOTAL] <= 0.0)
757 ? 0 : 100.0 * stats_all[k + OVERLAP] /
758 stats_all[k + TOTAL]);
759 }
760 log_printf("\n");
761 }
762 }
763
764 if (!rank) finish_logging();
765
766 ARMCI_Finalize();
767 armci_msg_finalize();
768
769 free(p_srcs);
770 free(stats_all);
771
772 return 0;
773 }
774