1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 #if HAVE_STDIO_H
6 #   include <stdio.h>
7 #endif
8 #if HAVE_STDLIB_H
9 #   include <stdlib.h>
10 #endif
11 #if HAVE_STDARG_H
12 #   include <stdarg.h>
13 #endif
14 #if HAVE_UNISTD_H
15 #   include <unistd.h>
16 #elif HAVE_WINDOWS_H
17 #   include <windows.h>
18 #endif
19 #if HAVE_MATH_H
20 #   include <math.h>
21 #endif
22 #if HAVE_TIME_H
23 #   include <time.h>
24 #endif
25 #if HAVE_STRING_H
26 #   include <string.h>
27 #endif
28 #if HAVE_ASSERT_H
29 #   include <assert.h>
30 #endif
31 
32 #include <mpi.h>
33 
34 #include "armci.h"
35 #include "message.h"
36 
37 extern double exp2(double);
38 extern double round(double);
39 extern double log2(double);
40 #define NDEBUG
41 /*#define LOG2FILE*/
42 
43 typedef int t_elem; /* type of an array element */
44 #define SIZE_ELEM   sizeof(t_elem)
45 
46 
47 #define STRIDE_OFF  (SIZE_ELEM * 4 - 1)
48 
49 #define MIN_MSG_SIZE    8
50 #define MAX_MSG_SIZE    (1024 * 1024)
51 #define MSG_COUNT       20
52 
53 int armci_error_code;
54 #define ARMCI_ASSERT(error_code) if ((armci_error_code = error_code)) {   \
55         fprintf(stderr, "ARMCI error %d\n", armci_error_code);pause();         \
56         ARMCI_Cleanup(); MPI_Abort(MPI_COMM_WORLD, armci_error_code); }
57 
58 #define FIX_TIME(t) if (t < 0.0) t = 0.0;
59 
60 int size, rank, second;
61 
62 #define ITERS           18
63 #define ITER_STEPS      20
64 double iterations_times[ITERS];
65 int iterations[ITERS];
66 
67 int *p_srcs, *p_dsts;
68 
69 enum {CONT_PUT, CONT_GET,
70     STRIDED_PUT, STRIDED_GET, STRIDED_ACC,
71     VECTOR_PUT, VECTOR_GET, VECTOR_ACC};
72 #define OPS_COUNT   (STRIDED_ACC + 1)
73 #define NON_CONT(op)  (op > CONT_GET)
74 
75 enum {NOWORK, TOTAL, OVERLAP};
76 #define STATS_COUNT (OVERLAP + 1)
77 
78 /* prints formatted numbered message with processor's rank */
log_debug(const char * fmt,...)79 int log_debug(const char *fmt, ...)
80 {
81     int r = 0;
82 #ifndef NDEBUG
83     static int log_counter = 1;
84     va_list ap;
85     va_start(ap, fmt);
86 
87     printf("%03d@%1d: ", log_counter++, rank);
88     r = vprintf(fmt, ap);
89 
90     va_end(ap);
91 #endif
92     return r;
93 }
94 
95 FILE *log_file = NULL;
96 
start_logging(const char * fname)97 void start_logging(const char *fname)
98 {
99 #ifdef  LOG2FILE
100     char exe_name[255];
101     char log_path[255];
102     int i;
103     char k;
104 
105     strcpy(exe_name, fname);
106     if (exe_name[strlen(exe_name) - 2] == '.') /* remove .x */
107         exe_name[strlen(exe_name) - 2] = 0;
108 
109     if (exe_name[0] == '/') { /* full path given */
110         for (i = ((int)strlen(exe_name)) - 1, k = -1; i >= 0; i--)
111             if (exe_name[i] == '/') {
112                 if (k == -1) k = i + 1;
113                 else {
114                     exe_name[i] = 0;
115                     break;
116                 }
117             }
118         log_debug("exe: path=%s, name=%s\n", exe_name, exe_name + k);
119         sprintf(log_path, "%s/data/%s.dat", exe_name, exe_name + k);
120     } else { /* only executable name */
121         sprintf(log_path, "../data/%s.dat", exe_name);
122     }
123     log_debug("log: %s\n", log_path);
124 
125     log_file = fopen(log_path, "w");
126 
127     if (!log_file) {
128         perror("cannot open log file");
129         abort();
130     }
131 #else
132     log_file = stderr;
133 #endif
134 }
135 
finish_logging()136 void finish_logging()
137 {
138     fclose(log_file);
139 }
140 
141 /* prints formatted message to ../data/<prog>.dat */
log_printf(const char * fmt,...)142 int log_printf(const char *fmt, ...)
143 {
144     va_list ap;
145     int r;
146 
147     va_start(ap, fmt);
148 
149     if (log_file)
150         r = vfprintf(log_file, fmt, ap);
151     else {
152         fprintf(stderr, "warning: logging is not enabled for this process\n");
153         r = vfprintf(stderr, fmt, ap);
154     }
155 
156     va_end(ap);
157     return r;
158 }
159 
160 
161 /* computes approximate time of n iterations for variable n */
time_iterations()162 void time_iterations()
163 {
164         double time_start, time_after_start, time_stop;
165         int i, j, k, l;
166 
167         for (i = 0, j = 1; i < ITERS; i++, j *= 2) {
168                 time_start = armci_timer();
169                 time_after_start = armci_timer();
170 
171                 for (l = 0, k = rand(); l < j; l++) k *= rand();
172 
173                 time_stop = armci_timer();
174                 iterations_times[i] = time_stop - time_after_start +
175                     time_start - time_after_start;
176                 FIX_TIME(iterations_times[i]);
177                 iterations[i] = j;
178 
179                 log_debug("it takes %.8f sec to iterate %d times\n",
180                         iterations_times[i], iterations[i]);
181         }
182 }
183 
184 
185 /* computes useful overlap time for contiguous/vector/strided arrays
186  *  * op - operation
187  *   * msg_size - size of a message/ 1st dimension (bytes)
188  *    * size2 - not used for contiguous arrays
189  *     *       - size of 2nd dimension for strided arrays (bytes)
190  *      *       - # of vector segments for vectors
191  *       * returns pointer to static array of stats (STATS_COUNT doubles)
192  *        */
benchmark(int op,int msg_size,int size2)193 double * benchmark(int op, int msg_size, int size2)
194 {
195     static double stats[STATS_COUNT]; /* return statistics in static array */
196 
197     void **array_ptrs;
198     int stride_dist, block_sizes[2], scale = 2;
199     int i=0, j=0, k=0, l=0, less=0, more=0;
200     double time_start=0, time_after_start=0, time_after_call=0,
201            time_after_work=0, time_after_wait=0;
202     double time2call_nw=0, time2wait_nw = 1.0, time_total_nw=0;
203     double time2call_fw, time2work_fw, time2wait_fw, time_total_fw;
204     armci_hdl_t handle;
205 
206     array_ptrs = malloc(sizeof(void*)*size);
207 
208     log_debug("barrier O\n");
209     armci_msg_barrier();
210     /* initialize: obtain remote address and generate random array */
211     switch (op) {
212         case CONT_PUT:
213         case CONT_GET:
214             ARMCI_ASSERT(ARMCI_Malloc(array_ptrs, msg_size));
215             for (i = 0; i < msg_size; i++)
216                 ((char *)array_ptrs[rank])[i] = (char)(rand() >> 24);
217             break;
218 
219         /* 2D strided array of ints */
220         case STRIDED_PUT:
221         case STRIDED_GET:
222         case STRIDED_ACC:
223             block_sizes[0] = msg_size;
224             block_sizes[1] = size2;
225             stride_dist = STRIDE_OFF + msg_size;
226             log_debug("strided: dim1 = %d (%d bytes), dim2 = %d, stride = %d\n",
227                     msg_size / SIZE_ELEM, msg_size, size2, stride_dist);
228             ARMCI_ASSERT(ARMCI_Malloc(array_ptrs,
229                         (size2 - 1) * stride_dist + msg_size));
230 
231             for (i = 0; i < size2; i++)
232                 for (j = 0; j < (msg_size / ((int)SIZE_ELEM)); j++) {
233                     l = stride_dist * i + SIZE_ELEM * i;
234                     *(int *)((char *)array_ptrs[rank] + l) = rand();
235                 }
236             break;
237     }
238 
239     /* warm up call */
240     log_debug("barrier A\n");
241     armci_msg_barrier();
242     if (second != -1) {
243             log_debug("testing message size %d bytes\n", msg_size);
244             switch (op) {
245                 case CONT_PUT:
246                     ARMCI_INIT_HANDLE(&handle);
247                     time_start = armci_timer();
248                     time_after_start = armci_timer();
249 
250                     ARMCI_ASSERT(ARMCI_NbPut(array_ptrs[rank],
251                                 array_ptrs[second], msg_size,
252                                 second, &handle));
253                     time_after_call = armci_timer();
254 
255                     ARMCI_ASSERT(ARMCI_Wait(&handle));
256                     time_after_wait = armci_timer();
257                     break;
258 
259                 case CONT_GET:
260                     ARMCI_INIT_HANDLE(&handle);
261                     time_start = armci_timer();
262                     time_after_start = armci_timer();
263 
264                     ARMCI_ASSERT(ARMCI_NbGet(array_ptrs[second],
265                                 array_ptrs[rank], msg_size,
266                                 second, &handle));
267                     time_after_call = armci_timer();
268 
269                     ARMCI_ASSERT(ARMCI_Wait(&handle));
270                     time_after_wait = armci_timer();
271                     break;
272 
273                 case STRIDED_PUT:
274                     ARMCI_INIT_HANDLE(&handle);
275                     time_start = armci_timer();
276                     time_after_start = armci_timer();
277 
278                     ARMCI_ASSERT(ARMCI_NbPutS(array_ptrs[rank], &stride_dist,
279                                 array_ptrs[second], &stride_dist,
280                                 block_sizes, 1, second, &handle));
281                     time_after_call = armci_timer();
282 
283                     ARMCI_ASSERT(ARMCI_Wait(&handle));
284                     time_after_wait = armci_timer();
285                     break;
286 
287                 case STRIDED_GET:
288                     ARMCI_INIT_HANDLE(&handle);
289                     time_start = armci_timer();
290                     time_after_start = armci_timer();
291 
292                     ARMCI_ASSERT(ARMCI_NbGetS(array_ptrs[second], &stride_dist,
293                                 array_ptrs[second], &stride_dist,
294                                 block_sizes, 1, second, &handle));
295                     time_after_call = armci_timer();
296 
297                     ARMCI_ASSERT(ARMCI_Wait(&handle));
298                     time_after_wait = armci_timer();
299                     break;
300 
301                 case STRIDED_ACC:
302                     ARMCI_INIT_HANDLE(&handle);
303                     time_start = armci_timer();
304                     time_after_start = armci_timer();
305 
306                     ARMCI_ASSERT(ARMCI_NbAccS(ARMCI_ACC_INT, &scale,
307                                 array_ptrs[rank], &stride_dist,
308                                 array_ptrs[second], &stride_dist,
309                                 block_sizes, 1, second, &handle));
310                     time_after_call = armci_timer();
311 
312                     ARMCI_ASSERT(ARMCI_Wait(&handle));
313                     time_after_wait = armci_timer();
314 
315                     break;
316             }
317 
318             time2call_nw = time_after_call - time_after_start + time_start -
319                 time_after_start;
320             time2wait_nw = time_after_wait - time_after_call + time_start -
321                 time_after_start;
322             time_total_nw = time_after_wait - time_after_start + time_start -
323                 time_after_start;
324 
325             log_debug("time (warm up): %.8f call, %.8f wait, %.8f total\n",
326             time2call_nw, time2wait_nw, time_total_nw);
327     }
328 
329     log_debug("barrier B\n");
330     armci_msg_barrier();
331     if (second != -1) {
332             /* no work */
333             ARMCI_INIT_HANDLE(&handle);
334             switch (op) {
335                 case CONT_PUT:
336                     time_start = armci_timer();
337                     time_after_start = armci_timer();
338 
339                     ARMCI_ASSERT(ARMCI_NbPut(array_ptrs[rank],
340                                 array_ptrs[second], msg_size,
341                                 second, &handle));
342                     time_after_call = armci_timer();
343 
344                     ARMCI_ASSERT(ARMCI_Wait(&handle));
345                     time_after_wait = armci_timer();
346                     break;
347 
348                 case CONT_GET:
349                     time_start = armci_timer();
350                     time_after_start = armci_timer();
351 
352                     ARMCI_ASSERT(ARMCI_NbGet(array_ptrs[second],
353                                 array_ptrs[rank], msg_size,
354                                 second, &handle));
355                     time_after_call = armci_timer();
356 
357                     ARMCI_ASSERT(ARMCI_Wait(&handle));
358                     time_after_wait = armci_timer();
359                     break;
360 
361                 case STRIDED_PUT:
362                     time_start = armci_timer();
363                     time_after_start = armci_timer();
364 
365                     ARMCI_ASSERT(ARMCI_NbPutS(array_ptrs[rank], &stride_dist,
366                                 array_ptrs[second], &stride_dist,
367                                 block_sizes, 1, second, &handle));
368                     time_after_call = armci_timer();
369 
370                     ARMCI_ASSERT(ARMCI_Wait(&handle));
371                     time_after_wait = armci_timer();
372                     break;
373 
374                 case STRIDED_GET:
375                     time_start = armci_timer();
376                     time_after_start = armci_timer();
377 
378                     ARMCI_ASSERT(ARMCI_NbGetS(array_ptrs[second], &stride_dist,
379                                 array_ptrs[rank], &stride_dist,
380                                 block_sizes, 1, second, &handle));
381                     time_after_call = armci_timer();
382 
383                     ARMCI_ASSERT(ARMCI_Wait(&handle));
384                     time_after_wait = armci_timer();
385                     break;
386 
387                 case STRIDED_ACC:
388                     time_start = armci_timer();
389                     time_after_start = armci_timer();
390 
391                     ARMCI_ASSERT(ARMCI_NbAccS(ARMCI_ACC_INT, &scale,
392                                 array_ptrs[rank], &stride_dist,
393                                 array_ptrs[second], &stride_dist,
394                                 block_sizes, 1, second, &handle));
395                     time_after_call = armci_timer();
396 
397                     ARMCI_ASSERT(ARMCI_Wait(&handle));
398                     time_after_wait = armci_timer();
399                     break;
400             }
401 
402             time2call_nw = time_after_call - time_after_start + time_start -
403                 time_after_start;
404             FIX_TIME(time2call_nw);
405             time2wait_nw = time_after_wait - time_after_call + time_start -
406                 time_after_start;
407             FIX_TIME(time2wait_nw);
408             time_total_nw = time_after_wait - time_after_start + time_start -
409                 time_after_start;
410             FIX_TIME(time_total_nw);
411 
412             log_debug("time (no work): %.8f call, %.8f wait, %.8f total\n",
413                     time2call_nw, time2wait_nw, time_total_nw);
414     }
415 
416     /* only perform tests if wait time is not 0 */
417     if (time2wait_nw > 0.0) {
418     /* time2wait_nw is always 1.0 on seconds (receiving nodes) */
419         double overlaps[ITER_STEPS], totals[ITER_STEPS];
420         if (second !=  -1) {
421             /* compute approximate range of iterations */
422             less = 0, more = iterations[ITERS - 1];
423             assert(time2wait_nw < iterations_times[ITERS - 1]);
424 
425             for (i = 0; i < ITERS; i++)
426                 if (time2wait_nw > iterations_times[i])
427                     less = iterations[i];
428                 else
429                     break;
430             for (i = 0; i < ITERS; i++)
431                 if (time2wait_nw < iterations_times[ITERS - i - 1])
432                     more = iterations[ITERS - i - 1];
433                 else
434                     break;
435 
436             log_debug("wait time (%.8f) is between %d and %d iterations\n",
437                     time2wait_nw, less, more);
438         }
439 
440         /* benchmark ITER_STEPS steps within computed range */
441         for (i = 0, j = less; i < ITER_STEPS;
442              i++, j += (more - less) / (ITER_STEPS - 1)) {
443             /* time noneblocking call with j interations of fake work */
444             log_debug("barrier C\n");
445             armci_msg_barrier();
446             if (second != -1) {
447                 ARMCI_INIT_HANDLE(&handle);
448                 switch (op) {
449                     case CONT_PUT:
450                         time_start = armci_timer();
451                         time_after_start = armci_timer();
452 
453                         ARMCI_ASSERT(ARMCI_NbPut(array_ptrs[rank],
454                                     array_ptrs[second], msg_size,
455                                     second, &handle));
456                         time_after_call = armci_timer();
457 
458                         for (l = 0, k = rand(); l < j; l++) k *= rand();
459                         time_after_work = armci_timer();
460 
461                         ARMCI_ASSERT(ARMCI_Wait(&handle));
462                         time_after_wait = armci_timer();
463                         break;
464 
465                     case CONT_GET:
466                         time_start = armci_timer();
467                         time_after_start = armci_timer();
468 
469                         ARMCI_ASSERT(ARMCI_NbGet(array_ptrs[second],
470                                     array_ptrs[rank], msg_size,
471                                     second, &handle));
472                         time_after_call = armci_timer();
473 
474                         for (l = 0, k = rand(); l < j; l++) k *= rand();
475                         time_after_work = armci_timer();
476 
477                         ARMCI_ASSERT(ARMCI_Wait(&handle));
478                         time_after_wait = armci_timer();
479                         break;
480 
481                 case STRIDED_PUT:
482                         time_start = armci_timer();
483                         time_after_start = armci_timer();
484 
485                        ARMCI_ASSERT(ARMCI_NbPutS(array_ptrs[rank], &stride_dist,
486                                    array_ptrs[second], &stride_dist,
487                                    block_sizes, 1, second, &handle));
488                         time_after_call = armci_timer();
489 
490                         for (l = 0, k = rand(); l < j; l++) k *= rand();
491                         time_after_work = armci_timer();
492 
493                         ARMCI_ASSERT(ARMCI_Wait(&handle));
494                         time_after_wait = armci_timer();
495                         break;
496 
497                 case STRIDED_GET:
498                         time_start = armci_timer();
499                         time_after_start = armci_timer();
500 
501                         ARMCI_ASSERT(ARMCI_NbGetS(array_ptrs[second],
502                                    &stride_dist, array_ptrs[rank], &stride_dist,
503                                    block_sizes, 1, second, &handle));
504                         time_after_call = armci_timer();
505 
506                         for (l = 0, k = rand(); l < j; l++) k *= rand();
507                         time_after_work = armci_timer();
508 
509                         ARMCI_ASSERT(ARMCI_Wait(&handle));
510                         time_after_wait = armci_timer();
511 
512                         break;
513 
514                 case STRIDED_ACC:
515                         time_start = armci_timer();
516                         time_after_start = armci_timer();
517 
518                         ARMCI_ASSERT(ARMCI_NbAccS(ARMCI_ACC_INT, &scale,
519                                     array_ptrs[rank], &stride_dist,
520                                     array_ptrs[second], &stride_dist,
521                                     block_sizes, 1, second, &handle));
522                         time_after_call = armci_timer();
523 
524                         for (l = 0, k = rand(); l < j; l++) k *= rand();
525                         time_after_work = armci_timer();
526 
527                         ARMCI_ASSERT(ARMCI_Wait(&handle));
528                         time_after_wait = armci_timer();
529                         break;
530                 }
531 
532                 time2call_fw = time_after_call - time_after_start + time_start -
533                     time_after_start;
534                 FIX_TIME(time2call_fw);
535                 time2work_fw = time_after_work - time_after_call + time_start -
536                     time_after_start;
537                 FIX_TIME(time2work_fw);
538                 time2wait_fw = time_after_wait - time_after_work + time_start -
539                     time_after_start;
540                 FIX_TIME(time2wait_fw);
541                 time_total_fw = time_after_wait - time_after_start +
542                     time_start - time_after_start;
543                 FIX_TIME(time_total_fw);
544 
545                 log_debug("time (%d iters): %.8f call, %.8f work, "
546                         "%.8f wait %.8f total\n", j, time2call_fw, time2work_fw,
547                         time2wait_fw, time_total_fw);
548 
549                 overlaps[i] = time2work_fw;
550                 totals[i] = time_total_fw;
551             }
552         }
553 
554         /* pick overlap with closest total (less or equal) */
555         if (second != -1) {
556                 double closest_total, closest_overlap;
557                 double smallest_total = totals[ITER_STEPS - 1],
558                        smallest_overlap = overlaps[ITER_STEPS - 1];
559                 for (i = ITER_STEPS - 1; i >= 0; i--) {
560                         closest_total = totals[i];
561                         closest_overlap = overlaps[i];
562                         if (closest_total < smallest_total) {
563                             smallest_total = closest_total;
564                             smallest_overlap = closest_overlap;
565                         }
566                         if (closest_total <= time_total_nw) break;
567                 }
568                 if (closest_total > time_total_nw) {
569                     closest_total = smallest_total;
570                     closest_overlap = smallest_overlap;
571                 }
572                 stats[NOWORK]   = time_total_nw;
573                 stats[TOTAL]    = closest_total;
574                 stats[OVERLAP]  = closest_overlap;
575         }
576     } else {
577         if (second != -1) {
578             for (i = 0; i < ITER_STEPS; i++) {
579                 log_debug("barrier C0\n");
580                 armci_msg_barrier();
581             }
582             stats[NOWORK]   = time_total_nw;
583             stats[TOTAL]    = 0;
584             stats[OVERLAP]  = 0;
585         }
586     }
587 
588 
589     ARMCI_ASSERT(ARMCI_Free(array_ptrs[rank]));
590     free(array_ptrs);
591 
592     log_debug("barrier D\n");
593     armci_msg_barrier();
594 
595     return stats;
596 }
597 
598 
599 
main(int argc,char * argv[])600 int main (int argc, char *argv[])
601 {
602     int i, j, k, l;
603     double u;
604     char buf[255];
605 
606     int dist, pos, time_seed;
607 
608     int msg_sizes[MSG_COUNT], dim1_sizes[MSG_COUNT], dim2[MSG_COUNT], mul_elem;
609     double *stats=NULL, *stats_all=NULL;
610     double from_log = log2(MIN_MSG_SIZE);
611     double to_log   = log2(MAX_MSG_SIZE);
612     double step_log = (to_log - from_log) / (MSG_COUNT - 1);
613 
614     armci_msg_init(&argc, &argv);
615     rank = armci_msg_me();
616     size = armci_msg_nproc();
617     assert((size & 1) ^ 1); /* works with even number of processors only */
618     log_debug("Message passing initialized\n");
619 
620     ARMCI_ASSERT(ARMCI_Init());
621     log_debug("ARMCI initialized\n");
622 
623     if (!rank) start_logging(argv[0]);
624 
625     /* generate MSG_COUNT message sizes MIN_MSG_SIZE thru MAX_MSG_SIZE */
626     for (i = 0, u = from_log; i < MSG_COUNT; i++, u += step_log) {
627         mul_elem = round(exp2(u));
628         msg_sizes[i] = mul_elem % ((int)SIZE_ELEM)
629             ? (mul_elem / ((int)SIZE_ELEM) + 1) * ((int)SIZE_ELEM)
630             : mul_elem; /* multiple of SIZE_ELEM */
631     }
632 
633     /* generate MSG_COUNT respective dim1 sizes and dim2 for strided */
634     for (i = 0; i < MSG_COUNT; i++) {
635         mul_elem = msg_sizes[i] / SIZE_ELEM;
636         mul_elem = sqrt(2.0 * mul_elem);
637         dim1_sizes[i] = mul_elem * SIZE_ELEM;
638         dim2[i] = mul_elem / 2;
639     }
640 
641     /* print msg_sizes and appropriate derivatives (debug mode only) */
642     if (!rank) {
643         log_debug("msg_sizes:\n");
644         for (i = 0; i < MSG_COUNT; i++)
645             log_debug("cont: %d bytes | strided: %d bytes X %d\n",
646                     msg_sizes[i], dim1_sizes[i], dim2[i]);
647     }
648 
649     /* inialize PRNG, use seed generated on processor 0 for uniform sequence */
650     time_seed = time(NULL);
651     MPI_Bcast(&time_seed, 1, MPI_INT, 0, MPI_COMM_WORLD);
652     srand(time_seed); rand();
653     log_debug("seed: %d\n", time_seed);
654 
655     /* generate random pairs of processors */
656 #define HALFSIZE    (size / 2)
657     p_srcs = malloc(sizeof(int) * size);
658     assert(p_srcs);
659     for (i = 0; i < size; i++) p_srcs[i] = -1;
660     p_dsts = p_srcs + HALFSIZE;
661 
662     for (i = 0, j = size - 1, pos = 0; i < size; i++, j--) {
663         dist = round((double)rand() * j / RAND_MAX + 1); /* random 1..j */
664 
665         for (l = 0; l < dist; ) {
666             pos = (pos + 1 == size) ? 0 : pos + 1;
667             if ((p_srcs[pos] == -1) && (pos != i)) l++;
668         }
669         p_srcs[pos] = i;
670     }
671 
672     for (i = 0, j = 0; i < HALFSIZE; i++)
673         j += sprintf(buf + j, " %d->%d", p_srcs[i], p_dsts[i]);
674     log_debug("random pairs:%s\n", buf);
675 
676     /* time interations: 1 thru ITERS */
677     time_iterations();
678 
679     /* determine if processor initiates communication and where it sends to,
680      *      * -1 for second(receiver) */
681     second = -1;
682     for (i = 0; i < HALFSIZE; i++) if (p_srcs[i] == rank) second = p_dsts[i];
683     log_debug("second: %d\n", second);
684 
685     /* allocate memory for statisticis */
686 #define MSG_OFF (STATS_COUNT * size)
687 #define OPS_OFF (MSG_OFF * MSG_COUNT)
688     stats_all = malloc(OPS_COUNT * OPS_OFF * sizeof(double));
689     assert(stats_all);
690 
691     for (i = 0; i < OPS_COUNT; i++)
692         for (j = 0; j < MSG_COUNT; j++) {
693             switch (i) {
694                 case CONT_PUT:
695                 case CONT_GET:
696                     stats = benchmark(i, msg_sizes[j], 0);
697                     log_debug("stats: %8d | %.8f | %.8f | %.8f | %.2f\n",
698                             msg_sizes[j], stats[NOWORK], stats[TOTAL],
699                             stats[OVERLAP],
700                             100.0 * stats[OVERLAP] / stats[TOTAL]);
701                     break;
702 
703                 case STRIDED_PUT:
704                 case STRIDED_GET:
705                 case STRIDED_ACC:
706                      stats = benchmark(i, dim1_sizes[j], dim2[j]);
707                      log_debug("stats: %8d | %.8f | %.8f | %.8f | %.2f\n",
708                             dim1_sizes[j] * dim2[j], stats[NOWORK],
709                             stats[TOTAL], stats[OVERLAP],
710                             100.0 * stats[OVERLAP] / stats[TOTAL]);
711                     break;
712             }
713             MPI_Gather(stats, STATS_COUNT, MPI_DOUBLE,
714                         stats_all + i * OPS_OFF + j * MSG_OFF,
715                         STATS_COUNT, MPI_DOUBLE, 0, MPI_COMM_WORLD);
716         }
717 
718     if (!rank)
719         for (l = 0; l < HALFSIZE; l++) { /* interate thru pairs */
720             log_printf("for pair of processors %d -> %d:\n", p_srcs[l], p_dsts[l]);
721 
722             for (i = 0; i < OPS_COUNT; i++) { /* iterate thru operations */
723                 switch (i) {
724                         case CONT_PUT:
725                             log_printf("ARMCI_NbPut\n");
726                             break;
727 
728                         case CONT_GET:
729                             log_printf("ARMCI_NbGet\n");
730                             break;
731 
732                         case STRIDED_PUT:
733                             log_printf("ARMCI_NbPutS\n");
734                             break;
735 
736                         case STRIDED_GET:
737                             log_printf("ARMCI_NbGetS\n");
738                             break;
739 
740                         case STRIDED_ACC:
741                             log_printf("ARMCI_NbAccS\n");
742                             break;
743                 }
744                 log_printf("msg size |   nowork   |    total   |   overlap  |"
745                         " ratio\n");
746                 log_printf("---------+------------+------------+------------+"
747                         "------\n");
748 
749                 for (j = 0; j < MSG_COUNT; j++) { /* iterate thru msg sizes */
750                     k = i * OPS_OFF + j * MSG_OFF + p_srcs[l] * STATS_COUNT;
751                     log_printf("%8d | %.8f | %.8f | %.8f | %.2f\n",
752                             NON_CONT(i) ? dim1_sizes[j] * dim2[j]: msg_sizes[j],
753                             stats_all[k + NOWORK], stats_all[k + TOTAL],
754                             stats_all[k + OVERLAP],
755                             (stats_all[k + NOWORK] < stats_all[k + TOTAL]) ||
756                             (stats_all[k + TOTAL] <= 0.0)
757                             ? 0 : 100.0 * stats_all[k + OVERLAP] /
758                             stats_all[k + TOTAL]);
759                 }
760                 log_printf("\n");
761             }
762         }
763 
764     if (!rank) finish_logging();
765 
766     ARMCI_Finalize();
767     armci_msg_finalize();
768 
769     free(p_srcs);
770     free(stats_all);
771 
772     return 0;
773 }
774