1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /* $Id: armci_profile.c,v 1.8 2005-11-30 10:20:53 vinod Exp $ */
6 
7 /**
8  * Profiler can profile the following ARMCI Calls:
9  *    ARMCI_Get,ARMCI_Put,ARMCI_Acc,ARMCI_NbGet,ARMCI_NbPut,ARMCI_NbAcc,
10  *    ARMCI_GetS,ARMCI_PutS,ARMCI_AccS,ARMCI_NbGetS,ARMCI_NbPutS,ARMCI_NbAccS,
11  *    ARMCI_GetV,ARMCI_PutV,ARMCI_AccV,ARMCI_NbGetV,ARMCI_NbPutV,ARMCI_NbAccV,
12  *    ARMCI_Wait, armci_wait_notify
13  *      (NOTE: As armci_notify is same as ARMCI_Put, it is not profiled.)
14  *
15  *
16  * Note #1: Right now, only process 0's profile is printed.
17  * Each and every process saves its profile in the correspoding data struture.
18  * Each process prints its profile to an output file armci_profile.<myrank>
19  * when armci_profile_terminate() is called (called in ARMCI_Finalize()).
20  *
21  * Note #2: By default profiler prints msg ranges 0 to 21. Example: range 10
22  * corresponds to message ranges from 1024 bytes to 2047 bytes.
23  * Message ranges are in the power of 2. for ex:
24  * ------------------------------------
25  *  MSG_RANGE (r)        BYTES (2^r to 2^(r+1)-1)
26  * ------------------------------------
27  *      0                    0-1
28  *      1                    2-3
29  *      2                    4-7
30  *     ...                   ...
31  *      10                1024-2047 bytes
32  *     ...                   ...
33  *      20                1MB - (2MB-1)
34  *      21                  >= 2MB
35  * -------------------------------------
36  * To increase the message range, set ARMCI_MAX_MSG_RANGE accordingly.
37  *
38  * Note #3: If Stride information needs to be printed, set ARMCI_PRINT_STRIDE.
39  * Stride information is printed in armci_profile_terminate() for a various
40  * selective message ranges and event types.Modify it according to your needs.
41  *
42  * Note #4: There is no profiling support for non-blocking operations yet!!
43  */
44 #define DEBUG_
45 #if HAVE_STDIO_H
46 #   include <stdio.h>
47 #endif
48 #if HAVE_STDLIB_H
49 #   include <stdlib.h>
50 #endif
51 #if HAVE_STRING_H
52 #   include <string.h>
53 #endif
54 #if HAVE_MATH_H
55 #   include <math.h>
56 #endif
57 #include "armci.h"
58 #include "armcip.h"
59 #include "armci_profile.h"
60 
61 #ifndef MSG_COMMS_MPI
62 #  include "tcgmsg.h"
63 #   define MP_TIMER tcg_time
64 #else
65 #  include "mpi.h"
66 #   define MP_TIMER MPI_Wtime
67 #endif
68 
69 
70 #define ARMCI_PRINT_STRIDE 1
71 #define ARMCI_MAX_MSG_RANGE 22 /* 0 to 21 */
72 
73 #if ARMCI_PRINT_STRIDE
74 
75 # define STRIDE_COUNT 1000
76 # define ARMCI_MAX_DIM 7
77 
78   typedef struct armci_stride {
79     int stride_levels;
80     int proc;
81     int count[ARMCI_MAX_DIM];
82     double time;
83   }armci_stride_t;
84 
85   typedef struct giov {
86     int ptr_array_len;
87     int bytes;
88   }giov_t;
89 
90   typedef struct armci_vector {
91     int vec_len;
92     int proc;
93     giov_t *giov;
94     double time;
95   }armci_vector_t;
96 
97 #endif
98 
99 #define ARMCI_EVENTS 24
100 
101 char *gEventName[ARMCI_EVENTS]={
102   "GET", "PUT", "ACC",
103   "STRIDED GET", "STRIDED PUT", "STRIDED ACC",
104   "VECTOR GET", "VECTOR PUT", "VECTOR ACC",
105   "NBGET", "NBPUT", "NBACC",
106   "STRIDED NBGET", "STRIDED NBPUT", "STRIDED NBACC",
107   "VECTOR NBGET", "VECTOR NBPUT", "VECTOR NBACC",
108   "BARRIER","ARMCI_WAIT","NOTIFY_WAIT",
109   "FENCE", "ALLFENCE", "RMW"
110 };
111 
112 typedef struct armci_profile {
113   int count;          /* number of times called */
114   double time;  /* total execution time for "count" calls */
115 #if ARMCI_PRINT_STRIDE
116   armci_stride_t *stride;
117   armci_vector_t *vector;
118 #endif
119 }armci_profile_t;
120 
121 /* profile get/put/acc for various message ranges (i.e ARMCI_MAX_MSG_RANGE) */
122 static armci_profile_t ARMCI_PROF[ARMCI_EVENTS][ARMCI_MAX_MSG_RANGE];
123 
124 /* Current event */
125 struct event_info {
126   int is_set;
127   int event_type;
128   int range;
129   double start_time;
130 } gCURRENT_EVNT;
131 
strided_event(int e)132 static int strided_event(int e) {
133     if (e==ARMCI_PROF_GETS || e==ARMCI_PROF_PUTS || e==ARMCI_PROF_ACCS ||
134 	e==ARMCI_PROF_NBGETS || e==ARMCI_PROF_NBPUTS || e==ARMCI_PROF_NBACCS)
135        return 1;
136     return 0;
137 }
138 
armci_profile_init()139 void armci_profile_init() {
140     int i,j;
141     if(armci_me==0) {printf("\nProfiling ARMCI - ON\n");fflush(stdout);}
142 
143     gCURRENT_EVNT.is_set = 0;
144 
145     for(i=0; i<ARMCI_EVENTS; i++)
146        for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
147 	  ARMCI_PROF[i][j].count = 0; ARMCI_PROF[i][j].time = 0.0;
148        }
149 
150 #if ARMCI_PRINT_STRIDE
151     for(i=0; i<ARMCI_EVENTS; i++) {
152        if(strided_event(i))
153 	  for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
154 	     ARMCI_PROF[i][j].stride = (armci_stride_t*)malloc(STRIDE_COUNT*sizeof(armci_stride_t));
155 	     ARMCI_PROF[i][j].vector = NULL;
156 	     if( ARMCI_PROF[i][j].stride == NULL)
157 		armci_die("armci_profile_init(): malloc failed", armci_me);
158 	  }
159        if(i==ARMCI_PROF_GETV || i==ARMCI_PROF_PUTV || i==ARMCI_PROF_ACCV ||
160 	  i==ARMCI_PROF_NBGETV || i==ARMCI_PROF_NBPUTV || i==ARMCI_PROF_NBACCV)
161 	  for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
162 	     ARMCI_PROF[i][j].vector = (armci_vector_t*)malloc(STRIDE_COUNT*sizeof(armci_vector_t));
163 	     ARMCI_PROF[i][j].stride = NULL;
164 	     if( ARMCI_PROF[i][j].vector == NULL)
165 		armci_die("armci_profile_init(): malloc failed", armci_me);
166 	  }
167     }
168 #endif
169 }
170 
171 #define ARMCI_EVENT_CLOSED     0
172 #define ARMCI_EVENT_NOTCLOSED -1
173 #define ARMCI_EVENT_SET        0
174 #define ARMCI_EVENT_NOTSET    -1
175 
armci_profile_set_event(int event_type,int range)176 static int armci_profile_set_event(int event_type, int range) {
177 #ifdef DEBUG
178     if(SERVER_CONTEXT)
179        printf("\n%d(s):call profile set for %s isset is %d",armci_me,
180                                    gEventName[event_type],gCURRENT_EVNT.is_set);
181     else
182        printf("\n%d:call profile set for %s isset is %d",armci_me,
183                                    gEventName[event_type],gCURRENT_EVNT.is_set);
184     fflush(stdout);
185 #endif
186     if(gCURRENT_EVNT.is_set == 0) { /* set an event */
187        gCURRENT_EVNT.is_set     = 1;
188        gCURRENT_EVNT.event_type = event_type;
189        gCURRENT_EVNT.range      = range;
190        gCURRENT_EVNT.start_time = MP_TIMER();
191        return ARMCI_EVENT_SET;
192     }
193     else gCURRENT_EVNT.is_set++; /* event overlap */
194     return ARMCI_EVENT_NOTSET;
195 }
196 
armci_profile_close_event(int event_type,int range,double * time,char * name)197 static int armci_profile_close_event(int event_type, int range, double *time,
198 				     char *name) {
199 
200     int curr_event = gCURRENT_EVNT.event_type;
201 #ifdef DEBUG
202     if(SERVER_CONTEXT)
203        printf("\n%d(s):call profile close for %s isset is %d",armci_me,
204                                    gEventName[event_type],gCURRENT_EVNT.is_set);
205     else
206        printf("\n%d:call profile close for %s isset is %d",armci_me,
207                                    gEventName[event_type],gCURRENT_EVNT.is_set);
208     fflush(stdout);
209 #endif
210 
211 
212     if(gCURRENT_EVNT.is_set==1) { /* Yep, there is an event set. So close it.*/
213        /*Check if "profile stop" is called for corresponding "profile start"*/
214        if(event_type != curr_event) {
215 	  printf(
216 		  "%d: %s: ERROR:Profile started for %s, but stopped for %s\n",
217 		  armci_me,name,gEventName[curr_event],gEventName[event_type]);
218           fflush(stdout);
219 	  armci_die("Profile_stop is called a different event", armci_me);
220        }
221 
222        *time = MP_TIMER() - gCURRENT_EVNT.start_time;
223        ARMCI_PROF[curr_event][range].time += *time;
224        gCURRENT_EVNT.is_set = 0; /* close the event */
225        return ARMCI_EVENT_CLOSED;
226     }
227     else { /* event overlapping */
228        gCURRENT_EVNT.is_set--;
229        if(gCURRENT_EVNT.is_set<=0) {
230 	  char *msg="Profile_stop is called before profile_start";
231 	    printf("%d: %s: ERROR: %s. Event Name = %s\n", armci_me,
232 		    name, msg, gEventName[curr_event]);
233             fflush(stdout);
234 	    armci_die(" profile_stop is called before profile_start", armci_me);
235        }
236     }
237     return ARMCI_EVENT_NOTCLOSED;
238 }
239 
armci_profile_start_strided(int count[],int stride_levels,int proc,int event_type)240 void armci_profile_start_strided(int count[], int stride_levels, int proc,
241 				 int event_type) {
242     int i, status, bytes=1, range;
243 
244     if(stride_levels >= ARMCI_MAX_DIM)
245        armci_die("ARMCI_PROFILE: stride_levels >= ARMCI_MAX_DIM. Increase ARMCI_MAX_DIM.", armci_me);
246 
247     /* find the message range */
248     for(i=0; i<= stride_levels; i++)  bytes *= count[i];
249     if(bytes<=0) range=0;
250     else range = (int) (log((double)bytes)/log(2.0));
251     if(range>=ARMCI_MAX_MSG_RANGE-1) range = ARMCI_MAX_MSG_RANGE-1;
252 
253     /* set the curent event for timer */
254     status = armci_profile_set_event(event_type, range);
255 
256     if(status == ARMCI_EVENT_SET) { /* new event set */
257        /* profile update: i.e. update event count */
258        ARMCI_PROF[event_type][range].count++;
259 
260 #      if ARMCI_PRINT_STRIDE
261           if(strided_event(event_type)) {
262 	     int idx = ARMCI_PROF[event_type][range].count-1;
263 	     if(idx<STRIDE_COUNT) {
264 		ARMCI_PROF[event_type][range].stride[idx].stride_levels = stride_levels;
265 		ARMCI_PROF[event_type][range].stride[idx].proc = proc;
266 		for(i=0;i<=stride_levels;i++) {
267 		   ARMCI_PROF[event_type][range].stride[idx].count[i]=count[i];
268 		}
269 	     }
270 	  }
271 #      endif
272     }
273     else { /* Do nothing. It is just an event overlap */ }
274 }
275 
armci_profile_stop_strided(int event_type)276 void armci_profile_stop_strided(int event_type) {
277     double time;
278     int status, range = gCURRENT_EVNT.range;
279 
280     status = armci_profile_close_event(event_type, range, &time,
281 				       "armci_profile_stop_strided");
282 
283 #if ARMCI_PRINT_STRIDE
284     if(status == ARMCI_EVENT_CLOSED) {
285        /* record the time of each strided data transfer */
286        if(strided_event(event_type)) {
287 	  int idx = ARMCI_PROF[event_type][range].count-1;
288 	  if(idx<STRIDE_COUNT)
289 	     ARMCI_PROF[event_type][range].stride[idx].time = time;
290        }
291     }
292 #endif
293 }
294 
armci_profile_start_vector(armci_giov_t darr[],int len,int proc,int event_type)295 void armci_profile_start_vector(armci_giov_t darr[], int len, int proc,
296 				int event_type) {
297 
298     int i, bytes=0, range, status;
299 
300     /* find the message range */
301     for(i=0; i<len; i++) bytes += darr[i].bytes;
302     if(bytes<=0) range=0;
303     else range = (int) (log((double)bytes)/log(2.0));
304     if(range>=ARMCI_MAX_MSG_RANGE-1) range = ARMCI_MAX_MSG_RANGE-1;
305 
306     /* set the curent event for timer */
307     status = armci_profile_set_event(event_type, range);
308 
309     if(status == ARMCI_EVENT_SET) { /* new event set */
310        /* profile update: i.e. update event count */
311        ARMCI_PROF[event_type][range].count++;
312 
313 #      if ARMCI_PRINT_STRIDE
314        {
315 	  int idx = ARMCI_PROF[event_type][range].count-1;
316 	  if(idx<STRIDE_COUNT) {
317 	     ARMCI_PROF[event_type][range].vector[idx].vec_len = len;
318 	     ARMCI_PROF[event_type][range].vector[idx].proc = proc;
319 	     ARMCI_PROF[event_type][range].vector[idx].giov =
320 	       (giov_t*)malloc(len*sizeof(giov_t));
321 	     for(i=0;i<len;i++) {
322 		ARMCI_PROF[event_type][range].vector[idx].giov[i].ptr_array_len = darr[i].ptr_array_len;
323 		ARMCI_PROF[event_type][range].vector[idx].giov[i].bytes =
324 		  darr[i].bytes;
325 	     }
326 	  }
327        }
328 #      endif
329     }
330 }
331 
armci_profile_stop_vector(int event_type)332 void armci_profile_stop_vector(int event_type) {
333     double time;
334     int status, range = gCURRENT_EVNT.range;
335 
336     status = armci_profile_close_event(event_type, range, &time,
337 				       "armci_profile_stop_vector");
338 
339 #if ARMCI_PRINT_STRIDE
340     if(status == ARMCI_EVENT_CLOSED) {/*record time of each data transfer*/
341        int idx = ARMCI_PROF[event_type][range].count-1;
342        if(idx<STRIDE_COUNT)
343 	  ARMCI_PROF[event_type][range].vector[idx].time = time;
344     }
345 #endif
346 }
347 
armci_profile_start(int event_type)348 void armci_profile_start(int event_type) {
349     int range, status;
350 
351     /* message range is zero for events registered using this call */
352     range=0;
353 
354     /* set the curent event for timer */
355     status = armci_profile_set_event(event_type, range);
356     if(status == ARMCI_EVENT_SET) { /* new event set */
357        /* profile update: i.e. update event count */
358        ARMCI_PROF[event_type][range].count++;
359     }
360 }
361 
armci_profile_stop(int event_type)362 void armci_profile_stop(int event_type) {
363     double time;
364     int status,range = gCURRENT_EVNT.range;
365     status = armci_profile_close_event(event_type, range, &time,
366 				       "armci_profile_stop");
367 }
368 
369 #define ARMCI_HDR0(fp) fprintf(fp, "\n\n************** TOTAL DATA TRANSFERS **************\n\n");
370 #define ARMCI_HDR1(fp) fprintf(fp, "\n\n************ CONTIGUOUS DATA TRANSFER ************\n\n");
371 #define ARMCI_HDR2(fp) fprintf(fp, "\n\n********** NON-CONTIGUOUS DATA TRANSFER **********\n\n");
372 #define ARMCI_HDR3(fp) fprintf(fp, "#gets\t #puts\t #accs\t get_time   put_time   acc_time   RANGE(bytes)\n\n");
373 #define ARMCI_HDR4(fp) fprintf(fp, "SL#\tndim\t proc\t time      stride_info\n\n");
374 #define ARMCI_HDR5(fp) fprintf(fp, "SL#\tnvec\t proc\t time\t    [ #arrays\t bytes\t]\n");
375 #define ARMCI_HDR6(fp) fprintf(fp, "\n\n****** NON-BLOCKING CONTIGUOUS DATA TRANSFER *****\n\n");
376 #define ARMCI_HDR7(fp) fprintf(fp, "\n\n*** NON-BLOCKING NON-CONTIGUOUS DATA TRANSFER ****\n\n");
377 #define ARMCI_HDR8(fp) fprintf(fp, "#gets\t #puts\t #accs\t get_time   put_time   acc_time   RANGE(bytes)\n\n");
378 #define ARMCI_HDR9(fp) fprintf(fp, "\n\n******************* ARMCI MISC *******************\n\n");
379 
380 /* print profile of all get/put/acc calls for every message range */
armci_print_all(FILE * fp)381 static void armci_print_all(FILE *fp) {
382     int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
383     double gtime, ptime, atime;
384 
385     ARMCI_HDR0(fp); ARMCI_HDR3(fp);
386     for(i=0; i< nrange; i++) {
387 
388        nget =(ARMCI_PROF[ARMCI_PROF_GET][i].count +
389 	      ARMCI_PROF[ARMCI_PROF_GETS][i].count +
390 	      ARMCI_PROF[ARMCI_PROF_GETV][i].count +
391 	      ARMCI_PROF[ARMCI_PROF_NBGET][i].count +
392 	      ARMCI_PROF[ARMCI_PROF_NBGETS][i].count +
393 	      ARMCI_PROF[ARMCI_PROF_NBGETV][i].count);
394        nput =(ARMCI_PROF[ARMCI_PROF_PUT][i].count +
395 	      ARMCI_PROF[ARMCI_PROF_PUTS][i].count +
396 	      ARMCI_PROF[ARMCI_PROF_PUTV][i].count +
397 	      ARMCI_PROF[ARMCI_PROF_NBPUT][i].count +
398               ARMCI_PROF[ARMCI_PROF_NBPUTS][i].count +
399 	      ARMCI_PROF[ARMCI_PROF_NBPUTV][i].count);
400        nacc =(ARMCI_PROF[ARMCI_PROF_ACC][i].count +
401 	      ARMCI_PROF[ARMCI_PROF_ACCS][i].count +
402 	      ARMCI_PROF[ARMCI_PROF_ACCV][i].count +
403 	      ARMCI_PROF[ARMCI_PROF_NBACC][i].count +
404               ARMCI_PROF[ARMCI_PROF_NBACCS][i].count +
405 	      ARMCI_PROF[ARMCI_PROF_NBACCV][i].count);
406 
407        gtime = (ARMCI_PROF[ARMCI_PROF_GET][i].time +
408 		ARMCI_PROF[ARMCI_PROF_GETS][i].time +
409 		ARMCI_PROF[ARMCI_PROF_GETV][i].time +
410 		ARMCI_PROF[ARMCI_PROF_NBGET][i].time +
411 		ARMCI_PROF[ARMCI_PROF_NBGETS][i].time +
412 		ARMCI_PROF[ARMCI_PROF_NBGETV][i].time);
413        ptime = (ARMCI_PROF[ARMCI_PROF_PUT][i].time +
414 		ARMCI_PROF[ARMCI_PROF_PUTS][i].time +
415 		ARMCI_PROF[ARMCI_PROF_PUTV][i].time +
416 		ARMCI_PROF[ARMCI_PROF_NBPUT][i].time +
417 		ARMCI_PROF[ARMCI_PROF_NBPUTS][i].time +
418 		ARMCI_PROF[ARMCI_PROF_NBPUTV][i].time);
419        atime = (ARMCI_PROF[ARMCI_PROF_ACC][i].time +
420 		ARMCI_PROF[ARMCI_PROF_ACCS][i].time +
421 		ARMCI_PROF[ARMCI_PROF_ACCV][i].time +
422 		ARMCI_PROF[ARMCI_PROF_NBACC][i].time +
423                 ARMCI_PROF[ARMCI_PROF_NBACCS][i].time +
424 		ARMCI_PROF[ARMCI_PROF_NBACCV][i].time);
425 
426        fprintf(fp, "%d\t %d\t %d\t %.2e   %.2e   %.2e  ",
427                nget, nput, nacc,  gtime, ptime, atime);
428        if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
429        else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
430     }
431 }
432 
433 /* print profile of contiguous get/put/acc calls for every message range */
armci_print_contig(FILE * fp)434 static void armci_print_contig(FILE *fp) {
435     int i, nrange=ARMCI_MAX_MSG_RANGE;
436     ARMCI_HDR1(fp); ARMCI_HDR3(fp);
437     for(i=0; i< nrange; i++) {
438        fprintf(fp, "%d\t %d\t %d\t %.2e   %.2e   %.2e  ",
439 	       ARMCI_PROF[ARMCI_PROF_GET][i].count,
440 	       ARMCI_PROF[ARMCI_PROF_PUT][i].count,
441 	       ARMCI_PROF[ARMCI_PROF_ACC][i].count,
442 	       ARMCI_PROF[ARMCI_PROF_GET][i].time,
443 	       ARMCI_PROF[ARMCI_PROF_PUT][i].time,
444 	       ARMCI_PROF[ARMCI_PROF_ACC][i].time);
445        if(i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
446        else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
447     }
448 }
449 
450 /* This prints the number of non-contiguous get/put/acc/ calls for every
451    message range */
armci_print_noncontig(FILE * fp)452 static void armci_print_noncontig(FILE *fp) {
453     int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
454     double gtime, ptime, atime;
455 
456     ARMCI_HDR2(fp); ARMCI_HDR3(fp);
457     for(i=0; i< nrange; i++) {
458        nget = (ARMCI_PROF[ARMCI_PROF_GETS][i].count +
459 	       ARMCI_PROF[ARMCI_PROF_GETV][i].count);
460        nput = (ARMCI_PROF[ARMCI_PROF_PUTS][i].count +
461 	       ARMCI_PROF[ARMCI_PROF_PUTV][i].count);
462        nacc = (ARMCI_PROF[ARMCI_PROF_ACCS][i].count +
463 	       ARMCI_PROF[ARMCI_PROF_ACCV][i].count);
464        gtime = (ARMCI_PROF[ARMCI_PROF_GETS][i].time +
465 		ARMCI_PROF[ARMCI_PROF_GETV][i].time);
466        ptime = (ARMCI_PROF[ARMCI_PROF_PUTS][i].time +
467 		ARMCI_PROF[ARMCI_PROF_PUTV][i].time);
468        atime = (ARMCI_PROF[ARMCI_PROF_ACCS][i].time +
469 		ARMCI_PROF[ARMCI_PROF_ACCV][i].time);
470 
471        fprintf(fp, "%d\t %d\t %d\t %.2e   %.2e   %.2e  ",
472 	       nget, nput, nacc,  gtime, ptime, atime);
473        if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
474        else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
475     }
476 }
477 
478 /* print profile of non-blocking contiguous get/put/acc calls for every
479    message range */
armci_print_nbcontig(FILE * fp)480 static void armci_print_nbcontig(FILE *fp) {
481     int i, nrange=ARMCI_MAX_MSG_RANGE;
482     ARMCI_HDR6(fp); ARMCI_HDR8(fp);
483     for(i=0; i< nrange; i++) {
484        fprintf(fp, "%d\t %d\t %d\t %.2e   %.2e   %.2e  ",
485 	       ARMCI_PROF[ARMCI_PROF_NBGET][i].count,
486 	       ARMCI_PROF[ARMCI_PROF_NBPUT][i].count,
487 	       ARMCI_PROF[ARMCI_PROF_NBACC][i].count,
488 	       ARMCI_PROF[ARMCI_PROF_NBGET][i].time,
489 	       ARMCI_PROF[ARMCI_PROF_NBPUT][i].time,
490 	       ARMCI_PROF[ARMCI_PROF_NBACC][i].time);
491        if(i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
492        else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
493     }
494 }
495 
496 /* This prints the number of non-blocking non-contiguous get/put/acc/ calls
497    for every message range */
armci_print_nbnoncontig(FILE * fp)498 static void armci_print_nbnoncontig(FILE *fp) {
499     int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
500     double gtime, ptime, atime;
501 
502     ARMCI_HDR7(fp); ARMCI_HDR8(fp);
503     for(i=0; i< nrange; i++) {
504        nget = (ARMCI_PROF[ARMCI_PROF_NBGETS][i].count +
505 	       ARMCI_PROF[ARMCI_PROF_NBGETV][i].count);
506        nput = (ARMCI_PROF[ARMCI_PROF_NBPUTS][i].count +
507 	       ARMCI_PROF[ARMCI_PROF_NBPUTV][i].count);
508        nacc = (ARMCI_PROF[ARMCI_PROF_NBACCS][i].count +
509 	       ARMCI_PROF[ARMCI_PROF_NBACCV][i].count);
510        gtime = (ARMCI_PROF[ARMCI_PROF_NBGETS][i].time +
511 		ARMCI_PROF[ARMCI_PROF_NBGETV][i].time);
512        ptime = (ARMCI_PROF[ARMCI_PROF_NBPUTS][i].time +
513 		ARMCI_PROF[ARMCI_PROF_NBPUTV][i].time);
514        atime = (ARMCI_PROF[ARMCI_PROF_NBACCS][i].time +
515 		ARMCI_PROF[ARMCI_PROF_NBACCV][i].time);
516 
517        fprintf(fp, "%d\t %d\t %d\t %.2e   %.2e   %.2e  ",
518 	       nget, nput, nacc,  gtime, ptime, atime);
519        if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
520        else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
521     }
522 }
523 
524 /* Profile of armci_notify_wait(), ARMCI_Wait() and ARMCI_Barrier() */
armci_print_misc(FILE * fp)525 static void armci_print_misc(FILE *fp) {
526     ARMCI_HDR9(fp);
527     fprintf(fp, "#calls\t time\t   EVENT\n\n");
528     fprintf(fp, "%d\t %.2e  ARMCI_Wait()\n",
529 	    ARMCI_PROF[ARMCI_PROF_WAIT][0].count,
530 	    ARMCI_PROF[ARMCI_PROF_WAIT][0].time);
531     fprintf(fp, "%d\t %.2e  armci_notify_wait()\n",
532 	    ARMCI_PROF[ARMCI_PROF_NOTIFY][0].count,
533 	    ARMCI_PROF[ARMCI_PROF_NOTIFY][0].time);
534     fprintf(fp, "%d\t %.2e  ARMCI_Barrier()\n",
535 	    ARMCI_PROF[ARMCI_PROF_BARRIER][0].count,
536 	    ARMCI_PROF[ARMCI_PROF_BARRIER][0].time);
537     fprintf(fp, "%d\t %.2e  ARMCI_Fence()\n",
538 	    ARMCI_PROF[ARMCI_PROF_FENCE][0].count,
539 	    ARMCI_PROF[ARMCI_PROF_FENCE][0].time);
540     fprintf(fp, "%d\t %.2e  ARMCI_Allfence()\n",
541 	    ARMCI_PROF[ARMCI_PROF_ALLFENCE][0].count,
542 	    ARMCI_PROF[ARMCI_PROF_ALLFENCE][0].time);
543     fprintf(fp, "%d\t %.2e  ARMCI_Rmw()\n",
544 	    ARMCI_PROF[ARMCI_PROF_RMW][0].count,
545 	    ARMCI_PROF[ARMCI_PROF_RMW][0].time);
546 }
547 
548 #if ARMCI_PRINT_STRIDE
armci_print_warning_msg(FILE * fp,int range,int str_count)549 static void armci_print_warning_msg(FILE *fp, int range, int str_count) {
550     fprintf(fp, "WARNING: In your program, total number of data transfers\n");
551     fprintf(fp, "for message range[%d - %d] is %d. This exceeds\n",
552 	    1<<range, 1<<(range+1), str_count);
553     fprintf(fp,"the maximum # of data transfers [%d] that can be profiled.\n",
554 	    STRIDE_COUNT);
555     fprintf(fp, "Therefore profile of only first %d data \n", STRIDE_COUNT);
556     fprintf(fp, "transfers are shown below. To increase the count, set\n");
557     fprintf(fp, "STRIDE_COUNT > %d (in armci_profile.c)\n", str_count);
558 }
559 
armci_print_stridedinfo(FILE * fp,int event,int range)560 static void armci_print_stridedinfo(FILE *fp, int event, int range) {
561     int i, j, stride_levels, str_count;
562     double time=0.0;
563 
564     str_count = ARMCI_PROF[event][range].count;
565     if(str_count <=0) return;
566     if(str_count > STRIDE_COUNT) {
567        armci_print_warning_msg(fp, range, str_count);
568        str_count = STRIDE_COUNT;
569     }
570 
571     fprintf(fp, "\n\nSTRIDE INFORMATION FOR MSG_RANGE %d-%d for EVENT: %s\n",
572 	    1<<range, (1<<(range+1))-1, gEventName[event]);
573     ARMCI_HDR4(fp);
574 
575     for(i=0; i< str_count; i++) {
576        time += ARMCI_PROF[event][range].stride[i].time;
577        stride_levels  = ARMCI_PROF[event][range].stride[i].stride_levels;
578        fprintf(fp, "%d\t%d\t %d\t %.2e  (",i, stride_levels,
579 	       ARMCI_PROF[event][range].stride[i].proc,
580 	       ARMCI_PROF[event][range].stride[i].time);
581        for(j=0;j<=stride_levels;j++) {
582 	  fprintf(fp, "%d", ARMCI_PROF[event][range].stride[i].count[j]);
583 	  if(j!=stride_levels) fprintf(fp, "x");
584        }
585        fprintf(fp, ")\n");
586     }
587     /*This o/p is just for verification*/
588     fprintf(fp, "**** STRIDE_COUNT = %d ; TOTAL TIME = %.2e\n",
589 	    str_count, time);
590 }
591 
armci_print_vectorinfo(FILE * fp,int event,int range)592 static void armci_print_vectorinfo(FILE *fp, int event, int range) {
593     int i, j, vec_len, str_count;
594     double time=0.0;
595 
596     str_count = ARMCI_PROF[event][range].count;
597     if(str_count <=0) return;
598     if(str_count > STRIDE_COUNT) {
599        armci_print_warning_msg(fp, range, str_count);
600        str_count = STRIDE_COUNT;
601     }
602 
603     fprintf(fp, "\n\nVECTOR INFORMATION FOR MSG_RANGE %d-%d for EVENT: %s\n",
604 	    1<<range, (1<<(range+1))-1, gEventName[event]);
605     ARMCI_HDR5(fp);
606 
607     for(i=0; i< str_count; i++) {
608        time += ARMCI_PROF[event][range].vector[i].time;
609        vec_len  = ARMCI_PROF[event][range].vector[i].vec_len;
610        fprintf(fp, "%d\t%d\t %d\t %.2e   [  ",i, vec_len,
611 	       ARMCI_PROF[event][range].vector[i].proc,
612 	       ARMCI_PROF[event][range].vector[i].time);
613        for(j=0;j<vec_len;j++) {
614 	  fprintf(fp, "%-9d %d\t]\n",
615 		  ARMCI_PROF[event][range].vector[i].giov[j].ptr_array_len,
616 		  ARMCI_PROF[event][range].vector[i].giov[j].bytes);
617 	  if(j!=vec_len-1) fprintf(fp, "\t\t\t\t    [  ");
618        }
619     }
620     /*This o/p is just for verification*/
621     fprintf(fp, "**** STRIDE_COUNT = %d ; TOTAL TIME = %.2e\n",
622 	    str_count, time);
623 }
624 #endif /* end of ARMCI_PRINT_STRIDE */
625 
armci_profile_terminate()626 void armci_profile_terminate() {
627     FILE *fp = stdout;
628     char file_name[50];
629     sprintf(file_name, "armci_profile.%d", armci_me);
630     fp = fopen(file_name, "w");
631 
632     armci_print_all(fp);         /* total get/put/acc calls */
633     armci_print_contig(fp);      /* contiguous calls */
634     armci_print_noncontig(fp);   /* non-contiguous calls */
635     armci_print_nbcontig(fp);    /* non-blocking contiguous calls */
636     armci_print_nbnoncontig(fp); /* non-blocking non-contiguous calls */
637 
638     /* miscellaneous (barrier, armci_wait, notify_wait) */
639     armci_print_misc(fp);
640 
641 #if ARMCI_PRINT_STRIDE
642     {
643        /**
644 	* printing stride info for non-contiguous get (ARMCI_PROF_GETS) for message
645 	* range #6. 2^6 - 2^(6+1) bytes (i.e. 64-128 bytes)
646 	*    Ex: armci_print_stridedinfo(ARMCI_PROF_GETS,6);
647  	*/
648 #define ARMCI_PRINT_EVENTS 6
649        int i,j;
650        int str_event[ARMCI_PRINT_EVENTS]={ARMCI_PROF_GETS, ARMCI_PROF_PUTS,
651 					  ARMCI_PROF_ACCS, ARMCI_PROF_NBGETS,
652 					  ARMCI_PROF_NBPUTS,ARMCI_PROF_NBACCS};
653        int vec_event[ARMCI_PRINT_EVENTS]={ARMCI_PROF_GETV, ARMCI_PROF_PUTV,
654 					  ARMCI_PROF_ACCV, ARMCI_PROF_NBGETV,
655 					  ARMCI_PROF_NBPUTV,ARMCI_PROF_NBACCV};
656 
657        fprintf(fp,"\n\n***************************************************\n");
658        fprintf(fp,    " STRIDE INFORMATION for all strided data transfers\n");
659        fprintf(fp,    "***************************************************\n");
660        for(i=0; i<ARMCI_MAX_MSG_RANGE; i++)
661 	  for(j=0; j<ARMCI_PRINT_EVENTS; j++)
662 	     armci_print_stridedinfo(fp,str_event[j], i);
663 
664        fprintf(fp,"\n\n**************************************************\n");
665        fprintf(fp,    " VECTOR INFORMATION for all vector data transfers\n");
666        fprintf(fp,    "**************************************************\n");
667        for(i=0; i<ARMCI_MAX_MSG_RANGE; i++)
668 	  for(j=0; j<ARMCI_PRINT_EVENTS; j++)
669 	     armci_print_vectorinfo(fp,vec_event[j], i);
670     }
671 #endif
672     fclose(fp);
673 }
674