1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /* $Id: armci_profile.c,v 1.8 2005-11-30 10:20:53 vinod Exp $ */
6
7 /**
8 * Profiler can profile the following ARMCI Calls:
9 * ARMCI_Get,ARMCI_Put,ARMCI_Acc,ARMCI_NbGet,ARMCI_NbPut,ARMCI_NbAcc,
10 * ARMCI_GetS,ARMCI_PutS,ARMCI_AccS,ARMCI_NbGetS,ARMCI_NbPutS,ARMCI_NbAccS,
11 * ARMCI_GetV,ARMCI_PutV,ARMCI_AccV,ARMCI_NbGetV,ARMCI_NbPutV,ARMCI_NbAccV,
12 * ARMCI_Wait, armci_wait_notify
13 * (NOTE: As armci_notify is same as ARMCI_Put, it is not profiled.)
14 *
15 *
16 * Note #1: Right now, only process 0's profile is printed.
17 * Each and every process saves its profile in the correspoding data struture.
18 * Each process prints its profile to an output file armci_profile.<myrank>
19 * when armci_profile_terminate() is called (called in ARMCI_Finalize()).
20 *
21 * Note #2: By default profiler prints msg ranges 0 to 21. Example: range 10
22 * corresponds to message ranges from 1024 bytes to 2047 bytes.
23 * Message ranges are in the power of 2. for ex:
24 * ------------------------------------
25 * MSG_RANGE (r) BYTES (2^r to 2^(r+1)-1)
26 * ------------------------------------
27 * 0 0-1
28 * 1 2-3
29 * 2 4-7
30 * ... ...
31 * 10 1024-2047 bytes
32 * ... ...
33 * 20 1MB - (2MB-1)
34 * 21 >= 2MB
35 * -------------------------------------
36 * To increase the message range, set ARMCI_MAX_MSG_RANGE accordingly.
37 *
38 * Note #3: If Stride information needs to be printed, set ARMCI_PRINT_STRIDE.
39 * Stride information is printed in armci_profile_terminate() for a various
40 * selective message ranges and event types.Modify it according to your needs.
41 *
42 * Note #4: There is no profiling support for non-blocking operations yet!!
43 */
44 #define DEBUG_
45 #if HAVE_STDIO_H
46 # include <stdio.h>
47 #endif
48 #if HAVE_STDLIB_H
49 # include <stdlib.h>
50 #endif
51 #if HAVE_STRING_H
52 # include <string.h>
53 #endif
54 #if HAVE_MATH_H
55 # include <math.h>
56 #endif
57 #include "armci.h"
58 #include "armcip.h"
59 #include "armci_profile.h"
60
61 #ifndef MSG_COMMS_MPI
62 # include "tcgmsg.h"
63 # define MP_TIMER tcg_time
64 #else
65 # include "mpi.h"
66 # define MP_TIMER MPI_Wtime
67 #endif
68
69
70 #define ARMCI_PRINT_STRIDE 1
71 #define ARMCI_MAX_MSG_RANGE 22 /* 0 to 21 */
72
73 #if ARMCI_PRINT_STRIDE
74
75 # define STRIDE_COUNT 1000
76 # define ARMCI_MAX_DIM 7
77
78 typedef struct armci_stride {
79 int stride_levels;
80 int proc;
81 int count[ARMCI_MAX_DIM];
82 double time;
83 }armci_stride_t;
84
85 typedef struct giov {
86 int ptr_array_len;
87 int bytes;
88 }giov_t;
89
90 typedef struct armci_vector {
91 int vec_len;
92 int proc;
93 giov_t *giov;
94 double time;
95 }armci_vector_t;
96
97 #endif
98
99 #define ARMCI_EVENTS 24
100
101 char *gEventName[ARMCI_EVENTS]={
102 "GET", "PUT", "ACC",
103 "STRIDED GET", "STRIDED PUT", "STRIDED ACC",
104 "VECTOR GET", "VECTOR PUT", "VECTOR ACC",
105 "NBGET", "NBPUT", "NBACC",
106 "STRIDED NBGET", "STRIDED NBPUT", "STRIDED NBACC",
107 "VECTOR NBGET", "VECTOR NBPUT", "VECTOR NBACC",
108 "BARRIER","ARMCI_WAIT","NOTIFY_WAIT",
109 "FENCE", "ALLFENCE", "RMW"
110 };
111
112 typedef struct armci_profile {
113 int count; /* number of times called */
114 double time; /* total execution time for "count" calls */
115 #if ARMCI_PRINT_STRIDE
116 armci_stride_t *stride;
117 armci_vector_t *vector;
118 #endif
119 }armci_profile_t;
120
121 /* profile get/put/acc for various message ranges (i.e ARMCI_MAX_MSG_RANGE) */
122 static armci_profile_t ARMCI_PROF[ARMCI_EVENTS][ARMCI_MAX_MSG_RANGE];
123
124 /* Current event */
125 struct event_info {
126 int is_set;
127 int event_type;
128 int range;
129 double start_time;
130 } gCURRENT_EVNT;
131
strided_event(int e)132 static int strided_event(int e) {
133 if (e==ARMCI_PROF_GETS || e==ARMCI_PROF_PUTS || e==ARMCI_PROF_ACCS ||
134 e==ARMCI_PROF_NBGETS || e==ARMCI_PROF_NBPUTS || e==ARMCI_PROF_NBACCS)
135 return 1;
136 return 0;
137 }
138
armci_profile_init()139 void armci_profile_init() {
140 int i,j;
141 if(armci_me==0) {printf("\nProfiling ARMCI - ON\n");fflush(stdout);}
142
143 gCURRENT_EVNT.is_set = 0;
144
145 for(i=0; i<ARMCI_EVENTS; i++)
146 for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
147 ARMCI_PROF[i][j].count = 0; ARMCI_PROF[i][j].time = 0.0;
148 }
149
150 #if ARMCI_PRINT_STRIDE
151 for(i=0; i<ARMCI_EVENTS; i++) {
152 if(strided_event(i))
153 for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
154 ARMCI_PROF[i][j].stride = (armci_stride_t*)malloc(STRIDE_COUNT*sizeof(armci_stride_t));
155 ARMCI_PROF[i][j].vector = NULL;
156 if( ARMCI_PROF[i][j].stride == NULL)
157 armci_die("armci_profile_init(): malloc failed", armci_me);
158 }
159 if(i==ARMCI_PROF_GETV || i==ARMCI_PROF_PUTV || i==ARMCI_PROF_ACCV ||
160 i==ARMCI_PROF_NBGETV || i==ARMCI_PROF_NBPUTV || i==ARMCI_PROF_NBACCV)
161 for(j=0; j<ARMCI_MAX_MSG_RANGE; j++) {
162 ARMCI_PROF[i][j].vector = (armci_vector_t*)malloc(STRIDE_COUNT*sizeof(armci_vector_t));
163 ARMCI_PROF[i][j].stride = NULL;
164 if( ARMCI_PROF[i][j].vector == NULL)
165 armci_die("armci_profile_init(): malloc failed", armci_me);
166 }
167 }
168 #endif
169 }
170
171 #define ARMCI_EVENT_CLOSED 0
172 #define ARMCI_EVENT_NOTCLOSED -1
173 #define ARMCI_EVENT_SET 0
174 #define ARMCI_EVENT_NOTSET -1
175
armci_profile_set_event(int event_type,int range)176 static int armci_profile_set_event(int event_type, int range) {
177 #ifdef DEBUG
178 if(SERVER_CONTEXT)
179 printf("\n%d(s):call profile set for %s isset is %d",armci_me,
180 gEventName[event_type],gCURRENT_EVNT.is_set);
181 else
182 printf("\n%d:call profile set for %s isset is %d",armci_me,
183 gEventName[event_type],gCURRENT_EVNT.is_set);
184 fflush(stdout);
185 #endif
186 if(gCURRENT_EVNT.is_set == 0) { /* set an event */
187 gCURRENT_EVNT.is_set = 1;
188 gCURRENT_EVNT.event_type = event_type;
189 gCURRENT_EVNT.range = range;
190 gCURRENT_EVNT.start_time = MP_TIMER();
191 return ARMCI_EVENT_SET;
192 }
193 else gCURRENT_EVNT.is_set++; /* event overlap */
194 return ARMCI_EVENT_NOTSET;
195 }
196
armci_profile_close_event(int event_type,int range,double * time,char * name)197 static int armci_profile_close_event(int event_type, int range, double *time,
198 char *name) {
199
200 int curr_event = gCURRENT_EVNT.event_type;
201 #ifdef DEBUG
202 if(SERVER_CONTEXT)
203 printf("\n%d(s):call profile close for %s isset is %d",armci_me,
204 gEventName[event_type],gCURRENT_EVNT.is_set);
205 else
206 printf("\n%d:call profile close for %s isset is %d",armci_me,
207 gEventName[event_type],gCURRENT_EVNT.is_set);
208 fflush(stdout);
209 #endif
210
211
212 if(gCURRENT_EVNT.is_set==1) { /* Yep, there is an event set. So close it.*/
213 /*Check if "profile stop" is called for corresponding "profile start"*/
214 if(event_type != curr_event) {
215 printf(
216 "%d: %s: ERROR:Profile started for %s, but stopped for %s\n",
217 armci_me,name,gEventName[curr_event],gEventName[event_type]);
218 fflush(stdout);
219 armci_die("Profile_stop is called a different event", armci_me);
220 }
221
222 *time = MP_TIMER() - gCURRENT_EVNT.start_time;
223 ARMCI_PROF[curr_event][range].time += *time;
224 gCURRENT_EVNT.is_set = 0; /* close the event */
225 return ARMCI_EVENT_CLOSED;
226 }
227 else { /* event overlapping */
228 gCURRENT_EVNT.is_set--;
229 if(gCURRENT_EVNT.is_set<=0) {
230 char *msg="Profile_stop is called before profile_start";
231 printf("%d: %s: ERROR: %s. Event Name = %s\n", armci_me,
232 name, msg, gEventName[curr_event]);
233 fflush(stdout);
234 armci_die(" profile_stop is called before profile_start", armci_me);
235 }
236 }
237 return ARMCI_EVENT_NOTCLOSED;
238 }
239
armci_profile_start_strided(int count[],int stride_levels,int proc,int event_type)240 void armci_profile_start_strided(int count[], int stride_levels, int proc,
241 int event_type) {
242 int i, status, bytes=1, range;
243
244 if(stride_levels >= ARMCI_MAX_DIM)
245 armci_die("ARMCI_PROFILE: stride_levels >= ARMCI_MAX_DIM. Increase ARMCI_MAX_DIM.", armci_me);
246
247 /* find the message range */
248 for(i=0; i<= stride_levels; i++) bytes *= count[i];
249 if(bytes<=0) range=0;
250 else range = (int) (log((double)bytes)/log(2.0));
251 if(range>=ARMCI_MAX_MSG_RANGE-1) range = ARMCI_MAX_MSG_RANGE-1;
252
253 /* set the curent event for timer */
254 status = armci_profile_set_event(event_type, range);
255
256 if(status == ARMCI_EVENT_SET) { /* new event set */
257 /* profile update: i.e. update event count */
258 ARMCI_PROF[event_type][range].count++;
259
260 # if ARMCI_PRINT_STRIDE
261 if(strided_event(event_type)) {
262 int idx = ARMCI_PROF[event_type][range].count-1;
263 if(idx<STRIDE_COUNT) {
264 ARMCI_PROF[event_type][range].stride[idx].stride_levels = stride_levels;
265 ARMCI_PROF[event_type][range].stride[idx].proc = proc;
266 for(i=0;i<=stride_levels;i++) {
267 ARMCI_PROF[event_type][range].stride[idx].count[i]=count[i];
268 }
269 }
270 }
271 # endif
272 }
273 else { /* Do nothing. It is just an event overlap */ }
274 }
275
armci_profile_stop_strided(int event_type)276 void armci_profile_stop_strided(int event_type) {
277 double time;
278 int status, range = gCURRENT_EVNT.range;
279
280 status = armci_profile_close_event(event_type, range, &time,
281 "armci_profile_stop_strided");
282
283 #if ARMCI_PRINT_STRIDE
284 if(status == ARMCI_EVENT_CLOSED) {
285 /* record the time of each strided data transfer */
286 if(strided_event(event_type)) {
287 int idx = ARMCI_PROF[event_type][range].count-1;
288 if(idx<STRIDE_COUNT)
289 ARMCI_PROF[event_type][range].stride[idx].time = time;
290 }
291 }
292 #endif
293 }
294
armci_profile_start_vector(armci_giov_t darr[],int len,int proc,int event_type)295 void armci_profile_start_vector(armci_giov_t darr[], int len, int proc,
296 int event_type) {
297
298 int i, bytes=0, range, status;
299
300 /* find the message range */
301 for(i=0; i<len; i++) bytes += darr[i].bytes;
302 if(bytes<=0) range=0;
303 else range = (int) (log((double)bytes)/log(2.0));
304 if(range>=ARMCI_MAX_MSG_RANGE-1) range = ARMCI_MAX_MSG_RANGE-1;
305
306 /* set the curent event for timer */
307 status = armci_profile_set_event(event_type, range);
308
309 if(status == ARMCI_EVENT_SET) { /* new event set */
310 /* profile update: i.e. update event count */
311 ARMCI_PROF[event_type][range].count++;
312
313 # if ARMCI_PRINT_STRIDE
314 {
315 int idx = ARMCI_PROF[event_type][range].count-1;
316 if(idx<STRIDE_COUNT) {
317 ARMCI_PROF[event_type][range].vector[idx].vec_len = len;
318 ARMCI_PROF[event_type][range].vector[idx].proc = proc;
319 ARMCI_PROF[event_type][range].vector[idx].giov =
320 (giov_t*)malloc(len*sizeof(giov_t));
321 for(i=0;i<len;i++) {
322 ARMCI_PROF[event_type][range].vector[idx].giov[i].ptr_array_len = darr[i].ptr_array_len;
323 ARMCI_PROF[event_type][range].vector[idx].giov[i].bytes =
324 darr[i].bytes;
325 }
326 }
327 }
328 # endif
329 }
330 }
331
armci_profile_stop_vector(int event_type)332 void armci_profile_stop_vector(int event_type) {
333 double time;
334 int status, range = gCURRENT_EVNT.range;
335
336 status = armci_profile_close_event(event_type, range, &time,
337 "armci_profile_stop_vector");
338
339 #if ARMCI_PRINT_STRIDE
340 if(status == ARMCI_EVENT_CLOSED) {/*record time of each data transfer*/
341 int idx = ARMCI_PROF[event_type][range].count-1;
342 if(idx<STRIDE_COUNT)
343 ARMCI_PROF[event_type][range].vector[idx].time = time;
344 }
345 #endif
346 }
347
armci_profile_start(int event_type)348 void armci_profile_start(int event_type) {
349 int range, status;
350
351 /* message range is zero for events registered using this call */
352 range=0;
353
354 /* set the curent event for timer */
355 status = armci_profile_set_event(event_type, range);
356 if(status == ARMCI_EVENT_SET) { /* new event set */
357 /* profile update: i.e. update event count */
358 ARMCI_PROF[event_type][range].count++;
359 }
360 }
361
armci_profile_stop(int event_type)362 void armci_profile_stop(int event_type) {
363 double time;
364 int status,range = gCURRENT_EVNT.range;
365 status = armci_profile_close_event(event_type, range, &time,
366 "armci_profile_stop");
367 }
368
369 #define ARMCI_HDR0(fp) fprintf(fp, "\n\n************** TOTAL DATA TRANSFERS **************\n\n");
370 #define ARMCI_HDR1(fp) fprintf(fp, "\n\n************ CONTIGUOUS DATA TRANSFER ************\n\n");
371 #define ARMCI_HDR2(fp) fprintf(fp, "\n\n********** NON-CONTIGUOUS DATA TRANSFER **********\n\n");
372 #define ARMCI_HDR3(fp) fprintf(fp, "#gets\t #puts\t #accs\t get_time put_time acc_time RANGE(bytes)\n\n");
373 #define ARMCI_HDR4(fp) fprintf(fp, "SL#\tndim\t proc\t time stride_info\n\n");
374 #define ARMCI_HDR5(fp) fprintf(fp, "SL#\tnvec\t proc\t time\t [ #arrays\t bytes\t]\n");
375 #define ARMCI_HDR6(fp) fprintf(fp, "\n\n****** NON-BLOCKING CONTIGUOUS DATA TRANSFER *****\n\n");
376 #define ARMCI_HDR7(fp) fprintf(fp, "\n\n*** NON-BLOCKING NON-CONTIGUOUS DATA TRANSFER ****\n\n");
377 #define ARMCI_HDR8(fp) fprintf(fp, "#gets\t #puts\t #accs\t get_time put_time acc_time RANGE(bytes)\n\n");
378 #define ARMCI_HDR9(fp) fprintf(fp, "\n\n******************* ARMCI MISC *******************\n\n");
379
380 /* print profile of all get/put/acc calls for every message range */
armci_print_all(FILE * fp)381 static void armci_print_all(FILE *fp) {
382 int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
383 double gtime, ptime, atime;
384
385 ARMCI_HDR0(fp); ARMCI_HDR3(fp);
386 for(i=0; i< nrange; i++) {
387
388 nget =(ARMCI_PROF[ARMCI_PROF_GET][i].count +
389 ARMCI_PROF[ARMCI_PROF_GETS][i].count +
390 ARMCI_PROF[ARMCI_PROF_GETV][i].count +
391 ARMCI_PROF[ARMCI_PROF_NBGET][i].count +
392 ARMCI_PROF[ARMCI_PROF_NBGETS][i].count +
393 ARMCI_PROF[ARMCI_PROF_NBGETV][i].count);
394 nput =(ARMCI_PROF[ARMCI_PROF_PUT][i].count +
395 ARMCI_PROF[ARMCI_PROF_PUTS][i].count +
396 ARMCI_PROF[ARMCI_PROF_PUTV][i].count +
397 ARMCI_PROF[ARMCI_PROF_NBPUT][i].count +
398 ARMCI_PROF[ARMCI_PROF_NBPUTS][i].count +
399 ARMCI_PROF[ARMCI_PROF_NBPUTV][i].count);
400 nacc =(ARMCI_PROF[ARMCI_PROF_ACC][i].count +
401 ARMCI_PROF[ARMCI_PROF_ACCS][i].count +
402 ARMCI_PROF[ARMCI_PROF_ACCV][i].count +
403 ARMCI_PROF[ARMCI_PROF_NBACC][i].count +
404 ARMCI_PROF[ARMCI_PROF_NBACCS][i].count +
405 ARMCI_PROF[ARMCI_PROF_NBACCV][i].count);
406
407 gtime = (ARMCI_PROF[ARMCI_PROF_GET][i].time +
408 ARMCI_PROF[ARMCI_PROF_GETS][i].time +
409 ARMCI_PROF[ARMCI_PROF_GETV][i].time +
410 ARMCI_PROF[ARMCI_PROF_NBGET][i].time +
411 ARMCI_PROF[ARMCI_PROF_NBGETS][i].time +
412 ARMCI_PROF[ARMCI_PROF_NBGETV][i].time);
413 ptime = (ARMCI_PROF[ARMCI_PROF_PUT][i].time +
414 ARMCI_PROF[ARMCI_PROF_PUTS][i].time +
415 ARMCI_PROF[ARMCI_PROF_PUTV][i].time +
416 ARMCI_PROF[ARMCI_PROF_NBPUT][i].time +
417 ARMCI_PROF[ARMCI_PROF_NBPUTS][i].time +
418 ARMCI_PROF[ARMCI_PROF_NBPUTV][i].time);
419 atime = (ARMCI_PROF[ARMCI_PROF_ACC][i].time +
420 ARMCI_PROF[ARMCI_PROF_ACCS][i].time +
421 ARMCI_PROF[ARMCI_PROF_ACCV][i].time +
422 ARMCI_PROF[ARMCI_PROF_NBACC][i].time +
423 ARMCI_PROF[ARMCI_PROF_NBACCS][i].time +
424 ARMCI_PROF[ARMCI_PROF_NBACCV][i].time);
425
426 fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
427 nget, nput, nacc, gtime, ptime, atime);
428 if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
429 else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
430 }
431 }
432
433 /* print profile of contiguous get/put/acc calls for every message range */
armci_print_contig(FILE * fp)434 static void armci_print_contig(FILE *fp) {
435 int i, nrange=ARMCI_MAX_MSG_RANGE;
436 ARMCI_HDR1(fp); ARMCI_HDR3(fp);
437 for(i=0; i< nrange; i++) {
438 fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
439 ARMCI_PROF[ARMCI_PROF_GET][i].count,
440 ARMCI_PROF[ARMCI_PROF_PUT][i].count,
441 ARMCI_PROF[ARMCI_PROF_ACC][i].count,
442 ARMCI_PROF[ARMCI_PROF_GET][i].time,
443 ARMCI_PROF[ARMCI_PROF_PUT][i].time,
444 ARMCI_PROF[ARMCI_PROF_ACC][i].time);
445 if(i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
446 else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
447 }
448 }
449
450 /* This prints the number of non-contiguous get/put/acc/ calls for every
451 message range */
armci_print_noncontig(FILE * fp)452 static void armci_print_noncontig(FILE *fp) {
453 int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
454 double gtime, ptime, atime;
455
456 ARMCI_HDR2(fp); ARMCI_HDR3(fp);
457 for(i=0; i< nrange; i++) {
458 nget = (ARMCI_PROF[ARMCI_PROF_GETS][i].count +
459 ARMCI_PROF[ARMCI_PROF_GETV][i].count);
460 nput = (ARMCI_PROF[ARMCI_PROF_PUTS][i].count +
461 ARMCI_PROF[ARMCI_PROF_PUTV][i].count);
462 nacc = (ARMCI_PROF[ARMCI_PROF_ACCS][i].count +
463 ARMCI_PROF[ARMCI_PROF_ACCV][i].count);
464 gtime = (ARMCI_PROF[ARMCI_PROF_GETS][i].time +
465 ARMCI_PROF[ARMCI_PROF_GETV][i].time);
466 ptime = (ARMCI_PROF[ARMCI_PROF_PUTS][i].time +
467 ARMCI_PROF[ARMCI_PROF_PUTV][i].time);
468 atime = (ARMCI_PROF[ARMCI_PROF_ACCS][i].time +
469 ARMCI_PROF[ARMCI_PROF_ACCV][i].time);
470
471 fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
472 nget, nput, nacc, gtime, ptime, atime);
473 if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
474 else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
475 }
476 }
477
478 /* print profile of non-blocking contiguous get/put/acc calls for every
479 message range */
armci_print_nbcontig(FILE * fp)480 static void armci_print_nbcontig(FILE *fp) {
481 int i, nrange=ARMCI_MAX_MSG_RANGE;
482 ARMCI_HDR6(fp); ARMCI_HDR8(fp);
483 for(i=0; i< nrange; i++) {
484 fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
485 ARMCI_PROF[ARMCI_PROF_NBGET][i].count,
486 ARMCI_PROF[ARMCI_PROF_NBPUT][i].count,
487 ARMCI_PROF[ARMCI_PROF_NBACC][i].count,
488 ARMCI_PROF[ARMCI_PROF_NBGET][i].time,
489 ARMCI_PROF[ARMCI_PROF_NBPUT][i].time,
490 ARMCI_PROF[ARMCI_PROF_NBACC][i].time);
491 if(i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
492 else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
493 }
494 }
495
496 /* This prints the number of non-blocking non-contiguous get/put/acc/ calls
497 for every message range */
armci_print_nbnoncontig(FILE * fp)498 static void armci_print_nbnoncontig(FILE *fp) {
499 int i, nget, nput, nacc, nrange=ARMCI_MAX_MSG_RANGE;
500 double gtime, ptime, atime;
501
502 ARMCI_HDR7(fp); ARMCI_HDR8(fp);
503 for(i=0; i< nrange; i++) {
504 nget = (ARMCI_PROF[ARMCI_PROF_NBGETS][i].count +
505 ARMCI_PROF[ARMCI_PROF_NBGETV][i].count);
506 nput = (ARMCI_PROF[ARMCI_PROF_NBPUTS][i].count +
507 ARMCI_PROF[ARMCI_PROF_NBPUTV][i].count);
508 nacc = (ARMCI_PROF[ARMCI_PROF_NBACCS][i].count +
509 ARMCI_PROF[ARMCI_PROF_NBACCV][i].count);
510 gtime = (ARMCI_PROF[ARMCI_PROF_NBGETS][i].time +
511 ARMCI_PROF[ARMCI_PROF_NBGETV][i].time);
512 ptime = (ARMCI_PROF[ARMCI_PROF_NBPUTS][i].time +
513 ARMCI_PROF[ARMCI_PROF_NBPUTV][i].time);
514 atime = (ARMCI_PROF[ARMCI_PROF_NBACCS][i].time +
515 ARMCI_PROF[ARMCI_PROF_NBACCV][i].time);
516
517 fprintf(fp, "%d\t %d\t %d\t %.2e %.2e %.2e ",
518 nget, nput, nacc, gtime, ptime, atime);
519 if (i< nrange-1) fprintf(fp, "(%d-%d)\n", 1<<i, (1<<(i+1))-1);
520 else fprintf(fp, "(>=%d)\n", 1<<(ARMCI_MAX_MSG_RANGE-1));
521 }
522 }
523
524 /* Profile of armci_notify_wait(), ARMCI_Wait() and ARMCI_Barrier() */
armci_print_misc(FILE * fp)525 static void armci_print_misc(FILE *fp) {
526 ARMCI_HDR9(fp);
527 fprintf(fp, "#calls\t time\t EVENT\n\n");
528 fprintf(fp, "%d\t %.2e ARMCI_Wait()\n",
529 ARMCI_PROF[ARMCI_PROF_WAIT][0].count,
530 ARMCI_PROF[ARMCI_PROF_WAIT][0].time);
531 fprintf(fp, "%d\t %.2e armci_notify_wait()\n",
532 ARMCI_PROF[ARMCI_PROF_NOTIFY][0].count,
533 ARMCI_PROF[ARMCI_PROF_NOTIFY][0].time);
534 fprintf(fp, "%d\t %.2e ARMCI_Barrier()\n",
535 ARMCI_PROF[ARMCI_PROF_BARRIER][0].count,
536 ARMCI_PROF[ARMCI_PROF_BARRIER][0].time);
537 fprintf(fp, "%d\t %.2e ARMCI_Fence()\n",
538 ARMCI_PROF[ARMCI_PROF_FENCE][0].count,
539 ARMCI_PROF[ARMCI_PROF_FENCE][0].time);
540 fprintf(fp, "%d\t %.2e ARMCI_Allfence()\n",
541 ARMCI_PROF[ARMCI_PROF_ALLFENCE][0].count,
542 ARMCI_PROF[ARMCI_PROF_ALLFENCE][0].time);
543 fprintf(fp, "%d\t %.2e ARMCI_Rmw()\n",
544 ARMCI_PROF[ARMCI_PROF_RMW][0].count,
545 ARMCI_PROF[ARMCI_PROF_RMW][0].time);
546 }
547
548 #if ARMCI_PRINT_STRIDE
armci_print_warning_msg(FILE * fp,int range,int str_count)549 static void armci_print_warning_msg(FILE *fp, int range, int str_count) {
550 fprintf(fp, "WARNING: In your program, total number of data transfers\n");
551 fprintf(fp, "for message range[%d - %d] is %d. This exceeds\n",
552 1<<range, 1<<(range+1), str_count);
553 fprintf(fp,"the maximum # of data transfers [%d] that can be profiled.\n",
554 STRIDE_COUNT);
555 fprintf(fp, "Therefore profile of only first %d data \n", STRIDE_COUNT);
556 fprintf(fp, "transfers are shown below. To increase the count, set\n");
557 fprintf(fp, "STRIDE_COUNT > %d (in armci_profile.c)\n", str_count);
558 }
559
armci_print_stridedinfo(FILE * fp,int event,int range)560 static void armci_print_stridedinfo(FILE *fp, int event, int range) {
561 int i, j, stride_levels, str_count;
562 double time=0.0;
563
564 str_count = ARMCI_PROF[event][range].count;
565 if(str_count <=0) return;
566 if(str_count > STRIDE_COUNT) {
567 armci_print_warning_msg(fp, range, str_count);
568 str_count = STRIDE_COUNT;
569 }
570
571 fprintf(fp, "\n\nSTRIDE INFORMATION FOR MSG_RANGE %d-%d for EVENT: %s\n",
572 1<<range, (1<<(range+1))-1, gEventName[event]);
573 ARMCI_HDR4(fp);
574
575 for(i=0; i< str_count; i++) {
576 time += ARMCI_PROF[event][range].stride[i].time;
577 stride_levels = ARMCI_PROF[event][range].stride[i].stride_levels;
578 fprintf(fp, "%d\t%d\t %d\t %.2e (",i, stride_levels,
579 ARMCI_PROF[event][range].stride[i].proc,
580 ARMCI_PROF[event][range].stride[i].time);
581 for(j=0;j<=stride_levels;j++) {
582 fprintf(fp, "%d", ARMCI_PROF[event][range].stride[i].count[j]);
583 if(j!=stride_levels) fprintf(fp, "x");
584 }
585 fprintf(fp, ")\n");
586 }
587 /*This o/p is just for verification*/
588 fprintf(fp, "**** STRIDE_COUNT = %d ; TOTAL TIME = %.2e\n",
589 str_count, time);
590 }
591
armci_print_vectorinfo(FILE * fp,int event,int range)592 static void armci_print_vectorinfo(FILE *fp, int event, int range) {
593 int i, j, vec_len, str_count;
594 double time=0.0;
595
596 str_count = ARMCI_PROF[event][range].count;
597 if(str_count <=0) return;
598 if(str_count > STRIDE_COUNT) {
599 armci_print_warning_msg(fp, range, str_count);
600 str_count = STRIDE_COUNT;
601 }
602
603 fprintf(fp, "\n\nVECTOR INFORMATION FOR MSG_RANGE %d-%d for EVENT: %s\n",
604 1<<range, (1<<(range+1))-1, gEventName[event]);
605 ARMCI_HDR5(fp);
606
607 for(i=0; i< str_count; i++) {
608 time += ARMCI_PROF[event][range].vector[i].time;
609 vec_len = ARMCI_PROF[event][range].vector[i].vec_len;
610 fprintf(fp, "%d\t%d\t %d\t %.2e [ ",i, vec_len,
611 ARMCI_PROF[event][range].vector[i].proc,
612 ARMCI_PROF[event][range].vector[i].time);
613 for(j=0;j<vec_len;j++) {
614 fprintf(fp, "%-9d %d\t]\n",
615 ARMCI_PROF[event][range].vector[i].giov[j].ptr_array_len,
616 ARMCI_PROF[event][range].vector[i].giov[j].bytes);
617 if(j!=vec_len-1) fprintf(fp, "\t\t\t\t [ ");
618 }
619 }
620 /*This o/p is just for verification*/
621 fprintf(fp, "**** STRIDE_COUNT = %d ; TOTAL TIME = %.2e\n",
622 str_count, time);
623 }
624 #endif /* end of ARMCI_PRINT_STRIDE */
625
armci_profile_terminate()626 void armci_profile_terminate() {
627 FILE *fp = stdout;
628 char file_name[50];
629 sprintf(file_name, "armci_profile.%d", armci_me);
630 fp = fopen(file_name, "w");
631
632 armci_print_all(fp); /* total get/put/acc calls */
633 armci_print_contig(fp); /* contiguous calls */
634 armci_print_noncontig(fp); /* non-contiguous calls */
635 armci_print_nbcontig(fp); /* non-blocking contiguous calls */
636 armci_print_nbnoncontig(fp); /* non-blocking non-contiguous calls */
637
638 /* miscellaneous (barrier, armci_wait, notify_wait) */
639 armci_print_misc(fp);
640
641 #if ARMCI_PRINT_STRIDE
642 {
643 /**
644 * printing stride info for non-contiguous get (ARMCI_PROF_GETS) for message
645 * range #6. 2^6 - 2^(6+1) bytes (i.e. 64-128 bytes)
646 * Ex: armci_print_stridedinfo(ARMCI_PROF_GETS,6);
647 */
648 #define ARMCI_PRINT_EVENTS 6
649 int i,j;
650 int str_event[ARMCI_PRINT_EVENTS]={ARMCI_PROF_GETS, ARMCI_PROF_PUTS,
651 ARMCI_PROF_ACCS, ARMCI_PROF_NBGETS,
652 ARMCI_PROF_NBPUTS,ARMCI_PROF_NBACCS};
653 int vec_event[ARMCI_PRINT_EVENTS]={ARMCI_PROF_GETV, ARMCI_PROF_PUTV,
654 ARMCI_PROF_ACCV, ARMCI_PROF_NBGETV,
655 ARMCI_PROF_NBPUTV,ARMCI_PROF_NBACCV};
656
657 fprintf(fp,"\n\n***************************************************\n");
658 fprintf(fp, " STRIDE INFORMATION for all strided data transfers\n");
659 fprintf(fp, "***************************************************\n");
660 for(i=0; i<ARMCI_MAX_MSG_RANGE; i++)
661 for(j=0; j<ARMCI_PRINT_EVENTS; j++)
662 armci_print_stridedinfo(fp,str_event[j], i);
663
664 fprintf(fp,"\n\n**************************************************\n");
665 fprintf(fp, " VECTOR INFORMATION for all vector data transfers\n");
666 fprintf(fp, "**************************************************\n");
667 for(i=0; i<ARMCI_MAX_MSG_RANGE; i++)
668 for(j=0; j<ARMCI_PRINT_EVENTS; j++)
669 armci_print_vectorinfo(fp,vec_event[j], i);
670 }
671 #endif
672 fclose(fp);
673 }
674