1 /* ---------------------------------------------------------------- */
2 /* (C)Copyright IBM Corp.  2007, 2008                               */
3 /* ---------------------------------------------------------------- */
4 /**
5  * \file ad_gpfs_tuning.c
6  * \brief Defines ad_gpfs performance tuning
7  */
8 
9 /* -*- Mode: C; c-basic-offset:4 ; -*- */
10 /*
11  *   Copyright (C) 2008 University of Chicago.
12  *   See COPYRIGHT notice in top-level directory.
13  */
14 
15 /*---------------------------------------------------------------------
16  * ad_gpfs_tuning.c
17  *
18  * defines global variables and functions for performance tuning and
19  * functional debugging.
20  *---------------------------------------------------------------------*/
21 
22 #include "ad_gpfs_tuning.h"
23 #include "mpi.h"
24 
25 #if !defined(PVFS2_SUPER_MAGIC)
26   #define PVFS2_SUPER_MAGIC (0x20030528)
27 #endif
28 
29 
30 int 	gpfsmpio_timing;
31 int 	gpfsmpio_timing2;
32 int     gpfsmpio_timing_cw_level;
33 int 	gpfsmpio_comm;
34 int 	gpfsmpio_tunegather;
35 int 	gpfsmpio_tuneblocking;
36 long    bglocklessmpio_f_type;
37 int     gpfsmpio_bg_nagg_pset;
38 int     gpfsmpio_pthreadio;
39 int     gpfsmpio_p2pcontig;
40 int	gpfsmpio_balancecontig;
41 int     gpfsmpio_devnullio;
42 int     gpfsmpio_bridgeringagg;
43 
44 double	gpfsmpio_prof_cw    [GPFSMPIO_CIO_LAST+1];
45 double	gpfsmpio_prof_cr    [GPFSMPIO_CIO_LAST+1];
46 
47 /* set internal variables for tuning environment variables */
48 /** \page mpiio_vars MPIIO Configuration
49   \section env_sec Environment Variables
50  * - GPFSMPIO_COMM - Define how data is exchanged on collective
51  *   reads and writes.  Possible values:
52  *   - 0 - Use MPI_Alltoallv.
53  *   - 1 - Use MPI_Isend/MPI_Irecv.
54  *   - Default is 0.
55  *
56  * - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
57  *   Possible values:
58  *   - 0 - Do not collect/report timing.
59  *   - 1 - Collect/report timing.
60  *   - Default is 0.
61  *
62  * - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
63  *   for aggregator collective i/o.  Possible values:
64  *   - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
65  *   - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
66  *   - Default is 1.
67  *
68  * - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
69  *   calculated (block size).  Possible values:
70  *   - 0 - Evenly calculate file domains across aggregators.  Also use
71  *   MPI_Isend/MPI_Irecv to exchange domain information.
72  *   - 1 - Align file domains with the underlying file system's block size.  Also use
73  *   MPI_Alltoallv to exchange domain information.
74  *   - Default is 1.
75  *
76  * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
77  *   the ad_bglockless driver.   NOTE: Using romio prefixes (such as
78  *   "bg:" or "bglockless:") on a file name will override this environment
79  *   variable.  Possible values:
80  *   - 0xnnnnnnnn - Any valid file system type (or "magic number") from
81  *                  statfs() field f_type.
82  *   - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
83  *
84  * - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
85  *   compute group (compute nodes + i/o nodes).    Possible values:
86  *   - any integer
87  *   - Default is 8
88  *
89  * - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
90  *   pthread is spawned to do the posix writes while the main thread does the
91  *   data aggregation - useful for large files where multiple rounds are
92  *   required (more that the cb_buffer_size of data per aggregator).   User
93  *   must ensure there is hw resource available for the thread to run.  I
94  *   am sure there is a better way to do this involving comm threads - this is
95  *   just a start.  NOTE: For some reason the stats collected when this is
96  *   enabled misses some of the data so the data sizes are off a bit - this is
97  *   a statistical issue only, the data is still accurately written out
98  *
99  * - GPFSMPIO_P2PCONTIG -  Does simple point-to-point communication between the
100  *   aggregator and the procs that feed it.  Performance could be enhanced by a
101  *   one-sided put algorithm.  Current implementation allows only 1 round of
102  *   data.  Useful/allowed only when:
103  * 1.) The datatype is contiguous.
104  * 2.) The offsets are increasing in rank-order.
105  * 3.) There are no gaps between the offsets.
106  * 4.) No single rank has a data size which spans multiple file domains.
107  *
108  * - GPFSMPIO_BALANCECONTIG -  Relevant only to BGQ.  File domain blocks are assigned
109  *   to aggregators in a breadth-first fashion relative to the ions - additionally,
110  *   file domains on the aggregators sharing the same bridgeset and ion have contiguous
111  *   offsets.  The breadth-first assignment improves performance in the case of
112  *   a relatively small file of size less than the gpfs block size multiplied
113  *   by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c.  Possible Values
114  *   - 0 - assign file domain blocks in the traditional manner
115  *   - 1 - if there are variable sized file domain blocks, spread them out
116  *         (balance) across bridge nodes
117  *
118  * - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
119  *   system. When experimenting with different two-phase I/O strategies, it's
120  *   helpful to remove the highly variable file system from the experiment.
121  *   - 0 (disabled) or 1 (enabled)
122  *   - Default is 0
123  *
124  * - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ.  Aggregator placement
125  *   optimization whch forms a 5-d ring around the bridge node starting at
126  *   GPFSMPIO_BRIDGERINGAGG hops away.  Experimental performance results
127  *   suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
128  *   and GPFSMPIO_BALANCECONTIG.  The number of aggregators selected is still
129  *   GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
130  *
131  */
132 
ad_gpfs_get_env_vars()133 void ad_gpfs_get_env_vars() {
134     char *x, *dummy;
135 
136     gpfsmpio_comm   = 0;
137 	x = getenv( "GPFSMPIO_COMM"         );
138 	if (x) gpfsmpio_comm         = atoi(x);
139     gpfsmpio_timing = 0;
140 	x = getenv( "GPFSMPIO_TIMING"       );
141 	if (x) gpfsmpio_timing       = atoi(x);
142     gpfsmpio_tunegather = 1;
143 	x = getenv( "GPFSMPIO_TUNEGATHER"   );
144 	if (x) gpfsmpio_tunegather   = atoi(x);
145     gpfsmpio_tuneblocking = 1;
146     x = getenv( "GPFSMPIO_TUNEBLOCKING" );
147     if (x) gpfsmpio_tuneblocking = atoi(x);
148     bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
149     x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
150     if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
151     DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
152             bglocklessmpio_f_type,bglocklessmpio_f_type);
153     /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
154      * when we know a bit more about what "largest possible value" and
155      * "smallest possible value" should be */
156     gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
157     x = getenv("GPFSMPIO_NAGG_PSET");
158     if (x) gpfsmpio_bg_nagg_pset = atoi(x);
159 
160     gpfsmpio_pthreadio = 0;
161     x = getenv( "GPFSMPIO_PTHREADIO" );
162     if (x) gpfsmpio_pthreadio = atoi(x);
163 
164     gpfsmpio_p2pcontig = 0;
165     x = getenv( "GPFSMPIO_P2PCONTIG" );
166     if (x) gpfsmpio_p2pcontig = atoi(x);
167 
168     gpfsmpio_balancecontig = 0;
169     x = getenv( "GPFSMPIO_BALANCECONTIG" );
170     if (x) gpfsmpio_balancecontig = atoi(x);
171 
172     gpfsmpio_devnullio = 0;
173     x = getenv( "GPFSMPIO_DEVNULLIO" );
174     if (x) gpfsmpio_devnullio = atoi(x);
175 
176     gpfsmpio_bridgeringagg = 0;
177     x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
178     if (x) gpfsmpio_bridgeringagg = atoi(x);
179 }
180 
181 /* report timing breakdown for MPI I/O collective call */
ad_gpfs_timing_crw_report(int rw,ADIO_File fd,int myrank,int nprocs)182 void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
183 {
184     int i;
185 
186     if (gpfsmpio_timing) {
187 	/* Timing across the whole communicator is a little bit interesting,
188 	 * but what is *more* interesting is if we single out the aggregators
189 	 * themselves.  non-aggregators spend a lot of time in "exchange" not
190 	 * exchanging data, but blocked because they are waiting for
191 	 * aggregators to finish writing.  If we focus on just the aggregator
192 	 * processes we will get a more clear picture about the data exchange
193 	 * vs. i/o time breakdown */
194 
195 	/* if deferred open enabled, we could use the aggregator communicator */
196 	MPI_Comm agg_comm;
197 	int nr_aggs, agg_rank;
198 	MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
199 	if(agg_comm != MPI_COMM_NULL) {
200 	    MPI_Comm_size(agg_comm, &nr_aggs);
201 	    MPI_Comm_rank(agg_comm, &agg_rank);
202 	}
203 
204 	double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
205 	if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw;
206 
207 	double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ];
208 	double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ];
209 
210 	if( agg_comm != MPI_COMM_NULL) {
211 	    MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
212 	    MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
213 	}
214 	if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
215 
216 	    for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs;
217 
218 	    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW  ] =
219 		gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
220 		gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW  ];
221 	    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW  ] =
222 		gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
223 		gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW  ];
224 
225 	    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] =
226 		gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
227 		gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ];
228 
229 	    fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
230 	    fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs));
231 	    fprintf(stderr,"SEEK-avg: %10.3f , ",
232 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ]     );
233 	    fprintf(stderr,"SEEK-max: %10.3f , ",
234 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ]     );
235 	    fprintf(stderr,"LOCAL-avg: %10.3f , ",
236 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ]    );
237 	    fprintf(stderr,"GATHER-max: %10.3f , ",
238 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ]   );
239 	    fprintf(stderr,"PATTERN-avg: %10.3f , ",
240 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ]   );
241 	    fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
242 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ]  );
243 	    fprintf(stderr,"MYREQ-avg: %10.3f , ",
244 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ]    );
245 	    fprintf(stderr,"OTHERREQ-max: %10.3f , ",
246 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ]   );
247 	    fprintf(stderr,"EXCHANGE-max: %10.3f , ",
248 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ]    );
249 	    fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
250 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH]  );
251 	    fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
252 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP]  );
253 	    fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
254 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET]  );
255 	    fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
256 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT]  );
257 	    fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
258 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE]  );
259 	    fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
260 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ]  );
261 	    fprintf(stderr,"POSIX-TIME-max: %10.3f , ",
262 		    gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ]  );
263 	    fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
264 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ]  );
265 	    fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
266 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] );
267 	    fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
268 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ]  );
269 	    fprintf(stderr,"MPI-BW-avg: %10.3f , ",
270 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ]  );
271 	    fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
272 		    gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] );
273 	}
274 	if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
275     }
276 
277 }
278