1 /* ---------------------------------------------------------------- */
2 /* (C)Copyright IBM Corp. 2007, 2008 */
3 /* ---------------------------------------------------------------- */
4 /**
5 * \file ad_gpfs_tuning.c
6 * \brief Defines ad_gpfs performance tuning
7 */
8
9 /* -*- Mode: C; c-basic-offset:4 ; -*- */
10 /*
11 * Copyright (C) 2008 University of Chicago.
12 * See COPYRIGHT notice in top-level directory.
13 */
14
15 /*---------------------------------------------------------------------
16 * ad_gpfs_tuning.c
17 *
18 * defines global variables and functions for performance tuning and
19 * functional debugging.
20 *---------------------------------------------------------------------*/
21
22 #include "ad_gpfs_tuning.h"
23 #include "mpi.h"
24
25 #if !defined(PVFS2_SUPER_MAGIC)
26 #define PVFS2_SUPER_MAGIC (0x20030528)
27 #endif
28
29
30 int gpfsmpio_timing;
31 int gpfsmpio_timing2;
32 int gpfsmpio_timing_cw_level;
33 int gpfsmpio_comm;
34 int gpfsmpio_tunegather;
35 int gpfsmpio_tuneblocking;
36 long bglocklessmpio_f_type;
37 int gpfsmpio_bg_nagg_pset;
38 int gpfsmpio_pthreadio;
39 int gpfsmpio_p2pcontig;
40 int gpfsmpio_write_aggmethod;
41 int gpfsmpio_read_aggmethod;
42 int gpfsmpio_balancecontig;
43 int gpfsmpio_devnullio;
44 int gpfsmpio_bridgeringagg;
45 int gpfsmpio_onesided_no_rmw;
46 int gpfsmpio_onesided_always_rmw;
47 int gpfsmpio_onesided_inform_rmw;
48
49 double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
50 double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
51
52 /* set internal variables for tuning environment variables */
53 /** \page mpiio_vars MPIIO Configuration
54 \section env_sec Environment Variables
55 * - GPFSMPIO_COMM - Define how data is exchanged on collective
56 * reads and writes. Possible values:
57 * - 0 - Use MPI_Alltoallv.
58 * - 1 - Use MPI_Isend/MPI_Irecv.
59 * - Default is 0.
60 *
61 * - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
62 * Possible values:
63 * - 0 - Do not collect/report timing.
64 * - 1 - Collect/report timing.
65 * - Default is 0.
66 *
67 * - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
68 * for aggregator collective i/o. Possible values:
69 * - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
70 * - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
71 * - Default is 1.
72 *
73 * - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
74 * calculated (block size). Possible values:
75 * - 0 - Evenly calculate file domains across aggregators. Also use
76 * MPI_Isend/MPI_Irecv to exchange domain information.
77 * - 1 - Align file domains with the underlying file system's block size. Also use
78 * MPI_Alltoallv to exchange domain information.
79 * - Default is 1.
80 *
81 * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
82 * the ad_bglockless driver. NOTE: Using romio prefixes (such as
83 * "bg:" or "bglockless:") on a file name will override this environment
84 * variable. Possible values:
85 * - 0xnnnnnnnn - Any valid file system type (or "magic number") from
86 * statfs() field f_type.
87 * - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
88 *
89 * - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
90 * compute group (compute nodes + i/o nodes). Possible values:
91 * - any integer
92 * - Default is 8
93 *
94 * - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
95 * pthread is spawned to do the posix writes while the main thread does the
96 * data aggregation - useful for large files where multiple rounds are
97 * required (more that the cb_buffer_size of data per aggregator). User
98 * must ensure there is hw resource available for the thread to run. I
99 * am sure there is a better way to do this involving comm threads - this is
100 * just a start. NOTE: For some reason the stats collected when this is
101 * enabled misses some of the data so the data sizes are off a bit - this is
102 * a statistical issue only, the data is still accurately written out
103 *
104 * - GPFSMPIO_P2PCONTIG - Does simple point-to-point communication between the
105 * aggregator and the procs that feed it. Performance could be enhanced by a
106 * one-sided put algorithm. Current implementation allows only 1 round of
107 * data. Useful/allowed only when:
108 * 1.) The datatype is contiguous.
109 * 2.) The offsets are increasing in rank-order.
110 * 3.) There are no gaps between the offsets.
111 * 4.) No single rank has a data size which spans multiple file domains.
112 *
113 * - GPFSMPIO_WRITE_AGGMETHOD/GPFSMPIO_READ_AGGMETHOD - Replaces the two-phase
114 * collective IO aggregation
115 * with a one-sided algorithm, significantly reducing communication and
116 * memory overhead. Fully
117 * supports all datasets and datatypes, the only caveat is that any holes in the data
118 * when writing to a pre-existing file are ignored -- there is no read-modify-write
119 * support to maintain the correctness of regions of pre-existing data so every byte
120 * must be explicitly written to maintain correctness. Users must beware of middle-ware
121 * libraries like PNETCDF which may count on read-modify-write functionality for certain
122 * features (like fill values). Possible values:
123 * - 0 - Normal two-phase collective IO is used.
124 * - 1 - A separate one-sided MPI_Put or MPI_Get is used for each contigous chunk of data
125 * for a compute to write to or read from the collective buffer on the aggregator.
126 * - 2 - An MPI derived datatype is created using all the contigous chunks and just one
127 * call to MPI_Put or MPI_Get is done with the derived datatype. On Blue Gene /Q
128 * optimal performance for this is achieved when paired with PAMID_TYPED_ONESIDED=1.
129 * - Default is 0
130 *
131 * - GPFSMPIO_ONESIDED_NO_RMW - For one-sided aggregation (GPFSMPIO_WRITE_AGGMETHOD = 1 or 2)
132 * disable the detection of holes in the data when writing to a pre-existing
133 * file requiring a read-modify-write, thereby avoiding the communication
134 * overhead for this detection.
135 * - 0 (hole detection enabled) or 1 (hole detection disabled)
136 * - Default is 0
137 *
138 * - GPFSMPIO_ONESIDED_INFORM_RMW - For one-sided aggregation
139 * (GPFSMPIO_AGGMETHOD = 1 or 2) generate an informational message informing
140 * the user whether holes exist in the data when writing to a pre-existing
141 * file requiring a read-modify-write, thereby educating the user to set
142 * GPFSMPIO_ONESIDED_NO_RMW=1 on a future run to avoid the communication
143 * overhead for this detection.
144 * - 0 (disabled) or 1 (enabled)
145 * - Default is 0
146 *
147 * - GPFSMPIO_BALANCECONTIG - Relevant only to BGQ. File domain blocks are assigned
148 * to aggregators in a breadth-first fashion relative to the ions - additionally,
149 * file domains on the aggregators sharing the same bridgeset and ion have contiguous
150 * offsets. The breadth-first assignment improves performance in the case of
151 * a relatively small file of size less than the gpfs block size multiplied
152 * by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c. Possible Values
153 * - 0 - assign file domain blocks in the traditional manner
154 * - 1 - if there are variable sized file domain blocks, spread them out
155 * (balance) across bridge nodes
156 *
157 * - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
158 * system. When experimenting with different two-phase I/O strategies, it's
159 * helpful to remove the highly variable file system from the experiment.
160 * - 0 (disabled) or 1 (enabled)
161 * - Default is 0
162 *
163 * - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ. Aggregator placement
164 * optimization whch forms a 5-d ring around the bridge node starting at
165 * GPFSMPIO_BRIDGERINGAGG hops away. Experimental performance results
166 * suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
167 * and GPFSMPIO_BALANCECONTIG. The number of aggregators selected is still
168 * GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
169 *
170 */
171
ad_gpfs_get_env_vars()172 void ad_gpfs_get_env_vars() {
173 char *x, *dummy;
174
175 gpfsmpio_comm = 0;
176 x = getenv( "GPFSMPIO_COMM" );
177 if (x) gpfsmpio_comm = atoi(x);
178 gpfsmpio_timing = 0;
179 x = getenv( "GPFSMPIO_TIMING" );
180 if (x) gpfsmpio_timing = atoi(x);
181 gpfsmpio_tunegather = 1;
182 x = getenv( "GPFSMPIO_TUNEGATHER" );
183 if (x) gpfsmpio_tunegather = atoi(x);
184 gpfsmpio_tuneblocking = 1;
185 x = getenv( "GPFSMPIO_TUNEBLOCKING" );
186 if (x) gpfsmpio_tuneblocking = atoi(x);
187 bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
188 x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
189 if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
190 DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
191 bglocklessmpio_f_type,bglocklessmpio_f_type);
192 /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
193 * when we know a bit more about what "largest possible value" and
194 * "smallest possible value" should be */
195 gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
196 x = getenv("GPFSMPIO_NAGG_PSET");
197 if (x) gpfsmpio_bg_nagg_pset = atoi(x);
198
199 gpfsmpio_pthreadio = 0;
200 x = getenv( "GPFSMPIO_PTHREADIO" );
201 if (x) gpfsmpio_pthreadio = atoi(x);
202
203 gpfsmpio_p2pcontig = 0;
204 x = getenv( "GPFSMPIO_P2PCONTIG" );
205 if (x) gpfsmpio_p2pcontig = atoi(x);
206
207 gpfsmpio_write_aggmethod = 0;
208 x = getenv( "GPFSMPIO_WRITE_AGGMETHOD" );
209 if (x) gpfsmpio_write_aggmethod = atoi(x);
210
211 gpfsmpio_read_aggmethod = 0;
212 x = getenv( "GPFSMPIO_READ_AGGMETHOD" );
213 if (x) gpfsmpio_read_aggmethod = atoi(x);
214
215 gpfsmpio_balancecontig = 0;
216 x = getenv( "GPFSMPIO_BALANCECONTIG" );
217 if (x) gpfsmpio_balancecontig = atoi(x);
218
219 gpfsmpio_devnullio = 0;
220 x = getenv( "GPFSMPIO_DEVNULLIO" );
221 if (x) gpfsmpio_devnullio = atoi(x);
222
223 gpfsmpio_bridgeringagg = 0;
224 x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
225 if (x) gpfsmpio_bridgeringagg = atoi(x);
226
227 gpfsmpio_onesided_no_rmw = 0;
228 x = getenv( "GPFSMPIO_ONESIDED_NO_RMW" );
229 if (x) gpfsmpio_onesided_no_rmw = atoi(x);
230
231 gpfsmpio_onesided_always_rmw = 0;
232 x = getenv( "GPFSMPIO_ONESIDED_ALWAYS_RMW" );
233 if (x) gpfsmpio_onesided_always_rmw = atoi(x);
234 if (gpfsmpio_onesided_always_rmw)
235 gpfsmpio_onesided_no_rmw = 1;
236
237 gpfsmpio_onesided_inform_rmw = 0;
238 x = getenv( "GPFSMPIO_ONESIDED_INFORM_RMW" );
239 if (x) gpfsmpio_onesided_inform_rmw = atoi(x);
240 }
241
242 /* report timing breakdown for MPI I/O collective call */
ad_gpfs_timing_crw_report(int rw,ADIO_File fd,int myrank,int nprocs)243 void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
244 {
245 int i;
246
247 if (gpfsmpio_timing) {
248 /* Timing across the whole communicator is a little bit interesting,
249 * but what is *more* interesting is if we single out the aggregators
250 * themselves. non-aggregators spend a lot of time in "exchange" not
251 * exchanging data, but blocked because they are waiting for
252 * aggregators to finish writing. If we focus on just the aggregator
253 * processes we will get a more clear picture about the data exchange
254 * vs. i/o time breakdown */
255
256 /* if deferred open enabled, we could use the aggregator communicator */
257 MPI_Comm agg_comm;
258 int nr_aggs, agg_rank;
259 MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
260 if(agg_comm != MPI_COMM_NULL) {
261 MPI_Comm_size(agg_comm, &nr_aggs);
262 MPI_Comm_rank(agg_comm, &agg_rank);
263 }
264
265 double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
266 if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw;
267
268 double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ];
269 double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ];
270
271 if( agg_comm != MPI_COMM_NULL) {
272 MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
273 MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
274 }
275 if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
276
277 for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs;
278
279 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] =
280 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
281 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ];
282 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] =
283 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
284 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW ];
285
286 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] =
287 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
288 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ];
289
290 fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
291 fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs));
292 fprintf(stderr,"SEEK-avg: %10.3f , ",
293 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ] );
294 fprintf(stderr,"SEEK-max: %10.3f , ",
295 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ] );
296 fprintf(stderr,"LOCAL-avg: %10.3f , ",
297 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ] );
298 fprintf(stderr,"GATHER-max: %10.3f , ",
299 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ] );
300 fprintf(stderr,"PATTERN-avg: %10.3f , ",
301 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ] );
302 fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
303 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ] );
304 fprintf(stderr,"MYREQ-avg: %10.3f , ",
305 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ] );
306 fprintf(stderr,"OTHERREQ-max: %10.3f , ",
307 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ] );
308 fprintf(stderr,"EXCHANGE-max: %10.3f , ",
309 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ] );
310 fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
311 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] );
312 fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
313 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP] );
314 fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
315 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET] );
316 fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
317 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT] );
318 fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
319 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE] );
320 fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
321 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ] );
322 fprintf(stderr,"POSIX-TIME-max: %10.3f , ",
323 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ] );
324 fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
325 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ] );
326 fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
327 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] );
328 fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
329 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] );
330 fprintf(stderr,"MPI-BW-avg: %10.3f , ",
331 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] );
332 fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
333 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] );
334 }
335 if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
336 }
337
338 }
339