1 /* ---------------------------------------------------------------- */
2 /* (C)Copyright IBM Corp. 2007, 2008 */
3 /* ---------------------------------------------------------------- */
4 /**
5 * \file ad_gpfs_tuning.c
6 * \brief Defines ad_gpfs performance tuning
7 */
8
9 /* -*- Mode: C; c-basic-offset:4 ; -*- */
10 /*
11 * Copyright (C) 2008 University of Chicago.
12 * See COPYRIGHT notice in top-level directory.
13 */
14
15 /*---------------------------------------------------------------------
16 * ad_gpfs_tuning.c
17 *
18 * defines global variables and functions for performance tuning and
19 * functional debugging.
20 *---------------------------------------------------------------------*/
21
22 #include "ad_gpfs_tuning.h"
23 #include "mpi.h"
24
25 #if !defined(PVFS2_SUPER_MAGIC)
26 #define PVFS2_SUPER_MAGIC (0x20030528)
27 #endif
28
29
30 int gpfsmpio_timing;
31 int gpfsmpio_timing2;
32 int gpfsmpio_timing_cw_level;
33 int gpfsmpio_comm;
34 int gpfsmpio_tunegather;
35 int gpfsmpio_tuneblocking;
36 long bglocklessmpio_f_type;
37 int gpfsmpio_bg_nagg_pset;
38 int gpfsmpio_pthreadio;
39 int gpfsmpio_p2pcontig;
40 int gpfsmpio_balancecontig;
41 int gpfsmpio_devnullio;
42 int gpfsmpio_bridgeringagg;
43
44 double gpfsmpio_prof_cw [GPFSMPIO_CIO_LAST+1];
45 double gpfsmpio_prof_cr [GPFSMPIO_CIO_LAST+1];
46
47 /* set internal variables for tuning environment variables */
48 /** \page mpiio_vars MPIIO Configuration
49 \section env_sec Environment Variables
50 * - GPFSMPIO_COMM - Define how data is exchanged on collective
51 * reads and writes. Possible values:
52 * - 0 - Use MPI_Alltoallv.
53 * - 1 - Use MPI_Isend/MPI_Irecv.
54 * - Default is 0.
55 *
56 * - GPFSMPIO_TIMING - collect timing breakdown for MPI I/O collective calls.
57 * Possible values:
58 * - 0 - Do not collect/report timing.
59 * - 1 - Collect/report timing.
60 * - Default is 0.
61 *
62 * - GPFSMPIO_TUNEGATHER - Tune how starting and ending offsets are communicated
63 * for aggregator collective i/o. Possible values:
64 * - 0 - Use two MPI_Allgather's to collect starting and ending offsets.
65 * - 1 - Use MPI_Allreduce(MPI_MAX) to collect starting and ending offsets.
66 * - Default is 1.
67 *
68 * - GPFSMPIO_TUNEBLOCKING - Tune how aggregate file domains are
69 * calculated (block size). Possible values:
70 * - 0 - Evenly calculate file domains across aggregators. Also use
71 * MPI_Isend/MPI_Irecv to exchange domain information.
72 * - 1 - Align file domains with the underlying file system's block size. Also use
73 * MPI_Alltoallv to exchange domain information.
74 * - Default is 1.
75 *
76 * - BGLOCKLESSMPIO_F_TYPE - Specify a filesystem type that should run
77 * the ad_bglockless driver. NOTE: Using romio prefixes (such as
78 * "bg:" or "bglockless:") on a file name will override this environment
79 * variable. Possible values:
80 * - 0xnnnnnnnn - Any valid file system type (or "magic number") from
81 * statfs() field f_type.
82 * - The default is 0x20030528 (PVFS2_SUPER_MAGIC)
83 *
84 * - GPFSMPIO_NAGG_PSET - Specify a ratio of "I/O aggregators" to use for each
85 * compute group (compute nodes + i/o nodes). Possible values:
86 * - any integer
87 * - Default is 8
88 *
89 * - GPFSMPIO_PTHREADIO - Enables a very simple form of asyncronous io where a
90 * pthread is spawned to do the posix writes while the main thread does the
91 * data aggregation - useful for large files where multiple rounds are
92 * required (more that the cb_buffer_size of data per aggregator). User
93 * must ensure there is hw resource available for the thread to run. I
94 * am sure there is a better way to do this involving comm threads - this is
95 * just a start. NOTE: For some reason the stats collected when this is
96 * enabled misses some of the data so the data sizes are off a bit - this is
97 * a statistical issue only, the data is still accurately written out
98 *
99 * - GPFSMPIO_P2PCONTIG - Does simple point-to-point communication between the
100 * aggregator and the procs that feed it. Performance could be enhanced by a
101 * one-sided put algorithm. Current implementation allows only 1 round of
102 * data. Useful/allowed only when:
103 * 1.) The datatype is contiguous.
104 * 2.) The offsets are increasing in rank-order.
105 * 3.) There are no gaps between the offsets.
106 * 4.) No single rank has a data size which spans multiple file domains.
107 *
108 * - GPFSMPIO_BALANCECONTIG - Relevant only to BGQ. File domain blocks are assigned
109 * to aggregators in a breadth-first fashion relative to the ions - additionally,
110 * file domains on the aggregators sharing the same bridgeset and ion have contiguous
111 * offsets. The breadth-first assignment improves performance in the case of
112 * a relatively small file of size less than the gpfs block size multiplied
113 * by the number of ions. Files: ad_gpfs_aggrs.c ad_bg_aggrs.c. Possible Values
114 * - 0 - assign file domain blocks in the traditional manner
115 * - 1 - if there are variable sized file domain blocks, spread them out
116 * (balance) across bridge nodes
117 *
118 * - GPFSMPIO_DEVNULLIO - do everything *except* write to / read from the file
119 * system. When experimenting with different two-phase I/O strategies, it's
120 * helpful to remove the highly variable file system from the experiment.
121 * - 0 (disabled) or 1 (enabled)
122 * - Default is 0
123 *
124 * - GPFSMPIO_BRIDGERINGAGG - Relevant only to BGQ. Aggregator placement
125 * optimization whch forms a 5-d ring around the bridge node starting at
126 * GPFSMPIO_BRIDGERINGAGG hops away. Experimental performance results
127 * suggest best value is 1 and only in conjunction with GPFSMPIO_P2PCONTIG
128 * and GPFSMPIO_BALANCECONTIG. The number of aggregators selected is still
129 * GPFSMPIO_NAGG_PSET however the bridge node itself is NOT selected.
130 *
131 */
132
ad_gpfs_get_env_vars()133 void ad_gpfs_get_env_vars() {
134 char *x, *dummy;
135
136 gpfsmpio_comm = 0;
137 x = getenv( "GPFSMPIO_COMM" );
138 if (x) gpfsmpio_comm = atoi(x);
139 gpfsmpio_timing = 0;
140 x = getenv( "GPFSMPIO_TIMING" );
141 if (x) gpfsmpio_timing = atoi(x);
142 gpfsmpio_tunegather = 1;
143 x = getenv( "GPFSMPIO_TUNEGATHER" );
144 if (x) gpfsmpio_tunegather = atoi(x);
145 gpfsmpio_tuneblocking = 1;
146 x = getenv( "GPFSMPIO_TUNEBLOCKING" );
147 if (x) gpfsmpio_tuneblocking = atoi(x);
148 bglocklessmpio_f_type = PVFS2_SUPER_MAGIC;
149 x = getenv( "BGLOCKLESSMPIO_F_TYPE" );
150 if (x) bglocklessmpio_f_type = strtol(x,&dummy,0);
151 DBG_FPRINTF(stderr,"BGLOCKLESSMPIO_F_TYPE=%ld/%#lX\n",
152 bglocklessmpio_f_type,bglocklessmpio_f_type);
153 /* note: this value will be 'sanity checked' in ADIOI_BG_persInfo_init(),
154 * when we know a bit more about what "largest possible value" and
155 * "smallest possible value" should be */
156 gpfsmpio_bg_nagg_pset = ADIOI_BG_NAGG_PSET_DFLT;
157 x = getenv("GPFSMPIO_NAGG_PSET");
158 if (x) gpfsmpio_bg_nagg_pset = atoi(x);
159
160 gpfsmpio_pthreadio = 0;
161 x = getenv( "GPFSMPIO_PTHREADIO" );
162 if (x) gpfsmpio_pthreadio = atoi(x);
163
164 gpfsmpio_p2pcontig = 0;
165 x = getenv( "GPFSMPIO_P2PCONTIG" );
166 if (x) gpfsmpio_p2pcontig = atoi(x);
167
168 gpfsmpio_balancecontig = 0;
169 x = getenv( "GPFSMPIO_BALANCECONTIG" );
170 if (x) gpfsmpio_balancecontig = atoi(x);
171
172 gpfsmpio_devnullio = 0;
173 x = getenv( "GPFSMPIO_DEVNULLIO" );
174 if (x) gpfsmpio_devnullio = atoi(x);
175
176 gpfsmpio_bridgeringagg = 0;
177 x = getenv( "GPFSMPIO_BRIDGERINGAGG" );
178 if (x) gpfsmpio_bridgeringagg = atoi(x);
179 }
180
181 /* report timing breakdown for MPI I/O collective call */
ad_gpfs_timing_crw_report(int rw,ADIO_File fd,int myrank,int nprocs)182 void ad_gpfs_timing_crw_report( int rw, ADIO_File fd, int myrank, int nprocs )
183 {
184 int i;
185
186 if (gpfsmpio_timing) {
187 /* Timing across the whole communicator is a little bit interesting,
188 * but what is *more* interesting is if we single out the aggregators
189 * themselves. non-aggregators spend a lot of time in "exchange" not
190 * exchanging data, but blocked because they are waiting for
191 * aggregators to finish writing. If we focus on just the aggregator
192 * processes we will get a more clear picture about the data exchange
193 * vs. i/o time breakdown */
194
195 /* if deferred open enabled, we could use the aggregator communicator */
196 MPI_Comm agg_comm;
197 int nr_aggs, agg_rank;
198 MPI_Comm_split(fd->comm, (fd->is_agg ? 1 : MPI_UNDEFINED), 0, &agg_comm);
199 if(agg_comm != MPI_COMM_NULL) {
200 MPI_Comm_size(agg_comm, &nr_aggs);
201 MPI_Comm_rank(agg_comm, &agg_rank);
202 }
203
204 double *gpfsmpio_prof_org = gpfsmpio_prof_cr;
205 if (rw) gpfsmpio_prof_org = gpfsmpio_prof_cw;
206
207 double gpfsmpio_prof_avg[ GPFSMPIO_CIO_LAST ];
208 double gpfsmpio_prof_max[ GPFSMPIO_CIO_LAST ];
209
210 if( agg_comm != MPI_COMM_NULL) {
211 MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_avg, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_SUM, 0, agg_comm);
212 MPI_Reduce( gpfsmpio_prof_org, gpfsmpio_prof_max, GPFSMPIO_CIO_LAST, MPI_DOUBLE, MPI_MAX, 0, agg_comm);
213 }
214 if (agg_comm != MPI_COMM_NULL && agg_rank == 0) {
215
216 for (i=0; i<GPFSMPIO_CIO_LAST; i++) gpfsmpio_prof_avg[i] /= nr_aggs;
217
218 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] =
219 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
220 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ];
221 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] =
222 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
223 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_RW ];
224
225 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] =
226 gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs /
227 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_MPIO_CRW ];
228
229 fprintf(stderr,"TIMING-%1s,", (rw ? "W" : "R") );
230 fprintf(stderr,"SIZE: %12.4lld , ", (long long int)(gpfsmpio_prof_avg[ GPFSMPIO_CIO_DATA_SIZE ] * nr_aggs));
231 fprintf(stderr,"SEEK-avg: %10.3f , ",
232 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_SEEK ] );
233 fprintf(stderr,"SEEK-max: %10.3f , ",
234 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_SEEK ] );
235 fprintf(stderr,"LOCAL-avg: %10.3f , ",
236 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_LCOMP ] );
237 fprintf(stderr,"GATHER-max: %10.3f , ",
238 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_GATHER ] );
239 fprintf(stderr,"PATTERN-avg: %10.3f , ",
240 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_PATANA ] );
241 fprintf(stderr,"FILEDOMAIN-avg: %10.3f , ",
242 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_FD_PART ] );
243 fprintf(stderr,"MYREQ-avg: %10.3f , ",
244 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MYREQ ] );
245 fprintf(stderr,"OTHERREQ-max: %10.3f , ",
246 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_OTHREQ ] );
247 fprintf(stderr,"EXCHANGE-max: %10.3f , ",
248 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH ] );
249 fprintf(stderr, "EXCHANGE-RECV_EXCH-max: %10.3f , ",
250 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_RECV_EXCH] );
251 fprintf(stderr, "EXCHANGE-SETUP-max: %10.3f , ",
252 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SETUP] );
253 fprintf(stderr, "EXCHANGE-NET-max: %10.3f , ",
254 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_NET] );
255 fprintf(stderr, "EXCHANGE-SORT-max: %10.3f , ",
256 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SORT] );
257 fprintf(stderr, "EXCHANGE-SIEVE-max: %10.3f , ",
258 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_DEXCH_SIEVE] );
259 fprintf(stderr,"POSIX-TIME-avg: %10.3f , ",
260 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_POSI_RW ] );
261 fprintf(stderr,"POSIX-TIME-max: %10.3f , ",
262 gpfsmpio_prof_max[ GPFSMPIO_CIO_T_POSI_RW ] );
263 fprintf(stderr,"MPIIO-CONTIG-TIME-avg: %10.3f , ",
264 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_RW ] );
265 fprintf(stderr,"MPIIO-STRIDED-TIME-avg: %10.3f , ",
266 gpfsmpio_prof_avg[ GPFSMPIO_CIO_T_MPIO_CRW ] );
267 fprintf(stderr,"POSIX-BW-avg: %10.3f , ",
268 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_POSI_RW ] );
269 fprintf(stderr,"MPI-BW-avg: %10.3f , ",
270 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_RW ] );
271 fprintf(stderr,"MPI-BW-collective-avg: %10.3f\n ",
272 gpfsmpio_prof_avg[ GPFSMPIO_CIO_B_MPIO_CRW ] );
273 }
274 if (agg_comm != MPI_COMM_NULL) MPI_Comm_free(&agg_comm);
275 }
276
277 }
278