1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3 *
4 * Copyright (C) 1997 University of Chicago.
5 * See COPYRIGHT notice in top-level directory.
6 */
7
8 #include "adio.h"
9 #include "adio_extern.h"
10 #include "adio_cb_config_list.h"
11
12 #include "mpio.h"
13 static int is_aggregator(int rank, ADIO_File fd);
14 static int uses_generic_read(ADIO_File fd);
15 static int uses_generic_write(ADIO_File fd);
16 static int build_cb_config_list(ADIO_File fd,
17 MPI_Comm orig_comm, MPI_Comm comm,
18 int rank, int procs, int *error_code);
19
ADIO_Open(MPI_Comm orig_comm,MPI_Comm comm,const char * filename,int file_system,ADIOI_Fns * ops,int access_mode,ADIO_Offset disp,MPI_Datatype etype,MPI_Datatype filetype,MPI_Info info,int perm,int * error_code)20 MPI_File ADIO_Open(MPI_Comm orig_comm,
21 MPI_Comm comm, const char *filename, int file_system,
22 ADIOI_Fns *ops,
23 int access_mode, ADIO_Offset disp, MPI_Datatype etype,
24 MPI_Datatype filetype,
25 MPI_Info info, int perm, int *error_code)
26 {
27 MPI_File mpi_fh;
28 ADIO_File fd;
29 int err, rank, procs;
30 static char myname[] = "ADIO_OPEN";
31 int max_error_code;
32 MPI_Info dupinfo;
33 int syshints_processed, can_skip;
34 char *p;
35
36 *error_code = MPI_SUCCESS;
37
38 /* obtain MPI_File handle */
39 mpi_fh = MPIO_File_create(sizeof(struct ADIOI_FileD));
40 if (mpi_fh == MPI_FILE_NULL) {
41 fd = MPI_FILE_NULL;
42 *error_code = MPIO_Err_create_code(*error_code,
43 MPIR_ERR_RECOVERABLE,
44 myname,
45 __LINE__,
46 MPI_ERR_OTHER,
47 "**nomem2",0);
48 goto fn_exit;
49
50 }
51 fd = MPIO_File_resolve(mpi_fh);
52
53 fd->cookie = ADIOI_FILE_COOKIE;
54 fd->fp_ind = disp;
55 fd->fp_sys_posn = 0;
56 fd->comm = comm; /* dup'ed in MPI_File_open */
57 fd->filename = ADIOI_Strdup(filename);
58 fd->file_system = file_system;
59 fd->fs_ptr = NULL;
60
61 fd->fns = ops;
62
63 fd->disp = disp;
64 fd->split_coll_count = 0;
65 fd->shared_fp_fd = ADIO_FILE_NULL;
66 fd->atomicity = 0;
67 fd->etype = etype; /* MPI_BYTE by default */
68 fd->filetype = filetype; /* MPI_BYTE by default */
69 fd->etype_size = 1; /* default etype is MPI_BYTE */
70
71 fd->file_realm_st_offs = NULL;
72 fd->file_realm_types = NULL;
73
74 fd->perm = perm;
75
76 fd->async_count = 0;
77
78 fd->fortran_handle = -1;
79
80 fd->err_handler = ADIOI_DFLT_ERR_HANDLER;
81
82 fd->io_buf_window = MPI_WIN_NULL;
83 fd->io_buf_put_amounts_window = MPI_WIN_NULL;
84
85 MPI_Comm_rank(comm, &rank);
86 MPI_Comm_size(comm, &procs);
87 /* create and initialize info object */
88 fd->hints = (ADIOI_Hints *)ADIOI_Calloc(1, sizeof(struct ADIOI_Hints_struct));
89 if (fd->hints == NULL) {
90 *error_code = MPIO_Err_create_code(*error_code,
91 MPIR_ERR_RECOVERABLE,
92 myname,
93 __LINE__,
94 MPI_ERR_OTHER,
95 "**nomem2",0);
96 goto fn_exit;
97 }
98 fd->hints->cb_config_list = NULL;
99 fd->hints->ranklist = NULL;
100 fd->hints->initialized = 0;
101 fd->info = MPI_INFO_NULL;
102
103 /* move system-wide hint processing *back* into open, but this time the
104 * hintfile reader will do a scalable read-and-broadcast. The global
105 * ADIOI_syshints will get initialized at first open. subsequent open
106 * calls will just use result from first open.
107 *
108 * We have two goals here:
109 * 1: avoid processing the hintfile multiple times
110 * 2: have all processes participate in hintfile processing (so we can read-and-broadcast)
111 *
112 * a code might do an "initialize from 0", so we can only skip hint
113 * processing once everyone has particpiated in hint processing */
114 if (ADIOI_syshints == MPI_INFO_NULL)
115 syshints_processed = 0;
116 else
117 syshints_processed = 1;
118
119 MPI_Allreduce(&syshints_processed, &can_skip, 1, MPI_INT, MPI_MIN, fd->comm);
120 if (!can_skip) {
121 if (ADIOI_syshints == MPI_INFO_NULL)
122 MPI_Info_create(&ADIOI_syshints);
123 ADIOI_process_system_hints(fd, ADIOI_syshints);
124 }
125
126 ADIOI_incorporate_system_hints(info, ADIOI_syshints, &dupinfo);
127 ADIO_SetInfo(fd, dupinfo, &err);
128 if (dupinfo != MPI_INFO_NULL) {
129 *error_code = MPI_Info_free(&dupinfo);
130 if (*error_code != MPI_SUCCESS)
131 goto fn_exit;
132 }
133 ADIOI_Info_set(fd->info, "romio_filesystem_type", fd->fns->fsname);
134
135 /* Instead of repeatedly allocating this buffer in collective read/write,
136 * allocating up-front might make memory management on small platforms
137 * (e.g. Blue Gene) more efficent */
138
139 fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
140 /* deferred open:
141 * we can only do this optimization if 'fd->hints->deferred_open' is set
142 * (which means the user hinted 'no_indep_rw' and collective buffering).
143 * Furthermore, we only do this if our collective read/write routines use
144 * our generic function, and not an fs-specific routine (we can defer opens
145 * only if we use our aggreagation code). */
146 if (fd->hints->deferred_open &&
147 !(uses_generic_read(fd) \
148 && uses_generic_write(fd))) {
149 fd->hints->deferred_open = 0;
150 }
151 if (ADIO_Feature(fd, ADIO_SCALABLE_OPEN))
152 /* disable deferred open on these fs so that scalable broadcast
153 * will always use the propper communicator */
154 fd->hints->deferred_open = 0;
155
156
157 /* on BlueGene, the cb_config_list is built when hints are processed. No
158 * one else does that right now */
159 if (fd->hints->ranklist == NULL) {
160 build_cb_config_list(fd, orig_comm, comm, rank, procs, error_code);
161 if (*error_code != MPI_SUCCESS)
162 goto fn_exit;
163 }
164 fd->is_open = 0;
165 fd->my_cb_nodes_index = -2;
166 fd->is_agg = is_aggregator(rank, fd);
167 /* deferred open used to split the communicator to create an "aggregator
168 * communicator", but we only used it as a way to indicate that deferred
169 * open happened. fd->is_open and fd->is_agg are sufficient */
170
171 /* actual opens start here */
172 /* generic open: one process opens to create the file, all others open */
173 /* nfs open: everybody opens or else you'll end up with "file not found"
174 * due to stupid nfs consistency semantics */
175 /* scalable open: one process opens and broadcasts results to everyone */
176
177 ADIOI_OpenColl(fd, rank, access_mode, error_code);
178
179 /* deferred open consideration: if an independent process lied about
180 * "no_indep_rw" and opens the file later (example: HDF5 uses independent
181 * i/o for metadata), that deferred open will use the access_mode provided
182 * by the user. CREATE|EXCL only makes sense here -- exclusive access in
183 * the deferred open case is going to fail and surprise the user. Turn off
184 * the excl amode bit. Save user's ammode for MPI_FILE_GET_AMODE */
185 fd->orig_access_mode = access_mode;
186 if (fd->access_mode & ADIO_EXCL) fd->access_mode ^= ADIO_EXCL;
187
188
189 /* for debugging, it can be helpful to see the hints selected. Some file
190 * systes set up the hints in the open call (e.g. lustre) */
191 p = getenv("ROMIO_PRINT_HINTS");
192 if (rank == 0 && p != NULL ) {
193 ADIOI_Info_print_keyvals(fd->info);
194 }
195
196 fn_exit:
197 MPI_Allreduce(error_code, &max_error_code, 1, MPI_INT, MPI_MAX, comm);
198 if (max_error_code != MPI_SUCCESS) {
199
200 /* If the file was successfully opened, close it */
201 if (*error_code == MPI_SUCCESS) {
202
203 /* in the deferred open case, only those who have actually
204 opened the file should close it */
205 if (fd->hints->deferred_open) {
206 if (fd->is_agg) {
207 (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
208 }
209 }
210 else {
211 (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
212 }
213 }
214 ADIOI_Free(fd->filename);
215 ADIOI_Free(fd->hints->ranklist);
216 if ( fd->hints->cb_config_list != NULL ) ADIOI_Free(fd->hints->cb_config_list);
217 ADIOI_Free(fd->hints);
218 if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info));
219 ADIOI_Free(fd->io_buf);
220 ADIOI_Free(fd);
221 fd = ADIO_FILE_NULL;
222 if (*error_code == MPI_SUCCESS)
223 {
224 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
225 MPIR_ERR_RECOVERABLE, myname,
226 __LINE__, MPI_ERR_IO,
227 "**oremote_fail", 0);
228 }
229 }
230
231 return fd;
232 }
233
234 /* a simple linear search. possible enancement: add a my_cb_nodes_index member
235 * ( index into cb_nodes, else -1 if not aggregator ) for faster lookups
236 *
237 * fd->hints->cb_nodes is the number of aggregators
238 * fd->hints->ranklist[] is an array of the ranks of aggregators
239 *
240 * might want to move this to adio/common/cb_config_list.c
241 */
is_aggregator(int rank,ADIO_File fd)242 int is_aggregator(int rank, ADIO_File fd ) {
243 int i;
244
245 if (fd->my_cb_nodes_index == -2) {
246 for (i=0; i< fd->hints->cb_nodes; i++ ) {
247 if ( rank == fd->hints->ranklist[i] ) {
248 fd->my_cb_nodes_index = i;
249 return 1;
250 }
251 }
252 fd->my_cb_nodes_index = -1;
253 }
254 else if (fd->my_cb_nodes_index != -1)
255 return 1;
256
257 return 0;
258 }
259
260 /*
261 * If file system implements some version of two-phase -- doesn't have to be
262 * generic -- we can still carry out the defered open optimization
263 */
uses_generic_read(ADIO_File fd)264 static int uses_generic_read(ADIO_File fd)
265 {
266 if (ADIO_Feature(fd, ADIO_TWO_PHASE))
267 return 1;
268 return 0;
269 }
270
uses_generic_write(ADIO_File fd)271 static int uses_generic_write(ADIO_File fd)
272 {
273 if (ADIO_Feature(fd, ADIO_TWO_PHASE))
274 return 1;
275 return 0;
276 }
277
build_cb_config_list(ADIO_File fd,MPI_Comm orig_comm,MPI_Comm comm,int rank,int procs,int * error_code)278 static int build_cb_config_list(ADIO_File fd,
279 MPI_Comm orig_comm, MPI_Comm comm,
280 int rank, int procs, int *error_code)
281 {
282 ADIO_cb_name_array array;
283 int *tmp_ranklist;
284 int rank_ct;
285 char *value;
286 static char myname[] = "ADIO_OPEN cb_config_list";
287
288 /* gather the processor name array if we don't already have it */
289 /* this has to be done early in ADIO_Open so that we can cache the name
290 * array in both the dup'd communicator (in case we want it later) and the
291 * original communicator */
292 ADIOI_cb_gather_name_array(orig_comm, comm, &array);
293
294 /* parse the cb_config_list and create a rank map on rank 0 */
295 if (rank == 0) {
296 tmp_ranklist = (int *) ADIOI_Malloc(sizeof(int) * procs);
297 if (tmp_ranklist == NULL) {
298 *error_code = MPIO_Err_create_code(*error_code,
299 MPIR_ERR_RECOVERABLE,
300 myname,
301 __LINE__,
302 MPI_ERR_OTHER,
303 "**nomem2",0);
304 return 0;
305 }
306
307 rank_ct = ADIOI_cb_config_list_parse(fd->hints->cb_config_list,
308 array, tmp_ranklist,
309 fd->hints->cb_nodes);
310
311 /* store the ranklist using the minimum amount of memory */
312 if (rank_ct > 0) {
313 fd->hints->ranklist = (int *) ADIOI_Malloc(sizeof(int) * rank_ct);
314 memcpy(fd->hints->ranklist, tmp_ranklist, sizeof(int) * rank_ct);
315 }
316 ADIOI_Free(tmp_ranklist);
317 fd->hints->cb_nodes = rank_ct;
318 /* TEMPORARY -- REMOVE WHEN NO LONGER UPDATING INFO FOR FS-INDEP. */
319 value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
320 ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", rank_ct);
321 ADIOI_Info_set(fd->info, "cb_nodes", value);
322 ADIOI_Free(value);
323 }
324
325 ADIOI_cb_bcast_rank_map(fd);
326 if (fd->hints->cb_nodes <= 0) {
327 *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
328 myname, __LINE__, MPI_ERR_IO,
329 "**ioagnomatch", 0);
330 fd = ADIO_FILE_NULL;
331 }
332 return 0;
333 }
334
335 /*
336 * vim: ts=8 sts=4 sw=4 noexpandtab
337 */
338