1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3  *
4  *   Copyright (C) 1997 University of Chicago.
5  *   See COPYRIGHT notice in top-level directory.
6  */
7 
8 #include "adio.h"
9 #include "adio_extern.h"
10 #include "adio_cb_config_list.h"
11 
12 #include "mpio.h"
13 static int is_aggregator(int rank, ADIO_File fd);
14 static int uses_generic_read(ADIO_File fd);
15 static int uses_generic_write(ADIO_File fd);
16 static int build_cb_config_list(ADIO_File fd,
17 	MPI_Comm orig_comm, MPI_Comm comm,
18 	int rank, int procs, int *error_code);
19 
ADIO_Open(MPI_Comm orig_comm,MPI_Comm comm,const char * filename,int file_system,ADIOI_Fns * ops,int access_mode,ADIO_Offset disp,MPI_Datatype etype,MPI_Datatype filetype,MPI_Info info,int perm,int * error_code)20 MPI_File ADIO_Open(MPI_Comm orig_comm,
21 		   MPI_Comm comm, const char *filename, int file_system,
22 		   ADIOI_Fns *ops,
23 		   int access_mode, ADIO_Offset disp, MPI_Datatype etype,
24 		   MPI_Datatype filetype,
25 		   MPI_Info info, int perm, int *error_code)
26 {
27     MPI_File mpi_fh;
28     ADIO_File fd;
29     int err, rank, procs;
30     static char myname[] = "ADIO_OPEN";
31     int  max_error_code;
32     MPI_Info dupinfo;
33     int syshints_processed, can_skip;
34     char *p;
35 
36     *error_code = MPI_SUCCESS;
37 
38     /* obtain MPI_File handle */
39     mpi_fh = MPIO_File_create(sizeof(struct ADIOI_FileD));
40     if (mpi_fh == MPI_FILE_NULL) {
41 	fd = MPI_FILE_NULL;
42 	*error_code = MPIO_Err_create_code(*error_code,
43 					   MPIR_ERR_RECOVERABLE,
44 					   myname,
45 					   __LINE__,
46 					   MPI_ERR_OTHER,
47 					   "**nomem2",0);
48 	goto fn_exit;
49 
50     }
51     fd = MPIO_File_resolve(mpi_fh);
52 
53     fd->cookie = ADIOI_FILE_COOKIE;
54     fd->fp_ind = disp;
55     fd->fp_sys_posn = 0;
56     fd->comm = comm;       /* dup'ed in MPI_File_open */
57     fd->filename = ADIOI_Strdup(filename);
58     fd->file_system = file_system;
59     fd->fs_ptr = NULL;
60 
61     fd->fns = ops;
62 
63     fd->disp = disp;
64     fd->split_coll_count = 0;
65     fd->shared_fp_fd = ADIO_FILE_NULL;
66     fd->atomicity = 0;
67     fd->etype = etype;          /* MPI_BYTE by default */
68     fd->filetype = filetype;    /* MPI_BYTE by default */
69     fd->etype_size = 1;  /* default etype is MPI_BYTE */
70 
71     fd->file_realm_st_offs = NULL;
72     fd->file_realm_types = NULL;
73 
74     fd->perm = perm;
75 
76     fd->async_count = 0;
77 
78     fd->fortran_handle = -1;
79 
80     fd->err_handler = ADIOI_DFLT_ERR_HANDLER;
81 
82     fd->io_buf_window = MPI_WIN_NULL;
83     fd->io_buf_put_amounts_window = MPI_WIN_NULL;
84 
85     MPI_Comm_rank(comm, &rank);
86     MPI_Comm_size(comm, &procs);
87 /* create and initialize info object */
88     fd->hints = (ADIOI_Hints *)ADIOI_Calloc(1, sizeof(struct ADIOI_Hints_struct));
89     if (fd->hints == NULL) {
90 	*error_code = MPIO_Err_create_code(*error_code,
91 					   MPIR_ERR_RECOVERABLE,
92 					   myname,
93 					   __LINE__,
94 					   MPI_ERR_OTHER,
95 					   "**nomem2",0);
96 	goto fn_exit;
97     }
98     fd->hints->cb_config_list = NULL;
99     fd->hints->ranklist = NULL;
100     fd->hints->initialized = 0;
101     fd->info = MPI_INFO_NULL;
102 
103     /* move system-wide hint processing *back* into open, but this time the
104      * hintfile reader will do a scalable read-and-broadcast.  The global
105      * ADIOI_syshints will get initialized at first open.  subsequent open
106      * calls will just use result from first open.
107      *
108      * We have two goals here:
109      * 1: avoid processing the hintfile multiple times
110      * 2: have all processes participate in hintfile processing (so we can read-and-broadcast)
111      *
112      * a code might do an "initialize from 0", so we can only skip hint
113      * processing once everyone has particpiated in hint processing */
114     if (ADIOI_syshints == MPI_INFO_NULL)
115 	syshints_processed = 0;
116     else
117 	syshints_processed = 1;
118 
119     MPI_Allreduce(&syshints_processed, &can_skip, 1, MPI_INT, MPI_MIN, fd->comm);
120     if (!can_skip) {
121 	if (ADIOI_syshints == MPI_INFO_NULL)
122 	    MPI_Info_create(&ADIOI_syshints);
123 	ADIOI_process_system_hints(fd, ADIOI_syshints);
124     }
125 
126     ADIOI_incorporate_system_hints(info, ADIOI_syshints, &dupinfo);
127     ADIO_SetInfo(fd, dupinfo, &err);
128     if (dupinfo != MPI_INFO_NULL) {
129 	*error_code = MPI_Info_free(&dupinfo);
130 	if (*error_code != MPI_SUCCESS)
131 	    goto fn_exit;
132     }
133     ADIOI_Info_set(fd->info, "romio_filesystem_type", fd->fns->fsname);
134 
135     /* Instead of repeatedly allocating this buffer in collective read/write,
136      * allocating up-front might make memory management on small platforms
137      * (e.g. Blue Gene) more efficent */
138 
139     fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
140      /* deferred open:
141      * we can only do this optimization if 'fd->hints->deferred_open' is set
142      * (which means the user hinted 'no_indep_rw' and collective buffering).
143      * Furthermore, we only do this if our collective read/write routines use
144      * our generic function, and not an fs-specific routine (we can defer opens
145      * only if we use our aggreagation code). */
146     if (fd->hints->deferred_open &&
147 		    !(uses_generic_read(fd) \
148 			    && uses_generic_write(fd))) {
149 	    fd->hints->deferred_open = 0;
150     }
151     if (ADIO_Feature(fd, ADIO_SCALABLE_OPEN))
152 	    /* disable deferred open on these fs so that scalable broadcast
153 	     * will always use the propper communicator */
154 	    fd->hints->deferred_open = 0;
155 
156 
157     /* on BlueGene, the cb_config_list is built when hints are processed. No
158      * one else does that right now */
159     if (fd->hints->ranklist == NULL) {
160 	build_cb_config_list(fd, orig_comm, comm, rank, procs, error_code);
161 	if (*error_code != MPI_SUCCESS)
162 	    goto fn_exit;
163     }
164     fd->is_open = 0;
165     fd->my_cb_nodes_index = -2;
166     fd->is_agg = is_aggregator(rank, fd);
167     /* deferred open used to split the communicator to create an "aggregator
168      * communicator", but we only used it as a way to indicate that deferred
169      * open happened.  fd->is_open and fd->is_agg are sufficient */
170 
171     /* actual opens start here */
172     /* generic open: one process opens to create the file, all others open */
173     /* nfs open: everybody opens or else you'll end up with "file not found"
174      * due to stupid nfs consistency semantics */
175     /* scalable open: one process opens and broadcasts results to everyone */
176 
177     ADIOI_OpenColl(fd, rank, access_mode, error_code);
178 
179     /* deferred open consideration: if an independent process lied about
180      * "no_indep_rw" and opens the file later (example: HDF5 uses independent
181      * i/o for metadata), that deferred open will use the access_mode provided
182      * by the user.  CREATE|EXCL only makes sense here -- exclusive access in
183      * the deferred open case is going to fail and surprise the user.  Turn off
184      * the excl amode bit. Save user's ammode for MPI_FILE_GET_AMODE */
185     fd->orig_access_mode = access_mode;
186     if (fd->access_mode & ADIO_EXCL) fd->access_mode ^= ADIO_EXCL;
187 
188 
189     /* for debugging, it can be helpful to see the hints selected. Some file
190      * systes set up the hints in the open call (e.g. lustre) */
191     p = getenv("ROMIO_PRINT_HINTS");
192     if (rank == 0 && p != NULL ) {
193 	ADIOI_Info_print_keyvals(fd->info);
194     }
195 
196  fn_exit:
197     MPI_Allreduce(error_code, &max_error_code, 1, MPI_INT, MPI_MAX, comm);
198     if (max_error_code != MPI_SUCCESS) {
199 
200         /* If the file was successfully opened, close it */
201         if (*error_code == MPI_SUCCESS) {
202 
203             /* in the deferred open case, only those who have actually
204                opened the file should close it */
205             if (fd->hints->deferred_open)  {
206                 if (fd->is_agg) {
207                     (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
208                 }
209             }
210             else {
211                 (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
212             }
213         }
214 	ADIOI_Free(fd->filename);
215 	ADIOI_Free(fd->hints->ranklist);
216 	if ( fd->hints->cb_config_list != NULL ) ADIOI_Free(fd->hints->cb_config_list);
217 	ADIOI_Free(fd->hints);
218 	if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info));
219 	ADIOI_Free(fd->io_buf);
220 	ADIOI_Free(fd);
221         fd = ADIO_FILE_NULL;
222 	if (*error_code == MPI_SUCCESS)
223 	{
224 	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
225 					       MPIR_ERR_RECOVERABLE, myname,
226 					       __LINE__, MPI_ERR_IO,
227 					       "**oremote_fail", 0);
228 	}
229     }
230 
231     return fd;
232 }
233 
234 /* a simple linear search. possible enancement: add a my_cb_nodes_index member
235  * ( index into cb_nodes, else -1 if not aggregator ) for faster lookups
236  *
237  * fd->hints->cb_nodes is the number of aggregators
238  * fd->hints->ranklist[] is an array of the ranks of aggregators
239  *
240  * might want to move this to adio/common/cb_config_list.c
241  */
is_aggregator(int rank,ADIO_File fd)242 int is_aggregator(int rank, ADIO_File fd ) {
243         int i;
244 
245 	if (fd->my_cb_nodes_index == -2) {
246 	    for (i=0; i< fd->hints->cb_nodes; i++ ) {
247 		if ( rank == fd->hints->ranklist[i] ) {
248 		    fd->my_cb_nodes_index = i;
249 		    return 1;
250 		}
251 	    }
252 	    fd->my_cb_nodes_index = -1;
253         }
254 	else if (fd->my_cb_nodes_index != -1)
255 	    return 1;
256 
257         return 0;
258 }
259 
260 /*
261  * If file system implements some version of two-phase -- doesn't have to be
262  * generic -- we can still carry out the defered open optimization
263  */
uses_generic_read(ADIO_File fd)264 static int uses_generic_read(ADIO_File fd)
265 {
266     if (ADIO_Feature(fd, ADIO_TWO_PHASE))
267         return 1;
268     return 0;
269 }
270 
uses_generic_write(ADIO_File fd)271 static int uses_generic_write(ADIO_File fd)
272 {
273     if (ADIO_Feature(fd, ADIO_TWO_PHASE))
274         return 1;
275     return 0;
276 }
277 
build_cb_config_list(ADIO_File fd,MPI_Comm orig_comm,MPI_Comm comm,int rank,int procs,int * error_code)278 static int build_cb_config_list(ADIO_File fd,
279 	MPI_Comm orig_comm, MPI_Comm comm,
280 	int rank, int procs, int *error_code)
281 {
282     ADIO_cb_name_array array;
283     int *tmp_ranklist;
284     int rank_ct;
285     char *value;
286     static char myname[] = "ADIO_OPEN cb_config_list";
287 
288     /* gather the processor name array if we don't already have it */
289     /* this has to be done early in ADIO_Open so that we can cache the name
290      * array in both the dup'd communicator (in case we want it later) and the
291      * original communicator */
292     ADIOI_cb_gather_name_array(orig_comm, comm, &array);
293 
294 /* parse the cb_config_list and create a rank map on rank 0 */
295     if (rank == 0) {
296 	tmp_ranklist = (int *) ADIOI_Malloc(sizeof(int) * procs);
297 	if (tmp_ranklist == NULL) {
298 	    *error_code = MPIO_Err_create_code(*error_code,
299 					       MPIR_ERR_RECOVERABLE,
300 					       myname,
301 					       __LINE__,
302 					       MPI_ERR_OTHER,
303 					       "**nomem2",0);
304 	    return 0;
305 	}
306 
307 	rank_ct = ADIOI_cb_config_list_parse(fd->hints->cb_config_list,
308 					     array, tmp_ranklist,
309 					     fd->hints->cb_nodes);
310 
311 	/* store the ranklist using the minimum amount of memory */
312 	if (rank_ct > 0) {
313 	    fd->hints->ranklist = (int *) ADIOI_Malloc(sizeof(int) * rank_ct);
314 	    memcpy(fd->hints->ranklist, tmp_ranklist, sizeof(int) * rank_ct);
315 	}
316 	ADIOI_Free(tmp_ranklist);
317 	fd->hints->cb_nodes = rank_ct;
318 	/* TEMPORARY -- REMOVE WHEN NO LONGER UPDATING INFO FOR FS-INDEP. */
319 	value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
320 	ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", rank_ct);
321 	ADIOI_Info_set(fd->info, "cb_nodes", value);
322 	ADIOI_Free(value);
323     }
324 
325     ADIOI_cb_bcast_rank_map(fd);
326     if (fd->hints->cb_nodes <= 0) {
327 	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
328 					   myname, __LINE__, MPI_ERR_IO,
329 					   "**ioagnomatch", 0);
330 	fd = ADIO_FILE_NULL;
331     }
332     return 0;
333 }
334 
335 /*
336  * vim: ts=8 sts=4 sw=4 noexpandtab
337  */
338