1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3  *
4  *   Copyright (C) 1997 University of Chicago.
5  *   See COPYRIGHT notice in top-level directory.
6  */
7 
8 #include "adio.h"
9 #include "adio_extern.h"
10 #include "adio_cb_config_list.h"
11 
12 #include "mpio.h"
13 
14 static int is_aggregator(int rank, ADIO_File fd);
15 static int uses_generic_read(ADIO_File fd);
16 static int uses_generic_write(ADIO_File fd);
17 static int build_cb_config_list(ADIO_File fd,
18 	MPI_Comm orig_comm, MPI_Comm comm,
19 	int rank, int procs, int *error_code);
20 
ADIO_Open(MPI_Comm orig_comm,MPI_Comm comm,const char * filename,int file_system,ADIOI_Fns * ops,int access_mode,ADIO_Offset disp,MPI_Datatype etype,MPI_Datatype filetype,MPI_Info info,int perm,int * error_code)21 MPI_File ADIO_Open(MPI_Comm orig_comm,
22 		   MPI_Comm comm, const char *filename, int file_system,
23 		   ADIOI_Fns *ops,
24 		   int access_mode, ADIO_Offset disp, MPI_Datatype etype,
25 		   MPI_Datatype filetype,
26 		   MPI_Info info, int perm, int *error_code)
27 {
28     MPI_File mpi_fh;
29     ADIO_File fd;
30     int err, rank, procs;
31     static char myname[] = "ADIO_OPEN";
32     int  max_error_code;
33     MPI_Info dupinfo;
34     int syshints_processed, can_skip;
35     char *p;
36 
37     *error_code = MPI_SUCCESS;
38 
39     /* obtain MPI_File handle */
40     mpi_fh = MPIO_File_create(sizeof(struct ADIOI_FileD));
41     if (mpi_fh == MPI_FILE_NULL) {
42     }
43     fd = MPIO_File_resolve(mpi_fh);
44 
45     fd->cookie = ADIOI_FILE_COOKIE;
46     fd->fp_ind = disp;
47     fd->fp_sys_posn = 0;
48     fd->comm = comm;       /* dup'ed in MPI_File_open */
49     fd->filename = ADIOI_Strdup(filename);
50     fd->file_system = file_system;
51     fd->fs_ptr = NULL;
52 
53     fd->fns = ops;
54 
55     fd->disp = disp;
56     fd->split_coll_count = 0;
57     fd->shared_fp_fd = ADIO_FILE_NULL;
58     fd->atomicity = 0;
59     fd->etype = etype;          /* MPI_BYTE by default */
60     fd->filetype = filetype;    /* MPI_BYTE by default */
61     fd->etype_size = 1;  /* default etype is MPI_BYTE */
62 
63     fd->file_realm_st_offs = NULL;
64     fd->file_realm_types = NULL;
65 
66     fd->perm = perm;
67 
68     fd->async_count = 0;
69 
70     fd->fortran_handle = -1;
71 
72     fd->err_handler = ADIOI_DFLT_ERR_HANDLER;
73 
74     MPI_Comm_rank(comm, &rank);
75     MPI_Comm_size(comm, &procs);
76 /* create and initialize info object */
77     fd->hints = (ADIOI_Hints *)ADIOI_Calloc(1, sizeof(struct ADIOI_Hints_struct));
78     if (fd->hints == NULL) {
79 	*error_code = MPIO_Err_create_code(*error_code,
80 					   MPIR_ERR_RECOVERABLE,
81 					   myname,
82 					   __LINE__,
83 					   MPI_ERR_OTHER,
84 					   "**nomem2",0);
85 	goto fn_exit;
86     }
87     fd->hints->cb_config_list = NULL;
88     fd->hints->ranklist = NULL;
89     fd->hints->initialized = 0;
90     fd->info = MPI_INFO_NULL;
91 
92     /* move system-wide hint processing *back* into open, but this time the
93      * hintfile reader will do a scalable read-and-broadcast.  The global
94      * ADIOI_syshints will get initialized at first open.  subsequent open
95      * calls will just use result from first open.
96      *
97      * We have two goals here:
98      * 1: avoid processing the hintfile multiple times
99      * 2: have all processes participate in hintfile processing (so we can read-and-broadcast)
100      *
101      * a code might do an "initialize from 0", so we can only skip hint
102      * processing once everyone has particpiated in hint processing */
103     if (ADIOI_syshints == MPI_INFO_NULL)
104 	syshints_processed = 0;
105     else
106 	syshints_processed = 1;
107 
108     MPI_Allreduce(&syshints_processed, &can_skip, 1, MPI_INT, MPI_MIN, fd->comm);
109     if (!can_skip) {
110 	if (ADIOI_syshints == MPI_INFO_NULL)
111 	    MPI_Info_create(&ADIOI_syshints);
112 	ADIOI_process_system_hints(fd, ADIOI_syshints);
113     }
114 
115     ADIOI_incorporate_system_hints(info, ADIOI_syshints, &dupinfo);
116     ADIO_SetInfo(fd, dupinfo, &err);
117     if (dupinfo != MPI_INFO_NULL) {
118 	*error_code = MPI_Info_free(&dupinfo);
119 	if (*error_code != MPI_SUCCESS)
120 	    goto fn_exit;
121     }
122     ADIOI_Info_set(fd->info, "romio_filesystem_type", fd->fns->fsname);
123 
124     /* Instead of repeatedly allocating this buffer in collective read/write,
125      * allocating up-front might make memory management on small platforms
126      * (e.g. Blue Gene) more efficent */
127     fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
128 
129      /* deferred open:
130      * we can only do this optimization if 'fd->hints->deferred_open' is set
131      * (which means the user hinted 'no_indep_rw' and collective buffering).
132      * Furthermore, we only do this if our collective read/write routines use
133      * our generic function, and not an fs-specific routine (we can defer opens
134      * only if we use our aggreagation code). */
135     if (fd->hints->deferred_open &&
136 		    !(uses_generic_read(fd) \
137 			    && uses_generic_write(fd))) {
138 	    fd->hints->deferred_open = 0;
139     }
140     if (ADIO_Feature(fd, ADIO_SCALABLE_OPEN))
141 	    /* disable deferred open on these fs so that scalable broadcast
142 	     * will always use the propper communicator */
143 	    fd->hints->deferred_open = 0;
144 
145 
146     /* on BlueGene, the cb_config_list is built when hints are processed. No
147      * one else does that right now */
148     if (fd->hints->ranklist == NULL) {
149 	build_cb_config_list(fd, orig_comm, comm, rank, procs, error_code);
150 	if (*error_code != MPI_SUCCESS)
151 	    goto fn_exit;
152     }
153     /* for debugging, it can be helpful to see the hints selected */
154     p = getenv("ROMIO_PRINT_HINTS");
155     if (rank == 0 && p != NULL ) {
156 	ADIOI_Info_print_keyvals(fd->info);
157     }
158 
159     fd->is_open = 0;
160     fd->my_cb_nodes_index = -2;
161     fd->is_agg = is_aggregator(rank, fd);
162     /* deferred open used to split the communicator to create an "aggregator
163      * communicator", but we only used it as a way to indicate that deferred
164      * open happened.  fd->is_open and fd->is_agg are sufficient */
165 
166     /* actual opens start here */
167     /* generic open: one process opens to create the file, all others open */
168     /* nfs open: everybody opens or else you'll end up with "file not found"
169      * due to stupid nfs consistency semantics */
170     /* scalable open: one process opens and broadcasts results to everyone */
171 
172     ADIOI_OpenColl(fd, rank, access_mode, error_code);
173 
174  fn_exit:
175     MPI_Allreduce(error_code, &max_error_code, 1, MPI_INT, MPI_MAX, comm);
176     if (max_error_code != MPI_SUCCESS) {
177 
178         /* If the file was successfully opened, close it */
179         if (*error_code == MPI_SUCCESS) {
180 
181             /* in the deferred open case, only those who have actually
182                opened the file should close it */
183             if (fd->hints->deferred_open)  {
184                 if (fd->is_agg) {
185                     (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
186                 }
187             }
188             else {
189                 (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
190             }
191         }
192 	if (fd->filename) ADIOI_Free(fd->filename);
193 	if (fd->hints->ranklist) ADIOI_Free(fd->hints->ranklist);
194 	if (fd->hints->cb_config_list) ADIOI_Free(fd->hints->cb_config_list);
195 	if (fd->hints) ADIOI_Free(fd->hints);
196 	if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info));
197 	if (fd->io_buf) ADIOI_Free(fd->io_buf);
198 	ADIOI_Free(fd);
199         fd = ADIO_FILE_NULL;
200 	if (*error_code == MPI_SUCCESS)
201 	{
202 	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
203 					       MPIR_ERR_RECOVERABLE, myname,
204 					       __LINE__, MPI_ERR_IO,
205 					       "**oremote_fail", 0);
206 	}
207     }
208 
209     return fd;
210 }
211 
212 /* a simple linear search. possible enancement: add a my_cb_nodes_index member
213  * ( index into cb_nodes, else -1 if not aggregator ) for faster lookups
214  *
215  * fd->hints->cb_nodes is the number of aggregators
216  * fd->hints->ranklist[] is an array of the ranks of aggregators
217  *
218  * might want to move this to adio/common/cb_config_list.c
219  */
is_aggregator(int rank,ADIO_File fd)220 int is_aggregator(int rank, ADIO_File fd ) {
221         int i;
222 
223 	if (fd->my_cb_nodes_index == -2) {
224 	    for (i=0; i< fd->hints->cb_nodes; i++ ) {
225 		if ( rank == fd->hints->ranklist[i] ) {
226 		    fd->my_cb_nodes_index = i;
227 		    return 1;
228 		}
229 	    }
230 	    fd->my_cb_nodes_index = -1;
231         }
232 	else if (fd->my_cb_nodes_index != -1)
233 	    return 1;
234 
235         return 0;
236 }
237 
238 /*
239  * If file system implements some version of two-phase -- doesn't have to be
240  * generic -- we can still carry out the defered open optimization
241  */
uses_generic_read(ADIO_File fd)242 static int uses_generic_read(ADIO_File fd)
243 {
244     if (ADIO_Feature(fd, ADIO_TWO_PHASE))
245         return 1;
246     return 0;
247 }
248 
uses_generic_write(ADIO_File fd)249 static int uses_generic_write(ADIO_File fd)
250 {
251     if (ADIO_Feature(fd, ADIO_TWO_PHASE))
252         return 1;
253     return 0;
254 }
255 
build_cb_config_list(ADIO_File fd,MPI_Comm orig_comm,MPI_Comm comm,int rank,int procs,int * error_code)256 static int build_cb_config_list(ADIO_File fd,
257 	MPI_Comm orig_comm, MPI_Comm comm,
258 	int rank, int procs, int *error_code)
259 {
260     ADIO_cb_name_array array;
261     int *tmp_ranklist;
262     int rank_ct;
263     char *value;
264     static char myname[] = "ADIO_OPEN cb_config_list";
265 
266     /* gather the processor name array if we don't already have it */
267     /* this has to be done early in ADIO_Open so that we can cache the name
268      * array in both the dup'd communicator (in case we want it later) and the
269      * original communicator */
270     ADIOI_cb_gather_name_array(orig_comm, comm, &array);
271 
272 /* parse the cb_config_list and create a rank map on rank 0 */
273     if (rank == 0) {
274 	tmp_ranklist = (int *) ADIOI_Malloc(sizeof(int) * procs);
275 	if (tmp_ranklist == NULL) {
276 	    *error_code = MPIO_Err_create_code(*error_code,
277 					       MPIR_ERR_RECOVERABLE,
278 					       myname,
279 					       __LINE__,
280 					       MPI_ERR_OTHER,
281 					       "**nomem2",0);
282 	    return 0;
283 	}
284 
285 	rank_ct = ADIOI_cb_config_list_parse(fd->hints->cb_config_list,
286 					     array, tmp_ranklist,
287 					     fd->hints->cb_nodes);
288 
289 	/* store the ranklist using the minimum amount of memory */
290 	if (rank_ct > 0) {
291 	    fd->hints->ranklist = (int *) ADIOI_Malloc(sizeof(int) * rank_ct);
292 	    memcpy(fd->hints->ranklist, tmp_ranklist, sizeof(int) * rank_ct);
293 	}
294 	ADIOI_Free(tmp_ranklist);
295 	fd->hints->cb_nodes = rank_ct;
296 	/* TEMPORARY -- REMOVE WHEN NO LONGER UPDATING INFO FOR FS-INDEP. */
297 	value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
298 	ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", rank_ct);
299 	ADIOI_Info_set(fd->info, "cb_nodes", value);
300 	ADIOI_Free(value);
301     }
302 
303     ADIOI_cb_bcast_rank_map(fd);
304     if (fd->hints->cb_nodes <= 0) {
305 	*error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
306 					   myname, __LINE__, MPI_ERR_IO,
307 					   "**ioagnomatch", 0);
308 	fd = ADIO_FILE_NULL;
309     }
310     return 0;
311 }
312 
313 /*
314  * vim: ts=8 sts=4 sw=4 noexpandtab
315  */
316