1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*-
2  * vim: ts=8 sts=4 sw=4 noexpandtab
3  *
4  *   Copyright (C) 1997 University of Chicago.
5  *   See COPYRIGHT notice in top-level directory.
6  */
7 
8 #include "ad_pvfs2.h"
9 #include "ad_pvfs2_common.h"
10 
11 /* open_status is helpful for bcasting values around */
12 struct open_status_s {
13     int error;
14     PVFS_object_ref object_ref;
15 };
16 typedef struct open_status_s open_status;
17 
18     /* steps for getting a handle:  (it gets a little convoluted, but at least
19      * it's deterministic)
20      * . lookup the file.
21      * . if lookup succeeds, but we were passed MPI_MODE_EXCL, that's an error
22      * . if lookup fails, the file might not exist.
23      *		in that case, create the file if we were passed MPI_MODE_CREATE
24      * . if the create fails, that means someone else created the file between
25      *    our call to lookup and our call to create (like if N processors all
26      *    open the same file with MPI_COMM_SELF).  Then we can just look up the
27      *    file (which now exists).
28      *
29      * the good news is that only one processor does this and broadcasts the
30      * handle to everyone else in the communicator
31      */
fake_an_open(PVFS_fs_id fs_id,char * pvfs_name,int access_mode,int nr_datafiles,PVFS_size strip_size,ADIOI_PVFS2_fs * pvfs2_fs,open_status * o_status)32 static void fake_an_open(PVFS_fs_id fs_id, char *pvfs_name, int access_mode,
33 	                 int nr_datafiles, PVFS_size strip_size,
34                          ADIOI_PVFS2_fs *pvfs2_fs,
35 			 open_status *o_status)
36 {
37     int ret;
38     PVFS_sysresp_lookup resp_lookup;
39     PVFS_sysresp_getparent resp_getparent;
40     PVFS_sysresp_create resp_create;
41     PVFS_sys_attr attribs;
42     PVFS_sys_dist* dist;
43 
44     ADIOI_PVFS2_makeattribs(&attribs);
45     if (nr_datafiles > 0 ) {
46 	attribs.dfile_count = nr_datafiles;
47 	attribs.mask |= PVFS_ATTR_SYS_DFILE_COUNT;
48     }
49 
50     dist = NULL;
51 
52     memset(&resp_lookup, 0, sizeof(resp_lookup));
53     memset(&resp_getparent, 0, sizeof(resp_getparent));
54     memset(&resp_create, 0, sizeof(resp_create));
55 
56 
57     ret = PVFS_sys_lookup(fs_id, pvfs_name,
58 	    &(pvfs2_fs->credentials), &resp_lookup, PVFS2_LOOKUP_LINK_FOLLOW);
59     if ( ret == (-PVFS_ENOENT)) {
60 	if (access_mode & ADIO_CREATE)  {
61 	    ret = PVFS_sys_getparent(fs_id, pvfs_name,
62 		    &(pvfs2_fs->credentials), &resp_getparent);
63 	    if (ret < 0) {
64 		FPRINTF(stderr, "pvfs_sys_getparent returns with %d\n", ret);
65 		o_status->error = ret;
66 		return;
67 	    }
68 
69             /* Set the distribution strip size if specified */
70             if (0 < strip_size) {
71                 /* Note that the distribution is hardcoded here */
72                 dist = PVFS_sys_dist_lookup("simple_stripe");
73                 ret = PVFS_sys_dist_setparam(dist,
74                                              "strip_size",
75                                              &strip_size);
76                 if (ret < 0)
77                 {
78                     FPRINTF(stderr,
79                             "pvfs_sys_dist_setparam returns with %d\n", ret);
80                     o_status->error = ret;
81                 }
82             }
83 
84             /* Perform file creation */
85 #ifdef HAVE_PVFS2_CREATE_WITHOUT_LAYOUT
86             ret = PVFS_sys_create(resp_getparent.basename,
87 		    resp_getparent.parent_ref, attribs,
88 		    &(pvfs2_fs->credentials), dist, &resp_create);
89 #else
90             ret = PVFS_sys_create(resp_getparent.basename,
91 		    resp_getparent.parent_ref, attribs,
92 		    &(pvfs2_fs->credentials), dist, NULL, &resp_create);
93 #endif
94 
95 	    /* if many creates are happening in this directory, the earlier
96 	     * sys_lookup may have returned ENOENT, but the sys_create could
97 	     * return EEXISTS.  That means the file has been created anyway, so
98 	     * less work for us and we can just open it up and return the
99 	     * handle */
100 	    if (ret == (-PVFS_EEXIST)) {
101 		ret = PVFS_sys_lookup(fs_id, pvfs_name,
102 			&(pvfs2_fs->credentials), &resp_lookup,
103 			PVFS2_LOOKUP_LINK_FOLLOW);
104 		if ( ret < 0 ) {
105 		    o_status->error = ret;
106 		    return;
107 		}
108 		o_status->error = ret;
109 		o_status->object_ref = resp_lookup.ref;
110 		return;
111 	    }
112 	    o_status->object_ref = resp_create.ref;
113 	} else {
114 	    FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n");
115 	    o_status->error = ret;
116 	    return;
117 	}
118     } else if (access_mode & ADIO_EXCL) {
119 	/* lookup should not succeed if opened with EXCL */
120 	o_status->error = -PVFS_EEXIST;
121 	return;
122     } else {
123 	o_status->object_ref = resp_lookup.ref;
124     }
125     o_status->error = ret;
126     return;
127 
128 }
129 
130 
131 /* ADIOI_PVFS2_Open:
132  *  one process opens (or creates) the file, then broadcasts the result to the
133  *  remaining processors.
134  *
135  *  ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before
136  * that, MPI_MODE_EXCL) was set.  Because PVFS2 handles file lookup and
137  * creation more scalably than other file systems, ADIO_Open now skips any
138  * special handling when CREATE is set.  */
ADIOI_PVFS2_Open(ADIO_File fd,int * error_code)139 void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code)
140 {
141     int rank, ret;
142     PVFS_fs_id cur_fs;
143     static char myname[] = "ADIOI_PVFS2_OPEN";
144     char pvfs_path[PVFS_NAME_MAX] = {0};
145 
146     ADIOI_PVFS2_fs *pvfs2_fs;
147 
148     /* since one process is doing the open, that means one process is also
149      * doing the error checking.  define a struct for both the object reference
150      * and the error code to broadcast to all the processors */
151 
152     open_status o_status = {0, {0, 0}};
153     MPI_Datatype open_status_type;
154     MPI_Datatype types[2] = {MPI_INT, MPI_BYTE};
155     int lens[2] = {1, sizeof(PVFS_object_ref)};
156     MPI_Aint offsets[2];
157 
158     pvfs2_fs = (ADIOI_PVFS2_fs *) ADIOI_Malloc(sizeof(ADIOI_PVFS2_fs));
159 
160     /* --BEGIN ERROR HANDLING-- */
161     if (pvfs2_fs == NULL) {
162 	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
163 					   MPIR_ERR_RECOVERABLE,
164 					   myname, __LINE__,
165 					   MPI_ERR_UNKNOWN,
166 					   "Error allocating memory", 0);
167 	return;
168     }
169     /* --END ERROR HANDLING-- */
170 
171     MPI_Comm_rank(fd->comm, &rank);
172 
173     ADIOI_PVFS2_Init(error_code);
174     if (*error_code != MPI_SUCCESS)
175     {
176 	/* ADIOI_PVFS2_INIT handles creating error codes on its own */
177 	return;
178     }
179 
180     /* currently everyone gets their own credentials */
181     ADIOI_PVFS2_makecredentials(&(pvfs2_fs->credentials));
182 
183     /* one process resolves name and will later bcast to others */
184 #ifdef ADIOI_MPE_LOGGING
185     MPE_Log_event( ADIOI_MPE_open_a, 0, NULL );
186 #endif
187     if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) {
188 	/* given the filename, figure out which pvfs filesystem it is on */
189 	ret = PVFS_util_resolve(fd->filename, &cur_fs,
190 		pvfs_path, PVFS_NAME_MAX);
191 	if (ret < 0 ) {
192 	    PVFS_perror("PVFS_util_resolve", ret);
193 	    /* TODO: pick a good error for this */
194 	    o_status.error = -1;
195 	} else  {
196 	    fake_an_open(cur_fs, pvfs_path,
197                          fd->access_mode, fd->hints->striping_factor,
198                          fd->hints->striping_unit,
199                          pvfs2_fs, &o_status);
200 	}
201 
202 	/* store credentials and object reference in fd */
203 	pvfs2_fs->object_ref = o_status.object_ref;
204 	fd->fs_ptr = pvfs2_fs;
205     }
206 #ifdef ADIOI_MPE_LOGGING
207     MPE_Log_event( ADIOI_MPE_open_b, 0, NULL );
208 #endif
209 
210     /* broadcast status and (possibly valid) object reference */
211     MPI_Address(&o_status.error, &offsets[0]);
212     MPI_Address(&o_status.object_ref, &offsets[1]);
213 
214     MPI_Type_struct(2, lens, offsets, types, &open_status_type);
215     MPI_Type_commit(&open_status_type);
216 
217     /* Assertion: if we hit this Bcast, then all processes collectively
218      *            called this open.
219      *
220      * That's because deferred open never happens with PVFS2.
221      */
222     MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0],
223 	      fd->comm);
224     MPI_Type_free(&open_status_type);
225 
226     /* --BEGIN ERROR HANDLING-- */
227     if (o_status.error != 0)
228     {
229 	ADIOI_Free(pvfs2_fs);
230 	fd->fs_ptr = NULL;
231 	*error_code = MPIO_Err_create_code(MPI_SUCCESS,
232 					   MPIR_ERR_RECOVERABLE,
233 					   myname, __LINE__,
234 					   ADIOI_PVFS2_error_convert(o_status.error),
235 					   "Unknown error", 0);
236 	/* TODO: FIX STRING */
237 	return;
238     }
239     /* --END ERROR HANDLING-- */
240 
241     pvfs2_fs->object_ref = o_status.object_ref;
242     fd->fs_ptr = pvfs2_fs;
243 
244     *error_code = MPI_SUCCESS;
245     return;
246 }
247