1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*-
2 * vim: ts=8 sts=4 sw=4 noexpandtab
3 *
4 * Copyright (C) 1997 University of Chicago.
5 * See COPYRIGHT notice in top-level directory.
6 */
7
8 #include "ad_pvfs2.h"
9 #include "ad_pvfs2_common.h"
10
11 /* open_status is helpful for bcasting values around */
12 struct open_status_s {
13 int error;
14 PVFS_object_ref object_ref;
15 };
16 typedef struct open_status_s open_status;
17
18 /* steps for getting a handle: (it gets a little convoluted, but at least
19 * it's deterministic)
20 * . lookup the file.
21 * . if lookup succeeds, but we were passed MPI_MODE_EXCL, that's an error
22 * . if lookup fails, the file might not exist.
23 * in that case, create the file if we were passed MPI_MODE_CREATE
24 * . if the create fails, that means someone else created the file between
25 * our call to lookup and our call to create (like if N processors all
26 * open the same file with MPI_COMM_SELF). Then we can just look up the
27 * file (which now exists).
28 *
29 * the good news is that only one processor does this and broadcasts the
30 * handle to everyone else in the communicator
31 */
fake_an_open(PVFS_fs_id fs_id,char * pvfs_name,int access_mode,int nr_datafiles,PVFS_size strip_size,ADIOI_PVFS2_fs * pvfs2_fs,open_status * o_status)32 static void fake_an_open(PVFS_fs_id fs_id, char *pvfs_name, int access_mode,
33 int nr_datafiles, PVFS_size strip_size,
34 ADIOI_PVFS2_fs *pvfs2_fs,
35 open_status *o_status)
36 {
37 int ret;
38 PVFS_sysresp_lookup resp_lookup;
39 PVFS_sysresp_getparent resp_getparent;
40 PVFS_sysresp_create resp_create;
41 PVFS_sys_attr attribs;
42 PVFS_sys_dist* dist;
43
44 ADIOI_PVFS2_makeattribs(&attribs);
45 if (nr_datafiles > 0 ) {
46 attribs.dfile_count = nr_datafiles;
47 attribs.mask |= PVFS_ATTR_SYS_DFILE_COUNT;
48 }
49
50 dist = NULL;
51
52 memset(&resp_lookup, 0, sizeof(resp_lookup));
53 memset(&resp_getparent, 0, sizeof(resp_getparent));
54 memset(&resp_create, 0, sizeof(resp_create));
55
56
57 ret = PVFS_sys_lookup(fs_id, pvfs_name,
58 &(pvfs2_fs->credentials), &resp_lookup, PVFS2_LOOKUP_LINK_FOLLOW);
59 if ( ret == (-PVFS_ENOENT)) {
60 if (access_mode & ADIO_CREATE) {
61 ret = PVFS_sys_getparent(fs_id, pvfs_name,
62 &(pvfs2_fs->credentials), &resp_getparent);
63 if (ret < 0) {
64 FPRINTF(stderr, "pvfs_sys_getparent returns with %d\n", ret);
65 o_status->error = ret;
66 return;
67 }
68
69 /* Set the distribution strip size if specified */
70 if (0 < strip_size) {
71 /* Note that the distribution is hardcoded here */
72 dist = PVFS_sys_dist_lookup("simple_stripe");
73 ret = PVFS_sys_dist_setparam(dist,
74 "strip_size",
75 &strip_size);
76 if (ret < 0)
77 {
78 FPRINTF(stderr,
79 "pvfs_sys_dist_setparam returns with %d\n", ret);
80 o_status->error = ret;
81 }
82 }
83
84 /* Perform file creation */
85 #ifdef HAVE_PVFS2_CREATE_WITHOUT_LAYOUT
86 ret = PVFS_sys_create(resp_getparent.basename,
87 resp_getparent.parent_ref, attribs,
88 &(pvfs2_fs->credentials), dist, &resp_create);
89 #else
90 ret = PVFS_sys_create(resp_getparent.basename,
91 resp_getparent.parent_ref, attribs,
92 &(pvfs2_fs->credentials), dist, NULL, &resp_create);
93 #endif
94
95 /* if many creates are happening in this directory, the earlier
96 * sys_lookup may have returned ENOENT, but the sys_create could
97 * return EEXISTS. That means the file has been created anyway, so
98 * less work for us and we can just open it up and return the
99 * handle */
100 if (ret == (-PVFS_EEXIST)) {
101 ret = PVFS_sys_lookup(fs_id, pvfs_name,
102 &(pvfs2_fs->credentials), &resp_lookup,
103 PVFS2_LOOKUP_LINK_FOLLOW);
104 if ( ret < 0 ) {
105 o_status->error = ret;
106 return;
107 }
108 o_status->error = ret;
109 o_status->object_ref = resp_lookup.ref;
110 return;
111 }
112 o_status->object_ref = resp_create.ref;
113 } else {
114 FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n");
115 o_status->error = ret;
116 return;
117 }
118 } else if (access_mode & ADIO_EXCL) {
119 /* lookup should not succeed if opened with EXCL */
120 o_status->error = -PVFS_EEXIST;
121 return;
122 } else {
123 o_status->object_ref = resp_lookup.ref;
124 }
125 o_status->error = ret;
126 return;
127
128 }
129
130
131 /* ADIOI_PVFS2_Open:
132 * one process opens (or creates) the file, then broadcasts the result to the
133 * remaining processors.
134 *
135 * ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before
136 * that, MPI_MODE_EXCL) was set. Because PVFS2 handles file lookup and
137 * creation more scalably than other file systems, ADIO_Open now skips any
138 * special handling when CREATE is set. */
ADIOI_PVFS2_Open(ADIO_File fd,int * error_code)139 void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code)
140 {
141 int rank, ret;
142 PVFS_fs_id cur_fs;
143 static char myname[] = "ADIOI_PVFS2_OPEN";
144 char pvfs_path[PVFS_NAME_MAX] = {0};
145
146 ADIOI_PVFS2_fs *pvfs2_fs;
147
148 /* since one process is doing the open, that means one process is also
149 * doing the error checking. define a struct for both the object reference
150 * and the error code to broadcast to all the processors */
151
152 open_status o_status = {0, {0, 0}};
153 MPI_Datatype open_status_type;
154 MPI_Datatype types[2] = {MPI_INT, MPI_BYTE};
155 int lens[2] = {1, sizeof(PVFS_object_ref)};
156 MPI_Aint offsets[2];
157
158 pvfs2_fs = (ADIOI_PVFS2_fs *) ADIOI_Malloc(sizeof(ADIOI_PVFS2_fs));
159
160 /* --BEGIN ERROR HANDLING-- */
161 if (pvfs2_fs == NULL) {
162 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
163 MPIR_ERR_RECOVERABLE,
164 myname, __LINE__,
165 MPI_ERR_UNKNOWN,
166 "Error allocating memory", 0);
167 return;
168 }
169 /* --END ERROR HANDLING-- */
170
171 MPI_Comm_rank(fd->comm, &rank);
172
173 ADIOI_PVFS2_Init(error_code);
174 if (*error_code != MPI_SUCCESS)
175 {
176 /* ADIOI_PVFS2_INIT handles creating error codes on its own */
177 return;
178 }
179
180 /* currently everyone gets their own credentials */
181 ADIOI_PVFS2_makecredentials(&(pvfs2_fs->credentials));
182
183 /* one process resolves name and will later bcast to others */
184 #ifdef ADIOI_MPE_LOGGING
185 MPE_Log_event( ADIOI_MPE_open_a, 0, NULL );
186 #endif
187 if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) {
188 /* given the filename, figure out which pvfs filesystem it is on */
189 ret = PVFS_util_resolve(fd->filename, &cur_fs,
190 pvfs_path, PVFS_NAME_MAX);
191 if (ret < 0 ) {
192 PVFS_perror("PVFS_util_resolve", ret);
193 /* TODO: pick a good error for this */
194 o_status.error = -1;
195 } else {
196 fake_an_open(cur_fs, pvfs_path,
197 fd->access_mode, fd->hints->striping_factor,
198 fd->hints->striping_unit,
199 pvfs2_fs, &o_status);
200 }
201
202 /* store credentials and object reference in fd */
203 pvfs2_fs->object_ref = o_status.object_ref;
204 fd->fs_ptr = pvfs2_fs;
205 }
206 #ifdef ADIOI_MPE_LOGGING
207 MPE_Log_event( ADIOI_MPE_open_b, 0, NULL );
208 #endif
209
210 /* broadcast status and (possibly valid) object reference */
211 MPI_Address(&o_status.error, &offsets[0]);
212 MPI_Address(&o_status.object_ref, &offsets[1]);
213
214 MPI_Type_struct(2, lens, offsets, types, &open_status_type);
215 MPI_Type_commit(&open_status_type);
216
217 /* Assertion: if we hit this Bcast, then all processes collectively
218 * called this open.
219 *
220 * That's because deferred open never happens with PVFS2.
221 */
222 MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0],
223 fd->comm);
224 MPI_Type_free(&open_status_type);
225
226 /* --BEGIN ERROR HANDLING-- */
227 if (o_status.error != 0)
228 {
229 ADIOI_Free(pvfs2_fs);
230 fd->fs_ptr = NULL;
231 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
232 MPIR_ERR_RECOVERABLE,
233 myname, __LINE__,
234 ADIOI_PVFS2_error_convert(o_status.error),
235 "Unknown error", 0);
236 /* TODO: FIX STRING */
237 return;
238 }
239 /* --END ERROR HANDLING-- */
240
241 pvfs2_fs->object_ref = o_status.object_ref;
242 fd->fs_ptr = pvfs2_fs;
243
244 *error_code = MPI_SUCCESS;
245 return;
246 }
247