1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "ad_pvfs2.h"
7 #include "ad_pvfs2_common.h"
8
9 /* open_status is helpful for bcasting values around */
10 struct open_status_s {
11 int error;
12 PVFS_object_ref object_ref;
13 };
14 typedef struct open_status_s open_status;
15
16 /* steps for getting a handle: (it gets a little convoluted, but at least
17 * it's deterministic)
18 * . lookup the file.
19 * . if lookup succeeds, but we were passed MPI_MODE_EXCL, that's an error
20 * . if lookup fails, the file might not exist.
21 * in that case, create the file if we were passed MPI_MODE_CREATE
22 * . if the create fails, that means someone else created the file between
23 * our call to lookup and our call to create (like if N processors all
24 * open the same file with MPI_COMM_SELF). Then we can just look up the
25 * file (which now exists).
26 *
27 * the good news is that only one processor does this and broadcasts the
28 * handle to everyone else in the communicator
29 */
fake_an_open(PVFS_fs_id fs_id,char * pvfs_name,int access_mode,int nr_datafiles,PVFS_size strip_size,ADIOI_PVFS2_fs * pvfs2_fs,open_status * o_status)30 static void fake_an_open(PVFS_fs_id fs_id, char *pvfs_name, int access_mode,
31 int nr_datafiles, PVFS_size strip_size,
32 ADIOI_PVFS2_fs * pvfs2_fs, open_status * o_status)
33 {
34 int ret;
35 PVFS_sysresp_lookup resp_lookup;
36 PVFS_sysresp_getparent resp_getparent;
37 PVFS_sysresp_create resp_create;
38 PVFS_sys_attr attribs;
39 PVFS_sys_dist *dist;
40
41 ADIOI_PVFS2_makeattribs(&attribs);
42 if (nr_datafiles > 0) {
43 attribs.dfile_count = nr_datafiles;
44 attribs.mask |= PVFS_ATTR_SYS_DFILE_COUNT;
45 }
46
47 dist = NULL;
48
49 memset(&resp_lookup, 0, sizeof(resp_lookup));
50 memset(&resp_getparent, 0, sizeof(resp_getparent));
51 memset(&resp_create, 0, sizeof(resp_create));
52
53
54 ret = PVFS_sys_lookup(fs_id, pvfs_name,
55 &(pvfs2_fs->credentials), &resp_lookup, PVFS2_LOOKUP_LINK_FOLLOW);
56 if (ret == (-PVFS_ENOENT)) {
57 if (access_mode & ADIO_CREATE) {
58 ret = PVFS_sys_getparent(fs_id, pvfs_name, &(pvfs2_fs->credentials), &resp_getparent);
59 if (ret < 0) {
60 FPRINTF(stderr, "pvfs_sys_getparent returns with %d\n", ret);
61 o_status->error = ret;
62 return;
63 }
64
65 /* Set the distribution strip size if specified */
66 if (0 < strip_size) {
67 /* Note that the distribution is hardcoded here */
68 dist = PVFS_sys_dist_lookup("simple_stripe");
69 ret = PVFS_sys_dist_setparam(dist, "strip_size", &strip_size);
70 if (ret < 0) {
71 FPRINTF(stderr, "pvfs_sys_dist_setparam returns with %d\n", ret);
72 o_status->error = ret;
73 }
74 }
75
76 /* Perform file creation */
77 #ifdef HAVE_PVFS2_CREATE_WITHOUT_LAYOUT
78 ret = PVFS_sys_create(resp_getparent.basename,
79 resp_getparent.parent_ref, attribs,
80 &(pvfs2_fs->credentials), dist, &resp_create);
81 #else
82 ret = PVFS_sys_create(resp_getparent.basename,
83 resp_getparent.parent_ref, attribs,
84 &(pvfs2_fs->credentials), dist, NULL, &resp_create);
85 #endif
86
87 /* if many creates are happening in this directory, the earlier
88 * sys_lookup may have returned ENOENT, but the sys_create could
89 * return EEXISTS. That means the file has been created anyway, so
90 * less work for us and we can just open it up and return the
91 * handle */
92 if (ret == (-PVFS_EEXIST)) {
93 ret = PVFS_sys_lookup(fs_id, pvfs_name,
94 &(pvfs2_fs->credentials), &resp_lookup,
95 PVFS2_LOOKUP_LINK_FOLLOW);
96 if (ret < 0) {
97 o_status->error = ret;
98 return;
99 }
100 o_status->error = ret;
101 o_status->object_ref = resp_lookup.ref;
102 return;
103 }
104 o_status->object_ref = resp_create.ref;
105 } else {
106 FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n");
107 o_status->error = ret;
108 return;
109 }
110 } else if (access_mode & ADIO_EXCL) {
111 /* lookup should not succeed if opened with EXCL */
112 o_status->error = -PVFS_EEXIST;
113 return;
114 } else {
115 o_status->object_ref = resp_lookup.ref;
116 }
117 o_status->error = ret;
118 return;
119
120 }
121
122
123 /* ADIOI_PVFS2_Open:
124 * one process opens (or creates) the file, then broadcasts the result to the
125 * remaining processors.
126 *
127 * ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before
128 * that, MPI_MODE_EXCL) was set. Because PVFS2 handles file lookup and
129 * creation more scalably than other file systems, ADIO_Open now skips any
130 * special handling when CREATE is set. */
ADIOI_PVFS2_Open(ADIO_File fd,int * error_code)131 void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code)
132 {
133 int rank, ret;
134 PVFS_fs_id cur_fs;
135 static char myname[] = "ADIOI_PVFS2_OPEN";
136 char pvfs_path[PVFS_NAME_MAX] = { 0 };
137
138 ADIOI_PVFS2_fs *pvfs2_fs;
139
140 /* since one process is doing the open, that means one process is also
141 * doing the error checking. define a struct for both the object reference
142 * and the error code to broadcast to all the processors */
143
144 open_status o_status = { 0, {0, 0} };
145 MPI_Datatype open_status_type;
146 MPI_Datatype types[2] = { MPI_INT, MPI_BYTE };
147 int lens[2] = { 1, sizeof(PVFS_object_ref) };
148 MPI_Aint offsets[2];
149
150 pvfs2_fs = (ADIOI_PVFS2_fs *) ADIOI_Malloc(sizeof(ADIOI_PVFS2_fs));
151
152 /* --BEGIN ERROR HANDLING-- */
153 if (pvfs2_fs == NULL) {
154 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
155 MPIR_ERR_RECOVERABLE,
156 myname, __LINE__,
157 MPI_ERR_UNKNOWN, "Error allocating memory", 0);
158 return;
159 }
160 /* --END ERROR HANDLING-- */
161
162 MPI_Comm_rank(fd->comm, &rank);
163
164 ADIOI_PVFS2_Init(error_code);
165 if (*error_code != MPI_SUCCESS) {
166 /* ADIOI_PVFS2_INIT handles creating error codes on its own */
167 return;
168 }
169
170 /* currently everyone gets their own credentials */
171 ADIOI_PVFS2_makecredentials(&(pvfs2_fs->credentials));
172
173 /* one process resolves name and will later bcast to others */
174 #ifdef ADIOI_MPE_LOGGING
175 MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
176 #endif
177 if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) {
178 /* given the filename, figure out which pvfs filesystem it is on */
179 ret = PVFS_util_resolve(fd->filename, &cur_fs, pvfs_path, PVFS_NAME_MAX);
180 if (ret < 0) {
181 PVFS_perror("PVFS_util_resolve", ret);
182 /* TODO: pick a good error for this */
183 o_status.error = -1;
184 } else {
185 fake_an_open(cur_fs, pvfs_path,
186 fd->access_mode, fd->hints->striping_factor,
187 fd->hints->striping_unit, pvfs2_fs, &o_status);
188 }
189
190 /* store credentials and object reference in fd */
191 pvfs2_fs->object_ref = o_status.object_ref;
192 fd->fs_ptr = pvfs2_fs;
193 }
194 #ifdef ADIOI_MPE_LOGGING
195 MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
196 #endif
197
198 /* broadcast status and (possibly valid) object reference */
199 MPI_Address(&o_status.error, &offsets[0]);
200 MPI_Address(&o_status.object_ref, &offsets[1]);
201
202 MPI_Type_struct(2, lens, offsets, types, &open_status_type);
203 MPI_Type_commit(&open_status_type);
204
205 /* Assertion: if we hit this Bcast, then all processes collectively
206 * called this open.
207 *
208 * That's because deferred open never happens with PVFS2.
209 */
210 MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0], fd->comm);
211 MPI_Type_free(&open_status_type);
212
213 /* --BEGIN ERROR HANDLING-- */
214 if (o_status.error != 0) {
215 ADIOI_Free(pvfs2_fs);
216 fd->fs_ptr = NULL;
217 *error_code = MPIO_Err_create_code(MPI_SUCCESS,
218 MPIR_ERR_RECOVERABLE,
219 myname, __LINE__,
220 ADIOI_PVFS2_error_convert(o_status.error),
221 "Unknown error", 0);
222 /* TODO: FIX STRING */
223 return;
224 }
225 /* --END ERROR HANDLING-- */
226
227 pvfs2_fs->object_ref = o_status.object_ref;
228 fd->fs_ptr = pvfs2_fs;
229
230 *error_code = MPI_SUCCESS;
231 return;
232 }
233