1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "ad_pvfs2.h"
7 #include "ad_pvfs2_common.h"
8 
9 /* open_status is helpful for bcasting values around */
10 struct open_status_s {
11     int error;
12     PVFS_object_ref object_ref;
13 };
14 typedef struct open_status_s open_status;
15 
16     /* steps for getting a handle:  (it gets a little convoluted, but at least
17      * it's deterministic)
18      * . lookup the file.
19      * . if lookup succeeds, but we were passed MPI_MODE_EXCL, that's an error
20      * . if lookup fails, the file might not exist.
21      *          in that case, create the file if we were passed MPI_MODE_CREATE
22      * . if the create fails, that means someone else created the file between
23      *    our call to lookup and our call to create (like if N processors all
24      *    open the same file with MPI_COMM_SELF).  Then we can just look up the
25      *    file (which now exists).
26      *
27      * the good news is that only one processor does this and broadcasts the
28      * handle to everyone else in the communicator
29      */
fake_an_open(PVFS_fs_id fs_id,char * pvfs_name,int access_mode,int nr_datafiles,PVFS_size strip_size,ADIOI_PVFS2_fs * pvfs2_fs,open_status * o_status)30 static void fake_an_open(PVFS_fs_id fs_id, char *pvfs_name, int access_mode,
31                          int nr_datafiles, PVFS_size strip_size,
32                          ADIOI_PVFS2_fs * pvfs2_fs, open_status * o_status)
33 {
34     int ret;
35     PVFS_sysresp_lookup resp_lookup;
36     PVFS_sysresp_getparent resp_getparent;
37     PVFS_sysresp_create resp_create;
38     PVFS_sys_attr attribs;
39     PVFS_sys_dist *dist;
40 
41     ADIOI_PVFS2_makeattribs(&attribs);
42     if (nr_datafiles > 0) {
43         attribs.dfile_count = nr_datafiles;
44         attribs.mask |= PVFS_ATTR_SYS_DFILE_COUNT;
45     }
46 
47     dist = NULL;
48 
49     memset(&resp_lookup, 0, sizeof(resp_lookup));
50     memset(&resp_getparent, 0, sizeof(resp_getparent));
51     memset(&resp_create, 0, sizeof(resp_create));
52 
53 
54     ret = PVFS_sys_lookup(fs_id, pvfs_name,
55                           &(pvfs2_fs->credentials), &resp_lookup, PVFS2_LOOKUP_LINK_FOLLOW);
56     if (ret == (-PVFS_ENOENT)) {
57         if (access_mode & ADIO_CREATE) {
58             ret = PVFS_sys_getparent(fs_id, pvfs_name, &(pvfs2_fs->credentials), &resp_getparent);
59             if (ret < 0) {
60                 FPRINTF(stderr, "pvfs_sys_getparent returns with %d\n", ret);
61                 o_status->error = ret;
62                 return;
63             }
64 
65             /* Set the distribution strip size if specified */
66             if (0 < strip_size) {
67                 /* Note that the distribution is hardcoded here */
68                 dist = PVFS_sys_dist_lookup("simple_stripe");
69                 ret = PVFS_sys_dist_setparam(dist, "strip_size", &strip_size);
70                 if (ret < 0) {
71                     FPRINTF(stderr, "pvfs_sys_dist_setparam returns with %d\n", ret);
72                     o_status->error = ret;
73                 }
74             }
75 
76             /* Perform file creation */
77 #ifdef HAVE_PVFS2_CREATE_WITHOUT_LAYOUT
78             ret = PVFS_sys_create(resp_getparent.basename,
79                                   resp_getparent.parent_ref, attribs,
80                                   &(pvfs2_fs->credentials), dist, &resp_create);
81 #else
82             ret = PVFS_sys_create(resp_getparent.basename,
83                                   resp_getparent.parent_ref, attribs,
84                                   &(pvfs2_fs->credentials), dist, NULL, &resp_create);
85 #endif
86 
87             /* if many creates are happening in this directory, the earlier
88              * sys_lookup may have returned ENOENT, but the sys_create could
89              * return EEXISTS.  That means the file has been created anyway, so
90              * less work for us and we can just open it up and return the
91              * handle */
92             if (ret == (-PVFS_EEXIST)) {
93                 ret = PVFS_sys_lookup(fs_id, pvfs_name,
94                                       &(pvfs2_fs->credentials), &resp_lookup,
95                                       PVFS2_LOOKUP_LINK_FOLLOW);
96                 if (ret < 0) {
97                     o_status->error = ret;
98                     return;
99                 }
100                 o_status->error = ret;
101                 o_status->object_ref = resp_lookup.ref;
102                 return;
103             }
104             o_status->object_ref = resp_create.ref;
105         } else {
106             FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n");
107             o_status->error = ret;
108             return;
109         }
110     } else if (access_mode & ADIO_EXCL) {
111         /* lookup should not succeed if opened with EXCL */
112         o_status->error = -PVFS_EEXIST;
113         return;
114     } else {
115         o_status->object_ref = resp_lookup.ref;
116     }
117     o_status->error = ret;
118     return;
119 
120 }
121 
122 
123 /* ADIOI_PVFS2_Open:
124  *  one process opens (or creates) the file, then broadcasts the result to the
125  *  remaining processors.
126  *
127  *  ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before
128  * that, MPI_MODE_EXCL) was set.  Because PVFS2 handles file lookup and
129  * creation more scalably than other file systems, ADIO_Open now skips any
130  * special handling when CREATE is set.  */
ADIOI_PVFS2_Open(ADIO_File fd,int * error_code)131 void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code)
132 {
133     int rank, ret;
134     PVFS_fs_id cur_fs;
135     static char myname[] = "ADIOI_PVFS2_OPEN";
136     char pvfs_path[PVFS_NAME_MAX] = { 0 };
137 
138     ADIOI_PVFS2_fs *pvfs2_fs;
139 
140     /* since one process is doing the open, that means one process is also
141      * doing the error checking.  define a struct for both the object reference
142      * and the error code to broadcast to all the processors */
143 
144     open_status o_status = { 0, {0, 0} };
145     MPI_Datatype open_status_type;
146     MPI_Datatype types[2] = { MPI_INT, MPI_BYTE };
147     int lens[2] = { 1, sizeof(PVFS_object_ref) };
148     MPI_Aint offsets[2];
149 
150     pvfs2_fs = (ADIOI_PVFS2_fs *) ADIOI_Malloc(sizeof(ADIOI_PVFS2_fs));
151 
152     /* --BEGIN ERROR HANDLING-- */
153     if (pvfs2_fs == NULL) {
154         *error_code = MPIO_Err_create_code(MPI_SUCCESS,
155                                            MPIR_ERR_RECOVERABLE,
156                                            myname, __LINE__,
157                                            MPI_ERR_UNKNOWN, "Error allocating memory", 0);
158         return;
159     }
160     /* --END ERROR HANDLING-- */
161 
162     MPI_Comm_rank(fd->comm, &rank);
163 
164     ADIOI_PVFS2_Init(error_code);
165     if (*error_code != MPI_SUCCESS) {
166         /* ADIOI_PVFS2_INIT handles creating error codes on its own */
167         return;
168     }
169 
170     /* currently everyone gets their own credentials */
171     ADIOI_PVFS2_makecredentials(&(pvfs2_fs->credentials));
172 
173     /* one process resolves name and will later bcast to others */
174 #ifdef ADIOI_MPE_LOGGING
175     MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
176 #endif
177     if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) {
178         /* given the filename, figure out which pvfs filesystem it is on */
179         ret = PVFS_util_resolve(fd->filename, &cur_fs, pvfs_path, PVFS_NAME_MAX);
180         if (ret < 0) {
181             PVFS_perror("PVFS_util_resolve", ret);
182             /* TODO: pick a good error for this */
183             o_status.error = -1;
184         } else {
185             fake_an_open(cur_fs, pvfs_path,
186                          fd->access_mode, fd->hints->striping_factor,
187                          fd->hints->striping_unit, pvfs2_fs, &o_status);
188         }
189 
190         /* store credentials and object reference in fd */
191         pvfs2_fs->object_ref = o_status.object_ref;
192         fd->fs_ptr = pvfs2_fs;
193     }
194 #ifdef ADIOI_MPE_LOGGING
195     MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
196 #endif
197 
198     /* broadcast status and (possibly valid) object reference */
199     MPI_Address(&o_status.error, &offsets[0]);
200     MPI_Address(&o_status.object_ref, &offsets[1]);
201 
202     MPI_Type_struct(2, lens, offsets, types, &open_status_type);
203     MPI_Type_commit(&open_status_type);
204 
205     /* Assertion: if we hit this Bcast, then all processes collectively
206      *            called this open.
207      *
208      * That's because deferred open never happens with PVFS2.
209      */
210     MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0], fd->comm);
211     MPI_Type_free(&open_status_type);
212 
213     /* --BEGIN ERROR HANDLING-- */
214     if (o_status.error != 0) {
215         ADIOI_Free(pvfs2_fs);
216         fd->fs_ptr = NULL;
217         *error_code = MPIO_Err_create_code(MPI_SUCCESS,
218                                            MPIR_ERR_RECOVERABLE,
219                                            myname, __LINE__,
220                                            ADIOI_PVFS2_error_convert(o_status.error),
221                                            "Unknown error", 0);
222         /* TODO: FIX STRING */
223         return;
224     }
225     /* --END ERROR HANDLING-- */
226 
227     pvfs2_fs->object_ref = o_status.object_ref;
228     fd->fs_ptr = pvfs2_fs;
229 
230     *error_code = MPI_SUCCESS;
231     return;
232 }
233