1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Copyright by The HDF Group. *
3 * Copyright by the Board of Trustees of the University of Illinois. *
4 * All rights reserved. *
5 * *
6 * This file is part of HDF5. The full HDF5 copyright notice, including *
7 * terms governing use, modification, and redistribution, is contained in *
8 * the COPYING file, which can be found at the root of the source code *
9 * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
10 * If you do not have access to either file, you may request a copy from *
11 * help@hdfgroup.org. *
12 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
13
14 /*
15 * Programmer: Robb Matzke <matzke@llnl.gov>
16 * Thursday, July 29, 1999
17 *
18 * Purpose: This is the MPI-2 I/O driver.
19 *
20 */
21
22 /* Interface initialization */
23 #define H5_INTERFACE_INIT_FUNC H5FD_mpio_init_interface
24
25
26 #include "H5private.h" /* Generic Functions */
27 #include "H5Dprivate.h" /* Dataset functions */
28 #include "H5Eprivate.h" /* Error handling */
29 #include "H5Fprivate.h" /* File access */
30 #include "H5FDprivate.h" /* File drivers */
31 #include "H5FDmpi.h" /* MPI-based file drivers */
32 #include "H5Iprivate.h" /* IDs */
33 #include "H5MMprivate.h" /* Memory management */
34 #include "H5Pprivate.h" /* Property lists */
35
36 #ifdef H5_HAVE_PARALLEL
37
38 /*
39 * The driver identification number, initialized at runtime if H5_HAVE_PARALLEL
40 * is defined. This allows applications to still have the H5FD_MPIO
41 * "constants" in their source code.
42 */
43 static hid_t H5FD_MPIO_g = 0;
44
45 /* Whether to allow collective I/O operations */
46 /* (Value can be set from environment variable also) */
47 hbool_t H5FD_mpi_opt_types_g = TRUE;
48
49 /*
50 * The view is set to this value
51 */
52 static char H5FD_mpi_native_g[] = "native";
53
54 /*
55 * The description of a file belonging to this driver.
56 * The EOF value is only used just after the file is opened in order for the
57 * library to determine whether the file is empty, truncated, or okay. The MPIO
58 * driver doesn't bother to keep it updated since it's an expensive operation.
59 */
60 typedef struct H5FD_mpio_t {
61 H5FD_t pub; /*public stuff, must be first */
62 MPI_File f; /*MPIO file handle */
63 MPI_Comm comm; /*communicator */
64 MPI_Info info; /*file information */
65 int mpi_rank; /* This process's rank */
66 int mpi_size; /* Total number of processes */
67 haddr_t eof; /*end-of-file marker */
68 haddr_t eoa; /*end-of-address marker */
69 haddr_t last_eoa; /* Last known end-of-address marker */
70 } H5FD_mpio_t;
71
72 /* Private Prototypes */
73
74 /* Callbacks */
75 static void *H5FD_mpio_fapl_get(H5FD_t *_file);
76 static void *H5FD_mpio_fapl_copy(const void *_old_fa);
77 static herr_t H5FD_mpio_fapl_free(void *_fa);
78 static H5FD_t *H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id,
79 haddr_t maxaddr);
80 static herr_t H5FD_mpio_close(H5FD_t *_file);
81 static herr_t H5FD_mpio_query(const H5FD_t *_f1, unsigned long *flags);
82 static haddr_t H5FD_mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t type);
83 static herr_t H5FD_mpio_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr);
84 static haddr_t H5FD_mpio_get_eof(const H5FD_t *_file);
85 static herr_t H5FD_mpio_get_handle(H5FD_t *_file, hid_t fapl, void** file_handle);
86 static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
87 size_t size, void *buf);
88 static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
89 size_t size, const void *buf);
90 static herr_t H5FD_mpio_flush(H5FD_t *_file, hid_t dxpl_id, unsigned closing);
91 static herr_t H5FD_mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
92 static int H5FD_mpio_mpi_rank(const H5FD_t *_file);
93 static int H5FD_mpio_mpi_size(const H5FD_t *_file);
94 static MPI_Comm H5FD_mpio_communicator(const H5FD_t *_file);
95
96 /* MPIO-specific file access properties */
97 typedef struct H5FD_mpio_fapl_t {
98 MPI_Comm comm; /*communicator */
99 MPI_Info info; /*file information */
100 } H5FD_mpio_fapl_t;
101
102 /* The MPIO file driver information */
103 static const H5FD_class_mpi_t H5FD_mpio_g = {
104 { /* Start of superclass information */
105 "mpio", /*name */
106 HADDR_MAX, /*maxaddr */
107 H5F_CLOSE_SEMI, /* fc_degree */
108 NULL, /*sb_size */
109 NULL, /*sb_encode */
110 NULL, /*sb_decode */
111 sizeof(H5FD_mpio_fapl_t), /*fapl_size */
112 H5FD_mpio_fapl_get, /*fapl_get */
113 H5FD_mpio_fapl_copy, /*fapl_copy */
114 H5FD_mpio_fapl_free, /*fapl_free */
115 0, /*dxpl_size */
116 NULL, /*dxpl_copy */
117 NULL, /*dxpl_free */
118 H5FD_mpio_open, /*open */
119 H5FD_mpio_close, /*close */
120 NULL, /*cmp */
121 H5FD_mpio_query, /*query */
122 NULL, /*get_type_map */
123 NULL, /*alloc */
124 NULL, /*free */
125 H5FD_mpio_get_eoa, /*get_eoa */
126 H5FD_mpio_set_eoa, /*set_eoa */
127 H5FD_mpio_get_eof, /*get_eof */
128 H5FD_mpio_get_handle, /*get_handle */
129 H5FD_mpio_read, /*read */
130 H5FD_mpio_write, /*write */
131 H5FD_mpio_flush, /*flush */
132 H5FD_mpio_truncate, /*truncate */
133 NULL, /*lock */
134 NULL, /*unlock */
135 H5FD_FLMAP_DICHOTOMY /*fl_map */
136 }, /* End of superclass information */
137 H5FD_mpio_mpi_rank, /*get_rank */
138 H5FD_mpio_mpi_size, /*get_size */
139 H5FD_mpio_communicator /*get_comm */
140 };
141
142 #ifdef H5FDmpio_DEBUG
143 /* Flags to control debug actions in H5Fmpio.
144 * Meant to be indexed by characters.
145 *
146 * 'c' show result of MPI_Get_count after read
147 * 'r' show read offset and size
148 * 't' trace function entry and exit
149 * 'w' show write offset and size
150 */
151 static int H5FD_mpio_Debug[256] =
152 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
153 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
154 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
155 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
156 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
157 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
158 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
159 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
160 #endif
161
162
163 /*--------------------------------------------------------------------------
164 NAME
165 H5FD_mpio_init_interface -- Initialize interface-specific information
166 USAGE
167 herr_t H5FD_mpio_init_interface()
168
169 RETURNS
170 Non-negative on success/Negative on failure
171 DESCRIPTION
172 Initializes any interface-specific data or routines. (Just calls
173 H5FD_mpio_init currently).
174
175 --------------------------------------------------------------------------*/
176 static herr_t
H5FD_mpio_init_interface(void)177 H5FD_mpio_init_interface(void)
178 {
179 FUNC_ENTER_NOAPI_NOINIT_NOERR
180
181 FUNC_LEAVE_NOAPI(H5FD_mpio_init())
182 } /* H5FD_mpio_init_interface() */
183
184
185 /*-------------------------------------------------------------------------
186 * Function: H5FD_mpio_init
187 *
188 * Purpose: Initialize this driver by registering the driver with the
189 * library.
190 *
191 * Return: Success: The driver ID for the mpio driver.
192 * Failure: Negative.
193 *
194 * Programmer: Robb Matzke
195 * Thursday, August 5, 1999
196 *
197 *-------------------------------------------------------------------------
198 */
199 hid_t
H5FD_mpio_init(void)200 H5FD_mpio_init(void)
201 {
202 #ifdef H5FDmpio_DEBUG
203 static int H5FD_mpio_Debug_inited = 0;
204 #endif /* H5FDmpio_DEBUG */
205 const char *s; /* String for environment variables */
206 hid_t ret_value; /* Return value */
207
208 FUNC_ENTER_NOAPI(FAIL)
209
210 /* Register the MPI-IO VFD, if it isn't already */
211 if(H5I_VFL != H5I_get_type(H5FD_MPIO_g))
212 H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g, sizeof(H5FD_class_mpi_t), FALSE);
213
214 /* Allow MPI buf-and-file-type optimizations? */
215 s = HDgetenv("HDF5_MPI_OPT_TYPES");
216 if(s && HDisdigit(*s))
217 H5FD_mpi_opt_types_g = (hbool_t)HDstrtol(s, NULL, 0);
218
219 #ifdef H5FDmpio_DEBUG
220 if(!H5FD_mpio_Debug_inited) {
221 /* Retrieve MPI-IO debugging environment variable */
222 s = HDgetenv("H5FD_mpio_Debug");
223 if(s) {
224 /* Set debug mask */
225 while(*s) {
226 H5FD_mpio_Debug[(int)*s]++;
227 s++;
228 } /* end while */
229 } /* end if */
230 H5FD_mpio_Debug_inited++;
231 } /* end if */
232 #endif /* H5FDmpio_DEBUG */
233
234 /* Set return value */
235 ret_value = H5FD_MPIO_g;
236
237 done:
238 FUNC_LEAVE_NOAPI(ret_value)
239 } /* end H5FD_mpio_init() */
240
241
242 /*---------------------------------------------------------------------------
243 * Function: H5FD_mpio_term
244 *
245 * Purpose: Shut down the VFD
246 *
247 * Return: <none>
248 *
249 * Programmer: Quincey Koziol
250 * Friday, Jan 30, 2004
251 *
252 * Modification:
253 *
254 *---------------------------------------------------------------------------
255 */
256 void
H5FD_mpio_term(void)257 H5FD_mpio_term(void)
258 {
259 FUNC_ENTER_NOAPI_NOINIT_NOERR
260
261 /* Reset VFL ID */
262 H5FD_MPIO_g=0;
263
264 FUNC_LEAVE_NOAPI_VOID
265 } /* end H5FD_mpio_term() */
266
267
268 /*-------------------------------------------------------------------------
269 * Function: H5Pset_fapl_mpio
270 *
271 * Purpose: Store the user supplied MPIO communicator comm and info in
272 * the file access property list FAPL_ID which can then be used
273 * to create and/or open the file. This function is available
274 * only in the parallel HDF5 library and is not collective.
275 *
276 * comm is the MPI communicator to be used for file open as
277 * defined in MPI_FILE_OPEN of MPI-2. This function makes a
278 * duplicate of comm. Any modification to comm after this function
279 * call returns has no effect on the access property list.
280 *
281 * info is the MPI Info object to be used for file open as
282 * defined in MPI_FILE_OPEN of MPI-2. This function makes a
283 * duplicate of info. Any modification to info after this
284 * function call returns has no effect on the access property
285 * list.
286 *
287 * If fapl_id has previously set comm and info values, they
288 * will be replaced and the old communicator and Info object
289 * are freed.
290 *
291 * Return: Success: Non-negative
292 *
293 * Failure: Negative
294 *
295 * Programmer: Albert Cheng
296 * Feb 3, 1998
297 *
298 * Modifications:
299 * Robb Matzke, 1998-02-18
300 * Check all arguments before the property list is updated so we
301 * don't leave the property list in a bad state if something
302 * goes wrong. Also, the property list data type changed to
303 * allow more generality so all the mpi-related stuff is in the
304 * `u.mpi' member. The `access_mode' will contain only
305 * mpi-related flags defined in H5Fpublic.h.
306 *
307 * Albert Cheng, 1998-04-16
308 * Removed the ACCESS_MODE argument. The access mode is changed
309 * to be controlled by data transfer property list during data
310 * read/write calls.
311 *
312 * Robb Matzke, 1999-08-06
313 * Modified to work with the virtual file layer.
314 *
315 * Raymond Lu, 2001-10-23
316 * Changed the file access list to the new generic property
317 * list.
318 *
319 * Albert Cheng, 2003-04-17
320 * Modified the description of the function that it now stores
321 * a duplicate of the communicator and INFO object. Free the
322 * old duplicates if previously set. (Work is actually done
323 * by H5P_set_driver.)
324 *
325 *-------------------------------------------------------------------------
326 */
327 herr_t
H5Pset_fapl_mpio(hid_t fapl_id,MPI_Comm comm,MPI_Info info)328 H5Pset_fapl_mpio(hid_t fapl_id, MPI_Comm comm, MPI_Info info)
329 {
330 H5FD_mpio_fapl_t fa;
331 H5P_genplist_t *plist; /* Property list pointer */
332 herr_t ret_value;
333
334 FUNC_ENTER_API(FAIL)
335 H5TRACE3("e", "iMcMi", fapl_id, comm, info);
336
337 if(fapl_id == H5P_DEFAULT)
338 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
339
340 /* Check arguments */
341 if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
342 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list")
343 if(MPI_COMM_NULL == comm)
344 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a valid communicator")
345
346 /* Initialize driver specific properties */
347 fa.comm = comm;
348 fa.info = info;
349
350 /* duplication is done during driver setting. */
351 ret_value= H5P_set_driver(plist, H5FD_MPIO, &fa);
352
353 done:
354 FUNC_LEAVE_API(ret_value)
355 }
356
357
358 /*-------------------------------------------------------------------------
359 * Function: H5Pget_fapl_mpio
360 *
361 * Purpose: If the file access property list is set to the H5FD_MPIO
362 * driver then this function returns duplicates of the MPI
363 * communicator and Info object stored through the comm and
364 * info pointers. It is the responsibility of the application
365 * to free the returned communicator and Info object.
366 *
367 * Return: Success: Non-negative with the communicator and
368 * Info object returned through the comm and
369 * info arguments if non-null. Since they are
370 * duplicates of the stored objects, future
371 * modifications to the access property list do
372 * not affect them and it is the responsibility
373 * of the application to free them.
374 *
375 * Failure: Negative
376 *
377 * Programmer: Robb Matzke
378 * Thursday, February 26, 1998
379 *
380 * Modifications:
381 *
382 * Albert Cheng, Apr 16, 1998
383 * Removed the access_mode argument. The access_mode is changed
384 * to be controlled by data transfer property list during data
385 * read/write calls.
386 *
387 * Raymond Lu, 2001-10-23
388 * Changed the file access list to the new generic property
389 * list.
390 *
391 * Albert Cheng, 2003-04-17
392 * Return duplicates of the stored communicator and Info object.
393 *
394 *-------------------------------------------------------------------------
395 */
396 herr_t
H5Pget_fapl_mpio(hid_t fapl_id,MPI_Comm * comm,MPI_Info * info)397 H5Pget_fapl_mpio(hid_t fapl_id, MPI_Comm *comm/*out*/, MPI_Info *info/*out*/)
398 {
399 H5FD_mpio_fapl_t *fa;
400 H5P_genplist_t *plist; /* Property list pointer */
401 MPI_Comm comm_tmp=MPI_COMM_NULL;
402 int mpi_code; /* mpi return code */
403 herr_t ret_value=SUCCEED; /* Return value */
404
405 FUNC_ENTER_API(FAIL)
406 H5TRACE3("e", "ixx", fapl_id, comm, info);
407
408 if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
409 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list")
410 if(H5FD_MPIO != H5P_get_driver(plist))
411 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect VFL driver")
412 if(NULL == (fa = (H5FD_mpio_fapl_t *)H5P_get_driver_info(plist)))
413 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "bad VFL driver info")
414
415 /* Store the duplicated communicator in a temporary variable for error */
416 /* recovery in case the INFO duplication fails. We cannot attempt to */
417 /* the value into *comm yet since if MPI_Comm_dup fails, we will end */
418 /* up freeing whatever *comm holds and that could be invalid. */
419 if (comm){
420 if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(fa->comm, &comm_tmp)))
421 HMPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code)
422 }
423
424 if (info){
425 if (MPI_INFO_NULL != fa->info){
426 if (MPI_SUCCESS != (mpi_code=MPI_Info_dup(fa->info, info)))
427 HMPI_GOTO_ERROR(FAIL, "MPI_Info_dup failed", mpi_code)
428 }else{
429 /* do not dup it */
430 *info = MPI_INFO_NULL;
431 }
432 }
433
434 if (comm)
435 *comm = comm_tmp;
436
437 done:
438 if (FAIL==ret_value){
439 /* need to free anything created here */
440 if (comm_tmp != MPI_COMM_NULL)
441 MPI_Comm_free(&comm_tmp);
442 }
443 FUNC_LEAVE_API(ret_value)
444 }
445
446
447 /*-------------------------------------------------------------------------
448 * Function: H5Pset_dxpl_mpio
449 *
450 * Purpose: Set the data transfer property list DXPL_ID to use transfer
451 * mode XFER_MODE. The property list can then be used to control
452 * the I/O transfer mode during data I/O operations. The valid
453 * transfer modes are:
454 *
455 * H5FD_MPIO_INDEPENDENT:
456 * Use independent I/O access (the default).
457 *
458 * H5FD_MPIO_COLLECTIVE:
459 * Use collective I/O access.
460 *
461 * Return: Success: Non-negative
462 * Failure: Negative
463 *
464 * Programmer: Albert Cheng
465 * April 2, 1998
466 *
467 *-------------------------------------------------------------------------
468 */
469 herr_t
H5Pset_dxpl_mpio(hid_t dxpl_id,H5FD_mpio_xfer_t xfer_mode)470 H5Pset_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t xfer_mode)
471 {
472 H5P_genplist_t *plist; /* Property list pointer */
473 herr_t ret_value = SUCCEED; /* Return value */
474
475 FUNC_ENTER_API(FAIL)
476 H5TRACE2("e", "iDt", dxpl_id, xfer_mode);
477
478 if(dxpl_id == H5P_DEFAULT)
479 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
480
481 /* Check arguments */
482 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
483 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
484 if(H5FD_MPIO_INDEPENDENT != xfer_mode && H5FD_MPIO_COLLECTIVE != xfer_mode)
485 HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "incorrect xfer_mode")
486
487 /* Set the transfer mode */
488 if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0)
489 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
490
491 done:
492 FUNC_LEAVE_API(ret_value)
493 } /* end H5Pset_dxpl_mpio() */
494
495
496 /*-------------------------------------------------------------------------
497 * Function: H5Pget_dxpl_mpio
498 *
499 * Purpose: Queries the transfer mode current set in the data transfer
500 * property list DXPL_ID. This is not collective.
501 *
502 * Return: Success: Non-negative, with the transfer mode returned
503 * through the XFER_MODE argument if it is
504 * non-null.
505 *
506 * Failure: Negative
507 *
508 * Programmer: Albert Cheng
509 * April 2, 1998
510 *
511 *-------------------------------------------------------------------------
512 */
513 herr_t
H5Pget_dxpl_mpio(hid_t dxpl_id,H5FD_mpio_xfer_t * xfer_mode)514 H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode/*out*/)
515 {
516 H5P_genplist_t *plist; /* Property list pointer */
517 herr_t ret_value = SUCCEED; /* Return value */
518
519 FUNC_ENTER_API(FAIL)
520 H5TRACE2("e", "ix", dxpl_id, xfer_mode);
521
522 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
523 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
524
525 /* Get the transfer mode */
526 if(xfer_mode)
527 if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, xfer_mode) < 0)
528 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to get value")
529
530 done:
531 FUNC_LEAVE_API(ret_value)
532 } /* end H5Pget_dxpl_mpio() */
533
534
535 /*-------------------------------------------------------------------------
536 * Function: H5Pset_dxpl_mpio_collective_opt
537 *
538 * Purpose: To set a flag to choose linked chunk I/O or multi-chunk I/O
539 * without involving decision-making inside HDF5
540 *
541 * Note: The library will do linked chunk I/O or multi-chunk I/O without
542 * involving communications for decision-making process.
543 * The library won't behave as it asks for only when we find
544 * that the low-level MPI-IO package doesn't support this.
545 *
546 * Return: Success: Non-negative
547 * Failure: Negative
548 *
549 * Programmer: Kent Yang
550 * ? ?, ?
551 *
552 *-------------------------------------------------------------------------
553 */
554 herr_t
H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id,H5FD_mpio_collective_opt_t opt_mode)555 H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id, H5FD_mpio_collective_opt_t opt_mode)
556 {
557 H5P_genplist_t *plist; /* Property list pointer */
558 herr_t ret_value = SUCCEED; /* Return value */
559
560 FUNC_ENTER_API(FAIL)
561 H5TRACE2("e", "iDc", dxpl_id, opt_mode);
562
563 if(dxpl_id == H5P_DEFAULT)
564 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
565
566 /* Check arguments */
567 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
568 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
569
570 /* Set the transfer mode */
571 if(H5P_set(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &opt_mode) < 0)
572 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
573
574 done:
575 FUNC_LEAVE_API(ret_value)
576 } /* end H5Pset_dxpl_mpio_collective_opt() */
577
578
579 /*-------------------------------------------------------------------------
580 * Function: H5Pset_dxpl_mpio_chunk_opt
581 *
582 * Purpose: To set a flag to choose linked chunk I/O or multi-chunk I/O
583 * without involving decision-making inside HDF5
584 *
585 * Note: The library will do linked chunk I/O or multi-chunk I/O without
586 * involving communications for decision-making process.
587 * The library won't behave as it asks for only when we find
588 * that the low-level MPI-IO package doesn't support this.
589 *
590 * Return: Success: Non-negative
591 * Failure: Negative
592 *
593 * Programmer: Kent Yang
594 * ? ?, ?
595 *
596 *-------------------------------------------------------------------------
597 */
598 herr_t
H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id,H5FD_mpio_chunk_opt_t opt_mode)599 H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode)
600 {
601 H5P_genplist_t *plist; /* Property list pointer */
602 herr_t ret_value = SUCCEED; /* Return value */
603
604 FUNC_ENTER_API(FAIL)
605 H5TRACE2("e", "iDh", dxpl_id, opt_mode);
606
607 if(dxpl_id == H5P_DEFAULT)
608 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
609
610 /* Check arguments */
611 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
612 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
613
614 /* Set the transfer mode */
615 if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME, &opt_mode) < 0)
616 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
617
618 done:
619 FUNC_LEAVE_API(ret_value)
620 } /* end H5Pset_dxpl_mpio_chunk_opt() */
621
622
623 /*-------------------------------------------------------------------------
624 * Function: H5Pset_dxpl_mpio_chunk_opt_num
625 *
626 * Purpose: To set a threshold for doing linked chunk IO
627 *
628 * Note: If the number is greater than the threshold set by the user,
629 * the library will do linked chunk I/O; otherwise, I/O will be
630 * done for every chunk.
631 *
632 * Return: Success: Non-negative
633 * Failure: Negative
634 *
635 * Programmer: Kent Yang
636 * ? ?, ?
637 *
638 *-------------------------------------------------------------------------
639 */
640 herr_t
H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id,unsigned num_chunk_per_proc)641 H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc)
642 {
643 H5P_genplist_t *plist; /* Property list pointer */
644 herr_t ret_value = SUCCEED; /* Return value */
645
646 FUNC_ENTER_API(FAIL)
647 H5TRACE2("e", "iIu", dxpl_id, num_chunk_per_proc);
648
649 if(dxpl_id == H5P_DEFAULT)
650 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
651
652 /* Check arguments */
653 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
654 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
655
656 /* Set the transfer mode */
657 if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME, &num_chunk_per_proc) < 0)
658 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
659
660 done:
661 FUNC_LEAVE_API(ret_value)
662 } /* end H5Pset_dxpl_mpio_chunk_opt_num() */
663
664
665 /*-------------------------------------------------------------------------
666 * Function: H5Pset_dxpl_mpio_chunk_opt_ratio
667 *
668 * Purpose: To set a threshold for doing collective I/O for each chunk
669 *
670 * Note: The library will calculate the percentage of the number of
671 * process holding selections at each chunk. If that percentage
672 * of number of process in the individual chunk is greater than
673 * the threshold set by the user, the library will do collective
674 * chunk I/O for this chunk; otherwise, independent I/O will be
675 * done for this chunk.
676 *
677 * Return: Success: Non-negative
678 * Failure: Negative
679 *
680 * Programmer: Kent Yang
681 * ? ?, ?
682 *
683 *-------------------------------------------------------------------------
684 */
685 herr_t
H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id,unsigned percent_num_proc_per_chunk)686 H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk)
687 {
688 H5P_genplist_t *plist; /* Property list pointer */
689 herr_t ret_value = SUCCEED; /* Return value */
690
691 FUNC_ENTER_API(FAIL)
692 H5TRACE2("e", "iIu", dxpl_id, percent_num_proc_per_chunk);
693
694 if(dxpl_id == H5P_DEFAULT)
695 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
696
697 /* Check arguments */
698 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
699 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
700
701 /* Set the transfer mode */
702 if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME, &percent_num_proc_per_chunk) < 0)
703 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
704
705 done:
706 FUNC_LEAVE_API(ret_value)
707 } /* end H5Pset_dxpl_mpio_chunk_opt_ratio() */
708
709
710 /*-------------------------------------------------------------------------
711 * Function: H5FD_mpio_fapl_get
712 *
713 * Purpose: Returns a file access property list which could be used to
714 * create another file the same as this one.
715 *
716 * Return: Success: Ptr to new file access property list with all
717 * fields copied from the file pointer.
718 *
719 * Failure: NULL
720 *
721 * Programmer: Robb Matzke
722 * Friday, August 13, 1999
723 *
724 *-------------------------------------------------------------------------
725 */
726 static void *
H5FD_mpio_fapl_get(H5FD_t * _file)727 H5FD_mpio_fapl_get(H5FD_t *_file)
728 {
729 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
730 H5FD_mpio_fapl_t *fa = NULL;
731 void *ret_value; /* Return value */
732
733 FUNC_ENTER_NOAPI_NOINIT
734
735 HDassert(file);
736 HDassert(H5FD_MPIO == file->pub.driver_id);
737
738 if(NULL == (fa = (H5FD_mpio_fapl_t *)H5MM_calloc(sizeof(H5FD_mpio_fapl_t))))
739 HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
740
741 /* Duplicate communicator and Info object. */
742 if(FAIL == H5FD_mpi_comm_info_dup(file->comm, file->info, &fa->comm, &fa->info))
743 HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
744
745 /* Set return value */
746 ret_value = fa;
747
748 done:
749 FUNC_LEAVE_NOAPI(ret_value)
750 }
751
752
753 /*-------------------------------------------------------------------------
754 * Function: H5FD_mpio_fapl_copy
755 *
756 * Purpose: Copies the mpio-specific file access properties.
757 *
758 * Return: Success: Ptr to a new property list
759 *
760 * Failure: NULL
761 *
762 * Programmer: Albert Cheng
763 * Jan 8, 2003
764 *
765 *-------------------------------------------------------------------------
766 */
767 static void *
H5FD_mpio_fapl_copy(const void * _old_fa)768 H5FD_mpio_fapl_copy(const void *_old_fa)
769 {
770 void *ret_value = NULL;
771 const H5FD_mpio_fapl_t *old_fa = (const H5FD_mpio_fapl_t*)_old_fa;
772 H5FD_mpio_fapl_t *new_fa = NULL;
773
774 FUNC_ENTER_NOAPI_NOINIT
775 #ifdef H5FDmpio_DEBUG
776 if (H5FD_mpio_Debug[(int)'t'])
777 fprintf(stderr, "enter H5FD_mpio_fapl_copy\n");
778 #endif
779
780 if(NULL == (new_fa = (H5FD_mpio_fapl_t *)H5MM_malloc(sizeof(H5FD_mpio_fapl_t))))
781 HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
782
783 /* Copy the general information */
784 HDmemcpy(new_fa, old_fa, sizeof(H5FD_mpio_fapl_t));
785
786 /* Duplicate communicator and Info object. */
787 if(FAIL == H5FD_mpi_comm_info_dup(old_fa->comm, old_fa->info, &new_fa->comm, &new_fa->info))
788 HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
789 ret_value = new_fa;
790
791 done:
792 if (NULL == ret_value){
793 /* cleanup */
794 if (new_fa)
795 H5MM_xfree(new_fa);
796 }
797
798 #ifdef H5FDmpio_DEBUG
799 if (H5FD_mpio_Debug[(int)'t'])
800 fprintf(stderr, "leaving H5FD_mpio_fapl_copy\n");
801 #endif
802 FUNC_LEAVE_NOAPI(ret_value)
803 } /* end H5FD_mpio_fapl_copy() */
804
805
806 /*-------------------------------------------------------------------------
807 * Function: H5FD_mpio_fapl_free
808 *
809 * Purpose: Frees the mpio-specific file access properties.
810 *
811 * Return: Success: 0
812 *
813 * Failure: -1
814 *
815 * Programmer: Albert Cheng
816 * Jan 8, 2003
817 *
818 * Modifications:
819 *
820 *-------------------------------------------------------------------------
821 */
822 static herr_t
H5FD_mpio_fapl_free(void * _fa)823 H5FD_mpio_fapl_free(void *_fa)
824 {
825 herr_t ret_value = SUCCEED;
826 H5FD_mpio_fapl_t *fa = (H5FD_mpio_fapl_t*)_fa;
827
828 FUNC_ENTER_NOAPI_NOINIT_NOERR
829 #ifdef H5FDmpio_DEBUG
830 if (H5FD_mpio_Debug[(int)'t'])
831 fprintf(stderr, "in H5FD_mpio_fapl_free\n");
832 #endif
833 HDassert(fa);
834
835 /* Free the internal communicator and INFO object */
836 HDassert(MPI_COMM_NULL!=fa->comm);
837 H5FD_mpi_comm_info_free(&fa->comm, &fa->info);
838 H5MM_xfree(fa);
839
840 #ifdef H5FDmpio_DEBUG
841 if (H5FD_mpio_Debug[(int)'t'])
842 fprintf(stderr, "leaving H5FD_mpio_fapl_free\n");
843 #endif
844 FUNC_LEAVE_NOAPI(ret_value)
845 } /* end H5FD_mpio_fapl_free() */
846
847
848 /*-------------------------------------------------------------------------
849 * Function: H5FD_set_mpio_atomicity
850 *
851 * Purpose: Sets the atomicity mode
852 *
853 * Return: Success: Non-negative
854 *
855 * Failure: Negative
856 *
857 * Programmer: Mohamad Chaarawi
858 * Feb 14, 2012
859 *
860 *-------------------------------------------------------------------------
861 */
862 herr_t
H5FD_set_mpio_atomicity(H5FD_t * _file,hbool_t flag)863 H5FD_set_mpio_atomicity(H5FD_t *_file, hbool_t flag)
864 {
865 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
866 int mpi_code; /* MPI return code */
867 int temp_flag;
868 herr_t ret_value = SUCCEED;
869
870 FUNC_ENTER_NOAPI_NOINIT
871
872 #ifdef H5FDmpio_DEBUG
873 if (H5FD_mpio_Debug[(int)'t'])
874 fprintf(stdout, "Entering H5FD_set_mpio_atomicity\n");
875 #endif
876
877 if (FALSE == flag)
878 temp_flag = 0;
879 else
880 temp_flag = 1;
881
882 /* set atomicity value */
883 if (MPI_SUCCESS != (mpi_code=MPI_File_set_atomicity(file->f, temp_flag)))
884 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_atomicity", mpi_code)
885
886 done:
887 #ifdef H5FDmpio_DEBUG
888 if (H5FD_mpio_Debug[(int)'t'])
889 fprintf(stdout, "Leaving H5FD_set_mpio_atomicity\n");
890 #endif
891 FUNC_LEAVE_NOAPI(ret_value)
892 }
893
894
895 /*-------------------------------------------------------------------------
896 * Function: H5FD_get_mpio_atomicity
897 *
898 * Purpose: Returns the atomicity mode
899 *
900 * Return: Success: Non-negative
901 *
902 * Failure: Negative
903 *
904 * Programmer: Mohamad Chaarawi
905 * Feb 14, 2012
906 *
907 *-------------------------------------------------------------------------
908 */
909 herr_t
H5FD_get_mpio_atomicity(H5FD_t * _file,hbool_t * flag)910 H5FD_get_mpio_atomicity(H5FD_t *_file, hbool_t *flag)
911 {
912 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
913 int mpi_code; /* MPI return code */
914 int temp_flag;
915 herr_t ret_value = SUCCEED;
916
917 FUNC_ENTER_NOAPI_NOINIT
918
919 #ifdef H5FDmpio_DEBUG
920 if (H5FD_mpio_Debug[(int)'t'])
921 fprintf(stdout, "Entering H5FD_get_mpio_atomicity\n");
922 #endif
923
924 /* get atomicity value */
925 if (MPI_SUCCESS != (mpi_code=MPI_File_get_atomicity(file->f, &temp_flag)))
926 HMPI_GOTO_ERROR(FAIL, "MPI_File_get_atomicity", mpi_code)
927
928 if (0 != temp_flag)
929 *flag = TRUE;
930 else
931 *flag = FALSE;
932
933 done:
934 #ifdef H5FDmpio_DEBUG
935 if (H5FD_mpio_Debug[(int)'t'])
936 fprintf(stdout, "Leaving H5FD_get_mpio_atomicity\n");
937 #endif
938 FUNC_LEAVE_NOAPI(ret_value)
939 }
940
941
942 /*-------------------------------------------------------------------------
943 * Function: H5FD_mpio_open
944 *
945 * Purpose: Opens a file with name NAME. The FLAGS are a bit field with
946 * purpose similar to the second argument of open(2) and which
947 * are defined in H5Fpublic.h. The file access property list
948 * FAPL_ID contains the properties driver properties and MAXADDR
949 * is the largest address which this file will be expected to
950 * access. This is collective.
951 *
952 * Return: Success: A new file pointer.
953 *
954 * Failure: NULL
955 *
956 * Programmer:
957 * January 30, 1998
958 *
959 * Modifications:
960 * Robb Matzke, 1998-02-18
961 * Added the ACCESS_PARMS argument. Moved some error checking
962 * here from elsewhere.
963 *
964 * rky, 1998-01-11
965 * Added H5FD_mpio_Debug debug flags controlled by MPI_Info.
966 *
967 * rky, 1998-08-28
968 * Init flag controlling redundant metadata writes to disk.
969 *
970 * rky, 1998-12-07
971 * Added barrier after MPI_File_set_size to prevent race
972 * condition -- subsequent writes were being truncated, causing
973 * holes in file.
974 *
975 * Robb Matzke, 1999-08-06
976 * Modified to work with the virtual file layer.
977 *
978 * rky & ppw, 1999-11-07
979 * Modified "H5FD_mpio_open" so that file-truncation is
980 * avoided for brand-new files (with zero filesize).
981 *
982 * Albert Cheng, 2003-04-17
983 * Duplicate the communicator and Info object so that file is
984 * insulated from the old one.
985 *-------------------------------------------------------------------------
986 */
987 static H5FD_t *
H5FD_mpio_open(const char * name,unsigned flags,hid_t fapl_id,haddr_t H5_ATTR_UNUSED maxaddr)988 H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id,
989 haddr_t H5_ATTR_UNUSED maxaddr)
990 {
991 H5FD_mpio_t *file=NULL;
992 MPI_File fh;
993 unsigned file_opened=0; /* Flag to indicate that the file was successfully opened */
994 int mpi_amode;
995 int mpi_rank; /* MPI rank of this process */
996 int mpi_size; /* Total number of MPI processes */
997 int mpi_code; /* mpi return code */
998 MPI_Offset size;
999 const H5FD_mpio_fapl_t *fa=NULL;
1000 H5FD_mpio_fapl_t _fa;
1001 H5P_genplist_t *plist; /* Property list pointer */
1002 MPI_Comm comm_dup=MPI_COMM_NULL;
1003 MPI_Info info_dup=MPI_INFO_NULL;
1004 H5FD_t *ret_value; /* Return value */
1005
1006 FUNC_ENTER_NOAPI_NOINIT
1007
1008 #ifdef H5FDmpio_DEBUG
1009 if (H5FD_mpio_Debug[(int)'t']) {
1010 fprintf(stdout, "Entering H5FD_mpio_open(name=\"%s\", flags=0x%x, "
1011 "fapl_id=%d, maxaddr=%lu)\n", name, flags, (int)fapl_id, (unsigned long)maxaddr);
1012 }
1013 #endif
1014
1015 /* Obtain a pointer to mpio-specific file access properties */
1016 if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
1017 HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list")
1018 if(H5P_FILE_ACCESS_DEFAULT == fapl_id || H5FD_MPIO != H5P_get_driver(plist)) {
1019 _fa.comm = MPI_COMM_SELF; /*default*/
1020 _fa.info = MPI_INFO_NULL; /*default*/
1021 fa = &_fa;
1022 } else {
1023 if(NULL == (fa = (const H5FD_mpio_fapl_t *)H5P_get_driver_info(plist)))
1024 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "bad VFL driver info")
1025 }
1026
1027 /* Duplicate communicator and Info object for use by this file. */
1028 if (FAIL==H5FD_mpi_comm_info_dup(fa->comm, fa->info, &comm_dup, &info_dup))
1029 HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
1030
1031 /* convert HDF5 flags to MPI-IO flags */
1032 /* some combinations are illegal; let MPI-IO figure it out */
1033 mpi_amode = (flags&H5F_ACC_RDWR) ? MPI_MODE_RDWR : MPI_MODE_RDONLY;
1034 if (flags&H5F_ACC_CREAT) mpi_amode |= MPI_MODE_CREATE;
1035 if (flags&H5F_ACC_EXCL) mpi_amode |= MPI_MODE_EXCL;
1036
1037 #ifdef H5FDmpio_DEBUG
1038 /* Check for debug commands in the info parameter */
1039 {
1040 char debug_str[128];
1041 int flag, i;
1042 if (MPI_INFO_NULL != info_dup) {
1043 MPI_Info_get(fa->info, H5F_MPIO_DEBUG_KEY, sizeof(debug_str)-1, debug_str, &flag);
1044 if (flag) {
1045 fprintf(stdout, "H5FD_mpio debug flags=%s\n", debug_str );
1046 for (i=0;
1047 debug_str[i]/*end of string*/ && i<128/*just in case*/;
1048 ++i) {
1049 H5FD_mpio_Debug[(int)debug_str[i]] = 1;
1050 }
1051 }
1052 }
1053 }
1054 #endif
1055
1056 if(MPI_SUCCESS != (mpi_code = MPI_File_open(comm_dup, name, mpi_amode, info_dup, &fh)))
1057 HMPI_GOTO_ERROR(NULL, "MPI_File_open failed", mpi_code)
1058 file_opened=1;
1059
1060 /* Get the MPI rank of this process and the total number of processes */
1061 if (MPI_SUCCESS != (mpi_code=MPI_Comm_rank (comm_dup, &mpi_rank)))
1062 HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code)
1063 if (MPI_SUCCESS != (mpi_code=MPI_Comm_size (comm_dup, &mpi_size)))
1064 HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code)
1065
1066 /* Build the return value and initialize it */
1067 if(NULL == (file = (H5FD_mpio_t *)H5MM_calloc(sizeof(H5FD_mpio_t))))
1068 HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
1069 file->f = fh;
1070 file->comm = comm_dup;
1071 file->info = info_dup;
1072 file->mpi_rank = mpi_rank;
1073 file->mpi_size = mpi_size;
1074
1075 /* Only processor p0 will get the filesize and broadcast it. */
1076 if (mpi_rank == 0) {
1077 if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size)))
1078 HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
1079 } /* end if */
1080
1081 /* Broadcast file size */
1082 if(MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup)))
1083 HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)
1084
1085 /* Determine if the file should be truncated */
1086 if(size && (flags & H5F_ACC_TRUNC)) {
1087 if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(fh, (MPI_Offset)0)))
1088 HMPI_GOTO_ERROR(NULL, "MPI_File_set_size failed", mpi_code)
1089
1090 /* Don't let any proc return until all have truncated the file. */
1091 if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(comm_dup)))
1092 HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mpi_code)
1093
1094 /* File is zero size now */
1095 size = 0;
1096 } /* end if */
1097
1098 /* Set the size of the file (from library's perspective) */
1099 file->eof = H5FD_mpi_MPIOff_to_haddr(size);
1100
1101 /* Set return value */
1102 ret_value=(H5FD_t*)file;
1103
1104 done:
1105 if(ret_value==NULL) {
1106 if(file_opened)
1107 MPI_File_close(&fh);
1108 if (MPI_COMM_NULL != comm_dup)
1109 MPI_Comm_free(&comm_dup);
1110 if (MPI_INFO_NULL != info_dup)
1111 MPI_Info_free(&info_dup);
1112 if (file)
1113 H5MM_xfree(file);
1114 } /* end if */
1115
1116 #ifdef H5FDmpio_DEBUG
1117 if (H5FD_mpio_Debug[(int)'t'])
1118 fprintf(stdout, "Leaving H5FD_mpio_open\n" );
1119 #endif
1120 FUNC_LEAVE_NOAPI(ret_value)
1121 }
1122
1123
1124 /*-------------------------------------------------------------------------
1125 * Function: H5FD_mpio_close
1126 *
1127 * Purpose: Closes a file. This is collective.
1128 *
1129 * Return: Success: Non-negative
1130 *
1131 * Failure: Negative
1132 *
1133 * Programmer: Unknown
1134 * January 30, 1998
1135 *
1136 * Modifications:
1137 * Robb Matzke, 1998-02-18
1138 * Added the ACCESS_PARMS argument.
1139 *
1140 * Robb Matzke, 1999-08-06
1141 * Modified to work with the virtual file layer.
1142 *
1143 * Albert Cheng, 2003-04-17
1144 * Free the communicator stored.
1145 *-------------------------------------------------------------------------
1146 */
1147 static herr_t
H5FD_mpio_close(H5FD_t * _file)1148 H5FD_mpio_close(H5FD_t *_file)
1149 {
1150 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1151 int mpi_code; /* MPI return code */
1152 herr_t ret_value=SUCCEED; /* Return value */
1153
1154 FUNC_ENTER_NOAPI_NOINIT
1155
1156 #ifdef H5FDmpio_DEBUG
1157 if (H5FD_mpio_Debug[(int)'t'])
1158 fprintf(stdout, "Entering H5FD_mpio_close\n");
1159 #endif
1160 HDassert(file);
1161 HDassert(H5FD_MPIO==file->pub.driver_id);
1162
1163 /* MPI_File_close sets argument to MPI_FILE_NULL */
1164 if (MPI_SUCCESS != (mpi_code=MPI_File_close(&(file->f)/*in,out*/)))
1165 HMPI_GOTO_ERROR(FAIL, "MPI_File_close failed", mpi_code)
1166
1167 /* Clean up other stuff */
1168 H5FD_mpi_comm_info_free(&file->comm, &file->info);
1169 H5MM_xfree(file);
1170
1171 done:
1172 #ifdef H5FDmpio_DEBUG
1173 if (H5FD_mpio_Debug[(int)'t'])
1174 fprintf(stdout, "Leaving H5FD_mpio_close\n");
1175 #endif
1176 FUNC_LEAVE_NOAPI(ret_value)
1177 }
1178
1179
1180 /*-------------------------------------------------------------------------
1181 * Function: H5FD_mpio_query
1182 *
1183 * Purpose: Set the flags that this VFL driver is capable of supporting.
1184 * (listed in H5FDpublic.h)
1185 *
1186 * Return: Success: non-negative
1187 *
1188 * Failure: negative
1189 *
1190 * Programmer: Quincey Koziol
1191 * Friday, August 25, 2000
1192 *
1193 * Modifications:
1194 *
1195 * John Mainzer -- 9/21/05
1196 * Modified code to turn off the
1197 * H5FD_FEAT_ACCUMULATE_METADATA_WRITE flag.
1198 * With the movement of
1199 * all cache writes to process 0, this flag has become
1200 * problematic in PHDF5.
1201 *
1202 *-------------------------------------------------------------------------
1203 */
1204 static herr_t
H5FD_mpio_query(const H5FD_t H5_ATTR_UNUSED * _file,unsigned long * flags)1205 H5FD_mpio_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */)
1206 {
1207 FUNC_ENTER_NOAPI_NOINIT_NOERR
1208
1209 /* Set the VFL feature flags that this driver supports */
1210 if(flags) {
1211 *flags=0;
1212 *flags|=H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
1213 *flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
1214 *flags|=H5FD_FEAT_HAS_MPI; /* This driver uses MPI */
1215 *flags|=H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */
1216 } /* end if */
1217
1218 FUNC_LEAVE_NOAPI(SUCCEED)
1219 }
1220
1221
1222 /*-------------------------------------------------------------------------
1223 * Function: H5FD_mpio_get_eoa
1224 *
1225 * Purpose: Gets the end-of-address marker for the file. The EOA marker
1226 * is the first address past the last byte allocated in the
1227 * format address space.
1228 *
1229 * Return: Success: The end-of-address marker.
1230 *
1231 * Failure: HADDR_UNDEF
1232 *
1233 * Programmer: Robb Matzke
1234 * Friday, August 6, 1999
1235 *
1236 * Modifications:
1237 * Raymond Lu
1238 * 21 Dec. 2006
1239 * Added the parameter TYPE. It's only used for MULTI driver.
1240 *
1241 *-------------------------------------------------------------------------
1242 */
1243 static haddr_t
H5FD_mpio_get_eoa(const H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type)1244 H5FD_mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
1245 {
1246 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
1247
1248 FUNC_ENTER_NOAPI_NOINIT_NOERR
1249
1250 HDassert(file);
1251 HDassert(H5FD_MPIO==file->pub.driver_id);
1252
1253 FUNC_LEAVE_NOAPI(file->eoa)
1254 }
1255
1256
1257 /*-------------------------------------------------------------------------
1258 * Function: H5FD_mpio_set_eoa
1259 *
1260 * Purpose: Set the end-of-address marker for the file. This function is
1261 * called shortly after an existing HDF5 file is opened in order
1262 * to tell the driver where the end of the HDF5 data is located.
1263 *
1264 * Return: Success: 0
1265 *
1266 * Failure: -1
1267 *
1268 * Programmer: Robb Matzke
1269 * Friday, August 6, 1999
1270 *
1271 * Modifications:
1272 * Raymond Lu
1273 * 21 Dec. 2006
1274 * Added the parameter TYPE. It's only used for MULTI driver.
1275 *
1276 *-------------------------------------------------------------------------
1277 */
1278 static herr_t
H5FD_mpio_set_eoa(H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type,haddr_t addr)1279 H5FD_mpio_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr)
1280 {
1281 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1282
1283 FUNC_ENTER_NOAPI_NOINIT_NOERR
1284
1285 HDassert(file);
1286 HDassert(H5FD_MPIO==file->pub.driver_id);
1287
1288 file->eoa = addr;
1289
1290 FUNC_LEAVE_NOAPI(SUCCEED)
1291 }
1292
1293
1294 /*-------------------------------------------------------------------------
1295 * Function: H5FD_mpio_get_eof
1296 *
1297 * Purpose: Gets the end-of-file marker for the file. The EOF marker
1298 * is the real size of the file.
1299 *
1300 * The MPIO driver doesn't bother keeping this field updated
1301 * since that's a relatively expensive operation. Fortunately
1302 * the library only needs the EOF just after the file is opened
1303 * in order to determine whether the file is empty, truncated,
1304 * or okay. Therefore, any MPIO I/O function will set its value
1305 * to HADDR_UNDEF which is the error return value of this
1306 * function.
1307 *
1308 * Keeping the EOF updated (during write calls) is expensive
1309 * because any process may extend the physical end of the
1310 * file. -QAK
1311 *
1312 * Return: Success: The end-of-address marker.
1313 *
1314 * Failure: HADDR_UNDEF
1315 *
1316 * Programmer: Robb Matzke
1317 * Friday, August 6, 1999
1318 *
1319 * Modifications:
1320 *
1321 *-------------------------------------------------------------------------
1322 */
1323 static haddr_t
H5FD_mpio_get_eof(const H5FD_t * _file)1324 H5FD_mpio_get_eof(const H5FD_t *_file)
1325 {
1326 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
1327
1328 FUNC_ENTER_NOAPI_NOINIT_NOERR
1329
1330 HDassert(file);
1331 HDassert(H5FD_MPIO==file->pub.driver_id);
1332
1333 FUNC_LEAVE_NOAPI(file->eof)
1334 }
1335
1336
1337 /*-------------------------------------------------------------------------
1338 * Function: H5FD_mpio_get_handle
1339 *
1340 * Purpose: Returns the file handle of MPIO file driver.
1341 *
1342 * Returns: Non-negative if succeed or negative if fails.
1343 *
1344 * Programmer: Raymond Lu
1345 * Sept. 16, 2002
1346 *
1347 * Modifications:
1348 *
1349 *-------------------------------------------------------------------------
1350 */
1351 static herr_t
H5FD_mpio_get_handle(H5FD_t * _file,hid_t H5_ATTR_UNUSED fapl,void ** file_handle)1352 H5FD_mpio_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void** file_handle)
1353 {
1354 H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
1355 herr_t ret_value = SUCCEED;
1356
1357 FUNC_ENTER_NOAPI_NOINIT
1358
1359 if(!file_handle)
1360 HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid")
1361
1362 *file_handle = &(file->f);
1363
1364 done:
1365 FUNC_LEAVE_NOAPI(ret_value)
1366 }
1367
1368
1369 /*-------------------------------------------------------------------------
1370 * Function: H5FD_mpio_read
1371 *
1372 * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR
1373 * into buffer BUF according to data transfer properties in
1374 * DXPL_ID using potentially complex file and buffer types to
1375 * effect the transfer.
1376 *
1377 * Reading past the end of the MPI file returns zeros instead of
1378 * failing. MPI is able to coalesce requests from different
1379 * processes (collective or independent).
1380 *
1381 * Return: Success: Zero. Result is stored in caller-supplied
1382 * buffer BUF.
1383 *
1384 * Failure: -1, Contents of buffer BUF are undefined.
1385 *
1386 * Programmer: rky, 1998-01-30
1387 *
1388 * Modifications:
1389 * Robb Matzke, 1998-02-18
1390 * Added the ACCESS_PARMS argument.
1391 *
1392 * rky, 1998-04-10
1393 * Call independent or collective MPI read, based on
1394 * ACCESS_PARMS.
1395 *
1396 * Albert Cheng, 1998-06-01
1397 * Added XFER_MODE to control independent or collective MPI
1398 * read.
1399 *
1400 * rky, 1998-08-16
1401 * Use BTYPE, FTYPE, and DISP from access parms. The guts of
1402 * H5FD_mpio_read and H5FD_mpio_write should be replaced by a
1403 * single dual-purpose routine.
1404 *
1405 * Robb Matzke, 1999-04-21
1406 * Changed XFER_MODE to XFER_PARMS for all H5F_*_read()
1407 * callbacks.
1408 *
1409 * Robb Matzke, 1999-07-28
1410 * The ADDR argument is passed by value.
1411 *
1412 * Robb Matzke, 1999-08-06
1413 * Modified to work with the virtual file layer.
1414 *
1415 * Quincey Koziol, 2002-05-14
1416 * Only call MPI_Get_count if we can use MPI_BYTE for the MPI type
1417 * for the I/O transfer. Someday we might include code to decode
1418 * the MPI type used for more complicated transfers and call
1419 * MPI_Get_count all the time.
1420 *
1421 * Quincey Koziol - 2002/06/17
1422 * Removed 'disp' parameter from H5FD_mpio_setup routine and use
1423 * the address of the dataset in MPI_File_set_view() calls, as
1424 * necessary.
1425 *
1426 * Quincey Koziol - 2002/06/24
1427 * Removed "lazy" MPI_File_set_view() calls, since they would fail
1428 * if the first I/O was a collective I/O using MPI derived types
1429 * and the next I/O was an independent I/O.
1430 *
1431 * Quincey Koziol - 2003/10/22-31
1432 * Restructured code massively, straightening out logic and finally
1433 * getting the bytes_read stuff working.
1434 *
1435 *-------------------------------------------------------------------------
1436 */
1437 static herr_t
H5FD_mpio_read(H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type,hid_t dxpl_id,haddr_t addr,size_t size,void * buf)1438 H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, haddr_t addr, size_t size,
1439 void *buf/*out*/)
1440 {
1441 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1442 MPI_Offset mpi_off;
1443 MPI_Status mpi_stat; /* Status from I/O operation */
1444 int mpi_code; /* mpi return code */
1445 MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
1446 int size_i; /* Integer copy of 'size' to read */
1447 int bytes_read; /* Number of bytes read in */
1448 int n;
1449 int type_size; /* MPI datatype used for I/O's size */
1450 int io_size; /* Actual number of bytes requested */
1451 H5P_genplist_t *plist = NULL; /* Property list pointer */
1452 hbool_t use_view_this_time = FALSE;
1453 herr_t ret_value = SUCCEED;
1454
1455 FUNC_ENTER_NOAPI_NOINIT
1456
1457 #ifdef H5FDmpio_DEBUG
1458 if (H5FD_mpio_Debug[(int)'t'])
1459 fprintf(stdout, "Entering H5FD_mpio_read\n" );
1460 #endif
1461 HDassert(file);
1462 HDassert(H5FD_MPIO==file->pub.driver_id);
1463 /* Make certain we have the correct type of property list */
1464 HDassert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
1465 HDassert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
1466 HDassert(buf);
1467
1468 /* Portably initialize MPI status variable */
1469 HDmemset(&mpi_stat,0,sizeof(MPI_Status));
1470
1471 /* some numeric conversions */
1472 if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off/*out*/)<0)
1473 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
1474 size_i = (int)size;
1475 if ((hsize_t)size_i != size)
1476 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
1477
1478 #ifdef H5FDmpio_DEBUG
1479 if (H5FD_mpio_Debug[(int)'r'])
1480 fprintf(stdout, "in H5FD_mpio_read mpi_off=%ld size_i=%d\n",
1481 (long)mpi_off, size_i );
1482 #endif
1483
1484 /* Only look for MPI views for raw data transfers */
1485 if(type==H5FD_MEM_DRAW) {
1486 H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
1487
1488 /* Obtain the data transfer properties */
1489 if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id)))
1490 HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
1491
1492 /* get the transfer mode from the dxpl */
1493 if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0)
1494 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
1495
1496 /*
1497 * Set up for a fancy xfer using complex types, or single byte block. We
1498 * wouldn't need to rely on the use_view field if MPI semantics allowed
1499 * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
1500 * could mean "use MPI_BYTE" by convention).
1501 */
1502 if(xfer_mode==H5FD_MPIO_COLLECTIVE) {
1503 MPI_Datatype file_type;
1504
1505 /* Remember that views are used */
1506 use_view_this_time = TRUE;
1507
1508 /* prepare for a full-blown xfer using btype, ftype, and disp */
1509 if(H5P_get(plist, H5FD_MPI_XFER_MEM_MPI_TYPE_NAME, &buf_type)<0)
1510 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property")
1511 if(H5P_get(plist, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, &file_type)<0)
1512 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property")
1513
1514 /*
1515 * Set the file view when we are using MPI derived types
1516 */
1517 if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info)))
1518 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1519
1520 /* When using types, use the address as the displacement for
1521 * MPI_File_set_view and reset the address for the read to zero
1522 */
1523 mpi_off=0;
1524 } /* end if */
1525 } /* end if */
1526
1527 /* Read the data. */
1528 if(use_view_this_time) {
1529 H5FD_mpio_collective_opt_t coll_opt_mode;
1530
1531 #ifdef H5FDmpio_DEBUG
1532 if (H5FD_mpio_Debug[(int)'t'])
1533 fprintf(stdout, "H5FD_mpio_read: using MPIO collective mode\n");
1534 #endif
1535 /* Get the collective_opt property to check whether the application wants to do IO individually. */
1536 HDassert(plist);
1537
1538 /* get the transfer mode from the dxpl */
1539 if(H5P_get(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &coll_opt_mode)<0)
1540 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
1541
1542 if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
1543 #ifdef H5FDmpio_DEBUG
1544 if(H5FD_mpio_Debug[(int)'t'])
1545 fprintf(stdout, "H5FD_mpio_read: doing MPI collective IO\n");
1546 #endif
1547 if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1548 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
1549 } /* end if */
1550 else {
1551 #ifdef H5FDmpio_DEBUG
1552 if(H5FD_mpio_Debug[(int)'t'])
1553 fprintf(stdout, "H5FD_mpio_read: doing MPI independent IO\n");
1554 #endif
1555
1556 if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1557 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
1558 } /* end else */
1559
1560 /*
1561 * Reset the file view when we used MPI derived types
1562 */
1563 if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info)))
1564 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1565 } else {
1566 if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1567 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
1568 }
1569
1570 /* How many bytes were actually read? */
1571 /* [This works because the "basic elements" we use for all our MPI derived
1572 * types are MPI_BYTE. We should be using the 'buf_type' for the MPI
1573 * datatype in this call though... (We aren't because using it causes
1574 * the LANL "qsc" machine to dump core - 12/19/03) - QAK]
1575 */
1576 if (MPI_SUCCESS != (mpi_code=MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
1577 HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
1578
1579 /* Get the type's size */
1580 if (MPI_SUCCESS != (mpi_code=MPI_Type_size(buf_type,&type_size)))
1581 HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
1582
1583 /* Compute the actual number of bytes requested */
1584 io_size=type_size*size_i;
1585
1586 /* Check for read failure */
1587 if (bytes_read<0 || bytes_read>io_size)
1588 HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed")
1589
1590 /*
1591 * This gives us zeroes beyond end of physical MPI file.
1592 */
1593 if ((n=(io_size-bytes_read)) > 0)
1594 HDmemset((char*)buf+bytes_read, 0, (size_t)n);
1595
1596 done:
1597 #ifdef H5FDmpio_DEBUG
1598 if (H5FD_mpio_Debug[(int)'t'])
1599 fprintf(stdout, "Leaving H5FD_mpio_read\n" );
1600 #endif
1601
1602 FUNC_LEAVE_NOAPI(ret_value)
1603 }
1604
1605
1606 /*-------------------------------------------------------------------------
1607 * Function: H5FD_mpio_write
1608 *
1609 * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR
1610 * from buffer BUF according to data transfer properties in
1611 * DXPL_ID using potentially complex file and buffer types to
1612 * effect the transfer.
1613 *
1614 * MPI is able to coalesce requests from different processes
1615 * (collective and independent).
1616 *
1617 * Return: Success: Zero. USE_TYPES and OLD_USE_TYPES in the
1618 * access params are altered.
1619 *
1620 * Failure: -1, USE_TYPES and OLD_USE_TYPES in the
1621 * access params may be altered.
1622 *
1623 * Programmer: Unknown
1624 * January 30, 1998
1625 *
1626 * Modifications:
1627 * rky, 1998-08-28
1628 * If the file->allsame flag is set, we assume that all the
1629 * procs in the relevant MPI communicator will write identical
1630 * data at identical offsets in the file, so only proc 0 will
1631 * write, and all other procs will wait for p0 to finish. This
1632 * is useful for writing metadata, for example. Note that we
1633 * don't _check_ that the data is identical. Also, the mechanism
1634 * we use to eliminate the redundant writes is by requiring a
1635 * call to H5FD_mpio_tas_allsame before the write, which is
1636 * rather klugey. Would it be better to pass a parameter to
1637 * low-level writes like H5F_block_write and H5F_low_write,
1638 * instead? Or...??? Also, when I created this mechanism I
1639 * wanted to minimize the difference in behavior between the old
1640 * way of doing things (i.e., all procs write) and the new way,
1641 * so the writes are eliminated at the very lowest level, here
1642 * in H5FD_mpio_write. It may be better to rethink that, and
1643 * short-circuit the writes at a higher level (e.g., at the
1644 * points in the code where H5FD_mpio_tas_allsame is called).
1645 *
1646 *
1647 * Robb Matzke, 1998-02-18
1648 * Added the ACCESS_PARMS argument.
1649 *
1650 * rky, 1998-04-10
1651 * Call independent or collective MPI write, based on
1652 * ACCESS_PARMS.
1653 *
1654 * rky, 1998-04-24
1655 * Removed redundant write from H5FD_mpio_write.
1656 *
1657 * Albert Cheng, 1998-06-01
1658 * Added XFER_MODE to control independent or collective MPI
1659 * write.
1660 *
1661 * rky, 1998-08-16
1662 * Use BTYPE, FTYPE, and DISP from access parms. The guts of
1663 * H5FD_mpio_read and H5FD_mpio_write should be replaced by a
1664 * single dual-purpose routine.
1665 *
1666 * rky, 1998-08-28
1667 * Added ALLSAME parameter to make all but proc 0 skip the
1668 * actual write.
1669 *
1670 * Robb Matzke, 1999-04-21
1671 * Changed XFER_MODE to XFER_PARMS for all H5FD_*_write()
1672 * callbacks.
1673 *
1674 * Robb Matzke, 1999-07-28
1675 * The ADDR argument is passed by value.
1676 *
1677 * Robb Matzke, 1999-08-06
1678 * Modified to work with the virtual file layer.
1679 *
1680 * Albert Cheng, 1999-12-19
1681 * When only-p0-write-allsame-data, p0 Bcasts the
1682 * ret_value to other processes. This prevents
1683 * a racing condition (that other processes try to
1684 * read the file before p0 finishes writing) and also
1685 * allows all processes to report the same ret_value.
1686 *
1687 * Kim Yates, Pat Weidhaas, 2000-09-26
1688 * Move block of coding where only p0 writes after the
1689 * MPI_File_set_view call.
1690 *
1691 * Quincey Koziol, 2002-05-10
1692 * Instead of always writing metadata from process 0, spread the
1693 * burden among all the processes by using a round-robin rotation
1694 * scheme.
1695 *
1696 * Quincey Koziol, 2002-05-10
1697 * Removed allsame code, keying off the type parameter instead.
1698 *
1699 * Quincey Koziol, 2002-05-14
1700 * Only call MPI_Get_count if we can use MPI_BYTE for the MPI type
1701 * for the I/O transfer. Someday we might include code to decode
1702 * the MPI type used for more complicated transfers and call
1703 * MPI_Get_count all the time.
1704 *
1705 * Quincey Koziol - 2002/06/17
1706 * Removed 'disp' parameter from H5FD_mpio_setup routine and use
1707 * the address of the dataset in MPI_File_set_view() calls, as
1708 * necessary.
1709 *
1710 * Quincey Koziol - 2002/06/24
1711 * Removed "lazy" MPI_File_set_view() calls, since they would fail
1712 * if the first I/O was a collective I/O using MPI derived types
1713 * and the next I/O was an independent I/O.
1714 *
1715 * Quincey Koziol - 2002/07/18
1716 * Added "block_before_meta_write" dataset transfer flag, which
1717 * is set during writes from a metadata cache flush and indicates
1718 * that all the processes must sync up before (one of them)
1719 * writing metadata.
1720 *
1721 * Quincey Koziol - 2003/10/22-31
1722 * Restructured code massively, straightening out logic and finally
1723 * getting the bytes_written stuff working.
1724 *
1725 *-------------------------------------------------------------------------
1726 */
1727 static herr_t
H5FD_mpio_write(H5FD_t * _file,H5FD_mem_t type,hid_t dxpl_id,haddr_t addr,size_t size,const void * buf)1728 H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
1729 size_t size, const void *buf)
1730 {
1731 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1732 MPI_Offset mpi_off;
1733 MPI_Status mpi_stat; /* Status from I/O operation */
1734 MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
1735 int mpi_code; /* MPI return code */
1736 int size_i, bytes_written;
1737 int type_size; /* MPI datatype used for I/O's size */
1738 int io_size; /* Actual number of bytes requested */
1739 hbool_t use_view_this_time = FALSE;
1740 H5P_genplist_t *plist = NULL; /* Property list pointer */
1741 herr_t ret_value = SUCCEED;
1742
1743 FUNC_ENTER_NOAPI_NOINIT
1744
1745 #ifdef H5FDmpio_DEBUG
1746 if (H5FD_mpio_Debug[(int)'t'])
1747 fprintf(stdout, "Entering H5FD_mpio_write\n" );
1748 #endif
1749 HDassert(file);
1750 HDassert(H5FD_MPIO==file->pub.driver_id);
1751 /* Make certain we have the correct type of property list */
1752 HDassert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
1753 HDassert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
1754 HDassert(buf);
1755
1756 /* Portably initialize MPI status variable */
1757 HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
1758
1759 /* some numeric conversions */
1760 if(H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off) < 0)
1761 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
1762 size_i = (int)size;
1763 if((hsize_t)size_i != size)
1764 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
1765
1766 #ifdef H5FDmpio_DEBUG
1767 if(H5FD_mpio_Debug[(int)'w'])
1768 fprintf(stdout, "in H5FD_mpio_write mpi_off=%ld size_i=%d\n", (long)mpi_off, size_i);
1769 #endif
1770
1771 if(type == H5FD_MEM_DRAW) {
1772 H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
1773
1774 /* Obtain the data transfer properties */
1775 if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id)))
1776 HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
1777
1778 /* get the transfer mode from the dxpl */
1779 if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0)
1780 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
1781
1782 /*
1783 * Set up for a fancy xfer using complex types, or single byte block. We
1784 * wouldn't need to rely on the use_view field if MPI semantics allowed
1785 * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
1786 * could mean "use MPI_BYTE" by convention).
1787 */
1788 if(xfer_mode == H5FD_MPIO_COLLECTIVE) {
1789 MPI_Datatype file_type;
1790
1791 /* Remember that views are used */
1792 use_view_this_time = TRUE;
1793
1794 /* prepare for a full-blown xfer using btype, ftype, and disp */
1795 if(H5P_get(plist, H5FD_MPI_XFER_MEM_MPI_TYPE_NAME, &buf_type) < 0)
1796 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property")
1797 if(H5P_get(plist, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, &file_type) < 0)
1798 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property")
1799
1800 /*
1801 * Set the file view when we are using MPI derived types
1802 */
1803 if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info)))
1804 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1805
1806 /* When using types, use the address as the displacement for
1807 * MPI_File_set_view and reset the address for the read to zero
1808 */
1809 mpi_off = 0;
1810 } /* end if */
1811 } /* end if */
1812 else {
1813 #if 0 /* JRM -- 3/23/10 */ /* this is no longer always the case */
1814 /* Only one process can do the actual metadata write */
1815 if(file->mpi_rank != H5_PAR_META_WRITE)
1816 #ifdef LATER
1817 HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't write metadata from non-zero rank")
1818 #else /* LATER */
1819 HGOTO_DONE(SUCCEED) /* skip the actual write */
1820 #endif /* LATER */
1821 #endif /* JRM */
1822 } /* end if */
1823
1824 /* Write the data. */
1825 if(use_view_this_time) {
1826 H5FD_mpio_collective_opt_t coll_opt_mode;
1827
1828 #ifdef H5FDmpio_DEBUG
1829 if(H5FD_mpio_Debug[(int)'t'])
1830 fprintf(stdout, "H5FD_mpio_write: using MPIO collective mode\n");
1831 #endif
1832 /* Get the collective_opt property to check whether the application wants to do IO individually. */
1833 HDassert(plist);
1834 /* get the transfer mode from the dxpl */
1835 if(H5P_get(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &coll_opt_mode)<0)
1836 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
1837
1838 if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
1839 #ifdef H5FDmpio_DEBUG
1840 if(H5FD_mpio_Debug[(int)'t'])
1841 fprintf(stdout, "H5FD_mpio_write: doing MPI collective IO\n");
1842 #endif
1843 if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1844 HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
1845 } /* end if */
1846 else {
1847 #ifdef H5FDmpio_DEBUG
1848 if(H5FD_mpio_Debug[(int)'t'])
1849 fprintf(stdout, "H5FD_mpio_write: doing MPI independent IO\n");
1850 #endif
1851 if(MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1852 HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
1853 } /* end else */
1854
1855 /* Reset the file view when we used MPI derived types */
1856 if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info)))
1857 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1858 } else {
1859 if(MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1860 HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
1861 }
1862
1863 /* How many bytes were actually written? */
1864 /* [This works because the "basic elements" we use for all our MPI derived
1865 * types are MPI_BYTE. We should be using the 'buf_type' for the MPI
1866 * datatype in this call though... (We aren't because using it causes
1867 * the LANL "qsc" machine to dump core - 12/19/03) - QAK]
1868 */
1869 if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_written)))
1870 HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
1871
1872 /* Get the type's size */
1873 if(MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
1874 HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
1875
1876 /* Compute the actual number of bytes requested */
1877 io_size = type_size * size_i;
1878
1879 /* Check for write failure */
1880 if(bytes_written != io_size)
1881 HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed")
1882
1883 /* Forget the EOF value (see H5FD_mpio_get_eof()) --rpm 1999-08-06 */
1884 file->eof = HADDR_UNDEF;
1885
1886 done:
1887 #ifdef H5FDmpio_DEBUG
1888 if(H5FD_mpio_Debug[(int)'t'])
1889 fprintf(stdout, "proc %d: Leaving H5FD_mpio_write with ret_value=%d\n",
1890 file->mpi_rank, ret_value );
1891 #endif
1892 FUNC_LEAVE_NOAPI(ret_value)
1893 } /* end H5FD_mpio_write() */
1894
1895
1896 /*-------------------------------------------------------------------------
1897 * Function: H5FD_mpio_flush
1898 *
1899 * Purpose: Makes sure that all data is on disk. This is collective.
1900 *
1901 * Return: Success: Non-negative
1902 *
1903 * Failure: Negative
1904 *
1905 * Programmer: Robb Matzke
1906 * January 30, 1998
1907 *
1908 *-------------------------------------------------------------------------
1909 */
1910 static herr_t
H5FD_mpio_flush(H5FD_t * _file,hid_t H5_ATTR_UNUSED dxpl_id,unsigned closing)1911 H5FD_mpio_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, unsigned closing)
1912 {
1913 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1914 int mpi_code; /* mpi return code */
1915 herr_t ret_value = SUCCEED;
1916
1917 FUNC_ENTER_NOAPI_NOINIT
1918
1919 #ifdef H5FDmpio_DEBUG
1920 if(H5FD_mpio_Debug[(int)'t'])
1921 HDfprintf(stdout, "Entering %s\n", FUNC);
1922 #endif
1923 HDassert(file);
1924 HDassert(H5FD_MPIO == file->pub.driver_id);
1925
1926 /* Only sync the file if we are not going to immediately close it */
1927 if(!closing) {
1928 if(MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f)))
1929 HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code)
1930 } /* end if */
1931
1932 done:
1933 #ifdef H5FDmpio_DEBUG
1934 if(H5FD_mpio_Debug[(int)'t'])
1935 HDfprintf(stdout, "Leaving %s\n", FUNC);
1936 #endif
1937
1938 FUNC_LEAVE_NOAPI(ret_value)
1939 } /* end H5FD_mpio_flush() */
1940
1941
1942 /*-------------------------------------------------------------------------
1943 * Function: H5FD_mpio_truncate
1944 *
1945 * Purpose: Make certain the file's size matches it's allocated size
1946 *
1947 * Return: Success: Non-negative
1948 * Failure: Negative
1949 *
1950 * Programmer: Quincey Koziol
1951 * January 31, 2008
1952 *
1953 *-------------------------------------------------------------------------
1954 */
1955 static herr_t
H5FD_mpio_truncate(H5FD_t * _file,hid_t H5_ATTR_UNUSED dxpl_id,hbool_t H5_ATTR_UNUSED closing)1956 H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing)
1957 {
1958 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1959 herr_t ret_value = SUCCEED;
1960
1961 FUNC_ENTER_NOAPI_NOINIT
1962
1963 #ifdef H5FDmpio_DEBUG
1964 if(H5FD_mpio_Debug[(int)'t'])
1965 HDfprintf(stdout, "Entering %s\n", FUNC);
1966 #endif
1967 HDassert(file);
1968 HDassert(H5FD_MPIO == file->pub.driver_id);
1969
1970 /* Extend the file to make sure it's large enough, then sync.
1971 * Unfortunately, keeping track of EOF is an expensive operation, so
1972 * we can't just check whether EOF<EOA like with other drivers.
1973 * Therefore we'll just read the byte at EOA-1 and then write it back. */
1974 if(file->eoa > file->last_eoa) {
1975 int mpi_code; /* mpi return code */
1976 MPI_Offset mpi_off;
1977
1978 if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &mpi_off) < 0)
1979 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset")
1980
1981 /* Extend the file's size */
1982 if(MPI_SUCCESS != (mpi_code = MPI_File_set_size(file->f, mpi_off)))
1983 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code)
1984
1985 /* Don't let any proc return until all have extended the file.
1986 * (Prevents race condition where some processes go ahead and write
1987 * more data to the file before all the processes have finished making
1988 * it the shorter length, potentially truncating the file and dropping
1989 * the new data written)
1990 */
1991 if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
1992 HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
1993
1994 /* Update the 'last' eoa value */
1995 file->last_eoa = file->eoa;
1996 } /* end if */
1997
1998 done:
1999 #ifdef H5FDmpio_DEBUG
2000 if(H5FD_mpio_Debug[(int)'t'])
2001 HDfprintf(stdout, "Leaving %s\n", FUNC);
2002 #endif
2003
2004 FUNC_LEAVE_NOAPI(ret_value)
2005 } /* end H5FD_mpio_truncate() */
2006
2007
2008 /*-------------------------------------------------------------------------
2009 * Function: H5FD_mpio_mpi_rank
2010 *
2011 * Purpose: Returns the MPI rank for a process
2012 *
2013 * Return: Success: non-negative
2014 * Failure: negative
2015 *
2016 * Programmer: Quincey Koziol
2017 * Thursday, May 16, 2002
2018 *
2019 * Modifications:
2020 *
2021 *-------------------------------------------------------------------------
2022 */
2023 static int
H5FD_mpio_mpi_rank(const H5FD_t * _file)2024 H5FD_mpio_mpi_rank(const H5FD_t *_file)
2025 {
2026 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
2027
2028 FUNC_ENTER_NOAPI_NOINIT_NOERR
2029
2030 HDassert(file);
2031 HDassert(H5FD_MPIO==file->pub.driver_id);
2032
2033 FUNC_LEAVE_NOAPI(file->mpi_rank)
2034 } /* end H5FD_mpio_mpi_rank() */
2035
2036
2037 /*-------------------------------------------------------------------------
2038 * Function: H5FD_mpio_mpi_size
2039 *
2040 * Purpose: Returns the number of MPI processes
2041 *
2042 * Return: Success: non-negative
2043 * Failure: negative
2044 *
2045 * Programmer: Quincey Koziol
2046 * Thursday, May 16, 2002
2047 *
2048 * Modifications:
2049 *
2050 *-------------------------------------------------------------------------
2051 */
2052 static int
H5FD_mpio_mpi_size(const H5FD_t * _file)2053 H5FD_mpio_mpi_size(const H5FD_t *_file)
2054 {
2055 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
2056
2057 FUNC_ENTER_NOAPI_NOINIT_NOERR
2058
2059 HDassert(file);
2060 HDassert(H5FD_MPIO==file->pub.driver_id);
2061
2062 FUNC_LEAVE_NOAPI(file->mpi_size)
2063 } /* end H5FD_mpio_mpi_size() */
2064
2065
2066 /*-------------------------------------------------------------------------
2067 * Function: H5FD_mpio_communicator
2068 *
2069 * Purpose: Returns the MPI communicator for the file.
2070 *
2071 * Return: Success: The communicator
2072 *
2073 * Failure: NULL
2074 *
2075 * Programmer: Robb Matzke
2076 * Monday, August 9, 1999
2077 *
2078 * Modifications:
2079 *
2080 *-------------------------------------------------------------------------
2081 */
2082 static MPI_Comm
H5FD_mpio_communicator(const H5FD_t * _file)2083 H5FD_mpio_communicator(const H5FD_t *_file)
2084 {
2085 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
2086
2087 FUNC_ENTER_NOAPI_NOINIT_NOERR
2088
2089 HDassert(file);
2090 HDassert(H5FD_MPIO==file->pub.driver_id);
2091
2092 FUNC_LEAVE_NOAPI(file->comm)
2093 }
2094
2095 #endif /* H5_HAVE_PARALLEL */
2096
2097