1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Copyright by The HDF Group. *
3 * Copyright by the Board of Trustees of the University of Illinois. *
4 * All rights reserved. *
5 * *
6 * This file is part of HDF5. The full HDF5 copyright notice, including *
7 * terms governing use, modification, and redistribution, is contained in *
8 * the COPYING file, which can be found at the root of the source code *
9 * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. *
10 * If you do not have access to either file, you may request a copy from *
11 * help@hdfgroup.org. *
12 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
13
14 /*
15 * Programmer: Robb Matzke <matzke@llnl.gov>
16 * Thursday, July 29, 1999
17 *
18 * Purpose: This is the MPI-2 I/O driver.
19 *
20 */
21
22 #include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */
23
24
25 #include "H5private.h" /* Generic Functions */
26 #include "H5CXprivate.h" /* API Contexts */
27 #include "H5Dprivate.h" /* Dataset functions */
28 #include "H5Eprivate.h" /* Error handling */
29 #include "H5Fprivate.h" /* File access */
30 #include "H5FDprivate.h" /* File drivers */
31 #include "H5FDmpi.h" /* MPI-based file drivers */
32 #include "H5Iprivate.h" /* IDs */
33 #include "H5MMprivate.h" /* Memory management */
34 #include "H5Pprivate.h" /* Property lists */
35
36 #ifdef H5_HAVE_PARALLEL
37
38 /*
39 * The driver identification number, initialized at runtime if H5_HAVE_PARALLEL
40 * is defined. This allows applications to still have the H5FD_MPIO
41 * "constants" in their source code.
42 */
43 static hid_t H5FD_MPIO_g = 0;
44
45 /* Whether to allow collective I/O operations */
46 /* (Value can be set from environment variable also) */
47 hbool_t H5FD_mpi_opt_types_g = TRUE;
48
49 /*
50 * The view is set to this value
51 */
52 static char H5FD_mpi_native_g[] = "native";
53
54 /*
55 * The description of a file belonging to this driver.
56 * The EOF value is only used just after the file is opened in order for the
57 * library to determine whether the file is empty, truncated, or okay. The MPIO
58 * driver doesn't bother to keep it updated since it's an expensive operation.
59 */
60 typedef struct H5FD_mpio_t {
61 H5FD_t pub; /*public stuff, must be first */
62 MPI_File f; /*MPIO file handle */
63 MPI_Comm comm; /*communicator */
64 MPI_Info info; /*file information */
65 int mpi_rank; /* This process's rank */
66 int mpi_size; /* Total number of processes */
67 haddr_t eof; /*end-of-file marker */
68 haddr_t eoa; /*end-of-address marker */
69 haddr_t last_eoa; /* Last known end-of-address marker */
70 haddr_t local_eof; /* Local end-of-file address for each process */
71 } H5FD_mpio_t;
72
73 /* Private Prototypes */
74
75 /* Callbacks */
76 static herr_t H5FD_mpio_term(void);
77 static void *H5FD_mpio_fapl_get(H5FD_t *_file);
78 static void *H5FD_mpio_fapl_copy(const void *_old_fa);
79 static herr_t H5FD_mpio_fapl_free(void *_fa);
80 static H5FD_t *H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id,
81 haddr_t maxaddr);
82 static herr_t H5FD_mpio_close(H5FD_t *_file);
83 static herr_t H5FD_mpio_query(const H5FD_t *_f1, unsigned long *flags);
84 static haddr_t H5FD_mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t type);
85 static herr_t H5FD_mpio_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr);
86 static haddr_t H5FD_mpio_get_eof(const H5FD_t *_file, H5FD_mem_t type);
87 static herr_t H5FD_mpio_get_handle(H5FD_t *_file, hid_t fapl, void** file_handle);
88 static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
89 size_t size, void *buf);
90 static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
91 size_t size, const void *buf);
92 static herr_t H5FD_mpio_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
93 static herr_t H5FD_mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
94 static int H5FD_mpio_mpi_rank(const H5FD_t *_file);
95 static int H5FD_mpio_mpi_size(const H5FD_t *_file);
96 static MPI_Comm H5FD_mpio_communicator(const H5FD_t *_file);
97 static herr_t H5FD_mpio_get_info(H5FD_t *_file, void** mpi_info);
98
99 /* The MPIO file driver information */
100 static const H5FD_class_mpi_t H5FD_mpio_g = {
101 { /* Start of superclass information */
102 "mpio", /*name */
103 HADDR_MAX, /*maxaddr */
104 H5F_CLOSE_SEMI, /* fc_degree */
105 H5FD_mpio_term, /*terminate */
106 NULL, /*sb_size */
107 NULL, /*sb_encode */
108 NULL, /*sb_decode */
109 sizeof(H5FD_mpio_fapl_t), /*fapl_size */
110 H5FD_mpio_fapl_get, /*fapl_get */
111 H5FD_mpio_fapl_copy, /*fapl_copy */
112 H5FD_mpio_fapl_free, /*fapl_free */
113 0, /*dxpl_size */
114 NULL, /*dxpl_copy */
115 NULL, /*dxpl_free */
116 H5FD_mpio_open, /*open */
117 H5FD_mpio_close, /*close */
118 NULL, /*cmp */
119 H5FD_mpio_query, /*query */
120 NULL, /*get_type_map */
121 NULL, /*alloc */
122 NULL, /*free */
123 H5FD_mpio_get_eoa, /*get_eoa */
124 H5FD_mpio_set_eoa, /*set_eoa */
125 H5FD_mpio_get_eof, /*get_eof */
126 H5FD_mpio_get_handle, /*get_handle */
127 H5FD_mpio_read, /*read */
128 H5FD_mpio_write, /*write */
129 H5FD_mpio_flush, /*flush */
130 H5FD_mpio_truncate, /*truncate */
131 NULL, /*lock */
132 NULL, /*unlock */
133 H5FD_FLMAP_DICHOTOMY /*fl_map */
134 }, /* End of superclass information */
135 H5FD_mpio_mpi_rank, /*get_rank */
136 H5FD_mpio_mpi_size, /*get_size */
137 H5FD_mpio_communicator, /*get_comm */
138 H5FD_mpio_get_info /*get_info */
139 };
140
141 #ifdef H5FDmpio_DEBUG
142 /* Flags to control debug actions in H5Fmpio.
143 * Meant to be indexed by characters.
144 *
145 * 'c' show result of MPI_Get_count after read
146 * 'r' show read offset and size
147 * 't' trace function entry and exit
148 * 'w' show write offset and size
149 */
150 static int H5FD_mpio_Debug[256] =
151 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
152 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
153 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
154 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
155 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
156 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
157 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
158 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
159 #endif
160
161
162 /*--------------------------------------------------------------------------
163 NAME
164 H5FD__init_package -- Initialize interface-specific information
165 USAGE
166 herr_t H5FD__init_package()
167 RETURNS
168 Non-negative on success/Negative on failure
169 DESCRIPTION
170 Initializes any interface-specific data or routines. (Just calls
171 H5FD_mpio_init currently).
172
173 --------------------------------------------------------------------------*/
174 static herr_t
H5FD__init_package(void)175 H5FD__init_package(void)
176 {
177 herr_t ret_value = SUCCEED;
178
179 FUNC_ENTER_STATIC
180
181 if(H5FD_mpio_init() < 0)
182 HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "unable to initialize mpio VFD")
183
184 done:
185 FUNC_LEAVE_NOAPI(ret_value)
186 } /* H5FD__init_package() */
187
188
189 /*-------------------------------------------------------------------------
190 * Function: H5FD_mpio_init
191 *
192 * Purpose: Initialize this driver by registering the driver with the
193 * library.
194 *
195 * Return: Success: The driver ID for the mpio driver.
196 * Failure: Negative.
197 *
198 * Programmer: Robb Matzke
199 * Thursday, August 5, 1999
200 *
201 *-------------------------------------------------------------------------
202 */
203 hid_t
H5FD_mpio_init(void)204 H5FD_mpio_init(void)
205 {
206 #ifdef H5FDmpio_DEBUG
207 static int H5FD_mpio_Debug_inited = 0;
208 #endif /* H5FDmpio_DEBUG */
209 const char *s; /* String for environment variables */
210 hid_t ret_value; /* Return value */
211
212 FUNC_ENTER_NOAPI(FAIL)
213
214 /* Register the MPI-IO VFD, if it isn't already */
215 if(H5I_VFL != H5I_get_type(H5FD_MPIO_g))
216 H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g, sizeof(H5FD_class_mpi_t), FALSE);
217
218 /* Allow MPI buf-and-file-type optimizations? */
219 s = HDgetenv("HDF5_MPI_OPT_TYPES");
220 if(s && HDisdigit(*s))
221 H5FD_mpi_opt_types_g = (hbool_t)HDstrtol(s, NULL, 0);
222
223 #ifdef H5FDmpio_DEBUG
224 if(!H5FD_mpio_Debug_inited) {
225 /* Retrieve MPI-IO debugging environment variable */
226 s = HDgetenv("H5FD_mpio_Debug");
227 if(s) {
228 /* Set debug mask */
229 while(*s) {
230 H5FD_mpio_Debug[(int)*s]++;
231 s++;
232 } /* end while */
233 } /* end if */
234 H5FD_mpio_Debug_inited++;
235 } /* end if */
236 #endif /* H5FDmpio_DEBUG */
237
238 /* Set return value */
239 ret_value = H5FD_MPIO_g;
240
241 done:
242 FUNC_LEAVE_NOAPI(ret_value)
243 } /* end H5FD_mpio_init() */
244
245
246 /*---------------------------------------------------------------------------
247 * Function: H5FD_mpio_term
248 *
249 * Purpose: Shut down the VFD
250 *
251 * Returns: Non-negative on success or negative on failure
252 *
253 * Programmer: Quincey Koziol
254 * Friday, Jan 30, 2004
255 *
256 *---------------------------------------------------------------------------
257 */
258 static herr_t
H5FD_mpio_term(void)259 H5FD_mpio_term(void)
260 {
261 FUNC_ENTER_NOAPI_NOINIT_NOERR
262
263 /* Reset VFL ID */
264 H5FD_MPIO_g=0;
265
266 FUNC_LEAVE_NOAPI(SUCCEED)
267 } /* end H5FD_mpio_term() */
268
269
270 /*-------------------------------------------------------------------------
271 * Function: H5Pset_fapl_mpio
272 *
273 * Purpose: Store the user supplied MPIO communicator comm and info in
274 * the file access property list FAPL_ID which can then be used
275 * to create and/or open the file. This function is available
276 * only in the parallel HDF5 library and is not collective.
277 *
278 * comm is the MPI communicator to be used for file open as
279 * defined in MPI_FILE_OPEN of MPI-2. This function makes a
280 * duplicate of comm. Any modification to comm after this function
281 * call returns has no effect on the access property list.
282 *
283 * info is the MPI Info object to be used for file open as
284 * defined in MPI_FILE_OPEN of MPI-2. This function makes a
285 * duplicate of info. Any modification to info after this
286 * function call returns has no effect on the access property
287 * list.
288 *
289 * If fapl_id has previously set comm and info values, they
290 * will be replaced and the old communicator and Info object
291 * are freed.
292 *
293 * Return: Success: Non-negative
294 *
295 * Failure: Negative
296 *
297 * Programmer: Albert Cheng
298 * Feb 3, 1998
299 *
300 *-------------------------------------------------------------------------
301 */
302 herr_t
H5Pset_fapl_mpio(hid_t fapl_id,MPI_Comm comm,MPI_Info info)303 H5Pset_fapl_mpio(hid_t fapl_id, MPI_Comm comm, MPI_Info info)
304 {
305 H5FD_mpio_fapl_t fa;
306 H5P_genplist_t *plist; /* Property list pointer */
307 herr_t ret_value;
308
309 FUNC_ENTER_API(FAIL)
310 H5TRACE3("e", "iMcMi", fapl_id, comm, info);
311
312 if(fapl_id == H5P_DEFAULT)
313 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
314
315 /* Check arguments */
316 if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
317 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list")
318 if(MPI_COMM_NULL == comm)
319 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a valid communicator")
320
321 /* Initialize driver specific properties */
322 fa.comm = comm;
323 fa.info = info;
324
325 /* duplication is done during driver setting. */
326 ret_value = H5P_set_driver(plist, H5FD_MPIO, &fa);
327
328 done:
329 FUNC_LEAVE_API(ret_value)
330 } /* H5Pset_fapl_mpio() */
331
332
333 /*-------------------------------------------------------------------------
334 * Function: H5Pget_fapl_mpio
335 *
336 * Purpose: If the file access property list is set to the H5FD_MPIO
337 * driver then this function returns duplicates of the MPI
338 * communicator and Info object stored through the comm and
339 * info pointers. It is the responsibility of the application
340 * to free the returned communicator and Info object.
341 *
342 * Return: Success: Non-negative with the communicator and
343 * Info object returned through the comm and
344 * info arguments if non-null. Since they are
345 * duplicates of the stored objects, future
346 * modifications to the access property list do
347 * not affect them and it is the responsibility
348 * of the application to free them.
349 *
350 * Failure: Negative
351 *
352 * Programmer: Robb Matzke
353 * Thursday, February 26, 1998
354 *
355 *-------------------------------------------------------------------------
356 */
357 herr_t
H5Pget_fapl_mpio(hid_t fapl_id,MPI_Comm * comm,MPI_Info * info)358 H5Pget_fapl_mpio(hid_t fapl_id, MPI_Comm *comm/*out*/, MPI_Info *info/*out*/)
359 {
360 H5P_genplist_t *plist; /* Property list pointer */
361 const H5FD_mpio_fapl_t *fa; /* MPIO fapl info */
362 MPI_Comm comm_tmp = MPI_COMM_NULL;
363 hbool_t comm_copied = FALSE; /* MPI Comm has been duplicated */
364 int mpi_code; /* MPI return code */
365 herr_t ret_value = SUCCEED; /* Return value */
366
367 FUNC_ENTER_API(FAIL)
368 H5TRACE3("e", "ixx", fapl_id, comm, info);
369
370 if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
371 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list")
372 if(H5FD_MPIO != H5P_peek_driver(plist))
373 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect VFL driver")
374 if(NULL == (fa = (const H5FD_mpio_fapl_t *)H5P_peek_driver_info(plist)))
375 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "bad VFL driver info")
376
377 /* Store the duplicated communicator in a temporary variable for error */
378 /* recovery in case the INFO duplication fails. */
379 if(comm) {
380 if(MPI_SUCCESS != (mpi_code = MPI_Comm_dup(fa->comm, &comm_tmp)))
381 HMPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code)
382 comm_copied = TRUE;
383 } /* end if */
384
385 if(info) {
386 if(MPI_INFO_NULL != fa->info) {
387 if(MPI_SUCCESS != (mpi_code = MPI_Info_dup(fa->info, info)))
388 HMPI_GOTO_ERROR(FAIL, "MPI_Info_dup failed", mpi_code)
389 } /* end if */
390 else
391 /* do not dup it */
392 *info = MPI_INFO_NULL;
393 } /* end if */
394
395 /* Store the copied communicator, now that the Info object has been
396 * successfully copied.
397 */
398 if(comm)
399 *comm = comm_tmp;
400
401 done:
402 if(ret_value < 0)
403 /* need to free anything created here */
404 if(comm_copied)
405 MPI_Comm_free(&comm_tmp);
406
407 FUNC_LEAVE_API(ret_value)
408 } /* end H5Pget_fapl_mpio() */
409
410
411 /*-------------------------------------------------------------------------
412 * Function: H5Pset_dxpl_mpio
413 *
414 * Purpose: Set the data transfer property list DXPL_ID to use transfer
415 * mode XFER_MODE. The property list can then be used to control
416 * the I/O transfer mode during data I/O operations. The valid
417 * transfer modes are:
418 *
419 * H5FD_MPIO_INDEPENDENT:
420 * Use independent I/O access (the default).
421 *
422 * H5FD_MPIO_COLLECTIVE:
423 * Use collective I/O access.
424 *
425 * Return: Success: Non-negative
426 * Failure: Negative
427 *
428 * Programmer: Albert Cheng
429 * April 2, 1998
430 *
431 *-------------------------------------------------------------------------
432 */
433 herr_t
H5Pset_dxpl_mpio(hid_t dxpl_id,H5FD_mpio_xfer_t xfer_mode)434 H5Pset_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t xfer_mode)
435 {
436 H5P_genplist_t *plist; /* Property list pointer */
437 herr_t ret_value = SUCCEED; /* Return value */
438
439 FUNC_ENTER_API(FAIL)
440 H5TRACE2("e", "iDt", dxpl_id, xfer_mode);
441
442 if(dxpl_id == H5P_DEFAULT)
443 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
444
445 /* Check arguments */
446 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
447 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
448 if(H5FD_MPIO_INDEPENDENT != xfer_mode && H5FD_MPIO_COLLECTIVE != xfer_mode)
449 HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "incorrect xfer_mode")
450
451 /* Set the transfer mode */
452 if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0)
453 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
454
455 done:
456 FUNC_LEAVE_API(ret_value)
457 } /* end H5Pset_dxpl_mpio() */
458
459
460 /*-------------------------------------------------------------------------
461 * Function: H5Pget_dxpl_mpio
462 *
463 * Purpose: Queries the transfer mode current set in the data transfer
464 * property list DXPL_ID. This is not collective.
465 *
466 * Return: Success: Non-negative, with the transfer mode returned
467 * through the XFER_MODE argument if it is
468 * non-null.
469 *
470 * Failure: Negative
471 *
472 * Programmer: Albert Cheng
473 * April 2, 1998
474 *
475 *-------------------------------------------------------------------------
476 */
477 herr_t
H5Pget_dxpl_mpio(hid_t dxpl_id,H5FD_mpio_xfer_t * xfer_mode)478 H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode/*out*/)
479 {
480 H5P_genplist_t *plist; /* Property list pointer */
481 herr_t ret_value = SUCCEED; /* Return value */
482
483 FUNC_ENTER_API(FAIL)
484 H5TRACE2("e", "ix", dxpl_id, xfer_mode);
485
486 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
487 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
488
489 /* Get the transfer mode */
490 if(xfer_mode)
491 if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, xfer_mode) < 0)
492 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to get value")
493
494 done:
495 FUNC_LEAVE_API(ret_value)
496 } /* end H5Pget_dxpl_mpio() */
497
498
499 /*-------------------------------------------------------------------------
500 * Function: H5Pset_dxpl_mpio_collective_opt
501 *
502 * Purpose: To set a flag to choose linked chunk I/O or multi-chunk I/O
503 * without involving decision-making inside HDF5
504 *
505 * Note: The library will do linked chunk I/O or multi-chunk I/O without
506 * involving communications for decision-making process.
507 * The library won't behave as it asks for only when we find
508 * that the low-level MPI-IO package doesn't support this.
509 *
510 * Return: Success: Non-negative
511 * Failure: Negative
512 *
513 * Programmer: Kent Yang
514 * ? ?, ?
515 *
516 *-------------------------------------------------------------------------
517 */
518 herr_t
H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id,H5FD_mpio_collective_opt_t opt_mode)519 H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id, H5FD_mpio_collective_opt_t opt_mode)
520 {
521 H5P_genplist_t *plist; /* Property list pointer */
522 herr_t ret_value = SUCCEED; /* Return value */
523
524 FUNC_ENTER_API(FAIL)
525 H5TRACE2("e", "iDc", dxpl_id, opt_mode);
526
527 if(dxpl_id == H5P_DEFAULT)
528 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
529
530 /* Check arguments */
531 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
532 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
533
534 /* Set the transfer mode */
535 if(H5P_set(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &opt_mode) < 0)
536 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
537
538 done:
539 FUNC_LEAVE_API(ret_value)
540 } /* end H5Pset_dxpl_mpio_collective_opt() */
541
542
543 /*-------------------------------------------------------------------------
544 * Function: H5Pset_dxpl_mpio_chunk_opt
545 *
546 * Purpose: To set a flag to choose linked chunk I/O or multi-chunk I/O
547 * without involving decision-making inside HDF5
548 *
549 * Note: The library will do linked chunk I/O or multi-chunk I/O without
550 * involving communications for decision-making process.
551 * The library won't behave as it asks for only when we find
552 * that the low-level MPI-IO package doesn't support this.
553 *
554 * Return: Success: Non-negative
555 * Failure: Negative
556 *
557 * Programmer: Kent Yang
558 * ? ?, ?
559 *
560 *-------------------------------------------------------------------------
561 */
562 herr_t
H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id,H5FD_mpio_chunk_opt_t opt_mode)563 H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode)
564 {
565 H5P_genplist_t *plist; /* Property list pointer */
566 herr_t ret_value = SUCCEED; /* Return value */
567
568 FUNC_ENTER_API(FAIL)
569 H5TRACE2("e", "iDh", dxpl_id, opt_mode);
570
571 if(dxpl_id == H5P_DEFAULT)
572 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
573
574 /* Check arguments */
575 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
576 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
577
578 /* Set the transfer mode */
579 if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME, &opt_mode) < 0)
580 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
581
582 done:
583 FUNC_LEAVE_API(ret_value)
584 } /* end H5Pset_dxpl_mpio_chunk_opt() */
585
586
587 /*-------------------------------------------------------------------------
588 * Function: H5Pset_dxpl_mpio_chunk_opt_num
589 *
590 * Purpose: To set a threshold for doing linked chunk IO
591 *
592 * Note: If the number is greater than the threshold set by the user,
593 * the library will do linked chunk I/O; otherwise, I/O will be
594 * done for every chunk.
595 *
596 * Return: Success: Non-negative
597 * Failure: Negative
598 *
599 * Programmer: Kent Yang
600 * ? ?, ?
601 *
602 *-------------------------------------------------------------------------
603 */
604 herr_t
H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id,unsigned num_chunk_per_proc)605 H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc)
606 {
607 H5P_genplist_t *plist; /* Property list pointer */
608 herr_t ret_value = SUCCEED; /* Return value */
609
610 FUNC_ENTER_API(FAIL)
611 H5TRACE2("e", "iIu", dxpl_id, num_chunk_per_proc);
612
613 if(dxpl_id == H5P_DEFAULT)
614 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
615
616 /* Check arguments */
617 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
618 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
619
620 /* Set the transfer mode */
621 if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME, &num_chunk_per_proc) < 0)
622 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
623
624 done:
625 FUNC_LEAVE_API(ret_value)
626 } /* end H5Pset_dxpl_mpio_chunk_opt_num() */
627
628
629 /*-------------------------------------------------------------------------
630 * Function: H5Pset_dxpl_mpio_chunk_opt_ratio
631 *
632 * Purpose: To set a threshold for doing collective I/O for each chunk
633 *
634 * Note: The library will calculate the percentage of the number of
635 * process holding selections at each chunk. If that percentage
636 * of number of process in the individual chunk is greater than
637 * the threshold set by the user, the library will do collective
638 * chunk I/O for this chunk; otherwise, independent I/O will be
639 * done for this chunk.
640 *
641 * Return: Success: Non-negative
642 * Failure: Negative
643 *
644 * Programmer: Kent Yang
645 * ? ?, ?
646 *
647 *-------------------------------------------------------------------------
648 */
649 herr_t
H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id,unsigned percent_num_proc_per_chunk)650 H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk)
651 {
652 H5P_genplist_t *plist; /* Property list pointer */
653 herr_t ret_value = SUCCEED; /* Return value */
654
655 FUNC_ENTER_API(FAIL)
656 H5TRACE2("e", "iIu", dxpl_id, percent_num_proc_per_chunk);
657
658 if(dxpl_id == H5P_DEFAULT)
659 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
660
661 /* Check arguments */
662 if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
663 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
664
665 /* Set the transfer mode */
666 if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME, &percent_num_proc_per_chunk) < 0)
667 HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
668
669 done:
670 FUNC_LEAVE_API(ret_value)
671 } /* end H5Pset_dxpl_mpio_chunk_opt_ratio() */
672
673
674 /*-------------------------------------------------------------------------
675 * Function: H5FD_mpio_fapl_get
676 *
677 * Purpose: Returns a file access property list which could be used to
678 * create another file the same as this one.
679 *
680 * Return: Success: Ptr to new file access property list with all
681 * fields copied from the file pointer.
682 *
683 * Failure: NULL
684 *
685 * Programmer: Robb Matzke
686 * Friday, August 13, 1999
687 *
688 *-------------------------------------------------------------------------
689 */
690 static void *
H5FD_mpio_fapl_get(H5FD_t * _file)691 H5FD_mpio_fapl_get(H5FD_t *_file)
692 {
693 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
694 H5FD_mpio_fapl_t *fa = NULL;
695 void *ret_value; /* Return value */
696
697 FUNC_ENTER_NOAPI_NOINIT
698
699 HDassert(file);
700 HDassert(H5FD_MPIO == file->pub.driver_id);
701
702 if(NULL == (fa = (H5FD_mpio_fapl_t *)H5MM_calloc(sizeof(H5FD_mpio_fapl_t))))
703 HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
704
705 /* Duplicate communicator and Info object. */
706 if(FAIL == H5FD_mpi_comm_info_dup(file->comm, file->info, &fa->comm, &fa->info))
707 HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
708
709 /* Set return value */
710 ret_value = fa;
711
712 done:
713 FUNC_LEAVE_NOAPI(ret_value)
714 }
715
716
717 /*-------------------------------------------------------------------------
718 * Function: H5FD_mpio_fapl_copy
719 *
720 * Purpose: Copies the mpio-specific file access properties.
721 *
722 * Return: Success: Ptr to a new property list
723 *
724 * Failure: NULL
725 *
726 * Programmer: Albert Cheng
727 * Jan 8, 2003
728 *
729 *-------------------------------------------------------------------------
730 */
731 static void *
H5FD_mpio_fapl_copy(const void * _old_fa)732 H5FD_mpio_fapl_copy(const void *_old_fa)
733 {
734 void *ret_value = NULL;
735 const H5FD_mpio_fapl_t *old_fa = (const H5FD_mpio_fapl_t*)_old_fa;
736 H5FD_mpio_fapl_t *new_fa = NULL;
737
738 FUNC_ENTER_NOAPI_NOINIT
739 #ifdef H5FDmpio_DEBUG
740 if (H5FD_mpio_Debug[(int)'t'])
741 HDfprintf(stderr, "enter H5FD_mpio_fapl_copy\n");
742 #endif
743
744 if(NULL == (new_fa = (H5FD_mpio_fapl_t *)H5MM_malloc(sizeof(H5FD_mpio_fapl_t))))
745 HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
746
747 /* Copy the general information */
748 HDmemcpy(new_fa, old_fa, sizeof(H5FD_mpio_fapl_t));
749
750 /* Duplicate communicator and Info object. */
751 if(FAIL == H5FD_mpi_comm_info_dup(old_fa->comm, old_fa->info, &new_fa->comm, &new_fa->info))
752 HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
753 ret_value = new_fa;
754
755 done:
756 if (NULL == ret_value){
757 /* cleanup */
758 if (new_fa)
759 H5MM_xfree(new_fa);
760 }
761
762 #ifdef H5FDmpio_DEBUG
763 if (H5FD_mpio_Debug[(int)'t'])
764 HDfprintf(stderr, "leaving H5FD_mpio_fapl_copy\n");
765 #endif
766 FUNC_LEAVE_NOAPI(ret_value)
767 } /* end H5FD_mpio_fapl_copy() */
768
769
770 /*-------------------------------------------------------------------------
771 * Function: H5FD_mpio_fapl_free
772 *
773 * Purpose: Frees the mpio-specific file access properties.
774 *
775 * Return: Success: 0
776 *
777 * Failure: -1
778 *
779 * Programmer: Albert Cheng
780 * Jan 8, 2003
781 *
782 * Modifications:
783 *
784 *-------------------------------------------------------------------------
785 */
786 static herr_t
H5FD_mpio_fapl_free(void * _fa)787 H5FD_mpio_fapl_free(void *_fa)
788 {
789 herr_t ret_value = SUCCEED;
790 H5FD_mpio_fapl_t *fa = (H5FD_mpio_fapl_t*)_fa;
791
792 FUNC_ENTER_NOAPI_NOINIT_NOERR
793 #ifdef H5FDmpio_DEBUG
794 if (H5FD_mpio_Debug[(int)'t'])
795 HDfprintf(stderr, "in H5FD_mpio_fapl_free\n");
796 #endif
797 HDassert(fa);
798
799 /* Free the internal communicator and INFO object */
800 HDassert(MPI_COMM_NULL!=fa->comm);
801 H5FD_mpi_comm_info_free(&fa->comm, &fa->info);
802 H5MM_xfree(fa);
803
804 #ifdef H5FDmpio_DEBUG
805 if (H5FD_mpio_Debug[(int)'t'])
806 HDfprintf(stderr, "leaving H5FD_mpio_fapl_free\n");
807 #endif
808 FUNC_LEAVE_NOAPI(ret_value)
809 } /* end H5FD_mpio_fapl_free() */
810
811
812 /*-------------------------------------------------------------------------
813 * Function: H5FD_set_mpio_atomicity
814 *
815 * Purpose: Sets the atomicity mode
816 *
817 * Return: Success: Non-negative
818 *
819 * Failure: Negative
820 *
821 * Programmer: Mohamad Chaarawi
822 * Feb 14, 2012
823 *
824 *-------------------------------------------------------------------------
825 */
826 herr_t
H5FD_set_mpio_atomicity(H5FD_t * _file,hbool_t flag)827 H5FD_set_mpio_atomicity(H5FD_t *_file, hbool_t flag)
828 {
829 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
830 int mpi_code; /* MPI return code */
831 int temp_flag;
832 herr_t ret_value = SUCCEED;
833
834 FUNC_ENTER_NOAPI_NOINIT
835
836 #ifdef H5FDmpio_DEBUG
837 if (H5FD_mpio_Debug[(int)'t'])
838 HDfprintf(stdout, "Entering H5FD_set_mpio_atomicity\n");
839 #endif
840
841 if (FALSE == flag)
842 temp_flag = 0;
843 else
844 temp_flag = 1;
845
846 /* set atomicity value */
847 if (MPI_SUCCESS != (mpi_code=MPI_File_set_atomicity(file->f, temp_flag)))
848 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_atomicity", mpi_code)
849
850 done:
851 #ifdef H5FDmpio_DEBUG
852 if (H5FD_mpio_Debug[(int)'t'])
853 HDfprintf(stdout, "Leaving H5FD_set_mpio_atomicity\n");
854 #endif
855 FUNC_LEAVE_NOAPI(ret_value)
856 }
857
858
859 /*-------------------------------------------------------------------------
860 * Function: H5FD_get_mpio_atomicity
861 *
862 * Purpose: Returns the atomicity mode
863 *
864 * Return: Success: Non-negative
865 *
866 * Failure: Negative
867 *
868 * Programmer: Mohamad Chaarawi
869 * Feb 14, 2012
870 *
871 *-------------------------------------------------------------------------
872 */
873 herr_t
H5FD_get_mpio_atomicity(H5FD_t * _file,hbool_t * flag)874 H5FD_get_mpio_atomicity(H5FD_t *_file, hbool_t *flag)
875 {
876 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
877 int mpi_code; /* MPI return code */
878 int temp_flag;
879 herr_t ret_value = SUCCEED;
880
881 FUNC_ENTER_NOAPI_NOINIT
882
883 #ifdef H5FDmpio_DEBUG
884 if (H5FD_mpio_Debug[(int)'t'])
885 HDfprintf(stdout, "Entering H5FD_get_mpio_atomicity\n");
886 #endif
887
888 /* get atomicity value */
889 if (MPI_SUCCESS != (mpi_code=MPI_File_get_atomicity(file->f, &temp_flag)))
890 HMPI_GOTO_ERROR(FAIL, "MPI_File_get_atomicity", mpi_code)
891
892 if (0 != temp_flag)
893 *flag = TRUE;
894 else
895 *flag = FALSE;
896
897 done:
898 #ifdef H5FDmpio_DEBUG
899 if (H5FD_mpio_Debug[(int)'t'])
900 HDfprintf(stdout, "Leaving H5FD_get_mpio_atomicity\n");
901 #endif
902 FUNC_LEAVE_NOAPI(ret_value)
903 }
904
905
906 /*-------------------------------------------------------------------------
907 * Function: H5FD_mpio_open
908 *
909 * Purpose: Opens a file with name NAME. The FLAGS are a bit field with
910 * purpose similar to the second argument of open(2) and which
911 * are defined in H5Fpublic.h. The file access property list
912 * FAPL_ID contains the properties driver properties and MAXADDR
913 * is the largest address which this file will be expected to
914 * access. This is collective.
915 *
916 * Return: Success: A new file pointer.
917 *
918 * Failure: NULL
919 *
920 * Programmer:
921 * January 30, 1998
922 *
923 *-------------------------------------------------------------------------
924 */
925 static H5FD_t *
H5FD_mpio_open(const char * name,unsigned flags,hid_t fapl_id,haddr_t H5_ATTR_UNUSED maxaddr)926 H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id,
927 haddr_t H5_ATTR_UNUSED maxaddr)
928 {
929 H5FD_mpio_t *file=NULL;
930 MPI_File fh;
931 unsigned file_opened=0; /* Flag to indicate that the file was successfully opened */
932 int mpi_amode;
933 int mpi_rank; /* MPI rank of this process */
934 int mpi_size; /* Total number of MPI processes */
935 int mpi_code; /* mpi return code */
936 MPI_Offset size;
937 const H5FD_mpio_fapl_t *fa = NULL;
938 H5FD_mpio_fapl_t _fa;
939 H5P_genplist_t *plist; /* Property list pointer */
940 MPI_Comm comm_dup = MPI_COMM_NULL;
941 MPI_Info info_dup = MPI_INFO_NULL;
942 H5FD_t *ret_value; /* Return value */
943
944 FUNC_ENTER_NOAPI_NOINIT
945
946 #ifdef H5FDmpio_DEBUG
947 if (H5FD_mpio_Debug[(int)'t']) {
948 HDfprintf(stdout, "Entering H5FD_mpio_open(name=\"%s\", flags=0x%x, "
949 "fapl_id=%d, maxaddr=%lu)\n", name, flags, (int)fapl_id, (unsigned long)maxaddr);
950 }
951 #endif
952
953 /* Obtain a pointer to mpio-specific file access properties */
954 if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
955 HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list")
956 if(H5P_FILE_ACCESS_DEFAULT == fapl_id || H5FD_MPIO != H5P_peek_driver(plist)) {
957 _fa.comm = MPI_COMM_SELF; /*default*/
958 _fa.info = MPI_INFO_NULL; /*default*/
959 fa = &_fa;
960 } /* end if */
961 else {
962 if(NULL == (fa = (const H5FD_mpio_fapl_t *)H5P_peek_driver_info(plist)))
963 HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "bad VFL driver info")
964 } /* end else */
965
966 /* Duplicate communicator and Info object for use by this file. */
967 if(FAIL == H5FD_mpi_comm_info_dup(fa->comm, fa->info, &comm_dup, &info_dup))
968 HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
969
970 /* convert HDF5 flags to MPI-IO flags */
971 /* some combinations are illegal; let MPI-IO figure it out */
972 mpi_amode = (flags & H5F_ACC_RDWR) ? MPI_MODE_RDWR : MPI_MODE_RDONLY;
973 if(flags & H5F_ACC_CREAT)
974 mpi_amode |= MPI_MODE_CREATE;
975 if(flags & H5F_ACC_EXCL)
976 mpi_amode |= MPI_MODE_EXCL;
977
978 #ifdef H5FDmpio_DEBUG
979 /* Check for debug commands in the info parameter */
980 {
981 if(MPI_INFO_NULL != info_dup) {
982 char debug_str[128];
983 int flag;
984
985 MPI_Info_get(fa->info, H5F_MPIO_DEBUG_KEY, sizeof(debug_str) - 1, debug_str, &flag);
986 if(flag) {
987 int i;
988
989 HDfprintf(stdout, "H5FD_mpio debug flags = '%s'\n", debug_str);
990 for(i = 0; debug_str[i]/*end of string*/ && i < 128/*just in case*/; ++i)
991 H5FD_mpio_Debug[(int)debug_str[i]] = 1;
992 }
993 }
994 }
995 #endif
996
997 if(MPI_SUCCESS != (mpi_code = MPI_File_open(comm_dup, name, mpi_amode, info_dup, &fh)))
998 HMPI_GOTO_ERROR(NULL, "MPI_File_open failed", mpi_code)
999 file_opened=1;
1000
1001 /* Get the MPI rank of this process and the total number of processes */
1002 if (MPI_SUCCESS != (mpi_code=MPI_Comm_rank (comm_dup, &mpi_rank)))
1003 HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code)
1004 if (MPI_SUCCESS != (mpi_code=MPI_Comm_size (comm_dup, &mpi_size)))
1005 HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code)
1006
1007 /* Build the return value and initialize it */
1008 if(NULL == (file = (H5FD_mpio_t *)H5MM_calloc(sizeof(H5FD_mpio_t))))
1009 HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
1010 file->f = fh;
1011 file->comm = comm_dup;
1012 file->info = info_dup;
1013 file->mpi_rank = mpi_rank;
1014 file->mpi_size = mpi_size;
1015
1016 /* Only processor p0 will get the filesize and broadcast it. */
1017 if (mpi_rank == 0) {
1018 if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size)))
1019 HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
1020 } /* end if */
1021
1022 /* Broadcast file size */
1023 if(MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup)))
1024 HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)
1025
1026 /* Determine if the file should be truncated */
1027 if(size && (flags & H5F_ACC_TRUNC)) {
1028 if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(fh, (MPI_Offset)0)))
1029 HMPI_GOTO_ERROR(NULL, "MPI_File_set_size failed", mpi_code)
1030
1031 /* Don't let any proc return until all have truncated the file. */
1032 if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(comm_dup)))
1033 HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mpi_code)
1034
1035 /* File is zero size now */
1036 size = 0;
1037 } /* end if */
1038
1039 /* Set the size of the file (from library's perspective) */
1040 file->eof = H5FD_mpi_MPIOff_to_haddr(size);
1041 file->local_eof = file->eof;
1042
1043 /* Set return value */
1044 ret_value=(H5FD_t*)file;
1045
1046 done:
1047 if(ret_value==NULL) {
1048 if(file_opened)
1049 MPI_File_close(&fh);
1050 if (MPI_COMM_NULL != comm_dup)
1051 MPI_Comm_free(&comm_dup);
1052 if (MPI_INFO_NULL != info_dup)
1053 MPI_Info_free(&info_dup);
1054 if (file)
1055 H5MM_xfree(file);
1056 } /* end if */
1057
1058 #ifdef H5FDmpio_DEBUG
1059 if (H5FD_mpio_Debug[(int)'t'])
1060 HDfprintf(stdout, "Leaving H5FD_mpio_open\n" );
1061 #endif
1062 FUNC_LEAVE_NOAPI(ret_value)
1063 } /* end H5FD_mpio_open() */
1064
1065
1066 /*-------------------------------------------------------------------------
1067 * Function: H5FD_mpio_close
1068 *
1069 * Purpose: Closes a file. This is collective.
1070 *
1071 * Return: Success: Non-negative
1072 *
1073 * Failure: Negative
1074 *
1075 * Programmer: Unknown
1076 * January 30, 1998
1077 *
1078 * Modifications:
1079 * Robb Matzke, 1998-02-18
1080 * Added the ACCESS_PARMS argument.
1081 *
1082 * Robb Matzke, 1999-08-06
1083 * Modified to work with the virtual file layer.
1084 *
1085 * Albert Cheng, 2003-04-17
1086 * Free the communicator stored.
1087 *-------------------------------------------------------------------------
1088 */
1089 static herr_t
H5FD_mpio_close(H5FD_t * _file)1090 H5FD_mpio_close(H5FD_t *_file)
1091 {
1092 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1093 int mpi_code; /* MPI return code */
1094 herr_t ret_value=SUCCEED; /* Return value */
1095
1096 FUNC_ENTER_NOAPI_NOINIT
1097
1098 #ifdef H5FDmpio_DEBUG
1099 if (H5FD_mpio_Debug[(int)'t'])
1100 HDfprintf(stdout, "Entering H5FD_mpio_close\n");
1101 #endif
1102 HDassert(file);
1103 HDassert(H5FD_MPIO==file->pub.driver_id);
1104
1105 /* MPI_File_close sets argument to MPI_FILE_NULL */
1106 if (MPI_SUCCESS != (mpi_code=MPI_File_close(&(file->f)/*in,out*/)))
1107 HMPI_GOTO_ERROR(FAIL, "MPI_File_close failed", mpi_code)
1108
1109 /* Clean up other stuff */
1110 H5FD_mpi_comm_info_free(&file->comm, &file->info);
1111 H5MM_xfree(file);
1112
1113 done:
1114 #ifdef H5FDmpio_DEBUG
1115 if (H5FD_mpio_Debug[(int)'t'])
1116 HDfprintf(stdout, "Leaving H5FD_mpio_close\n");
1117 #endif
1118 FUNC_LEAVE_NOAPI(ret_value)
1119 }
1120
1121
1122 /*-------------------------------------------------------------------------
1123 * Function: H5FD_mpio_query
1124 *
1125 * Purpose: Set the flags that this VFL driver is capable of supporting.
1126 * (listed in H5FDpublic.h)
1127 *
1128 * Return: Success: non-negative
1129 *
1130 * Failure: negative
1131 *
1132 * Programmer: Quincey Koziol
1133 * Friday, August 25, 2000
1134 *
1135 * Modifications:
1136 *
1137 * John Mainzer -- 9/21/05
1138 * Modified code to turn off the
1139 * H5FD_FEAT_ACCUMULATE_METADATA_WRITE flag.
1140 * With the movement of
1141 * all cache writes to process 0, this flag has become
1142 * problematic in PHDF5.
1143 *
1144 *-------------------------------------------------------------------------
1145 */
1146 static herr_t
H5FD_mpio_query(const H5FD_t H5_ATTR_UNUSED * _file,unsigned long * flags)1147 H5FD_mpio_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */)
1148 {
1149 FUNC_ENTER_NOAPI_NOINIT_NOERR
1150
1151 /* Set the VFL feature flags that this driver supports */
1152 if(flags) {
1153 *flags=0;
1154 *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
1155 *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
1156 *flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */
1157 *flags |= H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */
1158 *flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default VFD */
1159 } /* end if */
1160
1161 FUNC_LEAVE_NOAPI(SUCCEED)
1162 }
1163
1164
1165 /*-------------------------------------------------------------------------
1166 * Function: H5FD_mpio_get_eoa
1167 *
1168 * Purpose: Gets the end-of-address marker for the file. The EOA marker
1169 * is the first address past the last byte allocated in the
1170 * format address space.
1171 *
1172 * Return: Success: The end-of-address marker.
1173 *
1174 * Failure: HADDR_UNDEF
1175 *
1176 * Programmer: Robb Matzke
1177 * Friday, August 6, 1999
1178 *
1179 * Modifications:
1180 * Raymond Lu
1181 * 21 Dec. 2006
1182 * Added the parameter TYPE. It's only used for MULTI driver.
1183 *
1184 *-------------------------------------------------------------------------
1185 */
1186 static haddr_t
H5FD_mpio_get_eoa(const H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type)1187 H5FD_mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
1188 {
1189 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
1190
1191 FUNC_ENTER_NOAPI_NOINIT_NOERR
1192
1193 HDassert(file);
1194 HDassert(H5FD_MPIO==file->pub.driver_id);
1195
1196 FUNC_LEAVE_NOAPI(file->eoa)
1197 }
1198
1199
1200 /*-------------------------------------------------------------------------
1201 * Function: H5FD_mpio_set_eoa
1202 *
1203 * Purpose: Set the end-of-address marker for the file. This function is
1204 * called shortly after an existing HDF5 file is opened in order
1205 * to tell the driver where the end of the HDF5 data is located.
1206 *
1207 * Return: Success: 0
1208 *
1209 * Failure: -1
1210 *
1211 * Programmer: Robb Matzke
1212 * Friday, August 6, 1999
1213 *
1214 * Modifications:
1215 * Raymond Lu
1216 * 21 Dec. 2006
1217 * Added the parameter TYPE. It's only used for MULTI driver.
1218 *
1219 *-------------------------------------------------------------------------
1220 */
1221 static herr_t
H5FD_mpio_set_eoa(H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type,haddr_t addr)1222 H5FD_mpio_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr)
1223 {
1224 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1225
1226 FUNC_ENTER_NOAPI_NOINIT_NOERR
1227
1228 HDassert(file);
1229 HDassert(H5FD_MPIO==file->pub.driver_id);
1230
1231 file->eoa = addr;
1232
1233 FUNC_LEAVE_NOAPI(SUCCEED)
1234 }
1235
1236
1237 /*-------------------------------------------------------------------------
1238 * Function: H5FD_mpio_get_eof
1239 *
1240 * Purpose: Gets the end-of-file marker for the file. The EOF marker
1241 * is the real size of the file.
1242 *
1243 * The MPIO driver doesn't bother keeping this field updated
1244 * since that's a relatively expensive operation. Fortunately
1245 * the library only needs the EOF just after the file is opened
1246 * in order to determine whether the file is empty, truncated,
1247 * or okay. Therefore, any MPIO I/O function will set its value
1248 * to HADDR_UNDEF which is the error return value of this
1249 * function.
1250 *
1251 * Keeping the EOF updated (during write calls) is expensive
1252 * because any process may extend the physical end of the
1253 * file. -QAK
1254 *
1255 * Return: Success: The end-of-address marker.
1256 *
1257 * Failure: HADDR_UNDEF
1258 *
1259 * Programmer: Robb Matzke
1260 * Friday, August 6, 1999
1261 *
1262 * Modifications:
1263 *
1264 *-------------------------------------------------------------------------
1265 */
1266 static haddr_t
H5FD_mpio_get_eof(const H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type)1267 H5FD_mpio_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
1268 {
1269 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
1270
1271 FUNC_ENTER_NOAPI_NOINIT_NOERR
1272
1273 HDassert(file);
1274 HDassert(H5FD_MPIO==file->pub.driver_id);
1275
1276 FUNC_LEAVE_NOAPI(file->eof)
1277 }
1278
1279
1280 /*-------------------------------------------------------------------------
1281 * Function: H5FD_mpio_get_handle
1282 *
1283 * Purpose: Returns the file handle of MPIO file driver.
1284 *
1285 * Returns: Non-negative if succeed or negative if fails.
1286 *
1287 * Programmer: Raymond Lu
1288 * Sept. 16, 2002
1289 *
1290 * Modifications:
1291 *
1292 *-------------------------------------------------------------------------
1293 */
1294 static herr_t
H5FD_mpio_get_handle(H5FD_t * _file,hid_t H5_ATTR_UNUSED fapl,void ** file_handle)1295 H5FD_mpio_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void** file_handle)
1296 {
1297 H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
1298 herr_t ret_value = SUCCEED;
1299
1300 FUNC_ENTER_NOAPI_NOINIT
1301
1302 if(!file_handle)
1303 HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid")
1304
1305 *file_handle = &(file->f);
1306
1307 done:
1308 FUNC_LEAVE_NOAPI(ret_value)
1309 }
1310
1311
1312 /*-------------------------------------------------------------------------
1313 * Function: H5FD_mpio_get_info
1314 *
1315 * Purpose: Returns the file info of MPIO file driver.
1316 *
1317 * Returns: Non-negative if succeed or negative if fails.
1318 *
1319 * Programmer: John Mainzer
1320 * April 4, 2017
1321 *
1322 * Modifications:
1323 *
1324 *-------------------------------------------------------------------------
1325 */
1326 static herr_t
H5FD_mpio_get_info(H5FD_t * _file,void ** mpi_info)1327 H5FD_mpio_get_info(H5FD_t *_file, void** mpi_info)
1328 {
1329 H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
1330 herr_t ret_value = SUCCEED;
1331
1332 FUNC_ENTER_NOAPI_NOINIT
1333
1334 if(!mpi_info)
1335 HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mpi info not valid")
1336
1337 *mpi_info = &(file->info);
1338
1339 done:
1340 FUNC_LEAVE_NOAPI(ret_value)
1341
1342 } /* H5FD_mpio_get_info() */
1343
1344
1345 /*-------------------------------------------------------------------------
1346 * Function: H5FD_mpio_read
1347 *
1348 * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR
1349 * into buffer BUF according to data transfer properties in
1350 * DXPL_ID using potentially complex file and buffer types to
1351 * effect the transfer.
1352 *
1353 * Reading past the end of the MPI file returns zeros instead of
1354 * failing. MPI is able to coalesce requests from different
1355 * processes (collective or independent).
1356 *
1357 * Return: Success: Zero. Result is stored in caller-supplied
1358 * buffer BUF.
1359 *
1360 * Failure: -1, Contents of buffer BUF are undefined.
1361 *
1362 * Programmer: rky, 1998-01-30
1363 *
1364 * Modifications:
1365 * Robb Matzke, 1998-02-18
1366 * Added the ACCESS_PARMS argument.
1367 *
1368 * rky, 1998-04-10
1369 * Call independent or collective MPI read, based on
1370 * ACCESS_PARMS.
1371 *
1372 * Albert Cheng, 1998-06-01
1373 * Added XFER_MODE to control independent or collective MPI
1374 * read.
1375 *
1376 * rky, 1998-08-16
1377 * Use BTYPE, FTYPE, and DISP from access parms. The guts of
1378 * H5FD_mpio_read and H5FD_mpio_write should be replaced by a
1379 * single dual-purpose routine.
1380 *
1381 * Robb Matzke, 1999-04-21
1382 * Changed XFER_MODE to XFER_PARMS for all H5F_*_read()
1383 * callbacks.
1384 *
1385 * Robb Matzke, 1999-07-28
1386 * The ADDR argument is passed by value.
1387 *
1388 * Robb Matzke, 1999-08-06
1389 * Modified to work with the virtual file layer.
1390 *
1391 * Quincey Koziol, 2002-05-14
1392 * Only call MPI_Get_count if we can use MPI_BYTE for the MPI type
1393 * for the I/O transfer. Someday we might include code to decode
1394 * the MPI type used for more complicated transfers and call
1395 * MPI_Get_count all the time.
1396 *
1397 * Quincey Koziol - 2002/06/17
1398 * Removed 'disp' parameter from H5FD_mpio_setup routine and use
1399 * the address of the dataset in MPI_File_set_view() calls, as
1400 * necessary.
1401 *
1402 * Quincey Koziol - 2002/06/24
1403 * Removed "lazy" MPI_File_set_view() calls, since they would fail
1404 * if the first I/O was a collective I/O using MPI derived types
1405 * and the next I/O was an independent I/O.
1406 *
1407 * Quincey Koziol - 2003/10/22-31
1408 * Restructured code massively, straightening out logic and finally
1409 * getting the bytes_read stuff working.
1410 *
1411 *-------------------------------------------------------------------------
1412 */
1413 static herr_t
H5FD_mpio_read(H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type,hid_t H5_ATTR_UNUSED dxpl_id,haddr_t addr,size_t size,void * buf)1414 H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type,
1415 hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size, void *buf/*out*/)
1416 {
1417 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1418 MPI_Offset mpi_off;
1419 MPI_Status mpi_stat; /* Status from I/O operation */
1420 int mpi_code; /* mpi return code */
1421 MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
1422 int size_i; /* Integer copy of 'size' to read */
1423 #if MPI_VERSION >= 3
1424 MPI_Count bytes_read = 0; /* Number of bytes read in */
1425 MPI_Count type_size; /* MPI datatype used for I/O's size */
1426 MPI_Count io_size; /* Actual number of bytes requested */
1427 MPI_Count n;
1428 #else
1429 int bytes_read = 0; /* Number of bytes read in */
1430 int type_size; /* MPI datatype used for I/O's size */
1431 int io_size; /* Actual number of bytes requested */
1432 int n;
1433 #endif
1434 hbool_t use_view_this_time = FALSE;
1435 hbool_t rank0_bcast = FALSE; /* If read-with-rank0-and-bcast flag was used */
1436 herr_t ret_value = SUCCEED;
1437
1438 FUNC_ENTER_NOAPI_NOINIT
1439
1440 #ifdef H5FDmpio_DEBUG
1441 if(H5FD_mpio_Debug[(int)'t'])
1442 HDfprintf(stdout, "%s: Entering\n", FUNC);
1443 #endif
1444
1445 /* Sanity checks */
1446 HDassert(file);
1447 HDassert(H5FD_MPIO==file->pub.driver_id);
1448 HDassert(buf);
1449
1450 /* Portably initialize MPI status variable */
1451 HDmemset(&mpi_stat,0,sizeof(MPI_Status));
1452
1453 /* some numeric conversions */
1454 if(H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off/*out*/) < 0)
1455 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
1456 size_i = (int)size;
1457 if((hsize_t)size_i != size)
1458 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
1459
1460 #ifdef H5FDmpio_DEBUG
1461 if(H5FD_mpio_Debug[(int)'r'])
1462 HDfprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", FUNC, (long)mpi_off, size_i);
1463 #endif
1464
1465 /* Only look for MPI views for raw data transfers */
1466 if(type == H5FD_MEM_DRAW) {
1467 H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
1468
1469 /* Get the transfer mode from the API context */
1470 if(H5CX_get_io_xfer_mode(&xfer_mode) < 0)
1471 HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
1472
1473 /*
1474 * Set up for a fancy xfer using complex types, or single byte block. We
1475 * wouldn't need to rely on the use_view field if MPI semantics allowed
1476 * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
1477 * could mean "use MPI_BYTE" by convention).
1478 */
1479 if(xfer_mode == H5FD_MPIO_COLLECTIVE) {
1480 MPI_Datatype file_type;
1481
1482 /* Remember that views are used */
1483 use_view_this_time = TRUE;
1484
1485 /* Prepare for a full-blown xfer using btype, ftype, and disp */
1486 if(H5CX_get_mpi_coll_datatypes(&buf_type, &file_type) < 0)
1487 HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O datatypes")
1488
1489 /*
1490 * Set the file view when we are using MPI derived types
1491 */
1492 if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info)))
1493 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1494
1495 /* When using types, use the address as the displacement for
1496 * MPI_File_set_view and reset the address for the read to zero
1497 */
1498 mpi_off = 0;
1499 } /* end if */
1500 } /* end if */
1501
1502 /* Read the data. */
1503 if(use_view_this_time) {
1504 H5FD_mpio_collective_opt_t coll_opt_mode;
1505
1506 #ifdef H5FDmpio_DEBUG
1507 if(H5FD_mpio_Debug[(int)'r'])
1508 HDfprintf(stdout, "%s: using MPIO collective mode\n", FUNC);
1509 #endif
1510 /* Get the collective_opt property to check whether the application wants to do IO individually. */
1511 if(H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
1512 HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
1513
1514 if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
1515 #ifdef H5FDmpio_DEBUG
1516 if(H5FD_mpio_Debug[(int)'r'])
1517 HDfprintf(stdout, "%s: doing MPI collective IO\n", FUNC);
1518 #endif
1519 /* Check whether we should read from rank 0 and broadcast to other ranks */
1520 if(H5CX_get_mpio_rank0_bcast()) {
1521 #ifdef H5FDmpio_DEBUG
1522 if(H5FD_mpio_Debug[(int)'r'])
1523 HDfprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", FUNC);
1524 #endif
1525 /* Indicate path we've taken */
1526 rank0_bcast = TRUE;
1527
1528 /* Read on rank 0 Bcast to other ranks */
1529 if(file->mpi_rank == 0)
1530 if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1531 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
1532 if(MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm)))
1533 HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
1534 } /* end if */
1535 else
1536 if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1537 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
1538 } /* end if */
1539 else {
1540 #ifdef H5FDmpio_DEBUG
1541 if(H5FD_mpio_Debug[(int)'r'])
1542 HDfprintf(stdout, "%s: doing MPI independent IO\n", FUNC);
1543 #endif
1544
1545 if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1546 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
1547 } /* end else */
1548
1549 /*
1550 * Reset the file view when we used MPI derived types
1551 */
1552 if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info)))
1553 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1554 } /* end if */
1555 else
1556 if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1557 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
1558
1559 /* Only retrieve bytes read if this rank _actually_ participated in I/O */
1560 if(!rank0_bcast || (rank0_bcast && file->mpi_rank == 0) ) {
1561 /* How many bytes were actually read? */
1562 #if MPI_VERSION >= 3
1563 if(MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
1564 #else
1565 if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
1566 #endif
1567 HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
1568 } /* end if */
1569
1570 /* If the rank0-bcast feature was used, broadcast the # of bytes read to
1571 * other ranks, which didn't perform any I/O.
1572 */
1573 /* NOTE: This could be optimized further to be combined with the broadcast
1574 * of the data. (QAK - 2019/1/2)
1575 */
1576 if(rank0_bcast)
1577 if(MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_LONG_LONG, 0, file->comm))
1578 HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
1579
1580 /* Get the type's size */
1581 #if MPI_VERSION >= 3
1582 if(MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
1583 #else
1584 if(MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
1585 #endif
1586 HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
1587
1588 /* Compute the actual number of bytes requested */
1589 io_size = type_size * size_i;
1590
1591 /* Check for read failure */
1592 if(bytes_read < 0 || bytes_read > io_size)
1593 HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed")
1594
1595 /*
1596 * This gives us zeroes beyond end of physical MPI file.
1597 */
1598 if((n = (io_size - bytes_read)) > 0)
1599 HDmemset((char*)buf+bytes_read, 0, (size_t)n);
1600
1601 done:
1602 #ifdef H5FDmpio_DEBUG
1603 if(H5FD_mpio_Debug[(int)'t'])
1604 HDfprintf(stdout, "%s: Leaving\n", FUNC);
1605 #endif
1606
1607 FUNC_LEAVE_NOAPI(ret_value)
1608 }
1609
1610
1611 /*-------------------------------------------------------------------------
1612 * Function: H5FD_mpio_write
1613 *
1614 * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR
1615 * from buffer BUF according to data transfer properties in
1616 * DXPL_ID using potentially complex file and buffer types to
1617 * effect the transfer.
1618 *
1619 * MPI is able to coalesce requests from different processes
1620 * (collective and independent).
1621 *
1622 * Return: Success: Zero. USE_TYPES and OLD_USE_TYPES in the
1623 * access params are altered.
1624 *
1625 * Failure: -1, USE_TYPES and OLD_USE_TYPES in the
1626 * access params may be altered.
1627 *
1628 * Programmer: Unknown
1629 * January 30, 1998
1630 *
1631 * Modifications:
1632 * rky, 1998-08-28
1633 * If the file->allsame flag is set, we assume that all the
1634 * procs in the relevant MPI communicator will write identical
1635 * data at identical offsets in the file, so only proc 0 will
1636 * write, and all other procs will wait for p0 to finish. This
1637 * is useful for writing metadata, for example. Note that we
1638 * don't _check_ that the data is identical. Also, the mechanism
1639 * we use to eliminate the redundant writes is by requiring a
1640 * call to H5FD_mpio_tas_allsame before the write, which is
1641 * rather klugey. Would it be better to pass a parameter to
1642 * low-level writes like H5F_block_write and H5F_low_write,
1643 * instead? Or...??? Also, when I created this mechanism I
1644 * wanted to minimize the difference in behavior between the old
1645 * way of doing things (i.e., all procs write) and the new way,
1646 * so the writes are eliminated at the very lowest level, here
1647 * in H5FD_mpio_write. It may be better to rethink that, and
1648 * short-circuit the writes at a higher level (e.g., at the
1649 * points in the code where H5FD_mpio_tas_allsame is called).
1650 *
1651 *
1652 * Robb Matzke, 1998-02-18
1653 * Added the ACCESS_PARMS argument.
1654 *
1655 * rky, 1998-04-10
1656 * Call independent or collective MPI write, based on
1657 * ACCESS_PARMS.
1658 *
1659 * rky, 1998-04-24
1660 * Removed redundant write from H5FD_mpio_write.
1661 *
1662 * Albert Cheng, 1998-06-01
1663 * Added XFER_MODE to control independent or collective MPI
1664 * write.
1665 *
1666 * rky, 1998-08-16
1667 * Use BTYPE, FTYPE, and DISP from access parms. The guts of
1668 * H5FD_mpio_read and H5FD_mpio_write should be replaced by a
1669 * single dual-purpose routine.
1670 *
1671 * rky, 1998-08-28
1672 * Added ALLSAME parameter to make all but proc 0 skip the
1673 * actual write.
1674 *
1675 * Robb Matzke, 1999-04-21
1676 * Changed XFER_MODE to XFER_PARMS for all H5FD_*_write()
1677 * callbacks.
1678 *
1679 * Robb Matzke, 1999-07-28
1680 * The ADDR argument is passed by value.
1681 *
1682 * Robb Matzke, 1999-08-06
1683 * Modified to work with the virtual file layer.
1684 *
1685 * Albert Cheng, 1999-12-19
1686 * When only-p0-write-allsame-data, p0 Bcasts the
1687 * ret_value to other processes. This prevents
1688 * a racing condition (that other processes try to
1689 * read the file before p0 finishes writing) and also
1690 * allows all processes to report the same ret_value.
1691 *
1692 * Kim Yates, Pat Weidhaas, 2000-09-26
1693 * Move block of coding where only p0 writes after the
1694 * MPI_File_set_view call.
1695 *
1696 * Quincey Koziol, 2002-05-10
1697 * Instead of always writing metadata from process 0, spread the
1698 * burden among all the processes by using a round-robin rotation
1699 * scheme.
1700 *
1701 * Quincey Koziol, 2002-05-10
1702 * Removed allsame code, keying off the type parameter instead.
1703 *
1704 * Quincey Koziol, 2002-05-14
1705 * Only call MPI_Get_count if we can use MPI_BYTE for the MPI type
1706 * for the I/O transfer. Someday we might include code to decode
1707 * the MPI type used for more complicated transfers and call
1708 * MPI_Get_count all the time.
1709 *
1710 * Quincey Koziol - 2002/06/17
1711 * Removed 'disp' parameter from H5FD_mpio_setup routine and use
1712 * the address of the dataset in MPI_File_set_view() calls, as
1713 * necessary.
1714 *
1715 * Quincey Koziol - 2002/06/24
1716 * Removed "lazy" MPI_File_set_view() calls, since they would fail
1717 * if the first I/O was a collective I/O using MPI derived types
1718 * and the next I/O was an independent I/O.
1719 *
1720 * Quincey Koziol - 2002/07/18
1721 * Added "block_before_meta_write" dataset transfer flag, which
1722 * is set during writes from a metadata cache flush and indicates
1723 * that all the processes must sync up before (one of them)
1724 * writing metadata.
1725 *
1726 * Quincey Koziol - 2003/10/22-31
1727 * Restructured code massively, straightening out logic and finally
1728 * getting the bytes_written stuff working.
1729 *
1730 *-------------------------------------------------------------------------
1731 */
1732 static herr_t
H5FD_mpio_write(H5FD_t * _file,H5FD_mem_t type,hid_t H5_ATTR_UNUSED dxpl_id,haddr_t addr,size_t size,const void * buf)1733 H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id,
1734 haddr_t addr, size_t size, const void *buf)
1735 {
1736 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1737 MPI_Offset mpi_off;
1738 MPI_Status mpi_stat; /* Status from I/O operation */
1739 MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
1740 int mpi_code; /* MPI return code */
1741 #if MPI_VERSION >= 3
1742 MPI_Count bytes_written;
1743 MPI_Count type_size; /* MPI datatype used for I/O's size */
1744 MPI_Count io_size; /* Actual number of bytes requested */
1745 #else
1746 int bytes_written;
1747 int type_size; /* MPI datatype used for I/O's size */
1748 int io_size; /* Actual number of bytes requested */
1749 #endif
1750 int size_i;
1751 hbool_t use_view_this_time = FALSE;
1752 H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
1753 herr_t ret_value = SUCCEED;
1754
1755 FUNC_ENTER_NOAPI_NOINIT
1756
1757 #ifdef H5FDmpio_DEBUG
1758 if (H5FD_mpio_Debug[(int)'t'])
1759 HDfprintf(stdout, "Entering H5FD_mpio_write\n" );
1760 #endif
1761 HDassert(file);
1762 HDassert(H5FD_MPIO==file->pub.driver_id);
1763 HDassert(buf);
1764
1765 /* Verify that no data is written when between MPI_Barrier()s during file flush */
1766 HDassert(!H5CX_get_mpi_file_flushing());
1767
1768 /* Portably initialize MPI status variable */
1769 HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
1770
1771 /* some numeric conversions */
1772 if(H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off) < 0)
1773 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
1774 size_i = (int)size;
1775 if((hsize_t)size_i != size)
1776 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
1777
1778 #ifdef H5FDmpio_DEBUG
1779 if(H5FD_mpio_Debug[(int)'w'])
1780 HDfprintf(stdout, "in H5FD_mpio_write mpi_off=%ld size_i=%d\n", (long)mpi_off, size_i);
1781 #endif
1782
1783 /* Get the transfer mode from the API context */
1784 if(H5CX_get_io_xfer_mode(&xfer_mode) < 0)
1785 HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
1786
1787 /*
1788 * Set up for a fancy xfer using complex types, or single byte block. We
1789 * wouldn't need to rely on the use_view field if MPI semantics allowed
1790 * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
1791 * could mean "use MPI_BYTE" by convention).
1792 */
1793 if(xfer_mode == H5FD_MPIO_COLLECTIVE) {
1794 MPI_Datatype file_type;
1795
1796 /* Remember that views are used */
1797 use_view_this_time = TRUE;
1798
1799 /* Prepare for a full-blown xfer using btype, ftype, and disp */
1800 if(H5CX_get_mpi_coll_datatypes(&buf_type, &file_type) < 0)
1801 HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O datatypes")
1802
1803 /*
1804 * Set the file view when we are using MPI derived types
1805 */
1806 if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info)))
1807 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1808
1809 /* When using types, use the address as the displacement for
1810 * MPI_File_set_view and reset the address for the read to zero
1811 */
1812 mpi_off = 0;
1813 } /* end if */
1814
1815 /* Write the data. */
1816 if(use_view_this_time) {
1817 H5FD_mpio_collective_opt_t coll_opt_mode;
1818
1819 #ifdef H5FDmpio_DEBUG
1820 if(H5FD_mpio_Debug[(int)'t'])
1821 HDfprintf(stdout, "H5FD_mpio_write: using MPIO collective mode\n");
1822 #endif
1823
1824 /* Get the collective_opt property to check whether the application wants to do IO individually. */
1825 if(H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
1826 HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
1827
1828 if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
1829 #ifdef H5FDmpio_DEBUG
1830 if(H5FD_mpio_Debug[(int)'t'])
1831 HDfprintf(stdout, "H5FD_mpio_write: doing MPI collective IO\n");
1832 #endif
1833 if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1834 HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
1835 } /* end if */
1836 else {
1837 if(type != H5FD_MEM_DRAW)
1838 HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "Metadata Coll opt property should be collective at this point")
1839 #ifdef H5FDmpio_DEBUG
1840 if(H5FD_mpio_Debug[(int)'t'])
1841 HDfprintf(stdout, "H5FD_mpio_write: doing MPI independent IO\n");
1842 #endif
1843 if(MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1844 HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
1845 } /* end else */
1846
1847 /* Reset the file view when we used MPI derived types */
1848 if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info)))
1849 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1850 } else {
1851 if(MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1852 HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
1853 }
1854
1855 /* How many bytes were actually written? */
1856 #if MPI_VERSION >= 3
1857 if(MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_written)))
1858 #else
1859 if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_written)))
1860 #endif
1861 HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
1862
1863 /* Get the type's size */
1864 #if MPI_VERSION >= 3
1865 if(MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
1866 #else
1867 if(MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
1868 #endif
1869 HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
1870
1871 /* Compute the actual number of bytes requested */
1872 io_size = type_size * size_i;
1873
1874 /* Check for write failure */
1875 if(bytes_written != io_size)
1876 HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed")
1877
1878 /* Each process will keep track of its perceived EOF value locally, and
1879 * ultimately we will reduce this value to the maximum amongst all
1880 * processes, but until then keep the actual eof at HADDR_UNDEF just in
1881 * case something bad happens before that point. (rather have a value
1882 * we know is wrong sitting around rather than one that could only
1883 * potentially be wrong.) */
1884 file->eof = HADDR_UNDEF;
1885
1886 if(bytes_written && ((bytes_written + addr) > file->local_eof))
1887 file->local_eof = addr + bytes_written;
1888
1889 done:
1890 #ifdef H5FDmpio_DEBUG
1891 if(H5FD_mpio_Debug[(int)'t'])
1892 HDfprintf(stdout, "proc %d: Leaving H5FD_mpio_write with ret_value=%d\n",
1893 file->mpi_rank, ret_value );
1894 #endif
1895 FUNC_LEAVE_NOAPI(ret_value)
1896 } /* end H5FD_mpio_write() */
1897
1898
1899 /*-------------------------------------------------------------------------
1900 * Function: H5FD_mpio_flush
1901 *
1902 * Purpose: Makes sure that all data is on disk. This is collective.
1903 *
1904 * Return: Success: Non-negative
1905 *
1906 * Failure: Negative
1907 *
1908 * Programmer: Robb Matzke
1909 * January 30, 1998
1910 *
1911 *-------------------------------------------------------------------------
1912 */
1913 static herr_t
H5FD_mpio_flush(H5FD_t * _file,hid_t H5_ATTR_UNUSED dxpl_id,hbool_t closing)1914 H5FD_mpio_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing)
1915 {
1916 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1917 int mpi_code; /* mpi return code */
1918 herr_t ret_value = SUCCEED;
1919
1920 FUNC_ENTER_NOAPI_NOINIT
1921
1922 #ifdef H5FDmpio_DEBUG
1923 if(H5FD_mpio_Debug[(int)'t'])
1924 HDfprintf(stdout, "Entering %s\n", FUNC);
1925 #endif
1926 HDassert(file);
1927 HDassert(H5FD_MPIO == file->pub.driver_id);
1928
1929 /* Only sync the file if we are not going to immediately close it */
1930 if(!closing)
1931 if(MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f)))
1932 HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code)
1933
1934 done:
1935 #ifdef H5FDmpio_DEBUG
1936 if(H5FD_mpio_Debug[(int)'t'])
1937 HDfprintf(stdout, "Leaving %s\n", FUNC);
1938 #endif
1939
1940 FUNC_LEAVE_NOAPI(ret_value)
1941 } /* end H5FD_mpio_flush() */
1942
1943
1944 /*-------------------------------------------------------------------------
1945 * Function: H5FD_mpio_truncate
1946 *
1947 * Purpose: Make certain the file's size matches it's allocated size
1948 *
1949 * This is a little sticky in the mpio case, as it is not
1950 * easy for us to track the current EOF by extracting it from
1951 * write calls.
1952 *
1953 * Instead, we first check to see if the eoa has changed since
1954 * the last call to this function. If it has, we call
1955 * MPI_File_get_size() to determine the current EOF, and
1956 * only call MPI_File_set_size() if this value disagrees
1957 * with the current eoa.
1958 *
1959 * Return: Success: Non-negative
1960 * Failure: Negative
1961 *
1962 * Programmer: Quincey Koziol
1963 * January 31, 2008
1964 *
1965 * Changes: Heavily reworked to avoid unnecessary MPI_File_set_size()
1966 * calls. The hope is that these calls are superfluous in the
1967 * typical case, allowing us to avoid truncates most of the
1968 * time.
1969 *
1970 * The basic idea is to query the file system to get the
1971 * current eof, and only truncate if the file systems
1972 * conception of the eof disagrees with our eoa.
1973 *
1974 * JRM -- 10/27/17
1975 *
1976 *-------------------------------------------------------------------------
1977 */
1978 static herr_t
H5FD_mpio_truncate(H5FD_t * _file,hid_t H5_ATTR_UNUSED dxpl_id,hbool_t H5_ATTR_UNUSED closing)1979 H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing)
1980 {
1981 H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
1982 herr_t ret_value = SUCCEED;
1983
1984 FUNC_ENTER_NOAPI_NOINIT
1985
1986 #ifdef H5FDmpio_DEBUG
1987 if(H5FD_mpio_Debug[(int)'t'])
1988 HDfprintf(stdout, "Entering %s\n", FUNC);
1989 #endif
1990 HDassert(file);
1991 HDassert(H5FD_MPIO == file->pub.driver_id);
1992
1993 if(!H5F_addr_eq(file->eoa, file->last_eoa)) {
1994 int mpi_code; /* mpi return code */
1995 MPI_Offset size;
1996 MPI_Offset needed_eof;
1997
1998 /* In principle, it is possible for the size returned by the
1999 * call to MPI_File_get_size() to depend on whether writes from
2000 * all proceeses have completed at the time process 0 makes the
2001 * call.
2002 *
2003 * In practice, most (all?) truncate calls will come after a barrier
2004 * and with no interviening writes to the file (with the possible
2005 * exception of sueprblock / superblock extension message updates).
2006 *
2007 * Check the "MPI file closing" flag in the API context to determine
2008 * if we can skip the barrier.
2009 */
2010 if(!H5CX_get_mpi_file_flushing())
2011 if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
2012 HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
2013
2014 /* Only processor p0 will get the filesize and broadcast it. */
2015 /* (Note that throwing an error here will cause non-rank 0 processes
2016 * to hang in following Bcast. -QAK, 3/17/2018)
2017 */
2018 if(0 == file->mpi_rank)
2019 if(MPI_SUCCESS != (mpi_code = MPI_File_get_size(file->f, &size)))
2020 HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)
2021
2022 /* Broadcast file size */
2023 if(MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, file->comm)))
2024 HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
2025
2026 if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0)
2027 HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset")
2028
2029 /* eoa != eof. Set eof to eoa */
2030 if(size != needed_eof) {
2031 /* Extend the file's size */
2032 if(MPI_SUCCESS != (mpi_code = MPI_File_set_size(file->f, needed_eof)))
2033 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code)
2034
2035 /* In general, we must wait until all processes have finished
2036 * the truncate before any process can continue, since it is
2037 * possible that a process would write at the end of the
2038 * file, and this write would be discarded by the truncate.
2039 *
2040 * While this is an issue for a user initiated flush, it may
2041 * not be an issue at file close. If so, we may be able to
2042 * optimize out the following barrier in that case.
2043 */
2044 if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
2045 HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
2046 } /* end if */
2047
2048 /* Update the 'last' eoa value */
2049 file->last_eoa = file->eoa;
2050 } /* end if */
2051
2052 done:
2053 #ifdef H5FDmpio_DEBUG
2054 if(H5FD_mpio_Debug[(int)'t'])
2055 HDfprintf(stdout, "Leaving %s\n", FUNC);
2056 #endif
2057
2058 FUNC_LEAVE_NOAPI(ret_value)
2059 } /* end H5FD_mpio_truncate() */
2060
2061
2062 /*-------------------------------------------------------------------------
2063 * Function: H5FD_mpio_mpi_rank
2064 *
2065 * Purpose: Returns the MPI rank for a process
2066 *
2067 * Return: Success: non-negative
2068 * Failure: negative
2069 *
2070 * Programmer: Quincey Koziol
2071 * Thursday, May 16, 2002
2072 *
2073 * Modifications:
2074 *
2075 *-------------------------------------------------------------------------
2076 */
2077 static int
H5FD_mpio_mpi_rank(const H5FD_t * _file)2078 H5FD_mpio_mpi_rank(const H5FD_t *_file)
2079 {
2080 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
2081
2082 FUNC_ENTER_NOAPI_NOINIT_NOERR
2083
2084 HDassert(file);
2085 HDassert(H5FD_MPIO==file->pub.driver_id);
2086
2087 FUNC_LEAVE_NOAPI(file->mpi_rank)
2088 } /* end H5FD_mpio_mpi_rank() */
2089
2090
2091 /*-------------------------------------------------------------------------
2092 * Function: H5FD_mpio_mpi_size
2093 *
2094 * Purpose: Returns the number of MPI processes
2095 *
2096 * Return: Success: non-negative
2097 * Failure: negative
2098 *
2099 * Programmer: Quincey Koziol
2100 * Thursday, May 16, 2002
2101 *
2102 * Modifications:
2103 *
2104 *-------------------------------------------------------------------------
2105 */
2106 static int
H5FD_mpio_mpi_size(const H5FD_t * _file)2107 H5FD_mpio_mpi_size(const H5FD_t *_file)
2108 {
2109 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
2110
2111 FUNC_ENTER_NOAPI_NOINIT_NOERR
2112
2113 HDassert(file);
2114 HDassert(H5FD_MPIO==file->pub.driver_id);
2115
2116 FUNC_LEAVE_NOAPI(file->mpi_size)
2117 } /* end H5FD_mpio_mpi_size() */
2118
2119
2120 /*-------------------------------------------------------------------------
2121 * Function: H5FD_mpio_communicator
2122 *
2123 * Purpose: Returns the MPI communicator for the file.
2124 *
2125 * Return: Success: The communicator
2126 *
2127 * Failure: NULL
2128 *
2129 * Programmer: Robb Matzke
2130 * Monday, August 9, 1999
2131 *
2132 * Modifications:
2133 *
2134 *-------------------------------------------------------------------------
2135 */
2136 static MPI_Comm
H5FD_mpio_communicator(const H5FD_t * _file)2137 H5FD_mpio_communicator(const H5FD_t *_file)
2138 {
2139 const H5FD_mpio_t *file = (const H5FD_mpio_t*)_file;
2140
2141 FUNC_ENTER_NOAPI_NOINIT_NOERR
2142
2143 HDassert(file);
2144 HDassert(H5FD_MPIO==file->pub.driver_id);
2145
2146 FUNC_LEAVE_NOAPI(file->comm)
2147 } /* end H5FD_mpio_communicator() */
2148
2149 #endif /* H5_HAVE_PARALLEL */
2150
2151