1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  * Copyright by The HDF Group.                                               *
3  * Copyright by the Board of Trustees of the University of Illinois.         *
4  * All rights reserved.                                                      *
5  *                                                                           *
6  * This file is part of HDF5.  The full HDF5 copyright notice, including     *
7  * terms governing use, modification, and redistribution, is contained in    *
8  * the COPYING file, which can be found at the root of the source code       *
9  * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases.  *
10  * If you do not have access to either file, you may request a copy from     *
11  * help@hdfgroup.org.                                                        *
12  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
13 
14 /*
15  * Programmer:  Robb Matzke <matzke@llnl.gov>
16  *              Thursday, July 29, 1999
17  *
18  * Purpose:  This is the MPI-2 I/O driver.
19  *
20  */
21 
22 /* Interface initialization */
23 #define H5_INTERFACE_INIT_FUNC  H5FD_mpio_init_interface
24 
25 
26 #include "H5private.h"    /* Generic Functions      */
27 #include "H5Dprivate.h"    /* Dataset functions      */
28 #include "H5Eprivate.h"    /* Error handling        */
29 #include "H5Fprivate.h"    /* File access        */
30 #include "H5FDprivate.h"  /* File drivers        */
31 #include "H5FDmpi.h"            /* MPI-based file drivers    */
32 #include "H5Iprivate.h"    /* IDs            */
33 #include "H5MMprivate.h"  /* Memory management      */
34 #include "H5Pprivate.h"         /* Property lists                       */
35 
36 #ifdef H5_HAVE_PARALLEL
37 
38 /*
39  * The driver identification number, initialized at runtime if H5_HAVE_PARALLEL
40  * is defined. This allows applications to still have the H5FD_MPIO
41  * "constants" in their source code.
42  */
43 static hid_t H5FD_MPIO_g = 0;
44 
45 /* Whether to allow collective I/O operations */
46 /* (Value can be set from environment variable also) */
47 hbool_t H5FD_mpi_opt_types_g = TRUE;
48 
49 /*
50  * The view is set to this value
51  */
52 static char H5FD_mpi_native_g[] = "native";
53 
54 /*
55  * The description of a file belonging to this driver.
56  * The EOF value is only used just after the file is opened in order for the
57  * library to determine whether the file is empty, truncated, or okay. The MPIO
58  * driver doesn't bother to keep it updated since it's an expensive operation.
59  */
60 typedef struct H5FD_mpio_t {
61     H5FD_t  pub;    /*public stuff, must be first    */
62     MPI_File  f;    /*MPIO file handle      */
63     MPI_Comm  comm;    /*communicator        */
64     MPI_Info  info;    /*file information      */
65     int         mpi_rank;       /* This process's rank                  */
66     int         mpi_size;       /* Total number of processes            */
67     haddr_t  eof;    /*end-of-file marker      */
68     haddr_t  eoa;    /*end-of-address marker      */
69     haddr_t  last_eoa;  /* Last known end-of-address marker  */
70 } H5FD_mpio_t;
71 
72 /* Private Prototypes */
73 
74 /* Callbacks */
75 static void *H5FD_mpio_fapl_get(H5FD_t *_file);
76 static void *H5FD_mpio_fapl_copy(const void *_old_fa);
77 static herr_t H5FD_mpio_fapl_free(void *_fa);
78 static H5FD_t *H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id,
79             haddr_t maxaddr);
80 static herr_t H5FD_mpio_close(H5FD_t *_file);
81 static herr_t H5FD_mpio_query(const H5FD_t *_f1, unsigned long *flags);
82 static haddr_t H5FD_mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t type);
83 static herr_t H5FD_mpio_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr);
84 static haddr_t H5FD_mpio_get_eof(const H5FD_t *_file);
85 static herr_t  H5FD_mpio_get_handle(H5FD_t *_file, hid_t fapl, void** file_handle);
86 static herr_t H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
87             size_t size, void *buf);
88 static herr_t H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
89             size_t size, const void *buf);
90 static herr_t H5FD_mpio_flush(H5FD_t *_file, hid_t dxpl_id, unsigned closing);
91 static herr_t H5FD_mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
92 static int H5FD_mpio_mpi_rank(const H5FD_t *_file);
93 static int H5FD_mpio_mpi_size(const H5FD_t *_file);
94 static MPI_Comm H5FD_mpio_communicator(const H5FD_t *_file);
95 
96 /* MPIO-specific file access properties */
97 typedef struct H5FD_mpio_fapl_t {
98     MPI_Comm    comm;    /*communicator      */
99     MPI_Info    info;    /*file information    */
100 } H5FD_mpio_fapl_t;
101 
102 /* The MPIO file driver information */
103 static const H5FD_class_mpi_t H5FD_mpio_g = {
104     {   /* Start of superclass information */
105     "mpio",          /*name      */
106     HADDR_MAX,          /*maxaddr    */
107     H5F_CLOSE_SEMI,        /* fc_degree    */
108     NULL,          /*sb_size    */
109     NULL,          /*sb_encode    */
110     NULL,          /*sb_decode    */
111     sizeof(H5FD_mpio_fapl_t),      /*fapl_size    */
112     H5FD_mpio_fapl_get,        /*fapl_get    */
113     H5FD_mpio_fapl_copy,      /*fapl_copy    */
114     H5FD_mpio_fapl_free,       /*fapl_free    */
115     0,                        /*dxpl_size    */
116     NULL,          /*dxpl_copy    */
117     NULL,          /*dxpl_free    */
118     H5FD_mpio_open,        /*open      */
119     H5FD_mpio_close,        /*close      */
120     NULL,          /*cmp      */
121     H5FD_mpio_query,                    /*query      */
122     NULL,          /*get_type_map    */
123     NULL,          /*alloc      */
124     NULL,          /*free      */
125     H5FD_mpio_get_eoa,        /*get_eoa    */
126     H5FD_mpio_set_eoa,         /*set_eoa    */
127     H5FD_mpio_get_eof,        /*get_eof    */
128     H5FD_mpio_get_handle,                       /*get_handle            */
129     H5FD_mpio_read,        /*read      */
130     H5FD_mpio_write,        /*write      */
131     H5FD_mpio_flush,        /*flush      */
132     H5FD_mpio_truncate,        /*truncate    */
133     NULL,                                       /*lock                  */
134     NULL,                                       /*unlock                */
135     H5FD_FLMAP_DICHOTOMY                        /*fl_map                */
136     },  /* End of superclass information */
137     H5FD_mpio_mpi_rank,                         /*get_rank              */
138     H5FD_mpio_mpi_size,                         /*get_size              */
139     H5FD_mpio_communicator                      /*get_comm              */
140 };
141 
142 #ifdef H5FDmpio_DEBUG
143 /* Flags to control debug actions in H5Fmpio.
144  * Meant to be indexed by characters.
145  *
146  * 'c' show result of MPI_Get_count after read
147  * 'r' show read offset and size
148  * 't' trace function entry and exit
149  * 'w' show write offset and size
150  */
151 static int H5FD_mpio_Debug[256] =
152         { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
153           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
154           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
155           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
156           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
157           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
158           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
159           0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
160 #endif
161 
162 
163 /*--------------------------------------------------------------------------
164 NAME
165    H5FD_mpio_init_interface -- Initialize interface-specific information
166 USAGE
167     herr_t H5FD_mpio_init_interface()
168 
169 RETURNS
170     Non-negative on success/Negative on failure
171 DESCRIPTION
172     Initializes any interface-specific data or routines.  (Just calls
173     H5FD_mpio_init currently).
174 
175 --------------------------------------------------------------------------*/
176 static herr_t
H5FD_mpio_init_interface(void)177 H5FD_mpio_init_interface(void)
178 {
179     FUNC_ENTER_NOAPI_NOINIT_NOERR
180 
181     FUNC_LEAVE_NOAPI(H5FD_mpio_init())
182 } /* H5FD_mpio_init_interface() */
183 
184 
185 /*-------------------------------------------------------------------------
186  * Function:  H5FD_mpio_init
187  *
188  * Purpose:  Initialize this driver by registering the driver with the
189  *    library.
190  *
191  * Return:  Success:  The driver ID for the mpio driver.
192  *    Failure:  Negative.
193  *
194  * Programmer:  Robb Matzke
195  *              Thursday, August 5, 1999
196  *
197  *-------------------------------------------------------------------------
198  */
199 hid_t
H5FD_mpio_init(void)200 H5FD_mpio_init(void)
201 {
202 #ifdef H5FDmpio_DEBUG
203     static int H5FD_mpio_Debug_inited = 0;
204 #endif /* H5FDmpio_DEBUG */
205     const char *s;              /* String for environment variables */
206     hid_t ret_value;        	/* Return value */
207 
208     FUNC_ENTER_NOAPI(FAIL)
209 
210     /* Register the MPI-IO VFD, if it isn't already */
211     if(H5I_VFL != H5I_get_type(H5FD_MPIO_g))
212         H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g, sizeof(H5FD_class_mpi_t), FALSE);
213 
214     /* Allow MPI buf-and-file-type optimizations? */
215     s = HDgetenv("HDF5_MPI_OPT_TYPES");
216     if(s && HDisdigit(*s))
217         H5FD_mpi_opt_types_g = (hbool_t)HDstrtol(s, NULL, 0);
218 
219 #ifdef H5FDmpio_DEBUG
220     if(!H5FD_mpio_Debug_inited) {
221         /* Retrieve MPI-IO debugging environment variable */
222         s = HDgetenv("H5FD_mpio_Debug");
223         if(s) {
224             /* Set debug mask */
225 	    while(*s) {
226 		H5FD_mpio_Debug[(int)*s]++;
227 		s++;
228 	    } /* end while */
229         } /* end if */
230         H5FD_mpio_Debug_inited++;
231     } /* end if */
232 #endif /* H5FDmpio_DEBUG */
233 
234     /* Set return value */
235     ret_value = H5FD_MPIO_g;
236 
237 done:
238     FUNC_LEAVE_NOAPI(ret_value)
239 } /* end H5FD_mpio_init() */
240 
241 
242 /*---------------------------------------------------------------------------
243  * Function:  H5FD_mpio_term
244  *
245  * Purpose:  Shut down the VFD
246  *
247  * Return:  <none>
248  *
249  * Programmer:  Quincey Koziol
250  *              Friday, Jan 30, 2004
251  *
252  * Modification:
253  *
254  *---------------------------------------------------------------------------
255  */
256 void
H5FD_mpio_term(void)257 H5FD_mpio_term(void)
258 {
259     FUNC_ENTER_NOAPI_NOINIT_NOERR
260 
261     /* Reset VFL ID */
262     H5FD_MPIO_g=0;
263 
264     FUNC_LEAVE_NOAPI_VOID
265 } /* end H5FD_mpio_term() */
266 
267 
268 /*-------------------------------------------------------------------------
269  * Function:  H5Pset_fapl_mpio
270  *
271  * Purpose:  Store the user supplied MPIO communicator comm and info in
272  *    the file access property list FAPL_ID which can then be used
273  *    to create and/or open the file.  This function is available
274  *    only in the parallel HDF5 library and is not collective.
275  *
276  *    comm is the MPI communicator to be used for file open as
277  *    defined in MPI_FILE_OPEN of MPI-2. This function makes a
278  *    duplicate of comm. Any modification to comm after this function
279  *    call returns has no effect on the access property list.
280  *
281  *    info is the MPI Info object to be used for file open as
282  *    defined in MPI_FILE_OPEN of MPI-2. This function makes a
283  *    duplicate of info. Any modification to info after this
284  *    function call returns has no effect on the access property
285  *    list.
286  *
287  *              If fapl_id has previously set comm and info values, they
288  *              will be replaced and the old communicator and Info object
289  *              are freed.
290  *
291  * Return:  Success:  Non-negative
292  *
293  *     Failure:  Negative
294  *
295  * Programmer:  Albert Cheng
296  *    Feb 3, 1998
297  *
298  * Modifications:
299  *    Robb Matzke, 1998-02-18
300  *    Check all arguments before the property list is updated so we
301  *    don't leave the property list in a bad state if something
302  *    goes wrong.  Also, the property list data type changed to
303  *    allow more generality so all the mpi-related stuff is in the
304  *    `u.mpi' member.  The `access_mode' will contain only
305  *     mpi-related flags defined in H5Fpublic.h.
306  *
307  *    Albert Cheng, 1998-04-16
308  *    Removed the ACCESS_MODE argument.  The access mode is changed
309  *    to be controlled by data transfer property list during data
310  *    read/write calls.
311  *
312  *     Robb Matzke, 1999-08-06
313  *    Modified to work with the virtual file layer.
314  *
315  *    Raymond Lu, 2001-10-23
316  *    Changed the file access list to the new generic property
317  *    list.
318  *
319  *    Albert Cheng, 2003-04-17
320  *    Modified the description of the function that it now stores
321  *    a duplicate of the communicator and INFO object.  Free the
322  *    old duplicates if previously set.  (Work is actually done
323  *    by H5P_set_driver.)
324  *
325  *-------------------------------------------------------------------------
326  */
327 herr_t
H5Pset_fapl_mpio(hid_t fapl_id,MPI_Comm comm,MPI_Info info)328 H5Pset_fapl_mpio(hid_t fapl_id, MPI_Comm comm, MPI_Info info)
329 {
330     H5FD_mpio_fapl_t  fa;
331     H5P_genplist_t *plist;      /* Property list pointer */
332     herr_t ret_value;
333 
334     FUNC_ENTER_API(FAIL)
335     H5TRACE3("e", "iMcMi", fapl_id, comm, info);
336 
337     if(fapl_id == H5P_DEFAULT)
338         HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
339 
340     /* Check arguments */
341     if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
342         HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list")
343     if(MPI_COMM_NULL == comm)
344   HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a valid communicator")
345 
346     /* Initialize driver specific properties */
347     fa.comm = comm;
348     fa.info = info;
349 
350     /* duplication is done during driver setting. */
351     ret_value= H5P_set_driver(plist, H5FD_MPIO, &fa);
352 
353 done:
354     FUNC_LEAVE_API(ret_value)
355 }
356 
357 
358 /*-------------------------------------------------------------------------
359  * Function:  H5Pget_fapl_mpio
360  *
361  * Purpose:  If the file access property list is set to the H5FD_MPIO
362  *    driver then this function returns duplicates of the MPI
363  *    communicator and Info object stored through the comm and
364  *    info pointers.  It is the responsibility of the application
365  *    to free the returned communicator and Info object.
366  *
367  * Return:  Success:  Non-negative with the communicator and
368  *        Info object returned through the comm and
369  *        info arguments if non-null. Since they are
370  *        duplicates of the stored objects, future
371  *        modifications to the access property list do
372  *        not affect them and it is the responsibility
373  *        of the application to free them.
374  *
375  *     Failure:  Negative
376  *
377  * Programmer:  Robb Matzke
378  *    Thursday, February 26, 1998
379  *
380  * Modifications:
381  *
382  *          Albert Cheng, Apr 16, 1998
383  *          Removed the access_mode argument.  The access_mode is changed
384  *          to be controlled by data transfer property list during data
385  *          read/write calls.
386  *
387  *    Raymond Lu, 2001-10-23
388  *    Changed the file access list to the new generic property
389  *    list.
390  *
391  *    Albert Cheng, 2003-04-17
392  *    Return duplicates of the stored communicator and Info object.
393  *
394  *-------------------------------------------------------------------------
395  */
396 herr_t
H5Pget_fapl_mpio(hid_t fapl_id,MPI_Comm * comm,MPI_Info * info)397 H5Pget_fapl_mpio(hid_t fapl_id, MPI_Comm *comm/*out*/, MPI_Info *info/*out*/)
398 {
399     H5FD_mpio_fapl_t  *fa;
400     H5P_genplist_t *plist;      /* Property list pointer */
401     MPI_Comm  comm_tmp=MPI_COMM_NULL;
402     int    mpi_code;    /* mpi return code */
403     herr_t      ret_value=SUCCEED;      /* Return value */
404 
405     FUNC_ENTER_API(FAIL)
406     H5TRACE3("e", "ixx", fapl_id, comm, info);
407 
408     if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
409         HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list")
410     if(H5FD_MPIO != H5P_get_driver(plist))
411         HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "incorrect VFL driver")
412     if(NULL == (fa = (H5FD_mpio_fapl_t *)H5P_get_driver_info(plist)))
413         HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "bad VFL driver info")
414 
415     /* Store the duplicated communicator in a temporary variable for error */
416     /* recovery in case the INFO duplication fails.  We cannot attempt to */
417     /* the value into *comm yet since if MPI_Comm_dup fails, we will end  */
418     /* up freeing whatever *comm holds and that could be invalid. */
419     if (comm){
420   if (MPI_SUCCESS != (mpi_code=MPI_Comm_dup(fa->comm, &comm_tmp)))
421       HMPI_GOTO_ERROR(FAIL, "MPI_Comm_dup failed", mpi_code)
422     }
423 
424     if (info){
425   if (MPI_INFO_NULL != fa->info){
426       if (MPI_SUCCESS != (mpi_code=MPI_Info_dup(fa->info, info)))
427     HMPI_GOTO_ERROR(FAIL, "MPI_Info_dup failed", mpi_code)
428   }else{
429       /* do not dup it */
430       *info = MPI_INFO_NULL;
431   }
432     }
433 
434     if (comm)
435         *comm = comm_tmp;
436 
437 done:
438     if (FAIL==ret_value){
439   /* need to free anything created here */
440   if (comm_tmp != MPI_COMM_NULL)
441       MPI_Comm_free(&comm_tmp);
442     }
443     FUNC_LEAVE_API(ret_value)
444 }
445 
446 
447 /*-------------------------------------------------------------------------
448  * Function:  H5Pset_dxpl_mpio
449  *
450  * Purpose:  Set the data transfer property list DXPL_ID to use transfer
451  *    mode XFER_MODE. The property list can then be used to control
452  *    the I/O transfer mode during data I/O operations. The valid
453  *    transfer modes are:
454  *
455  *     H5FD_MPIO_INDEPENDENT:
456  *      Use independent I/O access (the default).
457  *
458  *     H5FD_MPIO_COLLECTIVE:
459  *      Use collective I/O access.
460  *
461  * Return:  Success:  Non-negative
462  *     Failure:  Negative
463  *
464  * Programmer:  Albert Cheng
465  *    April 2, 1998
466  *
467  *-------------------------------------------------------------------------
468  */
469 herr_t
H5Pset_dxpl_mpio(hid_t dxpl_id,H5FD_mpio_xfer_t xfer_mode)470 H5Pset_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t xfer_mode)
471 {
472     H5P_genplist_t *plist;      /* Property list pointer */
473     herr_t ret_value = SUCCEED; /* Return value */
474 
475     FUNC_ENTER_API(FAIL)
476     H5TRACE2("e", "iDt", dxpl_id, xfer_mode);
477 
478     if(dxpl_id == H5P_DEFAULT)
479         HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
480 
481     /* Check arguments */
482     if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
483         HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
484     if(H5FD_MPIO_INDEPENDENT != xfer_mode && H5FD_MPIO_COLLECTIVE != xfer_mode)
485         HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "incorrect xfer_mode")
486 
487     /* Set the transfer mode */
488     if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0)
489         HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
490 
491 done:
492     FUNC_LEAVE_API(ret_value)
493 } /* end H5Pset_dxpl_mpio() */
494 
495 
496 /*-------------------------------------------------------------------------
497  * Function:  H5Pget_dxpl_mpio
498  *
499  * Purpose:  Queries the transfer mode current set in the data transfer
500  *    property list DXPL_ID. This is not collective.
501  *
502  * Return:  Success:  Non-negative, with the transfer mode returned
503  *        through the XFER_MODE argument if it is
504  *        non-null.
505  *
506  *     Failure:  Negative
507  *
508  * Programmer:  Albert Cheng
509  *    April 2, 1998
510  *
511  *-------------------------------------------------------------------------
512  */
513 herr_t
H5Pget_dxpl_mpio(hid_t dxpl_id,H5FD_mpio_xfer_t * xfer_mode)514 H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode/*out*/)
515 {
516     H5P_genplist_t *plist;              /* Property list pointer */
517     herr_t      ret_value = SUCCEED;    /* Return value */
518 
519     FUNC_ENTER_API(FAIL)
520     H5TRACE2("e", "ix", dxpl_id, xfer_mode);
521 
522     if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
523         HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
524 
525     /* Get the transfer mode */
526     if(xfer_mode)
527         if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, xfer_mode) < 0)
528             HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to get value")
529 
530 done:
531     FUNC_LEAVE_API(ret_value)
532 } /* end H5Pget_dxpl_mpio() */
533 
534 
535 /*-------------------------------------------------------------------------
536  * Function:  H5Pset_dxpl_mpio_collective_opt
537  *
538  * Purpose:	To set a flag to choose linked chunk I/O or multi-chunk I/O
539  *		without involving decision-making inside HDF5
540  *
541  * Note:	The library will do linked chunk I/O or multi-chunk I/O without
542  *		involving communications for decision-making process.
543  *		The library won't behave as it asks for only when we find
544  *		that the low-level MPI-IO package doesn't support this.
545  *
546  * Return:	Success:	Non-negative
547  * 		Failure:	Negative
548  *
549  * Programmer:	Kent Yang
550  *		? ?, ?
551  *
552  *-------------------------------------------------------------------------
553  */
554 herr_t
H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id,H5FD_mpio_collective_opt_t opt_mode)555 H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id, H5FD_mpio_collective_opt_t opt_mode)
556 {
557     H5P_genplist_t *plist;      /* Property list pointer */
558     herr_t ret_value = SUCCEED; /* Return value */
559 
560     FUNC_ENTER_API(FAIL)
561     H5TRACE2("e", "iDc", dxpl_id, opt_mode);
562 
563     if(dxpl_id == H5P_DEFAULT)
564         HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
565 
566     /* Check arguments */
567     if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
568         HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
569 
570     /* Set the transfer mode */
571     if(H5P_set(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &opt_mode) < 0)
572         HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
573 
574 done:
575     FUNC_LEAVE_API(ret_value)
576 } /* end H5Pset_dxpl_mpio_collective_opt() */
577 
578 
579 /*-------------------------------------------------------------------------
580  * Function:  H5Pset_dxpl_mpio_chunk_opt
581  *
582  * Purpose:	To set a flag to choose linked chunk I/O or multi-chunk I/O
583  *		without involving decision-making inside HDF5
584  *
585  * Note:	The library will do linked chunk I/O or multi-chunk I/O without
586  *		involving communications for decision-making process.
587  *		The library won't behave as it asks for only when we find
588  *		that the low-level MPI-IO package doesn't support this.
589  *
590  * Return:	Success:	Non-negative
591  * 		Failure:	Negative
592  *
593  * Programmer:	Kent Yang
594  *		? ?, ?
595  *
596  *-------------------------------------------------------------------------
597  */
598 herr_t
H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id,H5FD_mpio_chunk_opt_t opt_mode)599 H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode)
600 {
601     H5P_genplist_t *plist;      /* Property list pointer */
602     herr_t ret_value = SUCCEED; /* Return value */
603 
604     FUNC_ENTER_API(FAIL)
605     H5TRACE2("e", "iDh", dxpl_id, opt_mode);
606 
607     if(dxpl_id == H5P_DEFAULT)
608         HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
609 
610     /* Check arguments */
611     if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
612         HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
613 
614     /* Set the transfer mode */
615     if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME, &opt_mode) < 0)
616         HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
617 
618 done:
619     FUNC_LEAVE_API(ret_value)
620 } /* end H5Pset_dxpl_mpio_chunk_opt() */
621 
622 
623 /*-------------------------------------------------------------------------
624  * Function:  H5Pset_dxpl_mpio_chunk_opt_num
625  *
626  * Purpose:	To set a threshold for doing linked chunk IO
627  *
628  * Note:	If the number is greater than the threshold set by the user,
629  *		the library will do linked chunk I/O; otherwise, I/O will be
630  *		done for every chunk.
631  *
632  * Return:	Success:	Non-negative
633  * 		Failure:	Negative
634  *
635  * Programmer:	Kent Yang
636  *		? ?, ?
637  *
638  *-------------------------------------------------------------------------
639  */
640 herr_t
H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id,unsigned num_chunk_per_proc)641 H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc)
642 {
643     H5P_genplist_t *plist;      /* Property list pointer */
644     herr_t ret_value = SUCCEED; /* Return value */
645 
646     FUNC_ENTER_API(FAIL)
647     H5TRACE2("e", "iIu", dxpl_id, num_chunk_per_proc);
648 
649     if(dxpl_id == H5P_DEFAULT)
650         HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
651 
652     /* Check arguments */
653     if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
654         HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
655 
656     /* Set the transfer mode */
657     if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME, &num_chunk_per_proc) < 0)
658         HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
659 
660 done:
661     FUNC_LEAVE_API(ret_value)
662 } /* end H5Pset_dxpl_mpio_chunk_opt_num() */
663 
664 
665 /*-------------------------------------------------------------------------
666  * Function:  H5Pset_dxpl_mpio_chunk_opt_ratio
667  *
668  * Purpose:	To set a threshold for doing collective I/O for each chunk
669  *
670  * Note:	The library will calculate the percentage of the number of
671  *		process holding selections at each chunk. If that percentage
672  *		of number of process in the individual chunk is greater than
673  *		the threshold set by the user, the library will do collective
674  *		chunk I/O for this chunk; otherwise, independent I/O will be
675  *		done for this chunk.
676  *
677  * Return:	Success:	Non-negative
678  * 		Failure:	Negative
679  *
680  * Programmer:	Kent Yang
681  *		? ?, ?
682  *
683  *-------------------------------------------------------------------------
684  */
685 herr_t
H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id,unsigned percent_num_proc_per_chunk)686 H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk)
687 {
688     H5P_genplist_t *plist;      /* Property list pointer */
689     herr_t ret_value = SUCCEED; /* Return value */
690 
691     FUNC_ENTER_API(FAIL)
692     H5TRACE2("e", "iIu", dxpl_id, percent_num_proc_per_chunk);
693 
694     if(dxpl_id == H5P_DEFAULT)
695         HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
696 
697     /* Check arguments */
698     if(NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
699         HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
700 
701     /* Set the transfer mode */
702     if(H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME, &percent_num_proc_per_chunk) < 0)
703         HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
704 
705 done:
706     FUNC_LEAVE_API(ret_value)
707 } /* end H5Pset_dxpl_mpio_chunk_opt_ratio() */
708 
709 
710 /*-------------------------------------------------------------------------
711  * Function:  H5FD_mpio_fapl_get
712  *
713  * Purpose:  Returns a file access property list which could be used to
714  *    create another file the same as this one.
715  *
716  * Return:  Success:  Ptr to new file access property list with all
717  *        fields copied from the file pointer.
718  *
719  *    Failure:  NULL
720  *
721  * Programmer:  Robb Matzke
722  *              Friday, August 13, 1999
723  *
724  *-------------------------------------------------------------------------
725  */
726 static void *
H5FD_mpio_fapl_get(H5FD_t * _file)727 H5FD_mpio_fapl_get(H5FD_t *_file)
728 {
729     H5FD_mpio_t    *file = (H5FD_mpio_t*)_file;
730     H5FD_mpio_fapl_t  *fa = NULL;
731     void      *ret_value;       /* Return value */
732 
733     FUNC_ENTER_NOAPI_NOINIT
734 
735     HDassert(file);
736     HDassert(H5FD_MPIO == file->pub.driver_id);
737 
738     if(NULL == (fa = (H5FD_mpio_fapl_t *)H5MM_calloc(sizeof(H5FD_mpio_fapl_t))))
739         HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
740 
741     /* Duplicate communicator and Info object. */
742     if(FAIL == H5FD_mpi_comm_info_dup(file->comm, file->info, &fa->comm, &fa->info))
743   HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
744 
745     /* Set return value */
746     ret_value = fa;
747 
748 done:
749     FUNC_LEAVE_NOAPI(ret_value)
750 }
751 
752 
753 /*-------------------------------------------------------------------------
754  * Function:  H5FD_mpio_fapl_copy
755  *
756  * Purpose:  Copies the mpio-specific file access properties.
757  *
758  * Return:  Success:  Ptr to a new property list
759  *
760  *    Failure:  NULL
761  *
762  * Programmer:  Albert Cheng
763  *              Jan  8, 2003
764  *
765  *-------------------------------------------------------------------------
766  */
767 static void *
H5FD_mpio_fapl_copy(const void * _old_fa)768 H5FD_mpio_fapl_copy(const void *_old_fa)
769 {
770     void    *ret_value = NULL;
771     const H5FD_mpio_fapl_t *old_fa = (const H5FD_mpio_fapl_t*)_old_fa;
772     H5FD_mpio_fapl_t  *new_fa = NULL;
773 
774     FUNC_ENTER_NOAPI_NOINIT
775 #ifdef H5FDmpio_DEBUG
776 if (H5FD_mpio_Debug[(int)'t'])
777 fprintf(stderr, "enter H5FD_mpio_fapl_copy\n");
778 #endif
779 
780     if(NULL == (new_fa = (H5FD_mpio_fapl_t *)H5MM_malloc(sizeof(H5FD_mpio_fapl_t))))
781         HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
782 
783     /* Copy the general information */
784     HDmemcpy(new_fa, old_fa, sizeof(H5FD_mpio_fapl_t));
785 
786     /* Duplicate communicator and Info object. */
787     if(FAIL == H5FD_mpi_comm_info_dup(old_fa->comm, old_fa->info, &new_fa->comm, &new_fa->info))
788   HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
789     ret_value = new_fa;
790 
791 done:
792     if (NULL == ret_value){
793   /* cleanup */
794   if (new_fa)
795       H5MM_xfree(new_fa);
796     }
797 
798 #ifdef H5FDmpio_DEBUG
799 if (H5FD_mpio_Debug[(int)'t'])
800 fprintf(stderr, "leaving H5FD_mpio_fapl_copy\n");
801 #endif
802     FUNC_LEAVE_NOAPI(ret_value)
803 } /* end H5FD_mpio_fapl_copy() */
804 
805 
806 /*-------------------------------------------------------------------------
807  * Function:  H5FD_mpio_fapl_free
808  *
809  * Purpose:  Frees the mpio-specific file access properties.
810  *
811  * Return:  Success:  0
812  *
813  *    Failure:  -1
814  *
815  * Programmer:  Albert Cheng
816  *              Jan  8, 2003
817  *
818  * Modifications:
819  *
820  *-------------------------------------------------------------------------
821  */
822 static herr_t
H5FD_mpio_fapl_free(void * _fa)823 H5FD_mpio_fapl_free(void *_fa)
824 {
825     herr_t    ret_value = SUCCEED;
826     H5FD_mpio_fapl_t  *fa = (H5FD_mpio_fapl_t*)_fa;
827 
828     FUNC_ENTER_NOAPI_NOINIT_NOERR
829 #ifdef H5FDmpio_DEBUG
830 if (H5FD_mpio_Debug[(int)'t'])
831 fprintf(stderr, "in H5FD_mpio_fapl_free\n");
832 #endif
833     HDassert(fa);
834 
835     /* Free the internal communicator and INFO object */
836     HDassert(MPI_COMM_NULL!=fa->comm);
837     H5FD_mpi_comm_info_free(&fa->comm, &fa->info);
838     H5MM_xfree(fa);
839 
840 #ifdef H5FDmpio_DEBUG
841 if (H5FD_mpio_Debug[(int)'t'])
842 fprintf(stderr, "leaving H5FD_mpio_fapl_free\n");
843 #endif
844     FUNC_LEAVE_NOAPI(ret_value)
845 } /* end H5FD_mpio_fapl_free() */
846 
847 
848 /*-------------------------------------------------------------------------
849  * Function:	H5FD_set_mpio_atomicity
850  *
851  * Purpose:	Sets the atomicity mode
852  *
853  * Return:	Success:	Non-negative
854  *
855  * 		Failure:	Negative
856  *
857  * Programmer:	Mohamad Chaarawi
858  *		Feb 14, 2012
859  *
860  *-------------------------------------------------------------------------
861  */
862 herr_t
H5FD_set_mpio_atomicity(H5FD_t * _file,hbool_t flag)863 H5FD_set_mpio_atomicity(H5FD_t *_file, hbool_t flag)
864 {
865     H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
866     int          mpi_code;               /* MPI return code */
867     int          temp_flag;
868     herr_t       ret_value = SUCCEED;
869 
870     FUNC_ENTER_NOAPI_NOINIT
871 
872 #ifdef H5FDmpio_DEBUG
873     if (H5FD_mpio_Debug[(int)'t'])
874     	fprintf(stdout, "Entering H5FD_set_mpio_atomicity\n");
875 #endif
876 
877     if (FALSE == flag)
878         temp_flag = 0;
879     else
880         temp_flag = 1;
881 
882     /* set atomicity value */
883     if (MPI_SUCCESS != (mpi_code=MPI_File_set_atomicity(file->f, temp_flag)))
884         HMPI_GOTO_ERROR(FAIL, "MPI_File_set_atomicity", mpi_code)
885 
886 done:
887 #ifdef H5FDmpio_DEBUG
888     if (H5FD_mpio_Debug[(int)'t'])
889     	fprintf(stdout, "Leaving H5FD_set_mpio_atomicity\n");
890 #endif
891     FUNC_LEAVE_NOAPI(ret_value)
892 }
893 
894 
895 /*-------------------------------------------------------------------------
896  * Function:	H5FD_get_mpio_atomicity
897  *
898  * Purpose:	Returns the atomicity mode
899  *
900  * Return:	Success:	Non-negative
901  *
902  * 		Failure:	Negative
903  *
904  * Programmer:	Mohamad Chaarawi
905  *		Feb 14, 2012
906  *
907  *-------------------------------------------------------------------------
908  */
909 herr_t
H5FD_get_mpio_atomicity(H5FD_t * _file,hbool_t * flag)910 H5FD_get_mpio_atomicity(H5FD_t *_file, hbool_t *flag)
911 {
912     H5FD_mpio_t *file = (H5FD_mpio_t*)_file;
913     int          mpi_code;               /* MPI return code */
914     int          temp_flag;
915     herr_t       ret_value = SUCCEED;
916 
917     FUNC_ENTER_NOAPI_NOINIT
918 
919 #ifdef H5FDmpio_DEBUG
920     if (H5FD_mpio_Debug[(int)'t'])
921     	fprintf(stdout, "Entering H5FD_get_mpio_atomicity\n");
922 #endif
923 
924     /* get atomicity value */
925     if (MPI_SUCCESS != (mpi_code=MPI_File_get_atomicity(file->f, &temp_flag)))
926         HMPI_GOTO_ERROR(FAIL, "MPI_File_get_atomicity", mpi_code)
927 
928     if (0 != temp_flag)
929         *flag = TRUE;
930     else
931         *flag = FALSE;
932 
933 done:
934 #ifdef H5FDmpio_DEBUG
935     if (H5FD_mpio_Debug[(int)'t'])
936     	fprintf(stdout, "Leaving H5FD_get_mpio_atomicity\n");
937 #endif
938     FUNC_LEAVE_NOAPI(ret_value)
939 }
940 
941 
942 /*-------------------------------------------------------------------------
943  * Function:    H5FD_mpio_open
944  *
945  * Purpose:     Opens a file with name NAME.  The FLAGS are a bit field with
946  *    purpose similar to the second argument of open(2) and which
947  *    are defined in H5Fpublic.h. The file access property list
948  *    FAPL_ID contains the properties driver properties and MAXADDR
949  *    is the largest address which this file will be expected to
950  *    access.  This is collective.
951  *
952  * Return:      Success:        A new file pointer.
953  *
954  *              Failure:        NULL
955  *
956  * Programmer:
957  *              January 30, 1998
958  *
959  * Modifications:
960  *     Robb Matzke, 1998-02-18
961  *    Added the ACCESS_PARMS argument.  Moved some error checking
962  *    here from elsewhere.
963  *
964  *        rky, 1998-01-11
965  *        Added H5FD_mpio_Debug debug flags controlled by MPI_Info.
966  *
967  *        rky, 1998-08-28
968  *    Init flag controlling redundant metadata writes to disk.
969  *
970  *        rky, 1998-12-07
971  *    Added barrier after MPI_File_set_size to prevent race
972  *    condition -- subsequent writes were being truncated, causing
973  *    holes in file.
974  *
975  *     Robb Matzke, 1999-08-06
976  *    Modified to work with the virtual file layer.
977  *
978  *        rky & ppw, 1999-11-07
979  *    Modified "H5FD_mpio_open" so that file-truncation is
980  *              avoided for brand-new files (with zero filesize).
981  *
982  *     Albert Cheng, 2003-04-17
983  *     Duplicate the communicator and Info object so that file is
984  *     insulated from the old one.
985  *-------------------------------------------------------------------------
986  */
987 static H5FD_t *
H5FD_mpio_open(const char * name,unsigned flags,hid_t fapl_id,haddr_t H5_ATTR_UNUSED maxaddr)988 H5FD_mpio_open(const char *name, unsigned flags, hid_t fapl_id,
989 	       haddr_t H5_ATTR_UNUSED maxaddr)
990 {
991     H5FD_mpio_t      *file=NULL;
992     MPI_File      fh;
993     unsigned                    file_opened=0;  /* Flag to indicate that the file was successfully opened */
994     int        mpi_amode;
995     int        mpi_rank;       /* MPI rank of this process */
996     int        mpi_size;       /* Total number of MPI processes */
997     int        mpi_code;  /* mpi return code */
998     MPI_Offset      size;
999     const H5FD_mpio_fapl_t  *fa=NULL;
1000     H5FD_mpio_fapl_t    _fa;
1001     H5P_genplist_t *plist;      /* Property list pointer */
1002     MPI_Comm                    comm_dup=MPI_COMM_NULL;
1003     MPI_Info                    info_dup=MPI_INFO_NULL;
1004     H5FD_t      *ret_value;     /* Return value */
1005 
1006     FUNC_ENTER_NOAPI_NOINIT
1007 
1008 #ifdef H5FDmpio_DEBUG
1009     if (H5FD_mpio_Debug[(int)'t']) {
1010       fprintf(stdout, "Entering H5FD_mpio_open(name=\"%s\", flags=0x%x, "
1011     "fapl_id=%d, maxaddr=%lu)\n", name, flags, (int)fapl_id, (unsigned long)maxaddr);
1012     }
1013 #endif
1014 
1015     /* Obtain a pointer to mpio-specific file access properties */
1016     if(NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
1017         HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list")
1018     if(H5P_FILE_ACCESS_DEFAULT == fapl_id || H5FD_MPIO != H5P_get_driver(plist)) {
1019         _fa.comm = MPI_COMM_SELF; /*default*/
1020         _fa.info = MPI_INFO_NULL; /*default*/
1021         fa = &_fa;
1022     } else {
1023         if(NULL == (fa = (const H5FD_mpio_fapl_t *)H5P_get_driver_info(plist)))
1024 	    HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, NULL, "bad VFL driver info")
1025     }
1026 
1027     /* Duplicate communicator and Info object for use by this file. */
1028     if (FAIL==H5FD_mpi_comm_info_dup(fa->comm, fa->info, &comm_dup, &info_dup))
1029         HGOTO_ERROR(H5E_INTERNAL, H5E_CANTCOPY, NULL, "Communicator/Info duplicate failed")
1030 
1031     /* convert HDF5 flags to MPI-IO flags */
1032     /* some combinations are illegal; let MPI-IO figure it out */
1033     mpi_amode  = (flags&H5F_ACC_RDWR) ? MPI_MODE_RDWR : MPI_MODE_RDONLY;
1034     if (flags&H5F_ACC_CREAT)  mpi_amode |= MPI_MODE_CREATE;
1035     if (flags&H5F_ACC_EXCL)  mpi_amode |= MPI_MODE_EXCL;
1036 
1037 #ifdef H5FDmpio_DEBUG
1038     /* Check for debug commands in the info parameter */
1039     {
1040   char debug_str[128];
1041         int flag, i;
1042         if (MPI_INFO_NULL != info_dup) {
1043             MPI_Info_get(fa->info, H5F_MPIO_DEBUG_KEY, sizeof(debug_str)-1, debug_str, &flag);
1044             if (flag) {
1045                 fprintf(stdout, "H5FD_mpio debug flags=%s\n", debug_str );
1046                 for (i=0;
1047                      debug_str[i]/*end of string*/ && i<128/*just in case*/;
1048                      ++i) {
1049                     H5FD_mpio_Debug[(int)debug_str[i]] = 1;
1050                 }
1051             }
1052         }
1053     }
1054 #endif
1055 
1056     if(MPI_SUCCESS != (mpi_code = MPI_File_open(comm_dup, name, mpi_amode, info_dup, &fh)))
1057         HMPI_GOTO_ERROR(NULL, "MPI_File_open failed", mpi_code)
1058     file_opened=1;
1059 
1060     /* Get the MPI rank of this process and the total number of processes */
1061     if (MPI_SUCCESS != (mpi_code=MPI_Comm_rank (comm_dup, &mpi_rank)))
1062         HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code)
1063     if (MPI_SUCCESS != (mpi_code=MPI_Comm_size (comm_dup, &mpi_size)))
1064         HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code)
1065 
1066     /* Build the return value and initialize it */
1067     if(NULL == (file = (H5FD_mpio_t *)H5MM_calloc(sizeof(H5FD_mpio_t))))
1068         HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed")
1069     file->f = fh;
1070     file->comm = comm_dup;
1071     file->info = info_dup;
1072     file->mpi_rank = mpi_rank;
1073     file->mpi_size = mpi_size;
1074 
1075     /* Only processor p0 will get the filesize and broadcast it. */
1076     if (mpi_rank == 0) {
1077         if (MPI_SUCCESS != (mpi_code=MPI_File_get_size(fh, &size)))
1078             HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
1079     } /* end if */
1080 
1081     /* Broadcast file size */
1082     if(MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm_dup)))
1083         HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)
1084 
1085     /* Determine if the file should be truncated */
1086     if(size && (flags & H5F_ACC_TRUNC)) {
1087         if (MPI_SUCCESS != (mpi_code=MPI_File_set_size(fh, (MPI_Offset)0)))
1088             HMPI_GOTO_ERROR(NULL, "MPI_File_set_size failed", mpi_code)
1089 
1090         /* Don't let any proc return until all have truncated the file. */
1091         if (MPI_SUCCESS!= (mpi_code=MPI_Barrier(comm_dup)))
1092             HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mpi_code)
1093 
1094         /* File is zero size now */
1095         size = 0;
1096     } /* end if */
1097 
1098     /* Set the size of the file (from library's perspective) */
1099     file->eof = H5FD_mpi_MPIOff_to_haddr(size);
1100 
1101     /* Set return value */
1102     ret_value=(H5FD_t*)file;
1103 
1104 done:
1105     if(ret_value==NULL) {
1106         if(file_opened)
1107             MPI_File_close(&fh);
1108   if (MPI_COMM_NULL != comm_dup)
1109       MPI_Comm_free(&comm_dup);
1110   if (MPI_INFO_NULL != info_dup)
1111       MPI_Info_free(&info_dup);
1112   if (file)
1113       H5MM_xfree(file);
1114     } /* end if */
1115 
1116 #ifdef H5FDmpio_DEBUG
1117     if (H5FD_mpio_Debug[(int)'t'])
1118         fprintf(stdout, "Leaving H5FD_mpio_open\n" );
1119 #endif
1120     FUNC_LEAVE_NOAPI(ret_value)
1121 }
1122 
1123 
1124 /*-------------------------------------------------------------------------
1125  * Function:    H5FD_mpio_close
1126  *
1127  * Purpose:     Closes a file.  This is collective.
1128  *
1129  * Return:      Success:  Non-negative
1130  *
1131  *     Failure:  Negative
1132  *
1133  * Programmer:  Unknown
1134  *              January 30, 1998
1135  *
1136  * Modifications:
1137  *     Robb Matzke, 1998-02-18
1138  *    Added the ACCESS_PARMS argument.
1139  *
1140  *     Robb Matzke, 1999-08-06
1141  *    Modified to work with the virtual file layer.
1142  *
1143  *     Albert Cheng, 2003-04-17
1144  *    Free the communicator stored.
1145  *-------------------------------------------------------------------------
1146  */
1147 static herr_t
H5FD_mpio_close(H5FD_t * _file)1148 H5FD_mpio_close(H5FD_t *_file)
1149 {
1150     H5FD_mpio_t  *file = (H5FD_mpio_t*)_file;
1151     int    mpi_code;          /* MPI return code */
1152     herr_t      ret_value=SUCCEED;      /* Return value */
1153 
1154     FUNC_ENTER_NOAPI_NOINIT
1155 
1156 #ifdef H5FDmpio_DEBUG
1157     if (H5FD_mpio_Debug[(int)'t'])
1158       fprintf(stdout, "Entering H5FD_mpio_close\n");
1159 #endif
1160     HDassert(file);
1161     HDassert(H5FD_MPIO==file->pub.driver_id);
1162 
1163     /* MPI_File_close sets argument to MPI_FILE_NULL */
1164     if (MPI_SUCCESS != (mpi_code=MPI_File_close(&(file->f)/*in,out*/)))
1165         HMPI_GOTO_ERROR(FAIL, "MPI_File_close failed", mpi_code)
1166 
1167     /* Clean up other stuff */
1168     H5FD_mpi_comm_info_free(&file->comm, &file->info);
1169     H5MM_xfree(file);
1170 
1171 done:
1172 #ifdef H5FDmpio_DEBUG
1173     if (H5FD_mpio_Debug[(int)'t'])
1174       fprintf(stdout, "Leaving H5FD_mpio_close\n");
1175 #endif
1176     FUNC_LEAVE_NOAPI(ret_value)
1177 }
1178 
1179 
1180 /*-------------------------------------------------------------------------
1181  * Function:  H5FD_mpio_query
1182  *
1183  * Purpose:  Set the flags that this VFL driver is capable of supporting.
1184  *              (listed in H5FDpublic.h)
1185  *
1186  * Return:  Success:  non-negative
1187  *
1188  *    Failure:  negative
1189  *
1190  * Programmer:  Quincey Koziol
1191  *              Friday, August 25, 2000
1192  *
1193  * Modifications:
1194  *
1195  *    John Mainzer -- 9/21/05
1196  *    Modified code to turn off the
1197  *    H5FD_FEAT_ACCUMULATE_METADATA_WRITE flag.
1198  *              With the movement of
1199  *    all cache writes to process 0, this flag has become
1200  *    problematic in PHDF5.
1201  *
1202  *-------------------------------------------------------------------------
1203  */
1204 static herr_t
H5FD_mpio_query(const H5FD_t H5_ATTR_UNUSED * _file,unsigned long * flags)1205 H5FD_mpio_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */)
1206 {
1207     FUNC_ENTER_NOAPI_NOINIT_NOERR
1208 
1209     /* Set the VFL feature flags that this driver supports */
1210     if(flags) {
1211         *flags=0;
1212         *flags|=H5FD_FEAT_AGGREGATE_METADATA;  /* OK to aggregate metadata allocations */
1213         *flags|=H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
1214         *flags|=H5FD_FEAT_HAS_MPI;             /* This driver uses MPI */
1215         *flags|=H5FD_FEAT_ALLOCATE_EARLY;      /* Allocate space early instead of late */
1216     } /* end if */
1217 
1218     FUNC_LEAVE_NOAPI(SUCCEED)
1219 }
1220 
1221 
1222 /*-------------------------------------------------------------------------
1223  * Function:  H5FD_mpio_get_eoa
1224  *
1225  * Purpose:  Gets the end-of-address marker for the file. The EOA marker
1226  *    is the first address past the last byte allocated in the
1227  *    format address space.
1228  *
1229  * Return:  Success:  The end-of-address marker.
1230  *
1231  *    Failure:  HADDR_UNDEF
1232  *
1233  * Programmer:  Robb Matzke
1234  *              Friday, August  6, 1999
1235  *
1236  * Modifications:
1237  *              Raymond Lu
1238  *              21 Dec. 2006
1239  *              Added the parameter TYPE.  It's only used for MULTI driver.
1240  *
1241  *-------------------------------------------------------------------------
1242  */
1243 static haddr_t
H5FD_mpio_get_eoa(const H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type)1244 H5FD_mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
1245 {
1246     const H5FD_mpio_t  *file = (const H5FD_mpio_t*)_file;
1247 
1248     FUNC_ENTER_NOAPI_NOINIT_NOERR
1249 
1250     HDassert(file);
1251     HDassert(H5FD_MPIO==file->pub.driver_id);
1252 
1253     FUNC_LEAVE_NOAPI(file->eoa)
1254 }
1255 
1256 
1257 /*-------------------------------------------------------------------------
1258  * Function:  H5FD_mpio_set_eoa
1259  *
1260  * Purpose:  Set the end-of-address marker for the file. This function is
1261  *    called shortly after an existing HDF5 file is opened in order
1262  *    to tell the driver where the end of the HDF5 data is located.
1263  *
1264  * Return:  Success:  0
1265  *
1266  *    Failure:  -1
1267  *
1268  * Programmer:  Robb Matzke
1269  *              Friday, August 6, 1999
1270  *
1271  * Modifications:
1272  *              Raymond Lu
1273  *              21 Dec. 2006
1274  *              Added the parameter TYPE.  It's only used for MULTI driver.
1275  *
1276  *-------------------------------------------------------------------------
1277  */
1278 static herr_t
H5FD_mpio_set_eoa(H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type,haddr_t addr)1279 H5FD_mpio_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr)
1280 {
1281     H5FD_mpio_t  *file = (H5FD_mpio_t*)_file;
1282 
1283     FUNC_ENTER_NOAPI_NOINIT_NOERR
1284 
1285     HDassert(file);
1286     HDassert(H5FD_MPIO==file->pub.driver_id);
1287 
1288     file->eoa = addr;
1289 
1290     FUNC_LEAVE_NOAPI(SUCCEED)
1291 }
1292 
1293 
1294 /*-------------------------------------------------------------------------
1295  * Function:  H5FD_mpio_get_eof
1296  *
1297  * Purpose:  Gets the end-of-file marker for the file. The EOF marker
1298  *    is the real size of the file.
1299  *
1300  *    The MPIO driver doesn't bother keeping this field updated
1301  *    since that's a relatively expensive operation. Fortunately
1302  *    the library only needs the EOF just after the file is opened
1303  *    in order to determine whether the file is empty, truncated,
1304  *    or okay.  Therefore, any MPIO I/O function will set its value
1305  *    to HADDR_UNDEF which is the error return value of this
1306  *    function.
1307  *
1308  *              Keeping the EOF updated (during write calls) is expensive
1309  *              because any process may extend the physical end of the
1310  *              file. -QAK
1311  *
1312  * Return:  Success:  The end-of-address marker.
1313  *
1314  *    Failure:  HADDR_UNDEF
1315  *
1316  * Programmer:  Robb Matzke
1317  *              Friday, August  6, 1999
1318  *
1319  * Modifications:
1320  *
1321  *-------------------------------------------------------------------------
1322  */
1323 static haddr_t
H5FD_mpio_get_eof(const H5FD_t * _file)1324 H5FD_mpio_get_eof(const H5FD_t *_file)
1325 {
1326     const H5FD_mpio_t  *file = (const H5FD_mpio_t*)_file;
1327 
1328     FUNC_ENTER_NOAPI_NOINIT_NOERR
1329 
1330     HDassert(file);
1331     HDassert(H5FD_MPIO==file->pub.driver_id);
1332 
1333     FUNC_LEAVE_NOAPI(file->eof)
1334 }
1335 
1336 
1337 /*-------------------------------------------------------------------------
1338  * Function:       H5FD_mpio_get_handle
1339  *
1340  * Purpose:        Returns the file handle of MPIO file driver.
1341  *
1342  * Returns:        Non-negative if succeed or negative if fails.
1343  *
1344  * Programmer:     Raymond Lu
1345  *                 Sept. 16, 2002
1346  *
1347  * Modifications:
1348  *
1349  *-------------------------------------------------------------------------
1350 */
1351 static herr_t
H5FD_mpio_get_handle(H5FD_t * _file,hid_t H5_ATTR_UNUSED fapl,void ** file_handle)1352 H5FD_mpio_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void** file_handle)
1353 {
1354     H5FD_mpio_t         *file = (H5FD_mpio_t *)_file;
1355     herr_t              ret_value = SUCCEED;
1356 
1357     FUNC_ENTER_NOAPI_NOINIT
1358 
1359     if(!file_handle)
1360         HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid")
1361 
1362     *file_handle = &(file->f);
1363 
1364 done:
1365     FUNC_LEAVE_NOAPI(ret_value)
1366 }
1367 
1368 
1369 /*-------------------------------------------------------------------------
1370  * Function:  H5FD_mpio_read
1371  *
1372  * Purpose:  Reads SIZE bytes of data from FILE beginning at address ADDR
1373  *    into buffer BUF according to data transfer properties in
1374  *    DXPL_ID using potentially complex file and buffer types to
1375  *    effect the transfer.
1376  *
1377  *    Reading past the end of the MPI file returns zeros instead of
1378  *    failing.  MPI is able to coalesce requests from different
1379  *    processes (collective or independent).
1380  *
1381  * Return:  Success:  Zero. Result is stored in caller-supplied
1382  *        buffer BUF.
1383  *
1384  *    Failure:  -1, Contents of buffer BUF are undefined.
1385  *
1386  * Programmer:  rky, 1998-01-30
1387  *
1388  * Modifications:
1389  *     Robb Matzke, 1998-02-18
1390  *    Added the ACCESS_PARMS argument.
1391  *
1392  *     rky, 1998-04-10
1393  *    Call independent or collective MPI read, based on
1394  *    ACCESS_PARMS.
1395  *
1396  *     Albert Cheng, 1998-06-01
1397  *    Added XFER_MODE to control independent or collective MPI
1398  *    read.
1399  *
1400  *     rky, 1998-08-16
1401  *    Use BTYPE, FTYPE, and DISP from access parms. The guts of
1402  *    H5FD_mpio_read and H5FD_mpio_write should be replaced by a
1403  *    single dual-purpose routine.
1404  *
1405  *     Robb Matzke, 1999-04-21
1406  *    Changed XFER_MODE to XFER_PARMS for all H5F_*_read()
1407  *    callbacks.
1408  *
1409  *     Robb Matzke, 1999-07-28
1410  *    The ADDR argument is passed by value.
1411  *
1412  *     Robb Matzke, 1999-08-06
1413  *    Modified to work with the virtual file layer.
1414  *
1415  *    Quincey Koziol,  2002-05-14
1416  *    Only call MPI_Get_count if we can use MPI_BYTE for the MPI type
1417  *              for the I/O transfer.  Someday we might include code to decode
1418  *              the MPI type used for more complicated transfers and call
1419  *              MPI_Get_count all the time.
1420  *
1421  *              Quincey Koziol - 2002/06/17
1422  *              Removed 'disp' parameter from H5FD_mpio_setup routine and use
1423  *              the address of the dataset in MPI_File_set_view() calls, as
1424  *              necessary.
1425  *
1426  *              Quincey Koziol - 2002/06/24
1427  *              Removed "lazy" MPI_File_set_view() calls, since they would fail
1428  *              if the first I/O was a collective I/O using MPI derived types
1429  *              and the next I/O was an independent I/O.
1430  *
1431  *              Quincey Koziol - 2003/10/22-31
1432  *              Restructured code massively, straightening out logic and finally
1433  *              getting the bytes_read stuff working.
1434  *
1435  *-------------------------------------------------------------------------
1436  */
1437 static herr_t
H5FD_mpio_read(H5FD_t * _file,H5FD_mem_t H5_ATTR_UNUSED type,hid_t dxpl_id,haddr_t addr,size_t size,void * buf)1438 H5FD_mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, haddr_t addr, size_t size,
1439          void *buf/*out*/)
1440 {
1441     H5FD_mpio_t      *file = (H5FD_mpio_t*)_file;
1442     MPI_Offset      mpi_off;
1443     MPI_Status      mpi_stat;       /* Status from I/O operation */
1444     int        mpi_code;  /* mpi return code */
1445     MPI_Datatype    buf_type = MPI_BYTE;      /* MPI description of the selection in memory */
1446     int             size_i;         /* Integer copy of 'size' to read */
1447     int             bytes_read;     /* Number of bytes read in */
1448     int             n;
1449     int                         type_size;      /* MPI datatype used for I/O's size */
1450     int                         io_size;        /* Actual number of bytes requested */
1451     H5P_genplist_t              *plist = NULL;  /* Property list pointer */
1452     hbool_t      use_view_this_time = FALSE;
1453     herr_t                ret_value = SUCCEED;
1454 
1455     FUNC_ENTER_NOAPI_NOINIT
1456 
1457 #ifdef H5FDmpio_DEBUG
1458     if (H5FD_mpio_Debug[(int)'t'])
1459       fprintf(stdout, "Entering H5FD_mpio_read\n" );
1460 #endif
1461     HDassert(file);
1462     HDassert(H5FD_MPIO==file->pub.driver_id);
1463     /* Make certain we have the correct type of property list */
1464     HDassert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
1465     HDassert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
1466     HDassert(buf);
1467 
1468     /* Portably initialize MPI status variable */
1469     HDmemset(&mpi_stat,0,sizeof(MPI_Status));
1470 
1471     /* some numeric conversions */
1472     if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off/*out*/)<0)
1473         HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
1474     size_i = (int)size;
1475     if ((hsize_t)size_i != size)
1476         HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
1477 
1478 #ifdef H5FDmpio_DEBUG
1479     if (H5FD_mpio_Debug[(int)'r'])
1480         fprintf(stdout, "in H5FD_mpio_read  mpi_off=%ld  size_i=%d\n",
1481     (long)mpi_off, size_i );
1482 #endif
1483 
1484     /* Only look for MPI views for raw data transfers */
1485     if(type==H5FD_MEM_DRAW) {
1486         H5FD_mpio_xfer_t            xfer_mode;   /* I/O transfer mode */
1487 
1488         /* Obtain the data transfer properties */
1489         if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id)))
1490             HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
1491 
1492         /* get the transfer mode from the dxpl */
1493         if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0)
1494             HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
1495 
1496         /*
1497          * Set up for a fancy xfer using complex types, or single byte block. We
1498          * wouldn't need to rely on the use_view field if MPI semantics allowed
1499          * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
1500          * could mean "use MPI_BYTE" by convention).
1501          */
1502         if(xfer_mode==H5FD_MPIO_COLLECTIVE) {
1503             MPI_Datatype    file_type;
1504 
1505             /* Remember that views are used */
1506             use_view_this_time = TRUE;
1507 
1508             /* prepare for a full-blown xfer using btype, ftype, and disp */
1509             if(H5P_get(plist, H5FD_MPI_XFER_MEM_MPI_TYPE_NAME, &buf_type)<0)
1510                 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property")
1511             if(H5P_get(plist, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, &file_type)<0)
1512                 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property")
1513 
1514             /*
1515              * Set the file view when we are using MPI derived types
1516              */
1517             if (MPI_SUCCESS != (mpi_code=MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g,  file->info)))
1518                 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1519 
1520             /* When using types, use the address as the displacement for
1521              * MPI_File_set_view and reset the address for the read to zero
1522              */
1523             mpi_off=0;
1524         } /* end if */
1525     } /* end if */
1526 
1527     /* Read the data. */
1528     if(use_view_this_time) {
1529        H5FD_mpio_collective_opt_t coll_opt_mode;
1530 
1531 #ifdef H5FDmpio_DEBUG
1532   if (H5FD_mpio_Debug[(int)'t'])
1533       fprintf(stdout, "H5FD_mpio_read: using MPIO collective mode\n");
1534 #endif
1535         /* Get the collective_opt property to check whether the application wants to do IO individually. */
1536         HDassert(plist);
1537 
1538         /* get the transfer mode from the dxpl */
1539         if(H5P_get(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &coll_opt_mode)<0)
1540             HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
1541 
1542         if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
1543 #ifdef H5FDmpio_DEBUG
1544             if(H5FD_mpio_Debug[(int)'t'])
1545                 fprintf(stdout, "H5FD_mpio_read: doing MPI collective IO\n");
1546 #endif
1547             if(MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1548                 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
1549         } /* end if */
1550         else {
1551 #ifdef H5FDmpio_DEBUG
1552             if(H5FD_mpio_Debug[(int)'t'])
1553                 fprintf(stdout, "H5FD_mpio_read: doing MPI independent IO\n");
1554 #endif
1555 
1556             if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1557                 HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
1558         } /* end else */
1559 
1560         /*
1561          * Reset the file view when we used MPI derived types
1562          */
1563         if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g, file->info)))
1564             HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1565     } else {
1566         if(MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1567             HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
1568     }
1569 
1570     /* How many bytes were actually read? */
1571     /* [This works because the "basic elements" we use for all our MPI derived
1572      *  types are MPI_BYTE.  We should be using the 'buf_type' for the MPI
1573      *  datatype in this call though... (We aren't because using it causes
1574      *  the LANL "qsc" machine to dump core - 12/19/03) - QAK]
1575      */
1576     if (MPI_SUCCESS != (mpi_code=MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
1577         HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
1578 
1579     /* Get the type's size */
1580     if (MPI_SUCCESS != (mpi_code=MPI_Type_size(buf_type,&type_size)))
1581         HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
1582 
1583     /* Compute the actual number of bytes requested */
1584     io_size=type_size*size_i;
1585 
1586     /* Check for read failure */
1587     if (bytes_read<0 || bytes_read>io_size)
1588         HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed")
1589 
1590     /*
1591      * This gives us zeroes beyond end of physical MPI file.
1592      */
1593     if ((n=(io_size-bytes_read)) > 0)
1594         HDmemset((char*)buf+bytes_read, 0, (size_t)n);
1595 
1596 done:
1597 #ifdef H5FDmpio_DEBUG
1598     if (H5FD_mpio_Debug[(int)'t'])
1599       fprintf(stdout, "Leaving H5FD_mpio_read\n" );
1600 #endif
1601 
1602     FUNC_LEAVE_NOAPI(ret_value)
1603 }
1604 
1605 
1606 /*-------------------------------------------------------------------------
1607  * Function:  H5FD_mpio_write
1608  *
1609  * Purpose:  Writes SIZE bytes of data to FILE beginning at address ADDR
1610  *    from buffer BUF according to data transfer properties in
1611  *    DXPL_ID using potentially complex file and buffer types to
1612  *    effect the transfer.
1613  *
1614  *    MPI is able to coalesce requests from different processes
1615  *    (collective and independent).
1616  *
1617  * Return:  Success:  Zero. USE_TYPES and OLD_USE_TYPES in the
1618  *        access params are altered.
1619  *
1620  *    Failure:  -1, USE_TYPES and OLD_USE_TYPES in the
1621  *        access params may be altered.
1622  *
1623  * Programmer:  Unknown
1624  *              January 30, 1998
1625  *
1626  * Modifications:
1627  *    rky, 1998-08-28
1628  *    If the file->allsame flag is set, we assume that all the
1629  *    procs in the relevant MPI communicator will write identical
1630  *    data at identical offsets in the file, so only proc 0 will
1631  *    write, and all other procs will wait for p0 to finish. This
1632  *    is useful for writing metadata, for example. Note that we
1633  *    don't _check_ that the data is identical. Also, the mechanism
1634  *    we use to eliminate the redundant writes is by requiring a
1635  *    call to H5FD_mpio_tas_allsame before the write, which is
1636  *    rather klugey. Would it be better to pass a parameter to
1637  *    low-level writes like H5F_block_write and H5F_low_write,
1638  *    instead?  Or...??? Also, when I created this mechanism I
1639  *    wanted to minimize the difference in behavior between the old
1640  *    way of doing things (i.e., all procs write) and the new way,
1641  *    so the writes are eliminated at the very lowest level, here
1642  *    in H5FD_mpio_write. It may be better to rethink that, and
1643  *    short-circuit the writes at a higher level (e.g., at the
1644  *    points in the code where H5FD_mpio_tas_allsame is called).
1645  *
1646  *
1647  *     Robb Matzke, 1998-02-18
1648  *    Added the ACCESS_PARMS argument.
1649  *
1650  *     rky, 1998-04-10
1651  *    Call independent or collective MPI write, based on
1652  *    ACCESS_PARMS.
1653  *
1654  *     rky, 1998-04-24
1655  *    Removed redundant write from H5FD_mpio_write.
1656  *
1657  *     Albert Cheng, 1998-06-01
1658  *    Added XFER_MODE to control independent or collective MPI
1659  *    write.
1660  *
1661  *     rky, 1998-08-16
1662  *    Use BTYPE, FTYPE, and DISP from access parms. The guts of
1663  *    H5FD_mpio_read and H5FD_mpio_write should be replaced by a
1664  *    single dual-purpose routine.
1665  *
1666  *     rky, 1998-08-28
1667  *    Added ALLSAME parameter to make all but proc 0 skip the
1668  *    actual write.
1669  *
1670  *     Robb Matzke, 1999-04-21
1671  *    Changed XFER_MODE to XFER_PARMS for all H5FD_*_write()
1672  *    callbacks.
1673  *
1674  *     Robb Matzke, 1999-07-28
1675  *    The ADDR argument is passed by value.
1676  *
1677  *     Robb Matzke, 1999-08-06
1678  *    Modified to work with the virtual file layer.
1679  *
1680  *    Albert Cheng, 1999-12-19
1681  *    When only-p0-write-allsame-data, p0 Bcasts the
1682  *    ret_value to other processes.  This prevents
1683  *    a racing condition (that other processes try to
1684  *    read the file before p0 finishes writing) and also
1685  *    allows all processes to report the same ret_value.
1686  *
1687  *    Kim Yates, Pat Weidhaas,  2000-09-26
1688  *    Move block of coding where only p0 writes after the
1689  *              MPI_File_set_view call.
1690  *
1691  *    Quincey Koziol,  2002-05-10
1692  *    Instead of always writing metadata from process 0, spread the
1693  *              burden among all the processes by using a round-robin rotation
1694  *              scheme.
1695  *
1696  *    Quincey Koziol,  2002-05-10
1697  *    Removed allsame code, keying off the type parameter instead.
1698  *
1699  *    Quincey Koziol,  2002-05-14
1700  *    Only call MPI_Get_count if we can use MPI_BYTE for the MPI type
1701  *              for the I/O transfer.  Someday we might include code to decode
1702  *              the MPI type used for more complicated transfers and call
1703  *              MPI_Get_count all the time.
1704  *
1705  *              Quincey Koziol - 2002/06/17
1706  *              Removed 'disp' parameter from H5FD_mpio_setup routine and use
1707  *              the address of the dataset in MPI_File_set_view() calls, as
1708  *              necessary.
1709  *
1710  *              Quincey Koziol - 2002/06/24
1711  *              Removed "lazy" MPI_File_set_view() calls, since they would fail
1712  *              if the first I/O was a collective I/O using MPI derived types
1713  *              and the next I/O was an independent I/O.
1714  *
1715  *              Quincey Koziol - 2002/07/18
1716  *              Added "block_before_meta_write" dataset transfer flag, which
1717  *              is set during writes from a metadata cache flush and indicates
1718  *              that all the processes must sync up before (one of them)
1719  *              writing metadata.
1720  *
1721  *              Quincey Koziol - 2003/10/22-31
1722  *              Restructured code massively, straightening out logic and finally
1723  *              getting the bytes_written stuff working.
1724  *
1725  *-------------------------------------------------------------------------
1726  */
1727 static herr_t
H5FD_mpio_write(H5FD_t * _file,H5FD_mem_t type,hid_t dxpl_id,haddr_t addr,size_t size,const void * buf)1728 H5FD_mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr,
1729     size_t size, const void *buf)
1730 {
1731     H5FD_mpio_t      *file = (H5FD_mpio_t*)_file;
1732     MPI_Offset        mpi_off;
1733     MPI_Status      mpi_stat;       /* Status from I/O operation */
1734     MPI_Datatype    buf_type = MPI_BYTE;      /* MPI description of the selection in memory */
1735     int              mpi_code;  /* MPI return code */
1736     int             size_i, bytes_written;
1737     int                         type_size;      /* MPI datatype used for I/O's size */
1738     int                         io_size;        /* Actual number of bytes requested */
1739     hbool_t      use_view_this_time = FALSE;
1740     H5P_genplist_t              *plist = NULL;  /* Property list pointer */
1741     herr_t                ret_value = SUCCEED;
1742 
1743     FUNC_ENTER_NOAPI_NOINIT
1744 
1745 #ifdef H5FDmpio_DEBUG
1746     if (H5FD_mpio_Debug[(int)'t'])
1747       fprintf(stdout, "Entering H5FD_mpio_write\n" );
1748 #endif
1749     HDassert(file);
1750     HDassert(H5FD_MPIO==file->pub.driver_id);
1751     /* Make certain we have the correct type of property list */
1752     HDassert(H5I_GENPROP_LST==H5I_get_type(dxpl_id));
1753     HDassert(TRUE==H5P_isa_class(dxpl_id,H5P_DATASET_XFER));
1754     HDassert(buf);
1755 
1756     /* Portably initialize MPI status variable */
1757     HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
1758 
1759     /* some numeric conversions */
1760     if(H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off) < 0)
1761         HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off")
1762     size_i = (int)size;
1763     if((hsize_t)size_i != size)
1764         HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from size to size_i")
1765 
1766 #ifdef H5FDmpio_DEBUG
1767     if(H5FD_mpio_Debug[(int)'w'])
1768         fprintf(stdout, "in H5FD_mpio_write  mpi_off=%ld  size_i=%d\n", (long)mpi_off, size_i);
1769 #endif
1770 
1771     if(type == H5FD_MEM_DRAW) {
1772         H5FD_mpio_xfer_t            xfer_mode;   /* I/O transfer mode */
1773 
1774         /* Obtain the data transfer properties */
1775         if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id)))
1776             HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
1777 
1778         /* get the transfer mode from the dxpl */
1779         if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode)<0)
1780             HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode")
1781 
1782         /*
1783          * Set up for a fancy xfer using complex types, or single byte block. We
1784          * wouldn't need to rely on the use_view field if MPI semantics allowed
1785          * us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
1786          * could mean "use MPI_BYTE" by convention).
1787          */
1788         if(xfer_mode == H5FD_MPIO_COLLECTIVE) {
1789             MPI_Datatype    file_type;
1790 
1791             /* Remember that views are used */
1792             use_view_this_time = TRUE;
1793 
1794             /* prepare for a full-blown xfer using btype, ftype, and disp */
1795             if(H5P_get(plist, H5FD_MPI_XFER_MEM_MPI_TYPE_NAME, &buf_type) < 0)
1796                 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property")
1797             if(H5P_get(plist, H5FD_MPI_XFER_FILE_MPI_TYPE_NAME, &file_type) < 0)
1798                 HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O type property")
1799 
1800             /*
1801              * Set the file view when we are using MPI derived types
1802              */
1803             if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type, H5FD_mpi_native_g, file->info)))
1804                 HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1805 
1806             /* When using types, use the address as the displacement for
1807              * MPI_File_set_view and reset the address for the read to zero
1808              */
1809             mpi_off = 0;
1810         } /* end if */
1811     } /* end if */
1812     else {
1813 #if 0 /* JRM -- 3/23/10 */ /* this is no longer always the case */
1814         /* Only one process can do the actual metadata write */
1815         if(file->mpi_rank != H5_PAR_META_WRITE)
1816 #ifdef LATER
1817             HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "can't write metadata from non-zero rank")
1818 #else /* LATER */
1819             HGOTO_DONE(SUCCEED) /* skip the actual write */
1820 #endif /* LATER */
1821 #endif /* JRM */
1822     } /* end if */
1823 
1824     /* Write the data. */
1825     if(use_view_this_time) {
1826         H5FD_mpio_collective_opt_t coll_opt_mode;
1827 
1828 #ifdef H5FDmpio_DEBUG
1829         if(H5FD_mpio_Debug[(int)'t'])
1830             fprintf(stdout, "H5FD_mpio_write: using MPIO collective mode\n");
1831 #endif
1832         /* Get the collective_opt property to check whether the application wants to do IO individually. */
1833         HDassert(plist);
1834         /* get the transfer mode from the dxpl */
1835         if(H5P_get(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &coll_opt_mode)<0)
1836             HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property")
1837 
1838         if(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
1839 #ifdef H5FDmpio_DEBUG
1840             if(H5FD_mpio_Debug[(int)'t'])
1841                 fprintf(stdout, "H5FD_mpio_write: doing MPI collective IO\n");
1842 #endif
1843             if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1844                 HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
1845         } /* end if */
1846         else {
1847 #ifdef H5FDmpio_DEBUG
1848             if(H5FD_mpio_Debug[(int)'t'])
1849                 fprintf(stdout, "H5FD_mpio_write: doing MPI independent IO\n");
1850 #endif
1851             if(MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1852                 HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
1853         } /* end else */
1854 
1855         /* Reset the file view when we used MPI derived types */
1856         if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, H5FD_mpi_native_g,  file->info)))
1857             HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
1858     } else {
1859         if(MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
1860             HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
1861     }
1862 
1863     /* How many bytes were actually written? */
1864     /* [This works because the "basic elements" we use for all our MPI derived
1865      *  types are MPI_BYTE.  We should be using the 'buf_type' for the MPI
1866      *  datatype in this call though... (We aren't because using it causes
1867      *  the LANL "qsc" machine to dump core - 12/19/03) - QAK]
1868      */
1869     if(MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_written)))
1870         HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
1871 
1872     /* Get the type's size */
1873     if(MPI_SUCCESS != (mpi_code = MPI_Type_size(buf_type, &type_size)))
1874         HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
1875 
1876     /* Compute the actual number of bytes requested */
1877     io_size = type_size * size_i;
1878 
1879     /* Check for write failure */
1880     if(bytes_written != io_size)
1881         HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed")
1882 
1883     /* Forget the EOF value (see H5FD_mpio_get_eof()) --rpm 1999-08-06 */
1884     file->eof = HADDR_UNDEF;
1885 
1886 done:
1887 #ifdef H5FDmpio_DEBUG
1888     if(H5FD_mpio_Debug[(int)'t'])
1889       fprintf(stdout, "proc %d: Leaving H5FD_mpio_write with ret_value=%d\n",
1890       file->mpi_rank, ret_value );
1891 #endif
1892     FUNC_LEAVE_NOAPI(ret_value)
1893 } /* end H5FD_mpio_write() */
1894 
1895 
1896 /*-------------------------------------------------------------------------
1897  * Function:    H5FD_mpio_flush
1898  *
1899  * Purpose:     Makes sure that all data is on disk.  This is collective.
1900  *
1901  * Return:      Success:  Non-negative
1902  *
1903  *     Failure:  Negative
1904  *
1905  * Programmer:  Robb Matzke
1906  *              January 30, 1998
1907  *
1908  *-------------------------------------------------------------------------
1909  */
1910 static herr_t
H5FD_mpio_flush(H5FD_t * _file,hid_t H5_ATTR_UNUSED dxpl_id,unsigned closing)1911 H5FD_mpio_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, unsigned closing)
1912 {
1913     H5FD_mpio_t    *file = (H5FD_mpio_t*)_file;
1914     int      mpi_code;  /* mpi return code */
1915     herr_t              ret_value = SUCCEED;
1916 
1917     FUNC_ENTER_NOAPI_NOINIT
1918 
1919 #ifdef H5FDmpio_DEBUG
1920     if(H5FD_mpio_Debug[(int)'t'])
1921       HDfprintf(stdout, "Entering %s\n", FUNC);
1922 #endif
1923     HDassert(file);
1924     HDassert(H5FD_MPIO == file->pub.driver_id);
1925 
1926     /* Only sync the file if we are not going to immediately close it */
1927     if(!closing) {
1928         if(MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f)))
1929             HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code)
1930     } /* end if */
1931 
1932 done:
1933 #ifdef H5FDmpio_DEBUG
1934     if(H5FD_mpio_Debug[(int)'t'])
1935       HDfprintf(stdout, "Leaving %s\n", FUNC);
1936 #endif
1937 
1938     FUNC_LEAVE_NOAPI(ret_value)
1939 } /* end H5FD_mpio_flush() */
1940 
1941 
1942 /*-------------------------------------------------------------------------
1943  * Function:    H5FD_mpio_truncate
1944  *
1945  * Purpose:     Make certain the file's size matches it's allocated size
1946  *
1947  * Return:      Success:  Non-negative
1948  *     Failure:  Negative
1949  *
1950  * Programmer:  Quincey Koziol
1951  *              January 31, 2008
1952  *
1953  *-------------------------------------------------------------------------
1954  */
1955 static herr_t
H5FD_mpio_truncate(H5FD_t * _file,hid_t H5_ATTR_UNUSED dxpl_id,hbool_t H5_ATTR_UNUSED closing)1956 H5FD_mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing)
1957 {
1958     H5FD_mpio_t    *file = (H5FD_mpio_t*)_file;
1959     herr_t              ret_value = SUCCEED;
1960 
1961     FUNC_ENTER_NOAPI_NOINIT
1962 
1963 #ifdef H5FDmpio_DEBUG
1964     if(H5FD_mpio_Debug[(int)'t'])
1965       HDfprintf(stdout, "Entering %s\n", FUNC);
1966 #endif
1967     HDassert(file);
1968     HDassert(H5FD_MPIO == file->pub.driver_id);
1969 
1970     /* Extend the file to make sure it's large enough, then sync.
1971      * Unfortunately, keeping track of EOF is an expensive operation, so
1972      * we can't just check whether EOF<EOA like with other drivers.
1973      * Therefore we'll just read the byte at EOA-1 and then write it back. */
1974     if(file->eoa > file->last_eoa) {
1975         int    mpi_code;  /* mpi return code */
1976         MPI_Offset      mpi_off;
1977 
1978         if(H5FD_mpi_haddr_to_MPIOff(file->eoa, &mpi_off) < 0)
1979             HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset")
1980 
1981         /* Extend the file's size */
1982         if(MPI_SUCCESS != (mpi_code = MPI_File_set_size(file->f, mpi_off)))
1983             HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code)
1984 
1985 	/* Don't let any proc return until all have extended the file.
1986          * (Prevents race condition where some processes go ahead and write
1987          * more data to the file before all the processes have finished making
1988          * it the shorter length, potentially truncating the file and dropping
1989          * the new data written)
1990          */
1991         if(MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
1992             HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
1993 
1994         /* Update the 'last' eoa value */
1995         file->last_eoa = file->eoa;
1996     } /* end if */
1997 
1998 done:
1999 #ifdef H5FDmpio_DEBUG
2000     if(H5FD_mpio_Debug[(int)'t'])
2001       HDfprintf(stdout, "Leaving %s\n", FUNC);
2002 #endif
2003 
2004     FUNC_LEAVE_NOAPI(ret_value)
2005 } /* end H5FD_mpio_truncate() */
2006 
2007 
2008 /*-------------------------------------------------------------------------
2009  * Function:  H5FD_mpio_mpi_rank
2010  *
2011  * Purpose:  Returns the MPI rank for a process
2012  *
2013  * Return:  Success: non-negative
2014  *    Failure: negative
2015  *
2016  * Programmer:  Quincey Koziol
2017  *              Thursday, May 16, 2002
2018  *
2019  * Modifications:
2020  *
2021  *-------------------------------------------------------------------------
2022  */
2023 static int
H5FD_mpio_mpi_rank(const H5FD_t * _file)2024 H5FD_mpio_mpi_rank(const H5FD_t *_file)
2025 {
2026     const H5FD_mpio_t  *file = (const H5FD_mpio_t*)_file;
2027 
2028     FUNC_ENTER_NOAPI_NOINIT_NOERR
2029 
2030     HDassert(file);
2031     HDassert(H5FD_MPIO==file->pub.driver_id);
2032 
2033     FUNC_LEAVE_NOAPI(file->mpi_rank)
2034 } /* end H5FD_mpio_mpi_rank() */
2035 
2036 
2037 /*-------------------------------------------------------------------------
2038  * Function:  H5FD_mpio_mpi_size
2039  *
2040  * Purpose:  Returns the number of MPI processes
2041  *
2042  * Return:  Success: non-negative
2043  *    Failure: negative
2044  *
2045  * Programmer:  Quincey Koziol
2046  *              Thursday, May 16, 2002
2047  *
2048  * Modifications:
2049  *
2050  *-------------------------------------------------------------------------
2051  */
2052 static int
H5FD_mpio_mpi_size(const H5FD_t * _file)2053 H5FD_mpio_mpi_size(const H5FD_t *_file)
2054 {
2055     const H5FD_mpio_t  *file = (const H5FD_mpio_t*)_file;
2056 
2057     FUNC_ENTER_NOAPI_NOINIT_NOERR
2058 
2059     HDassert(file);
2060     HDassert(H5FD_MPIO==file->pub.driver_id);
2061 
2062     FUNC_LEAVE_NOAPI(file->mpi_size)
2063 } /* end H5FD_mpio_mpi_size() */
2064 
2065 
2066 /*-------------------------------------------------------------------------
2067  * Function:  H5FD_mpio_communicator
2068  *
2069  * Purpose:  Returns the MPI communicator for the file.
2070  *
2071  * Return:  Success:  The communicator
2072  *
2073  *    Failure:  NULL
2074  *
2075  * Programmer:  Robb Matzke
2076  *              Monday, August  9, 1999
2077  *
2078  * Modifications:
2079  *
2080  *-------------------------------------------------------------------------
2081  */
2082 static MPI_Comm
H5FD_mpio_communicator(const H5FD_t * _file)2083 H5FD_mpio_communicator(const H5FD_t *_file)
2084 {
2085     const H5FD_mpio_t  *file = (const H5FD_mpio_t*)_file;
2086 
2087     FUNC_ENTER_NOAPI_NOINIT_NOERR
2088 
2089     HDassert(file);
2090     HDassert(H5FD_MPIO==file->pub.driver_id);
2091 
2092     FUNC_LEAVE_NOAPI(file->comm)
2093 }
2094 
2095 #endif /* H5_HAVE_PARALLEL */
2096 
2097