1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  * Copyright by The HDF Group.                                               *
3  * All rights reserved.                                                      *
4  *                                                                           *
5  * This file is part of HDF5.  The full HDF5 copyright notice, including     *
6  * terms governing use, modification, and redistribution, is contained in    *
7  * the COPYING file, which can be found at the root of the source code       *
8  * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases.  *
9  * If you do not have access to either file, you may request a copy from     *
10  * help@hdfgroup.org.                                                        *
11  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
12 
13 /*
14  * Author: Albert Cheng of NCSA, Oct 24, 2001.
15  */
16 
17 #include "hdf5.h"
18 
19 #ifdef H5_STDC_HEADERS
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #endif
25 
26 #ifdef H5_HAVE_UNISTD_H
27 #include <sys/types.h>
28 #include <unistd.h>
29 #endif
30 
31 #ifdef H5_HAVE_SYS_STAT_H
32 #include <sys/stat.h>
33 #endif
34 
35 #ifdef H5_HAVE_PARALLEL
36 
37 #include <mpi.h>
38 
39 #ifndef MPI_FILE_NULL           /*MPIO may be defined in mpi.h already       */
40 #   include <mpio.h>
41 #endif  /* !MPI_FILE_NULL */
42 
43 #include "pio_perf.h"
44 
45 /* Macro definitions */
46 
47 #if H5_VERS_MAJOR == 1 && H5_VERS_MINOR == 6
48 #    define H5DCREATE(fd, name, type, space, dcpl)    H5Dcreate(fd, name, type, space, dcpl)
49 #    define H5DOPEN(fd, name)                         H5Dopen(fd, name)
50 #else
51 #    define H5DCREATE(fd, name, type, space, dcpl)    H5Dcreate2(fd, name, type, space, H5P_DEFAULT, dcpl, H5P_DEFAULT)
52 #    define H5DOPEN(fd, name)                         H5Dopen2(fd, name, H5P_DEFAULT)
53 #endif
54 
55 /* sizes of various items. these sizes won't change during program execution */
56 /* The following three must have the same type */
57 #define ELMT_SIZE           (sizeof(unsigned char))     /* we're doing bytes */
58 #define ELMT_MPI_TYPE       MPI_BYTE
59 #define ELMT_H5_TYPE        H5T_NATIVE_UCHAR
60 
61 #define GOTOERROR(errcode)  { ret_code = errcode; goto done; }
62 #define GOTODONE        { goto done; }
63 #define ERRMSG(mesg) {                                                  \
64     HDfprintf(stderr, "Proc %d: ", pio_mpi_rank_g);                       \
65     HDfprintf(stderr, "*** Assertion failed (%s) at line %4d in %s\n",    \
66         mesg, (int)__LINE__, __FILE__);                             \
67 }
68 
69 #define MSG(mesg) {                                     \
70     HDfprintf(stderr, "Proc %d: ", pio_mpi_rank_g);       \
71     HDfprintf(stderr, "(%s) at line %4d in %s\n",         \
72         mesg, (int)__LINE__, __FILE__);             \
73 }
74 
75 /* verify: if val is false (0), print mesg. */
76 #define VRFY(val, mesg) do {                            \
77     if (!val) {                                         \
78     ERRMSG(mesg);                                   \
79     GOTOERROR(FAIL);                                \
80     }                                                   \
81 } while(0)
82 
83 
84 /* POSIX I/O macros */
85 #ifdef H5_HAVE_WIN32_API
86 /* Can't link against the library, so this test will use the older, non-Unicode
87  * _open() call on Windows.
88  */
89 #define HDopen(S,F,...)           _open(S, F | _O_BINARY, __VA_ARGS__)
90 #endif /* H5_HAVE_WIN32_API */
91 #define POSIXCREATE(fn)           HDopen(fn, O_CREAT|O_TRUNC|O_RDWR, 0600)
92 #define POSIXOPEN(fn, F)          HDopen(fn, F, 0600)
93 #define POSIXCLOSE(F)             HDclose(F)
94 #define POSIXSEEK(F,L)            HDlseek(F, L, SEEK_SET)
95 #define POSIXWRITE(F,B,S)         HDwrite(F,B,S)
96 #define POSIXREAD(F,B,S)          HDread(F,B,S)
97 
98 enum {
99     PIO_CREATE = 1,
100     PIO_WRITE = 2,
101     PIO_READ = 4
102 };
103 
104 /* Global variables */
105 static int  clean_file_g = -1;  /*whether to cleanup temporary test     */
106 /*files. -1 is not defined;             */
107 /*0 is no cleanup; 1 is do cleanup      */
108 
109 /*
110  * In a parallel machine, the filesystem suitable for compiling is
111  * unlikely a parallel file system that is suitable for parallel I/O.
112  * There is no standard pathname for the parallel file system.  /tmp
113  * is about the best guess.
114  */
115 #ifndef HDF5_PARAPREFIX
116 #    define HDF5_PARAPREFIX     ""
117 #endif  /* !HDF5_PARAPREFIX */
118 
119 #ifndef MIN
120 #   define MIN(a,b) ((a) < (b) ? (a) : (b))
121 #endif  /* !MIN */
122 
123 /* the different types of file descriptors we can expect */
124 typedef union _file_descr {
125     int         posixfd;    /* POSIX file handle*/
126     MPI_File    mpifd;      /* MPI file         */
127     hid_t       h5fd;       /* HDF5 file        */
128 } file_descr;
129 
130 /* local functions */
131 static char  *pio_create_filename(iotype iot, const char *base_name,
132     char *fullname, size_t size);
133 static herr_t do_write(results *res, file_descr *fd, parameters *parms,
134     long ndsets, off_t nelmts, size_t buf_size, void *buffer);
135 static herr_t do_read(results *res, file_descr *fd, parameters *parms,
136     long ndsets, off_t nelmts, size_t buf_size, void *buffer /*out*/);
137 static herr_t do_fopen(parameters *param, char *fname, file_descr *fd /*out*/,
138     int flags);
139 static herr_t do_fclose(iotype iot, file_descr *fd);
140 static void do_cleanupfile(iotype iot, char *fname);
141 
142 /*
143  * Function:        do_pio
144  * Purpose:         PIO Engine where Parallel IO are executed.
145  * Return:          results
146  * Programmer:      Albert Cheng, Bill Wendling 2001/12/12
147  * Modifications:
148  *  Added 2D testing (Christian Chilan, 10. August 2005)
149  */
150     results
do_pio(parameters param)151 do_pio(parameters param)
152 {
153     /* return codes */
154     herr_t      ret_code = 0;   /*return code                           */
155     results     res;
156 
157     file_descr  fd;
158     iotype      iot;
159 
160     char        fname[FILENAME_MAX];
161     long    nf;
162     long        ndsets;
163     off_t       nbytes;         /*number of bytes per dataset  */
164     off_t   snbytes;    /*general dataset size          */
165                                 /*for 1D, it is the actual dataset size */
166                                 /*for 2D, it is the size of a side of the dataset square */
167     char        *buffer = NULL; /*data buffer pointer           */
168     size_t      buf_size;       /*general buffer size in bytes     */
169                                 /*for 1D, it is the actual buffer size */
170                                 /*for 2D, it is the length of the buffer rectangle */
171     size_t      blk_size;       /*data block size in bytes      */
172     size_t  bsize;      /*actual buffer size            */
173 
174     /* HDF5 variables */
175     herr_t          hrc;        /*HDF5 return code              */
176 
177     /* Sanity check parameters */
178 
179     /* IO type */
180     iot = param.io_type;
181 
182     switch (iot) {
183         case MPIO:
184             fd.mpifd = MPI_FILE_NULL;
185             res.timers = io_time_new(MPI_CLOCK);
186             break;
187         case POSIXIO:
188             fd.posixfd = -1;
189             res.timers = io_time_new(MPI_CLOCK);
190             break;
191         case PHDF5:
192             fd.h5fd = -1;
193             res.timers = io_time_new(MPI_CLOCK);
194             break;
195         default:
196             /* unknown request */
197             HDfprintf(stderr, "Unknown IO type request (%d)\n", iot);
198             GOTOERROR(FAIL);
199     }
200 
201     ndsets = param.num_dsets;       /* number of datasets per file          */
202     nbytes = param.num_bytes;       /* number of bytes per dataset          */
203     buf_size = param.buf_size;
204     blk_size = param.blk_size;
205 
206     if (!param.dim2d){
207         snbytes = nbytes;   /* General dataset size     */
208         bsize = buf_size;   /* Actual buffer size       */
209     }
210     else {
211         snbytes = (off_t)sqrt(nbytes);  /* General dataset size     */
212         bsize = buf_size * blk_size;    /* Actual buffer size       */
213     }
214 
215     if (param.num_files < 0 ) {
216     HDfprintf(stderr,
217         "number of files must be >= 0 (%ld)\n",
218         param.num_files);
219     GOTOERROR(FAIL);
220     }
221 
222     if (ndsets < 0 ) {
223     HDfprintf(stderr,
224         "number of datasets per file must be >= 0 (%ld)\n",
225         ndsets);
226     GOTOERROR(FAIL);
227     }
228 
229     if (param.num_procs <= 0 ) {
230     HDfprintf(stderr,
231         "maximum number of process to use must be > 0 (%d)\n",
232         param.num_procs);
233     GOTOERROR(FAIL);
234     }
235 
236     /* Validate transfer buffer size & block size*/
237     if(blk_size<=0) {
238     HDfprintf(stderr,
239         "Transfer block size (%zu) must be > 0\n", blk_size);
240     GOTOERROR(FAIL);
241     }
242     if(buf_size<=0) {
243     HDfprintf(stderr,
244         "Transfer buffer size (%zu) must be > 0\n", buf_size);
245     GOTOERROR(FAIL);
246     }
247     if ((buf_size % blk_size) != 0){
248     HDfprintf(stderr,
249         "Transfer buffer size (%zu) must be a multiple of the "
250         "interleaved I/O block size (%zu)\n",
251         buf_size, blk_size);
252     GOTOERROR(FAIL);
253     }
254     if((snbytes%pio_mpi_nprocs_g)!=0) {
255     HDfprintf(stderr,
256               "Dataset size (%" H5_PRINTF_LL_WIDTH "d) must be a multiple of the "
257               "number of processes (%d)\n",
258               (long long)snbytes, pio_mpi_nprocs_g);
259     GOTOERROR(FAIL);
260     }
261 
262     if (!param.dim2d){
263         if(((snbytes/pio_mpi_nprocs_g)%buf_size)!=0) {
264         HDfprintf(stderr,
265             "Dataset size/process (%" H5_PRINTF_LL_WIDTH "d) must be a multiple of the "
266             "trasfer buffer size (%zu)\n",
267             (long long)(snbytes/pio_mpi_nprocs_g), buf_size);
268         GOTOERROR(FAIL);
269         }
270     }
271     else {
272         if((snbytes%buf_size)!=0) {
273         HDfprintf(stderr,
274             "Dataset side size (%" H5_PRINTF_LL_WIDTH "d) must be a multiple of the "
275             "trasfer buffer size (%zu)\n",
276             (long long)snbytes, buf_size);
277         GOTOERROR(FAIL);
278         }
279     }
280 
281     /* Allocate transfer buffer */
282     if ((buffer = malloc(bsize)) == NULL){
283     HDfprintf(stderr, "malloc for transfer buffer size (%zu) failed\n",
284         bsize);
285     GOTOERROR(FAIL);
286     }
287 
288     if (pio_debug_level >= 4) {
289     int myrank;
290 
291     MPI_Comm_rank(pio_comm_g, &myrank);
292 
293     /* output all of the times for all iterations */
294     if (myrank == 0)
295         HDfprintf(output, "Timer details:\n");
296     }
297 
298     for (nf = 1; nf <= param.num_files; nf++) {
299     /*
300      * Write performance measurement
301      */
302     /* Open file for write */
303     char base_name[256];
304 
305     HDsprintf(base_name, "#pio_tmp_%lu", nf);
306     pio_create_filename(iot, base_name, fname, sizeof(fname));
307     if (pio_debug_level > 0)
308         HDfprintf(output, "rank %d: data filename=%s\n",
309             pio_mpi_rank_g, fname);
310 
311     /* Need barrier to make sure everyone starts at the same time */
312     MPI_Barrier(pio_comm_g);
313 
314     set_time(res.timers, HDF5_GROSS_WRITE_FIXED_DIMS, TSTART);
315     hrc = do_fopen(&param, fname, &fd, PIO_CREATE | PIO_WRITE);
316 
317     VRFY((hrc == SUCCESS), "do_fopen failed");
318 
319     set_time(res.timers, HDF5_FINE_WRITE_FIXED_DIMS, TSTART);
320     hrc = do_write(&res, &fd, &param, ndsets, nbytes, buf_size, buffer);
321     set_time(res.timers, HDF5_FINE_WRITE_FIXED_DIMS, TSTOP);
322 
323     VRFY((hrc == SUCCESS), "do_write failed");
324 
325     /* Close file for write */
326     hrc = do_fclose(iot, &fd);
327 
328     set_time(res.timers, HDF5_GROSS_WRITE_FIXED_DIMS, TSTOP);
329     VRFY((hrc == SUCCESS), "do_fclose failed");
330 
331     if (!param.h5_write_only) {
332         /*
333          * Read performance measurement
334          */
335         /* Need barrier to make sure everyone is done writing and has
336          * closed the file.  Also to make sure everyone starts reading
337          * at the same time.
338          */
339         MPI_Barrier(pio_comm_g);
340 
341         /* Open file for read */
342         set_time(res.timers, HDF5_GROSS_READ_FIXED_DIMS, TSTART);
343         hrc = do_fopen(&param, fname, &fd, PIO_READ);
344 
345         VRFY((hrc == SUCCESS), "do_fopen failed");
346 
347         set_time(res.timers, HDF5_FINE_READ_FIXED_DIMS, TSTART);
348         hrc = do_read(&res, &fd, &param, ndsets, nbytes, buf_size, buffer);
349         set_time(res.timers, HDF5_FINE_READ_FIXED_DIMS, TSTOP);
350         VRFY((hrc == SUCCESS), "do_read failed");
351 
352         /* Close file for read */
353         hrc = do_fclose(iot, &fd);
354 
355         set_time(res.timers, HDF5_GROSS_READ_FIXED_DIMS, TSTOP);
356         VRFY((hrc == SUCCESS), "do_fclose failed");
357     }
358 
359     /* Need barrier to make sure everyone is done with the file */
360     /* before it may be removed by do_cleanupfile */
361     MPI_Barrier(pio_comm_g);
362     do_cleanupfile(iot, fname);
363     }
364 
365 done:
366     /* clean up */
367     /* release HDF5 objects */
368 
369     /* close any opened files */
370     /* no remove(fname) because that should have happened normally. */
371     switch (iot) {
372         case POSIXIO:
373             if (fd.posixfd != -1)
374             hrc = do_fclose(iot, &fd);
375             break;
376         case MPIO:
377             if (fd.mpifd != MPI_FILE_NULL)
378             hrc = do_fclose(iot, &fd);
379             break;
380         case PHDF5:
381             if (fd.h5fd != -1)
382             hrc = do_fclose(iot, &fd);
383             break;
384     }
385 
386     /* release generic resources */
387     if(buffer)
388         HDfree(buffer);
389     res.ret_code = ret_code;
390     return res;
391 }
392 
393 /*
394  * Function:    pio_create_filename
395  * Purpose:     Create a new filename to write to. Determine the correct
396  *              suffix to append to the filename by the type of I/O we're
397  *              doing. Also, place in the /tmp/{$USER,$LOGIN} directory if
398  *              USER or LOGIN are specified in the environment.
399  * Return:      Pointer to filename or NULL
400  * Programmer:  Bill Wendling, 21. November 2001
401  * Modifications:
402  */
403     static char *
pio_create_filename(iotype iot,const char * base_name,char * fullname,size_t size)404 pio_create_filename(iotype iot, const char *base_name, char *fullname, size_t size)
405 {
406     const char *prefix, *suffix = "";
407     char *ptr, last = '\0';
408     size_t i, j;
409 
410     if (!base_name || !fullname || size < 1)
411         return NULL;
412 
413     HDmemset(fullname, 0, size);
414 
415     switch (iot) {
416         case POSIXIO:
417             suffix = ".posix";
418             break;
419         case MPIO:
420             suffix = ".mpio";
421             break;
422         case PHDF5:
423             suffix = ".h5";
424             break;
425     }
426 
427     /* First use the environment variable and then try the constant */
428     prefix = HDgetenv("HDF5_PARAPREFIX");
429 
430 #ifdef HDF5_PARAPREFIX
431     if (!prefix)
432         prefix = HDF5_PARAPREFIX;
433 #endif  /* HDF5_PARAPREFIX */
434 
435     /* Prepend the prefix value to the base name */
436     if (prefix && *prefix) {
437         /* If the prefix specifies the HDF5_PARAPREFIX directory, then
438          * default to using the "/tmp/$USER" or "/tmp/$LOGIN"
439          * directory instead. */
440         register char *user, *login, *subdir;
441 
442         user = HDgetenv("USER");
443         login = HDgetenv("LOGIN");
444         subdir = (user ? user : login);
445 
446         if (subdir) {
447             for (i = 0; i < size-1 && prefix[i]; i++)
448                 fullname[i] = prefix[i];
449 
450             fullname[i++] = '/';
451 
452             for (j = 0; i < size && subdir[j]; i++, j++)
453                 fullname[i] = subdir[j];
454         }
455         else {
456             /* We didn't append the prefix yet */
457             HDstrncpy(fullname, prefix, size);
458             fullname[size - 1] = '\0';
459         }
460 
461         if ((HDstrlen(fullname) + HDstrlen(base_name) + 1) < size) {
462             /* Append the base_name with a slash first. Multiple slashes are
463              * handled below. */
464             h5_stat_t buf;
465 
466             if (HDstat(fullname, &buf) < 0)
467                 /* The directory doesn't exist just yet */
468                 if (HDmkdir(fullname, (mode_t) 0755) < 0 && errno != EEXIST) {
469                     /* We couldn't make the "/tmp/${USER,LOGIN}" subdirectory.
470                      * Default to PREFIX's original prefix value. */
471                     HDstrcpy(fullname, prefix);
472                 }
473 
474             HDstrcat(fullname, "/");
475             HDstrcat(fullname, base_name);
476         }
477         else {
478             /* Buffer is too small */
479             return NULL;
480         }
481     }
482     else if (HDstrlen(base_name) >= size) {
483         /* Buffer is too small */
484         return NULL;
485     }
486     else {
487         HDstrcpy(fullname, base_name);
488     }
489 
490     /* Append a suffix */
491     if (suffix) {
492         if (HDstrlen(fullname) + HDstrlen(suffix) >= size)
493             return NULL;
494 
495         HDstrcat(fullname, suffix);
496     }
497 
498     /* Remove any double slashes in the filename */
499     for (ptr = fullname, i = j = 0; ptr && i < size; i++, ptr++) {
500         if (*ptr != '/' || last != '/')
501             fullname[j++] = *ptr;
502 
503         last = *ptr;
504     }
505 
506     return fullname;
507 }
508 
509 /*
510  * Function:        do_write
511  * Purpose:         Write the required amount of data to the file.
512  * Return:          SUCCESS or FAIL
513  * Programmer:      Albert Cheng, Bill Wendling, 2001/12/13
514  * Modifications:
515  *  Added 2D testing (Christian Chilan, 10. August 2005)
516  */
517     static herr_t
do_write(results * res,file_descr * fd,parameters * parms,long ndsets,off_t nbytes,size_t buf_size,void * buffer)518 do_write(results *res, file_descr *fd, parameters *parms, long ndsets,
519     off_t nbytes, size_t buf_size, void *buffer)
520 {
521     int         ret_code = SUCCESS;
522     int         rc;             /*routine return code                   */
523     long        ndset;
524     size_t      blk_size;       /* The block size to subdivide the xfer buffer into */
525     off_t       nbytes_xfer;    /* Total number of bytes transferred so far */
526     size_t      nbytes_xfer_advance; /* Number of bytes transferred in a single I/O operation */
527     size_t      nbytes_toxfer;  /* Number of bytes to transfer a particular time */
528     char        dname[64];
529     off_t       dset_offset=0;  /*dataset offset in a file              */
530     off_t       bytes_begin[2];    /*first elmt this process transfer      */
531     off_t       bytes_count;    /*number of elmts this process transfer */
532     off_t       snbytes=0;  /*size of a side of the dataset square  */
533     unsigned char *buf_p;       /* Current buffer pointer               */
534 
535     /* POSIX variables */
536     off_t       file_offset;    /* File offset of the next transfer     */
537     off_t       file_offset_advance;    /* File offset advance after each I/O operation */
538     off_t       posix_file_offset;    /* Base file offset of the next transfer      */
539 
540     /* MPI variables */
541     MPI_Offset  mpi_file_offset;    /* Base file offset of the next transfer*/
542     MPI_Offset  mpi_offset;         /* Offset in MPI file                   */
543     MPI_Offset  mpi_offset_advance; /* Offset advance after each I/O operation */
544     MPI_Datatype mpi_file_type;     /* MPI derived type for 1D file            */
545     MPI_Datatype mpi_blk_type;      /* MPI derived type for 1D buffer          */
546     MPI_Datatype mpi_cont_type;     /* MPI derived type for 2D contiguous file */
547     MPI_Datatype mpi_partial_buffer_cont;       /* MPI derived type for partial 2D contiguous buffer */
548     MPI_Datatype mpi_inter_type;    /* MPI derived type for 2D interleaved file  */
549     MPI_Datatype mpi_partial_buffer_inter;      /* MPI derived type for partial 2D interleaved buffer */
550     MPI_Datatype mpi_full_buffer;       /* MPI derived type for 2D full buffer */
551     MPI_Datatype mpi_full_chunk;        /* MPI derived type for 2D full chunk */
552     MPI_Datatype mpi_chunk_inter_type;  /* MPI derived type for 2D chunk interleaved file */
553     MPI_Datatype mpi_collective_type;   /* Generic MPI derived type for 2D collective access */
554     MPI_Status  mpi_status;
555     int         mrc;                /* MPI return code                      */
556 
557     /* HDF5 variables */
558     herr_t      hrc;                    /*HDF5 return code              */
559     hsize_t     h5dims[2];              /*dataset dim sizes             */
560     hid_t       h5dset_space_id = -1;   /*dataset space ID              */
561     hid_t       h5mem_space_id = -1;    /*memory dataspace ID           */
562     hid_t       h5ds_id = -1;           /*dataset handle                */
563     hsize_t     h5block[2];     /*dataspace selection           */
564     hsize_t     h5stride[2];
565     hsize_t     h5count[2];
566     hsize_t     h5start[2];
567     hssize_t    h5offset[2];            /* Selection offset within dataspace */
568     hid_t       h5dcpl = -1;            /* Dataset creation property list */
569     hid_t       h5dxpl = -1;            /* Dataset transfer property list */
570 
571     /* Get the parameters from the parameter block */
572     blk_size=parms->blk_size;
573 
574     /* There are two kinds of transfer patterns, contiguous and interleaved.
575      * Let 0,1,2,...,n be data accessed by process 0,1,2,...,n
576      *     where n is rank of the last process.
577      * In contiguous pattern, data are accessed as
578      *    000...111...222...nnn...
579      * In interleaved pattern, data are accessed as
580      *    012...n012...n...
581      * These are all in the scope of one dataset.
582      */
583 
584     /* 1D dataspace */
585     if (!parms->dim2d){
586         /* Contiguous Pattern: */
587         if (!parms->interleaved) {
588             bytes_begin[0] = (off_t)(((double)nbytes*pio_mpi_rank_g)/pio_mpi_nprocs_g);
589         } /* end if */
590         /* Interleaved Pattern: */
591         else {
592             bytes_begin[0] = (off_t)(blk_size*pio_mpi_rank_g);
593         } /* end else */
594 
595         /* Prepare buffer for verifying data */
596         if (parms->verify)
597             memset(buffer,pio_mpi_rank_g+1,buf_size);
598     }/* end if */
599     /* 2D dataspace */
600     else {
601         /* nbytes is always the number of bytes per dataset (1D or 2D). If the
602            dataspace is 2D, snbytes is the size of a side of the dataset square.
603          */
604         snbytes = (off_t)sqrt(nbytes);
605 
606         /* Contiguous Pattern: */
607         if (!parms->interleaved) {
608             bytes_begin[0] = (off_t)((double)snbytes*pio_mpi_rank_g / pio_mpi_nprocs_g);
609             bytes_begin[1] = 0;
610         } /* end if */
611         /* Interleaved Pattern: */
612         else {
613             bytes_begin[0] = 0;
614 
615             if(!parms->h5_use_chunks || parms->io_type==PHDF5)
616                 bytes_begin[1] = (off_t)(blk_size*pio_mpi_rank_g);
617             else
618                 bytes_begin[1] = (off_t)(blk_size*blk_size*pio_mpi_rank_g);
619         } /* end else */
620 
621         /* Prepare buffer for verifying data */
622         if (parms->verify)
623             HDmemset(buffer,pio_mpi_rank_g+1,buf_size*blk_size);
624     } /* end else */
625 
626 
627     /* Calculate the total number of bytes (bytes_count) to be
628      * transferred by this process. It may be different for different
629      * transfer pattern due to rounding to integral values.
630      */
631     /*
632      * Calculate the beginning bytes of this process and the next.
633      * bytes_count is the difference between these two beginnings.
634      * This way, it eliminates any rounding errors.
635      * (This is tricky, don't mess with the formula, rounding errors
636      * can easily get introduced) */
637     bytes_count = (off_t)(((double)nbytes*(pio_mpi_rank_g+1)) / pio_mpi_nprocs_g)
638     - (off_t)(((double)nbytes*pio_mpi_rank_g) / pio_mpi_nprocs_g);
639 
640     /* debug */
641     if (pio_debug_level >= 4) {
642         HDprint_rank(output);
643         if (!parms->dim2d) {
644         HDfprintf(output, "Debug(do_write): "
645             "buf_size=%zu, bytes_begin=%" H5_PRINTF_LL_WIDTH "d, bytes_count=%" H5_PRINTF_LL_WIDTH "d\n",
646             buf_size, (long long)bytes_begin[0],
647             (long long)bytes_count);
648         } else {
649         HDfprintf(output, "Debug(do_write): "
650             "linear buf_size=%zu, bytes_begin=(%" H5_PRINTF_LL_WIDTH "d,%" H5_PRINTF_LL_WIDTH "d), bytes_count=%" H5_PRINTF_LL_WIDTH "d\n",
651             buf_size*blk_size, (long long)bytes_begin[0],
652             (long long)bytes_begin[1], (long long)bytes_count);
653         }
654     }
655 
656     /* I/O Access specific setup */
657     switch (parms->io_type) {
658     case POSIXIO:
659         /* No extra setup */
660         break;
661 
662     case MPIO: /* MPI-I/O setup */
663         /* 1D dataspace */
664         if (!parms->dim2d){
665             /* Build block's derived type */
666             mrc = MPI_Type_contiguous((int)blk_size,
667                 MPI_BYTE, &mpi_blk_type);
668             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
669 
670             /* Build file's derived type */
671             mrc = MPI_Type_vector((int)(buf_size/blk_size), (int)1,
672                 (int)pio_mpi_nprocs_g, mpi_blk_type, &mpi_file_type);
673             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
674 
675             /* Commit file type */
676             mrc = MPI_Type_commit( &mpi_file_type );
677             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
678 
679             /* Commit buffer type */
680             mrc = MPI_Type_commit( &mpi_blk_type );
681             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
682         } /* end if */
683         /* 2D dataspace */
684         else {
685             /* Build partial buffer derived type for contiguous access */
686 
687             mrc = MPI_Type_contiguous((int)buf_size, MPI_BYTE,
688                 &mpi_partial_buffer_cont);
689             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
690 
691             /* Commit partial buffer derived type */
692             mrc = MPI_Type_commit(&mpi_partial_buffer_cont);
693             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
694 
695             /* Build contiguous file's derived type */
696             mrc = MPI_Type_vector((int)blk_size, (int)1, (int)(snbytes/buf_size),
697                 mpi_partial_buffer_cont, &mpi_cont_type);
698             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
699 
700             /* Commit contiguous file type */
701             mrc = MPI_Type_commit(&mpi_cont_type);
702             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
703 
704             /* Build partial buffer derived type for interleaved access */
705             mrc = MPI_Type_contiguous((int)blk_size, MPI_BYTE,
706                 &mpi_partial_buffer_inter);
707             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
708 
709             /* Commit partial buffer derived type */
710             mrc = MPI_Type_commit(&mpi_partial_buffer_inter);
711             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
712 
713             /* Build interleaved file's derived type */
714             mrc = MPI_Type_vector((int)buf_size, (int)1, (int)(snbytes/blk_size),
715                 mpi_partial_buffer_inter, &mpi_inter_type);
716             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
717 
718             /* Commit interleaved file type */
719             mrc = MPI_Type_commit(&mpi_inter_type);
720             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
721 
722             /* Build full buffer derived type */
723             mrc = MPI_Type_contiguous((int)(blk_size*buf_size), MPI_BYTE,
724                 &mpi_full_buffer);
725             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
726 
727             /* Commit full buffer derived type */
728             mrc = MPI_Type_commit(&mpi_full_buffer);
729             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
730 
731             /* Build full chunk derived type */
732             mrc = MPI_Type_contiguous((int)(blk_size*blk_size), MPI_BYTE,
733                 &mpi_full_chunk);
734             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
735 
736             /* Commit full chunk derived type */
737             mrc = MPI_Type_commit(&mpi_full_chunk);
738             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
739 
740             /* Build chunk interleaved file's derived type */
741             mrc = MPI_Type_vector((int)(buf_size/blk_size), (int)1, (int)(snbytes/blk_size),
742                 mpi_full_chunk, &mpi_chunk_inter_type);
743             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
744 
745             /* Commit chunk interleaved file type */
746             mrc = MPI_Type_commit(&mpi_chunk_inter_type);
747             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
748 
749         } /* end else */
750         break;
751 
752     case PHDF5: /* HDF5 setup */
753         /* 1D dataspace */
754         if (!parms->dim2d){
755             if(nbytes>0) {
756                 /* define a contiguous dataset of nbytes native bytes */
757                 h5dims[0] = nbytes;
758                 h5dset_space_id = H5Screate_simple(1, h5dims, NULL);
759                 VRFY((h5dset_space_id >= 0), "H5Screate_simple");
760 
761                 /* Set up the file dset space id to select the pattern to access */
762                 if (!parms->interleaved){
763                 /* Contiguous pattern */
764                 h5start[0] = bytes_begin[0];
765                 h5stride[0] = h5block[0] = blk_size;
766                 h5count[0] = buf_size/blk_size;
767                 } /* end if */
768                 else {
769                 /* Interleaved access pattern */
770                 /* Skip offset over blocks of other processes */
771                 h5start[0] = bytes_begin[0];
772                 h5stride[0] = blk_size*pio_mpi_nprocs_g;
773                 h5block[0] = blk_size;
774                 h5count[0] = buf_size/blk_size;
775                 } /* end else */
776                 hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET,
777                     h5start, h5stride, h5count, h5block);
778                 VRFY((hrc >= 0), "H5Sselect_hyperslab");
779             } /* end if */
780             else {
781                 h5dset_space_id = H5Screate(H5S_SCALAR);
782                 VRFY((h5dset_space_id >= 0), "H5Screate");
783             } /* end else */
784 
785             /* Create the memory dataspace that corresponds to the xfer buffer */
786             if(buf_size>0) {
787                 h5dims[0] = buf_size;
788                 h5mem_space_id = H5Screate_simple(1, h5dims, NULL);
789                 VRFY((h5mem_space_id >= 0), "H5Screate_simple");
790             } /* end if */
791             else {
792                 h5mem_space_id = H5Screate(H5S_SCALAR);
793                 VRFY((h5mem_space_id >= 0), "H5Screate");
794             } /* end else */
795         } /* end if */
796         /* 2D dataspace */
797         else {
798             if(nbytes>0) {
799                 /* define a contiguous dataset of nbytes native bytes */
800                 h5dims[0] = snbytes;
801                 h5dims[1] = snbytes;
802                 h5dset_space_id = H5Screate_simple(2, h5dims, NULL);
803                 VRFY((h5dset_space_id >= 0), "H5Screate_simple");
804 
805                 /* Set up the file dset space id to select the pattern to access */
806                 if (!parms->interleaved){
807                 /* Contiguous pattern */
808                 h5start[0] = bytes_begin[0];
809                 h5start[1] = bytes_begin[1];
810                 h5stride[0] = 1;
811                 h5stride[1] = h5block[0] = h5block[1] = blk_size;
812                 h5count[0] = 1;
813                 h5count[1] = buf_size/blk_size;
814                 } /* end if */
815                 else {
816                 /* Interleaved access pattern */
817                 /* Skip offset over blocks of other processes */
818                 h5start[0] = bytes_begin[0];
819                 h5start[1] = bytes_begin[1];
820                 h5stride[0] = blk_size;
821                 h5stride[1] = blk_size*pio_mpi_nprocs_g;
822                 h5block[0] = h5block[1] = blk_size;
823                 h5count[0] = buf_size/blk_size;
824                 h5count[1] = 1;
825                 } /* end else */
826                 hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET,
827                     h5start, h5stride, h5count, h5block);
828                 VRFY((hrc >= 0), "H5Sselect_hyperslab");
829             } /* end if */
830             else {
831                 h5dset_space_id = H5Screate(H5S_SCALAR);
832                 VRFY((h5dset_space_id >= 0), "H5Screate");
833             } /* end else */
834 
835             /* Create the memory dataspace that corresponds to the xfer buffer */
836             if(buf_size>0) {
837                 if (!parms->interleaved){
838                 h5dims[0] = blk_size;
839                 h5dims[1] = buf_size;
840                 }else{
841                 h5dims[0] = buf_size;
842                 h5dims[1] = blk_size;
843                 }
844                 h5mem_space_id = H5Screate_simple(2, h5dims, NULL);
845                 VRFY((h5mem_space_id >= 0), "H5Screate_simple");
846             } /* end if */
847             else {
848                 h5mem_space_id = H5Screate(H5S_SCALAR);
849                 VRFY((h5mem_space_id >= 0), "H5Screate");
850             } /* end else */
851         } /* end else */
852 
853         /* Create the dataset transfer property list */
854         h5dxpl = H5Pcreate(H5P_DATASET_XFER);
855         if (h5dxpl < 0) {
856         HDfprintf(stderr, "HDF5 Property List Create failed\n");
857         GOTOERROR(FAIL);
858         }
859 
860         /* Change to collective I/O, if asked */
861         if(parms->collective) {
862             hrc = H5Pset_dxpl_mpio(h5dxpl, H5FD_MPIO_COLLECTIVE);
863             if (hrc < 0) {
864                 HDfprintf(stderr, "HDF5 Property List Set failed\n");
865                 GOTOERROR(FAIL);
866             } /* end if */
867         } /* end if */
868         break;
869     } /* end switch */
870 
871     for (ndset = 1; ndset <= ndsets; ++ndset) {
872 
873     /* Calculate dataset offset within a file */
874 
875     /* create dataset */
876     switch (parms->io_type) {
877         case POSIXIO:
878         case MPIO:
879             /* both posix and mpi io just need dataset offset in file*/
880             dset_offset = (ndset - 1) * nbytes;
881             break;
882 
883         case PHDF5:
884             h5dcpl = H5Pcreate(H5P_DATASET_CREATE);
885             if (h5dcpl < 0) {
886                 HDfprintf(stderr, "HDF5 Property List Create failed\n");
887                 GOTOERROR(FAIL);
888             }
889             /* 1D dataspace */
890             if (!parms->dim2d){
891                 /* Make the dataset chunked if asked */
892                 if(parms->h5_use_chunks) {
893                 /* Set the chunk size to be the same as the buffer size */
894                 h5dims[0] = blk_size;
895                 hrc = H5Pset_chunk(h5dcpl, 1, h5dims);
896                 if (hrc < 0) {
897                     HDfprintf(stderr, "HDF5 Property List Set failed\n");
898                     GOTOERROR(FAIL);
899                 } /* end if */
900                 } /* end if */
901             }/* end if */
902             else{
903                 /* 2D dataspace */
904                 if(parms->h5_use_chunks) {
905                 /* Set the chunk size to be the same as the block size */
906                 h5dims[0] = blk_size;
907                 h5dims[1] = blk_size;
908                 hrc = H5Pset_chunk(h5dcpl, 2, h5dims);
909                 if (hrc < 0) {
910                     HDfprintf(stderr, "HDF5 Property List Set failed\n");
911                     GOTOERROR(FAIL);
912                 } /* end if */
913                 } /* end if */
914             }/* end else */
915 
916             HDsprintf(dname, "Dataset_%ld", ndset);
917             h5ds_id = H5DCREATE(fd->h5fd, dname, ELMT_H5_TYPE,
918                 h5dset_space_id, h5dcpl);
919 
920             if (h5ds_id < 0) {
921                 HDfprintf(stderr, "HDF5 Dataset Create failed\n");
922                 GOTOERROR(FAIL);
923             }
924 
925             hrc = H5Pclose(h5dcpl);
926             /* verifying the close of the dcpl */
927             if (hrc < 0) {
928                 HDfprintf(stderr, "HDF5 Property List Close failed\n");
929                 GOTOERROR(FAIL);
930             }
931             break;
932     }
933 
934     /* The task is to transfer bytes_count bytes, starting at
935      * bytes_begin position, using transfer buffer of buf_size bytes.
936      * If interleaved, select buf_size at a time, in round robin
937      * fashion, according to number of process. Otherwise, select
938      * all bytes_count in contiguous.
939      */
940     nbytes_xfer = 0 ;
941 
942     /* 1D dataspace */
943     if (!parms->dim2d){
944         /* Set base file offset for all I/O patterns and POSIX access */
945         posix_file_offset = dset_offset + bytes_begin[0];
946 
947         /* Set base file offset for all I/O patterns and MPI access */
948         mpi_file_offset = (MPI_Offset)(dset_offset + bytes_begin[0]);
949     } /* end if */
950     else {
951         /* Set base file offset for all I/O patterns and POSIX access */
952         posix_file_offset=dset_offset + bytes_begin[0]*snbytes+
953         bytes_begin[1];
954 
955         /* Set base file offset for all I/O patterns and MPI access */
956         mpi_file_offset=(MPI_Offset)(dset_offset + bytes_begin[0]*snbytes+
957             bytes_begin[1]);
958     } /* end else */
959 
960     /* Start "raw data" write timer */
961     set_time(res->timers, HDF5_RAW_WRITE_FIXED_DIMS, TSTART);
962 
963     while (nbytes_xfer < bytes_count){
964         /* Write */
965         /* Calculate offset of write within a dataset/file */
966         switch (parms->io_type) {
967         case POSIXIO:
968             /* 1D dataspace */
969             if (!parms->dim2d){
970                 /* Contiguous pattern */
971                 if (!parms->interleaved) {
972                     /* Compute file offset */
973                     file_offset = posix_file_offset + (off_t)nbytes_xfer;
974 
975                     /* only care if seek returns error */
976                     rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
977                     VRFY((rc==0), "POSIXSEEK");
978 
979                     /* check if all bytes are written */
980                     rc = ((ssize_t)buf_size ==
981                         POSIXWRITE(fd->posixfd, buffer, buf_size));
982                     VRFY((rc != 0), "POSIXWRITE");
983 
984                     /* Advance global offset in dataset */
985                     nbytes_xfer+=buf_size;
986                 } /* end if */
987                 /* Interleaved access pattern */
988                 else {
989                     /* Set the base of user's buffer */
990                     buf_p=(unsigned char *)buffer;
991 
992                     /* Set the number of bytes to transfer this time */
993                     nbytes_toxfer = buf_size;
994 
995                     /* Loop over the buffers to write */
996                     while(nbytes_toxfer>0) {
997                         /* Skip offset over blocks of other processes */
998                         file_offset = posix_file_offset +
999                             (off_t)(nbytes_xfer*pio_mpi_nprocs_g);
1000 
1001                         /* only care if seek returns error */
1002                         rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
1003                         VRFY((rc==0), "POSIXSEEK");
1004 
1005                         /* check if all bytes are written */
1006                         rc = ((ssize_t)blk_size ==
1007                             POSIXWRITE(fd->posixfd, buf_p, blk_size));
1008                         VRFY((rc != 0), "POSIXWRITE");
1009 
1010                         /* Advance location in buffer */
1011                         buf_p+=blk_size;
1012 
1013                         /* Advance global offset in dataset */
1014                         nbytes_xfer+=blk_size;
1015 
1016                         /* Decrement number of bytes left this time */
1017                         nbytes_toxfer-=blk_size;
1018                     } /* end while */
1019                 } /* end else */
1020             } /* end if */
1021             /* 2D dataspace */
1022             else {
1023                 /* Contiguous storage */
1024                 if (!parms->h5_use_chunks) {
1025                     /* Contiguous access pattern */
1026                     if (!parms->interleaved) {
1027                         /* Compute file offset */
1028                         file_offset=posix_file_offset+(off_t)(((nbytes_xfer/blk_size)
1029                             /snbytes)*(blk_size*snbytes)+((nbytes_xfer/blk_size)%snbytes));
1030 
1031                         /* Number of bytes to be transferred per I/O operation */
1032                         nbytes_xfer_advance = buf_size;
1033 
1034                         /* Global offset advance after each I/O operation */
1035                         file_offset_advance = (off_t)snbytes;
1036                     } /* end if */
1037                     /* Interleaved access pattern */
1038                     else {
1039                         /* Compute file offset */
1040                         file_offset=posix_file_offset+(off_t)((((nbytes_xfer/buf_size)
1041                             *pio_mpi_nprocs_g)/snbytes)*(buf_size*snbytes)
1042                             +((nbytes_xfer/buf_size)*pio_mpi_nprocs_g)%snbytes);
1043 
1044                         /* Number of bytes to be transferred per I/O operation */
1045                         nbytes_xfer_advance = blk_size;
1046 
1047                         /* Global offset advance after each I/O operation */
1048                         file_offset_advance = (off_t)snbytes;
1049                     } /* end else */
1050                 } /* end if */
1051                 /* Chunked storage */
1052                 else {
1053                     /*Contiguous access pattern */
1054                     if (!parms->interleaved) {
1055                         /* Compute file offset */
1056                         file_offset=posix_file_offset+(off_t)nbytes_xfer;
1057 
1058                         /* Number of bytes to be transferred per I/O operation */
1059                         nbytes_xfer_advance = blk_size * buf_size;
1060 
1061                         /* Global offset advance after each I/O operation */
1062                         file_offset_advance = 0;
1063                     } /* end if */
1064                     /*Interleaved access pattern */
1065                     else {
1066                         /* Compute file offset */
1067                         /* Before simplification */
1068                         /* file_offset=posix_file_offset+(off_t)((nbytes_xfer/(buf_size/blk_size)
1069                         *pio_mpi_nprocs_g)/(snbytes/blk_size*(blk_size*blk_size))*(buf_size/blk_size
1070                         *snbytes/blk_size*(blk_size*blk_size))+((nbytes_xfer/(buf_size/blk_size))
1071                         *pio_mpi_nprocs_g)%(snbytes/blk_size*(blk_size*blk_size))); */
1072 
1073                         file_offset=posix_file_offset+(off_t)(((nbytes_xfer/(buf_size/blk_size)
1074                         *pio_mpi_nprocs_g)/(snbytes*blk_size))*(buf_size*snbytes)+((nbytes_xfer/(buf_size/blk_size))
1075                         *pio_mpi_nprocs_g)%(snbytes*blk_size));
1076 
1077                         /* Number of bytes to be transferred per I/O operation */
1078                         nbytes_xfer_advance = blk_size * blk_size;
1079 
1080                         /* Global offset advance after each I/O operation */
1081                         /* file_offset_advance = (off_t)(snbytes/blk_size*(blk_size*blk_size)); */
1082                         file_offset_advance = (off_t)(snbytes*blk_size);
1083                     } /* end else */
1084                 } /* end else */
1085 
1086                 /* Common code for file access */
1087 
1088                 /* Set the base of user's buffer */
1089                 buf_p = (unsigned char *)buffer;
1090 
1091                 /* Set the number of bytes to transfer this time */
1092                 nbytes_toxfer = buf_size*blk_size;
1093 
1094                 /* Loop over portions of the buffer to write */
1095                 while(nbytes_toxfer>0){
1096                     /* only care if seek returns error */
1097                     rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
1098                     VRFY((rc==0), "POSIXSEEK");
1099 
1100                     /* check if all bytes are written */
1101                     rc = ((ssize_t)nbytes_xfer_advance ==
1102                         POSIXWRITE(fd->posixfd, buf_p, nbytes_xfer_advance));
1103                     VRFY((rc != 0), "POSIXWRITE");
1104 
1105                     /* Advance location in buffer */
1106                     buf_p+=nbytes_xfer_advance;
1107 
1108                     /* Advance global offset in dataset */
1109                     nbytes_xfer+=nbytes_xfer_advance;
1110 
1111                     /* Decrement number of bytes left this time */
1112                     nbytes_toxfer-=nbytes_xfer_advance;
1113 
1114                     /* Partially advance file offset */
1115                     file_offset+=file_offset_advance;
1116                 } /* end while */
1117 
1118             } /* end else */
1119 
1120             break;
1121 
1122         case MPIO:
1123             /* 1D dataspace */
1124             if (!parms->dim2d){
1125                 /* Independent file access */
1126                 if(!parms->collective) {
1127                     /* Contiguous pattern */
1128                     if (!parms->interleaved){
1129                         /* Compute offset in file */
1130                         mpi_offset = mpi_file_offset +
1131                             nbytes_xfer;
1132 
1133                         /* Perform independent write */
1134                         mrc = MPI_File_write_at(fd->mpifd, mpi_offset, buffer,
1135                             (int)(buf_size/blk_size), mpi_blk_type,
1136                             &mpi_status);
1137                         VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE");
1138 
1139                         /* Advance global offset in dataset */
1140                         nbytes_xfer+=buf_size;
1141                     } /* end if */
1142                     /* Interleaved access pattern */
1143                     else {
1144                         /* Set the base of user's buffer */
1145                         buf_p=(unsigned char *)buffer;
1146 
1147                         /* Set the number of bytes to transfer this time */
1148                         nbytes_toxfer = buf_size;
1149 
1150                         /* Loop over the buffers to write */
1151                         while(nbytes_toxfer>0) {
1152                             /* Skip offset over blocks of other processes */
1153                             mpi_offset = mpi_file_offset +
1154                             (nbytes_xfer*pio_mpi_nprocs_g);
1155 
1156                             /* Perform independent write */
1157                             mrc = MPI_File_write_at(fd->mpifd, mpi_offset, buf_p,
1158                                 (int)1, mpi_blk_type, &mpi_status);
1159                             VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE");
1160 
1161                             /* Advance location in buffer */
1162                             buf_p+=blk_size;
1163 
1164                             /* Advance global offset in dataset */
1165                             nbytes_xfer+=blk_size;
1166 
1167                             /* Decrement number of bytes left this time */
1168                             nbytes_toxfer-=blk_size;
1169                         } /* end while */
1170                     } /* end else */
1171                 } /* end if */
1172                 /* Collective file access */
1173                 else {
1174                     /* Contiguous access pattern */
1175                     if (!parms->interleaved){
1176                         /* Compute offset in file */
1177                         mpi_offset = mpi_file_offset +
1178                             nbytes_xfer;
1179 
1180                         /* Perform independent write */
1181                         mrc = MPI_File_write_at_all(fd->mpifd, mpi_offset, buffer,
1182                             (int)(buf_size/blk_size), mpi_blk_type, &mpi_status);
1183                         VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE");
1184 
1185                         /* Advance global offset in dataset */
1186                         nbytes_xfer+=buf_size;
1187                     } /* end if */
1188                     /* Interleaved access pattern */
1189                     else {
1190                         /* Compute offset in file */
1191                         mpi_offset = mpi_file_offset +
1192                             (nbytes_xfer*pio_mpi_nprocs_g);
1193 
1194                         /* Set the file view */
1195                         mrc = MPI_File_set_view(fd->mpifd, mpi_offset, mpi_blk_type,
1196                             mpi_file_type, (char*)"native",  h5_io_info_g);
1197                         VRFY((mrc==MPI_SUCCESS), "MPIO_VIEW");
1198 
1199                         /* Perform write */
1200                         mrc = MPI_File_write_at_all(fd->mpifd, 0, buffer,
1201                             (int)(buf_size/blk_size), mpi_blk_type, &mpi_status);
1202                         VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE");
1203 
1204                         /* Advance global offset in dataset */
1205                         nbytes_xfer+=buf_size;
1206                     } /* end else */
1207                 } /* end else */
1208             } /* end if */
1209             /* 2D dataspace */
1210             else {
1211                 /* Contiguous storage */
1212                 if (!parms->h5_use_chunks) {
1213                     /* Contiguous access pattern */
1214                     if (!parms->interleaved) {
1215                         /* Compute offset in file */
1216                         mpi_offset=mpi_file_offset+((nbytes_xfer/blk_size)/snbytes)*
1217                             (blk_size*snbytes)+((nbytes_xfer/blk_size)%snbytes);
1218 
1219                         /* Number of bytes to be transferred per I/O operation */
1220                         nbytes_xfer_advance = buf_size;
1221 
1222                         /* Global offset advance after each I/O operation */
1223                         mpi_offset_advance = snbytes;
1224 
1225                         /* MPI type to be used for collective access */
1226                         mpi_collective_type = mpi_cont_type;
1227                     } /* end if */
1228                     /* Interleaved access pattern */
1229                     else {
1230                         /* Compute offset in file */
1231                         mpi_offset=mpi_file_offset+(((nbytes_xfer/buf_size)*pio_mpi_nprocs_g)/snbytes)*
1232                             (buf_size*snbytes)+((nbytes_xfer/buf_size)*pio_mpi_nprocs_g)%snbytes;
1233 
1234                         /* Number of bytes to be transferred per I/O operation */
1235                         nbytes_xfer_advance = blk_size;
1236 
1237                         /* Global offset advance after each I/O operation */
1238                         mpi_offset_advance = snbytes;
1239 
1240                         /* MPI type to be used for collective access */
1241                         mpi_collective_type = mpi_inter_type;
1242                     } /* end else */
1243                 } /* end if */
1244                 /* Chunked storage */
1245                 else {
1246                     /*Contiguous access pattern */
1247                     if (!parms->interleaved) {
1248                         /* Compute offset in file */
1249                         mpi_offset=mpi_file_offset+nbytes_xfer;
1250 
1251                         /* Number of bytes to be transferred per I/O operation */
1252                         nbytes_xfer_advance = blk_size * buf_size;
1253 
1254                         /* Global offset advance after each I/O operation */
1255                         mpi_offset_advance = 0;
1256 
1257                         /* MPI type to be used for collective access */
1258                         mpi_collective_type = mpi_full_buffer;
1259                     } /* end if */
1260                     /*Interleaved access pattern */
1261                     else {
1262                         /* Compute offset in file */
1263                         /* Before simplification */
1264                         /* mpi_offset=mpi_file_offset+(nbytes_xfer/(buf_size/blk_size)
1265                             *pio_mpi_nprocs_g)/(snbytes/blk_size*(blk_size*blk_size))*
1266                             (buf_size/blk_size*snbytes/blk_size*(blk_size*blk_size))+
1267                             ((nbytes_xfer/(buf_size/blk_size))*pio_mpi_nprocs_g)%(snbytes
1268                             /blk_size*(blk_size*blk_size)); */
1269                         mpi_offset=mpi_file_offset+((nbytes_xfer/(buf_size/blk_size)
1270                             *pio_mpi_nprocs_g)/(snbytes*blk_size))*(buf_size*snbytes)
1271                             +((nbytes_xfer/(buf_size/blk_size))*pio_mpi_nprocs_g)%(snbytes*blk_size);
1272 
1273                         /* Number of bytes to be transferred per I/O operation */
1274                         nbytes_xfer_advance = blk_size * blk_size;
1275 
1276                         /* Global offset advance after each I/O operation */
1277                         /* mpi_offset_advance = (MPI_Offset)(snbytes/blk_size*(blk_size*blk_size)); */
1278                         mpi_offset_advance = (MPI_Offset)(snbytes*blk_size);
1279 
1280                         /* MPI type to be used for collective access */
1281                         mpi_collective_type = mpi_chunk_inter_type;
1282                     } /* end else */
1283                 } /* end else */
1284 
1285                 /* Common code for independent file access */
1286                 if (!parms->collective) {
1287                     /* Set the base of user's buffer */
1288                     buf_p = (unsigned char *)buffer;
1289 
1290                     /* Set the number of bytes to transfer this time */
1291                     nbytes_toxfer = buf_size * blk_size;
1292 
1293                     /* Loop over portions of the buffer to write */
1294                     while(nbytes_toxfer>0){
1295                         /* Perform independent write */
1296                         mrc = MPI_File_write_at(fd->mpifd, mpi_offset, buf_p,
1297                             (int)nbytes_xfer_advance, MPI_BYTE, &mpi_status);
1298                         VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE");
1299 
1300                         /* Advance location in buffer */
1301                         buf_p+=nbytes_xfer_advance;
1302 
1303                         /* Advance global offset in dataset */
1304                         nbytes_xfer+=nbytes_xfer_advance;
1305 
1306                         /* Decrement number of bytes left this time */
1307                         nbytes_toxfer-=nbytes_xfer_advance;
1308 
1309                         /* Partially advance global offset in dataset */
1310                         mpi_offset+=mpi_offset_advance;
1311                     } /* end while */
1312                 } /* end if */
1313 
1314                 /* Common code for collective file access */
1315                 else {
1316                     /* Set the file view */
1317                     mrc = MPI_File_set_view(fd->mpifd, mpi_offset, MPI_BYTE,
1318                         mpi_collective_type, (char *)"native", h5_io_info_g);
1319                     VRFY((mrc==MPI_SUCCESS), "MPIO_VIEW");
1320 
1321                     /* Perform write */
1322                     MPI_File_write_at_all(fd->mpifd, 0, buffer,(int)(buf_size*blk_size),
1323                         MPI_BYTE, &mpi_status);
1324                     VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE");
1325 
1326                     /* Advance global offset in dataset */
1327                     nbytes_xfer+=buf_size*blk_size;
1328                 } /* end else */
1329 
1330             } /* end else */
1331 
1332             break;
1333 
1334         case PHDF5:
1335             /* 1D dataspace */
1336             if (!parms->dim2d){
1337             /* Set up the file dset space id to move the selection to process */
1338             if (!parms->interleaved){
1339                 /* Contiguous pattern */
1340                 h5offset[0] = nbytes_xfer;
1341             } /* end if */
1342             else {
1343                 /* Interleaved access pattern */
1344                 /* Skip offset over blocks of other processes */
1345                 h5offset[0] = (nbytes_xfer*pio_mpi_nprocs_g);
1346             } /* end else */
1347             hrc = H5Soffset_simple(h5dset_space_id, h5offset);
1348             VRFY((hrc >= 0), "H5Soffset_simple");
1349 
1350             /* Write the buffer out */
1351             hrc = H5Dwrite(h5ds_id, ELMT_H5_TYPE, h5mem_space_id,
1352                 h5dset_space_id, h5dxpl, buffer);
1353             VRFY((hrc >= 0), "H5Dwrite");
1354 
1355             /* Increment number of bytes transferred */
1356             nbytes_xfer += buf_size;
1357             } /* end if */
1358             /* 2D dataspace */
1359             else {
1360             /* Set up the file dset space id to move the selection to process */
1361             if (!parms->interleaved){
1362                 /* Contiguous pattern */
1363                 h5offset[0] = (nbytes_xfer/(snbytes*blk_size))*blk_size;
1364                 h5offset[1] = (nbytes_xfer%(snbytes*blk_size))/blk_size;
1365 
1366             } /* end if */
1367             else {
1368                 /* Interleaved access pattern */
1369                 /* Skip offset over blocks of other processes */
1370                 h5offset[0] = ((nbytes_xfer*pio_mpi_nprocs_g)/(snbytes*buf_size))*buf_size;
1371                 h5offset[1] = ((nbytes_xfer*pio_mpi_nprocs_g)%(snbytes*buf_size))/buf_size;
1372 
1373             } /* end else */
1374             hrc = H5Soffset_simple(h5dset_space_id, h5offset);
1375             VRFY((hrc >= 0), "H5Soffset_simple");
1376 
1377             /* Write the buffer out */
1378             hrc = H5Dwrite(h5ds_id, ELMT_H5_TYPE, h5mem_space_id,
1379                 h5dset_space_id, h5dxpl, buffer);
1380             VRFY((hrc >= 0), "H5Dwrite");
1381 
1382             /* Increment number of bytes transferred */
1383             nbytes_xfer += buf_size*blk_size;
1384 
1385             } /* end else */
1386 
1387             break;
1388         } /* switch (parms->io_type) */
1389     } /* end while */
1390 
1391     /* Stop "raw data" write timer */
1392     set_time(res->timers, HDF5_RAW_WRITE_FIXED_DIMS, TSTOP);
1393 
1394     /* Calculate write time */
1395 
1396     /* Close dataset. Only HDF5 needs to do an explicit close. */
1397     if (parms->io_type == PHDF5) {
1398         hrc = H5Dclose(h5ds_id);
1399 
1400         if (hrc < 0) {
1401         HDfprintf(stderr, "HDF5 Dataset Close failed\n");
1402         GOTOERROR(FAIL);
1403         }
1404 
1405         h5ds_id = -1;
1406     } /* end if */
1407     } /* end for */
1408 
1409 done:
1410     /* release MPI-I/O objects */
1411     if (parms->io_type == MPIO) {
1412         /* 1D dataspace */
1413         if (!parms->dim2d){
1414             /* Free file type */
1415             mrc = MPI_Type_free( &mpi_file_type );
1416             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1417 
1418             /* Free buffer type */
1419             mrc = MPI_Type_free( &mpi_blk_type );
1420             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1421         } /* end if */
1422         /* 2D dataspace */
1423         else {
1424             /* Free partial buffer type for contiguous access */
1425             mrc = MPI_Type_free( &mpi_partial_buffer_cont );
1426             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1427 
1428             /* Free contiguous file type */
1429             mrc = MPI_Type_free( &mpi_cont_type );
1430             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1431 
1432             /* Free partial buffer type for interleaved access */
1433             mrc = MPI_Type_free( &mpi_partial_buffer_inter );
1434             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1435 
1436             /* Free interleaved file type */
1437             mrc = MPI_Type_free( &mpi_inter_type );
1438             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1439 
1440             /* Free full buffer type */
1441             mrc = MPI_Type_free(&mpi_full_buffer);
1442             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1443 
1444             /* Free full chunk type */
1445             mrc = MPI_Type_free(&mpi_full_chunk);
1446             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1447 
1448             /* Free chunk interleaved file type */
1449             mrc = MPI_Type_free(&mpi_chunk_inter_type);
1450             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
1451         } /* end else */
1452     } /* end if */
1453 
1454     /* release HDF5 objects */
1455     if (h5dset_space_id != -1) {
1456     hrc = H5Sclose(h5dset_space_id);
1457     if (hrc < 0){
1458         HDfprintf(stderr, "HDF5 Dataset Space Close failed\n");
1459         ret_code = FAIL;
1460     } else {
1461         h5dset_space_id = -1;
1462     }
1463     }
1464 
1465     if (h5mem_space_id != -1) {
1466     hrc = H5Sclose(h5mem_space_id);
1467     if (hrc < 0) {
1468         HDfprintf(stderr, "HDF5 Memory Space Close failed\n");
1469         ret_code = FAIL;
1470     } else {
1471         h5mem_space_id = -1;
1472     }
1473     }
1474 
1475     if (h5dxpl != -1) {
1476     hrc = H5Pclose(h5dxpl);
1477     if (hrc < 0) {
1478         HDfprintf(stderr, "HDF5 Dataset Transfer Property List Close failed\n");
1479         ret_code = FAIL;
1480     } else {
1481         h5dxpl = -1;
1482     }
1483     }
1484 
1485     return ret_code;
1486 }
1487 
1488 /*
1489  * Function:        do_read
1490  * Purpose:         read the required amount of data from the file.
1491  * Return:          SUCCESS or FAIL
1492  * Programmer:      Albert Cheng 2001/12/13
1493  * Modifications:
1494  *  Added 2D testing (Christian Chilan, 10. August 2005)
1495  */
1496     static herr_t
do_read(results * res,file_descr * fd,parameters * parms,long ndsets,off_t nbytes,size_t buf_size,void * buffer)1497 do_read(results *res, file_descr *fd, parameters *parms, long ndsets,
1498     off_t nbytes, size_t buf_size, void *buffer /*out*/)
1499 {
1500     int         ret_code = SUCCESS;
1501     int         rc;             /*routine return code                   */
1502     long        ndset;
1503     size_t      blk_size;       /* The block size to subdivide the xfer buffer into */
1504     size_t      bsize;          /* Size of the actual buffer */
1505     off_t       nbytes_xfer;    /* Total number of bytes transferred so far */
1506     size_t      nbytes_xfer_advance; /* Number of bytes transferred in a single I/O operation */
1507     size_t      nbytes_toxfer;  /* Number of bytes to transfer a particular time */
1508     char        dname[64];
1509     off_t       dset_offset=0;  /*dataset offset in a file              */
1510     off_t       bytes_begin[2];    /*first elmt this process transfer      */
1511     off_t       bytes_count;    /*number of elmts this process transfer */
1512     off_t       snbytes=0;      /*size of a side of the dataset square */
1513     unsigned char *buf_p;       /* Current buffer pointer               */
1514 
1515     /* POSIX variables */
1516     off_t       file_offset;    /* File offset of the next transfer    */
1517     off_t       file_offset_advance; /* File offset advance after each I/O operation */
1518     off_t       posix_file_offset;    /* Base file offset of the next transfer      */
1519 
1520     /* MPI variables */
1521     MPI_Offset  mpi_file_offset;/* Base file offset of the next transfer*/
1522     MPI_Offset  mpi_offset;     /* Offset in MPI file                   */
1523     MPI_Offset  mpi_offset_advance; /* Offset advance after each I/O operation */
1524     MPI_Datatype mpi_file_type; /* MPI derived type for 1D file            */
1525     MPI_Datatype mpi_blk_type;  /* MPI derived type for 1D buffer          */
1526     MPI_Datatype mpi_cont_type; /* MPI derived type for 2D contiguous file */
1527     MPI_Datatype mpi_partial_buffer_cont;       /* MPI derived type for partial 2D contiguous buffer */
1528     MPI_Datatype mpi_inter_type;    /* MPI derived type for 2D interleaved file  */
1529     MPI_Datatype mpi_partial_buffer_inter;      /* MPI derived type for partial 2D interleaved buffer */
1530     MPI_Datatype mpi_full_buffer;       /* MPI derived type for 2D full buffer */
1531     MPI_Datatype mpi_full_chunk;        /* MPI derived type for 2D full chunk */
1532     MPI_Datatype mpi_chunk_inter_type;  /* MPI derived type for 2D chunk interleaved file */
1533     MPI_Datatype mpi_collective_type;   /* Generic MPI derived type for 2D collective access */
1534     MPI_Status  mpi_status;
1535     int         mrc;            /* MPI return code                      */
1536 
1537     /* HDF5 variables */
1538     herr_t      hrc;                    /*HDF5 return code              */
1539     hsize_t     h5dims[2];              /*dataset dim sizes             */
1540     hid_t       h5dset_space_id = -1;   /*dataset space ID              */
1541     hid_t       h5mem_space_id = -1;    /*memory dataspace ID           */
1542     hid_t       h5ds_id = -1;           /*dataset handle                */
1543     hsize_t h5block[2];     /*dataspace selection           */
1544     hsize_t h5stride[2];
1545     hsize_t h5count[2];
1546     hsize_t h5start[2];
1547     hssize_t    h5offset[2];            /* Selection offset within dataspace */
1548     hid_t       h5dxpl = -1;            /* Dataset transfer property list */
1549 
1550     /* Get the parameters from the parameter block */
1551     blk_size=parms->blk_size;
1552 
1553     /* There are two kinds of transfer patterns, contiguous and interleaved.
1554      * Let 0,1,2,...,n be data accessed by process 0,1,2,...,n
1555      *     where n is rank of the last process.
1556      * In contiguous pattern, data are accessed as
1557      *    000...111...222...nnn...
1558      * In interleaved pattern, data are accessed as
1559      *    012...n012...n...
1560      * These are all in the scope of one dataset.
1561      */
1562 
1563     /* 1D dataspace */
1564     if (!parms->dim2d){
1565         bsize = buf_size;
1566         /* Contiguous Pattern: */
1567         if (!parms->interleaved) {
1568              bytes_begin[0] = (off_t)(((double)nbytes*pio_mpi_rank_g)/pio_mpi_nprocs_g);
1569         } /* end if */
1570         /* Interleaved Pattern: */
1571         else {
1572             bytes_begin[0] = (off_t)(blk_size*pio_mpi_rank_g);
1573         } /* end else */
1574     }/* end if */
1575     /* 2D dataspace */
1576     else {
1577         /* nbytes is always the number of bytes per dataset (1D or 2D). If the
1578            dataspace is 2D, snbytes is the size of a side of the 'dataset square'.
1579          */
1580         snbytes = (off_t)sqrt(nbytes);
1581 
1582         bsize = buf_size * blk_size;
1583 
1584         /* Contiguous Pattern: */
1585         if (!parms->interleaved) {
1586             bytes_begin[0] = (off_t)((double)snbytes*pio_mpi_rank_g / pio_mpi_nprocs_g);
1587             bytes_begin[1] = 0;
1588         } /* end if */
1589         /* Interleaved Pattern: */
1590         else {
1591             bytes_begin[0] = 0;
1592 
1593             if (!parms->h5_use_chunks || parms->io_type==PHDF5)
1594                 bytes_begin[1] = (off_t)(blk_size*pio_mpi_rank_g);
1595             else
1596                 bytes_begin[1] = (off_t)(blk_size*blk_size*pio_mpi_rank_g);
1597         } /* end else */
1598     } /* end else */
1599 
1600     /* Calculate the total number of bytes (bytes_count) to be
1601      * transferred by this process. It may be different for different
1602      * transfer pattern due to rounding to integral values.
1603      */
1604     /*
1605      * Calculate the beginning bytes of this process and the next.
1606      * bytes_count is the difference between these two beginnings.
1607      * This way, it eliminates any rounding errors.
1608      * (This is tricky, don't mess with the formula, rounding errors
1609      * can easily get introduced) */
1610     bytes_count = (off_t)(((double)nbytes*(pio_mpi_rank_g+1)) / pio_mpi_nprocs_g)
1611     - (off_t)(((double)nbytes*pio_mpi_rank_g) / pio_mpi_nprocs_g);
1612 
1613     /* debug */
1614     if (pio_debug_level >= 4) {
1615         HDprint_rank(output);
1616         if (!parms->dim2d) {
1617         HDfprintf(output, "Debug(do_write): "
1618             "buf_size=%zu, bytes_begin=%" H5_PRINTF_LL_WIDTH "d, bytes_count=%" H5_PRINTF_LL_WIDTH "d\n",
1619             buf_size, (long long)bytes_begin[0],
1620             (long long)bytes_count);
1621         } else {
1622         HDfprintf(output, "Debug(do_write): "
1623             "linear buf_size=%zu, bytes_begin=(%" H5_PRINTF_LL_WIDTH "d,%" H5_PRINTF_LL_WIDTH "d), bytes_count=%" H5_PRINTF_LL_WIDTH "d\n",
1624             buf_size*blk_size, (long long)bytes_begin[0],
1625             (long long)bytes_begin[1], (long long)bytes_count);
1626         }
1627     }
1628 
1629     /* I/O Access specific setup */
1630     switch (parms->io_type) {
1631     case POSIXIO:
1632         /* No extra setup */
1633         break;
1634 
1635     case MPIO: /* MPI-I/O setup */
1636         /* 1D dataspace */
1637         if (!parms->dim2d){
1638             /* Build block's derived type */
1639             mrc = MPI_Type_contiguous((int)blk_size,
1640                 MPI_BYTE, &mpi_blk_type);
1641             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1642 
1643             /* Build file's derived type */
1644             mrc = MPI_Type_vector((int)(buf_size/blk_size), (int)1,
1645                 (int)pio_mpi_nprocs_g, mpi_blk_type, &mpi_file_type);
1646             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1647 
1648             /* Commit file type */
1649             mrc = MPI_Type_commit( &mpi_file_type );
1650             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1651 
1652             /* Commit buffer type */
1653             mrc = MPI_Type_commit( &mpi_blk_type );
1654             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1655         } /* end if */
1656         /* 2D dataspace */
1657         else {
1658             /* Build partial buffer derived type for contiguous access */
1659             mrc = MPI_Type_contiguous((int)buf_size, MPI_BYTE,
1660                 &mpi_partial_buffer_cont);
1661             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1662 
1663             /* Commit partial buffer derived type */
1664             mrc = MPI_Type_commit(&mpi_partial_buffer_cont);
1665             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1666 
1667             /* Build contiguous file's derived type */
1668             mrc = MPI_Type_vector((int)blk_size, (int)1, (int)(snbytes/buf_size),
1669                 mpi_partial_buffer_cont, &mpi_cont_type);
1670             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1671 
1672             /* Commit contiguous file type */
1673             mrc = MPI_Type_commit(&mpi_cont_type);
1674             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1675 
1676             /* Build partial buffer derived type for interleaved access */
1677             mrc = MPI_Type_contiguous((int)blk_size, MPI_BYTE,
1678                 &mpi_partial_buffer_inter);
1679             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1680 
1681             /* Commit partial buffer derived type */
1682             mrc = MPI_Type_commit(&mpi_partial_buffer_inter);
1683             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1684 
1685             /* Build interleaved file's derived type */
1686             mrc = MPI_Type_vector((int)buf_size, (int)1, (int)(snbytes/blk_size),
1687                 mpi_partial_buffer_inter, &mpi_inter_type);
1688             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1689 
1690             /* Commit interleaved file type */
1691             mrc = MPI_Type_commit(&mpi_inter_type);
1692             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1693 
1694             /* Build full buffer derived type */
1695             mrc = MPI_Type_contiguous((int)(blk_size*buf_size), MPI_BYTE,
1696                 &mpi_full_buffer);
1697             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1698 
1699             /* Commit full buffer derived type */
1700             mrc = MPI_Type_commit(&mpi_full_buffer);
1701             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1702 
1703             /* Build full chunk derived type */
1704             mrc = MPI_Type_contiguous((int)(blk_size*blk_size), MPI_BYTE,
1705                 &mpi_full_chunk);
1706             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1707 
1708             /* Commit full chunk derived type */
1709             mrc = MPI_Type_commit(&mpi_full_chunk);
1710             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1711 
1712             /* Build chunk interleaved file's derived type */
1713             mrc = MPI_Type_vector((int)(buf_size/blk_size), (int)1, (int)(snbytes/blk_size),
1714                 mpi_full_chunk, &mpi_chunk_inter_type);
1715             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_CREATE");
1716 
1717             /* Commit chunk interleaved file type */
1718             mrc = MPI_Type_commit(&mpi_chunk_inter_type);
1719             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_COMMIT");
1720         } /* end else */
1721         break;
1722 
1723     case PHDF5: /* HDF5 setup */
1724         /* 1D dataspace */
1725         if (!parms->dim2d){
1726         if(nbytes>0) {
1727             /* define a contiguous dataset of nbytes native bytes */
1728             h5dims[0] = nbytes;
1729             h5dset_space_id = H5Screate_simple(1, h5dims, NULL);
1730             VRFY((h5dset_space_id >= 0), "H5Screate_simple");
1731 
1732             /* Set up the file dset space id to select the pattern to access */
1733             if (!parms->interleaved){
1734             /* Contiguous pattern */
1735             h5start[0] = bytes_begin[0];
1736             h5stride[0] = h5block[0] = blk_size;
1737             h5count[0] = buf_size/blk_size;
1738             } /* end if */
1739             else {
1740             /* Interleaved access pattern */
1741             /* Skip offset over blocks of other processes */
1742             h5start[0] = bytes_begin[0];
1743             h5stride[0] = blk_size*pio_mpi_nprocs_g;
1744             h5block[0] = blk_size;
1745             h5count[0] = buf_size/blk_size;
1746             } /* end else */
1747             hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET,
1748                 h5start, h5stride, h5count, h5block);
1749             VRFY((hrc >= 0), "H5Sselect_hyperslab");
1750         } /* end if */
1751         else {
1752             h5dset_space_id = H5Screate(H5S_SCALAR);
1753             VRFY((h5dset_space_id >= 0), "H5Screate");
1754         } /* end else */
1755 
1756         /* Create the memory dataspace that corresponds to the xfer buffer */
1757         if(buf_size>0) {
1758             h5dims[0] = buf_size;
1759             h5mem_space_id = H5Screate_simple(1, h5dims, NULL);
1760             VRFY((h5mem_space_id >= 0), "H5Screate_simple");
1761         } /* end if */
1762         else {
1763             h5mem_space_id = H5Screate(H5S_SCALAR);
1764             VRFY((h5mem_space_id >= 0), "H5Screate");
1765         } /* end else */
1766         } /* end if */
1767         /* 2D dataspace */
1768         else {
1769         if(nbytes>0) {
1770             /* define a contiguous dataset of nbytes native bytes */
1771             h5dims[0] = snbytes;
1772             h5dims[1] = snbytes;
1773             h5dset_space_id = H5Screate_simple(2, h5dims, NULL);
1774             VRFY((h5dset_space_id >= 0), "H5Screate_simple");
1775 
1776             /* Set up the file dset space id to select the pattern to access */
1777             if (!parms->interleaved){
1778             /* Contiguous pattern */
1779             h5start[0] = bytes_begin[0];
1780             h5start[1] = bytes_begin[1];
1781             h5stride[0] = 1;
1782             h5stride[1] = h5block[0] = h5block[1] = blk_size;
1783             h5count[0] = 1;
1784             h5count[1] = buf_size/blk_size;
1785             } /* end if */
1786             else {
1787             /* Interleaved access pattern */
1788             /* Skip offset over blocks of other processes */
1789             h5start[0] = bytes_begin[0];
1790             h5start[1] = bytes_begin[1];
1791             h5stride[0] = blk_size;
1792             h5stride[1] = blk_size*pio_mpi_nprocs_g;
1793             h5block[0] = h5block[1] = blk_size;
1794             h5count[0] = buf_size/blk_size;
1795             h5count[1] = 1;
1796             } /* end else */
1797             hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET,
1798                 h5start, h5stride, h5count, h5block);
1799             VRFY((hrc >= 0), "H5Sselect_hyperslab");
1800         } /* end if */
1801         else {
1802             h5dset_space_id = H5Screate(H5S_SCALAR);
1803             VRFY((h5dset_space_id >= 0), "H5Screate");
1804         } /* end else */
1805 
1806         /* Create the memory dataspace that corresponds to the xfer buffer */
1807         if(buf_size>0) {
1808             if (!parms->interleaved){
1809             h5dims[0] = blk_size;
1810             h5dims[1] = buf_size;
1811             }else{
1812             h5dims[0] = buf_size;
1813             h5dims[1] = blk_size;
1814             }
1815             h5mem_space_id = H5Screate_simple(2, h5dims, NULL);
1816             VRFY((h5mem_space_id >= 0), "H5Screate_simple");
1817         } /* end if */
1818         else {
1819             h5mem_space_id = H5Screate(H5S_SCALAR);
1820             VRFY((h5mem_space_id >= 0), "H5Screate");
1821         } /* end else */
1822         } /* end else */
1823 
1824         /* Create the dataset transfer property list */
1825         h5dxpl = H5Pcreate(H5P_DATASET_XFER);
1826         if (h5dxpl < 0) {
1827         HDfprintf(stderr, "HDF5 Property List Create failed\n");
1828         GOTOERROR(FAIL);
1829         }
1830 
1831         /* Change to collective I/O, if asked */
1832         if(parms->collective) {
1833         hrc = H5Pset_dxpl_mpio(h5dxpl, H5FD_MPIO_COLLECTIVE);
1834         if (hrc < 0) {
1835             HDfprintf(stderr, "HDF5 Property List Set failed\n");
1836             GOTOERROR(FAIL);
1837         } /* end if */
1838         } /* end if */
1839         break;
1840     } /* end switch */
1841 
1842     for (ndset = 1; ndset <= ndsets; ++ndset) {
1843 
1844     /* Calculate dataset offset within a file */
1845 
1846     /* create dataset */
1847     switch (parms->io_type) {
1848         case POSIXIO:
1849         case MPIO:
1850         /* both posix and mpi io just need dataset offset in file*/
1851         dset_offset = (ndset - 1) * nbytes;
1852         break;
1853 
1854         case PHDF5:
1855             HDsprintf(dname, "Dataset_%ld", ndset);
1856         h5ds_id = H5DOPEN(fd->h5fd, dname);
1857         if (h5ds_id < 0) {
1858             HDfprintf(stderr, "HDF5 Dataset open failed\n");
1859             GOTOERROR(FAIL);
1860         }
1861 
1862         break;
1863     }
1864 
1865     /* The task is to transfer bytes_count bytes, starting at
1866      * bytes_begin position, using transfer buffer of buf_size bytes.
1867      * If interleaved, select buf_size at a time, in round robin
1868      * fashion, according to number of process. Otherwise, select
1869      * all bytes_count in contiguous.
1870      */
1871     nbytes_xfer = 0 ;
1872 
1873     /* 1D dataspace */
1874     if (!parms->dim2d){
1875         /* Set base file offset for all I/O patterns and POSIX access */
1876         posix_file_offset = dset_offset + bytes_begin[0];
1877 
1878         /* Set base file offset for all I/O patterns and MPI access */
1879         mpi_file_offset = (MPI_Offset)(dset_offset + bytes_begin[0]);
1880     } /* end if */
1881     else {
1882         /* Set base file offset for all I/O patterns and POSIX access */
1883         posix_file_offset=dset_offset + bytes_begin[0]*snbytes+
1884         bytes_begin[1];
1885 
1886         /* Set base file offset for all I/O patterns and MPI access */
1887         mpi_file_offset=(MPI_Offset)(dset_offset + bytes_begin[0]*snbytes+
1888             bytes_begin[1]);
1889     } /* end else */
1890 
1891     /* Start "raw data" read timer */
1892     set_time(res->timers, HDF5_RAW_READ_FIXED_DIMS, TSTART);
1893 
1894     while (nbytes_xfer < bytes_count){
1895         /* Read */
1896         /* Calculate offset of read within a dataset/file */
1897         switch (parms->io_type) {
1898         case POSIXIO:
1899             /* 1D dataspace */
1900             if (!parms->dim2d){
1901                 /* Contiguous pattern */
1902                 if (!parms->interleaved) {
1903                     /* Compute file offset */
1904                     file_offset = posix_file_offset + (off_t)nbytes_xfer;
1905 
1906                     /* only care if seek returns error */
1907                     rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
1908                     VRFY((rc==0), "POSIXSEEK");
1909 
1910                     /* check if all bytes are read */
1911                     rc = ((ssize_t)buf_size ==
1912                         POSIXREAD(fd->posixfd, buffer, buf_size));
1913                     VRFY((rc != 0), "POSIXREAD");
1914 
1915                     /* Advance global offset in dataset */
1916                     nbytes_xfer+=buf_size;
1917                 } /* end if */
1918                 /* Interleaved access pattern */
1919                 else {
1920                     /* Set the base of user's buffer */
1921                     buf_p=(unsigned char *)buffer;
1922 
1923                     /* Set the number of bytes to transfer this time */
1924                     nbytes_toxfer = buf_size;
1925 
1926                     /* Loop over the buffers to read */
1927                     while(nbytes_toxfer>0) {
1928                     /* Skip offset over blocks of other processes */
1929                     file_offset = posix_file_offset +
1930                         (off_t)(nbytes_xfer*pio_mpi_nprocs_g);
1931 
1932                     /* only care if seek returns error */
1933                     rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
1934                     VRFY((rc==0), "POSIXSEEK");
1935 
1936                     /* check if all bytes are read */
1937                     rc = ((ssize_t)blk_size ==
1938                         POSIXREAD(fd->posixfd, buf_p, blk_size));
1939                     VRFY((rc != 0), "POSIXREAD");
1940 
1941                     /* Advance location in buffer */
1942                     buf_p+=blk_size;
1943 
1944                     /* Advance global offset in dataset */
1945                     nbytes_xfer+=blk_size;
1946 
1947                     /* Decrement number of bytes left this time */
1948                     nbytes_toxfer-=blk_size;
1949                     } /* end while */
1950                 } /* end else */
1951             } /* end if */
1952             /* 2D dataspace */
1953             else {
1954                 /* Contiguous storage */
1955                 if (!parms->h5_use_chunks) {
1956                     /* Contiguous access pattern */
1957                     if (!parms->interleaved) {
1958                         /* Compute file offset */
1959                         file_offset=posix_file_offset+(off_t)(((nbytes_xfer/blk_size)
1960                             /snbytes)*(blk_size*snbytes)+((nbytes_xfer/blk_size)%snbytes));
1961 
1962                         /* Number of bytes to be transferred per I/O operation */
1963                         nbytes_xfer_advance = buf_size;
1964 
1965                         /* Global offset advance after each I/O operation */
1966                         file_offset_advance = (off_t)snbytes;
1967                     } /* end if */
1968                     /* Interleaved access pattern */
1969                     else {
1970                         /* Compute file offset */
1971                         file_offset=posix_file_offset+(off_t)((((nbytes_xfer/buf_size)
1972                             *pio_mpi_nprocs_g)/snbytes)*(buf_size*snbytes)
1973                             +((nbytes_xfer/buf_size)*pio_mpi_nprocs_g)%snbytes);
1974 
1975                         /* Number of bytes to be transferred per I/O operation */
1976                         nbytes_xfer_advance = blk_size;
1977 
1978                         /* Global offset advance after each I/O operation */
1979                         file_offset_advance = (off_t)snbytes;
1980                     } /* end else */
1981                 } /* end if */
1982                 /* Chunked storage */
1983                 else {
1984                     /*Contiguous access pattern */
1985                     if (!parms->interleaved) {
1986                         /* Compute file offset */
1987                         file_offset=posix_file_offset+(off_t)nbytes_xfer;
1988 
1989                         /* Number of bytes to be transferred per I/O operation */
1990                         nbytes_xfer_advance = blk_size * buf_size;
1991 
1992                         /* Global offset advance after each I/O operation */
1993                         file_offset_advance = 0;
1994                     } /* end if */
1995                     /*Interleaved access pattern */
1996                     else {
1997                         /* Compute file offset */
1998                         /* Before simplification */
1999                         /* file_offset=posix_file_offset+(off_t)((nbytes_xfer/(buf_size/blk_size)
2000                         *pio_mpi_nprocs_g)/(snbytes/blk_size*(blk_size*blk_size))*(buf_size/blk_size
2001                         *snbytes/blk_size*(blk_size*blk_size))+((nbytes_xfer/(buf_size/blk_size))
2002                         *pio_mpi_nprocs_g)%(snbytes/blk_size*(blk_size*blk_size))); */
2003 
2004                         file_offset=posix_file_offset+(off_t)(((nbytes_xfer/(buf_size/blk_size)
2005                         *pio_mpi_nprocs_g)/(snbytes*blk_size))*(buf_size*snbytes)+((nbytes_xfer/(buf_size/blk_size))
2006                         *pio_mpi_nprocs_g)%(snbytes*blk_size));
2007 
2008                         /* Number of bytes to be transferred per I/O operation */
2009                         nbytes_xfer_advance = blk_size * blk_size;
2010 
2011                         /* Global offset advance after each I/O operation */
2012                         /* file_offset_advance = (off_t)(snbytes/blk_size*(blk_size*blk_size)); */
2013                         file_offset_advance = (off_t)(snbytes*blk_size);
2014                     } /* end else */
2015                 } /* end else */
2016 
2017                 /* Common code for file access */
2018 
2019                 /* Set the base of user's buffer */
2020                 buf_p = (unsigned char *)buffer;
2021 
2022                 /* Set the number of bytes to transfer this time */
2023                 nbytes_toxfer = buf_size*blk_size;
2024 
2025                 /* Loop over portions of the buffer to read */
2026                 while(nbytes_toxfer>0){
2027                     /* only care if seek returns error */
2028                     rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
2029                     VRFY((rc==0), "POSIXSEEK");
2030 
2031                     /* check if all bytes are read */
2032                     rc = ((ssize_t)nbytes_xfer_advance ==
2033                         POSIXREAD(fd->posixfd, buf_p, nbytes_xfer_advance));
2034                     VRFY((rc != 0), "POSIXREAD");
2035 
2036                     /* Advance location in buffer */
2037                     buf_p+=nbytes_xfer_advance;
2038 
2039                     /* Advance global offset in dataset */
2040                     nbytes_xfer+=nbytes_xfer_advance;
2041 
2042                     /* Decrement number of bytes left this time */
2043                     nbytes_toxfer-=nbytes_xfer_advance;
2044 
2045                     /* Partially advance file offset */
2046                     file_offset+=file_offset_advance;
2047                 } /* end while */
2048 
2049             } /* end else */
2050             break;
2051 
2052         case MPIO:
2053             /* 1D dataspace */
2054             if (!parms->dim2d){
2055                 /* Independent file access */
2056                 if(!parms->collective) {
2057                     /* Contiguous pattern */
2058                     if (!parms->interleaved){
2059                         /* Compute offset in file */
2060                         mpi_offset = mpi_file_offset +
2061                             nbytes_xfer;
2062 
2063                         /* Perform independent read */
2064                         mrc = MPI_File_read_at(fd->mpifd, mpi_offset, buffer,
2065                             (int)(buf_size/blk_size), mpi_blk_type,
2066                             &mpi_status);
2067                         VRFY((mrc==MPI_SUCCESS), "MPIO_READ");
2068 
2069                         /* Advance global offset in dataset */
2070                         nbytes_xfer+=buf_size;
2071                     } /* end if */
2072                     /* Interleaved access pattern */
2073                     else {
2074                         /* Set the base of user's buffer */
2075                         buf_p=(unsigned char *)buffer;
2076 
2077                         /* Set the number of bytes to transfer this time */
2078                         nbytes_toxfer = buf_size;
2079 
2080                         /* Loop over the buffers to read */
2081                         while(nbytes_toxfer>0) {
2082                             /* Skip offset over blocks of other processes */
2083                             mpi_offset = mpi_file_offset +
2084                             (nbytes_xfer*pio_mpi_nprocs_g);
2085 
2086                             /* Perform independent read */
2087                             mrc = MPI_File_read_at(fd->mpifd, mpi_offset, buf_p,
2088                                 (int)1, mpi_blk_type, &mpi_status);
2089                             VRFY((mrc==MPI_SUCCESS), "MPIO_READ");
2090 
2091                             /* Advance location in buffer */
2092                             buf_p+=blk_size;
2093 
2094                             /* Advance global offset in dataset */
2095                             nbytes_xfer+=blk_size;
2096 
2097                             /* Decrement number of bytes left this time */
2098                             nbytes_toxfer-=blk_size;
2099                         } /* end while */
2100                     } /* end else */
2101                 } /* end if */
2102                 /* Collective file access */
2103                 else {
2104                     /* Contiguous access pattern */
2105                     if (!parms->interleaved){
2106                         /* Compute offset in file */
2107                         mpi_offset = mpi_file_offset +
2108                             nbytes_xfer;
2109 
2110                         /* Perform collective read */
2111                         mrc = MPI_File_read_at_all(fd->mpifd, mpi_offset, buffer,
2112                             (int)(buf_size/blk_size), mpi_blk_type, &mpi_status);
2113                         VRFY((mrc==MPI_SUCCESS), "MPIO_READ");
2114 
2115                         /* Advance global offset in dataset */
2116                         nbytes_xfer+=buf_size;
2117                     } /* end if */
2118                     /* Interleaved access pattern */
2119                     else {
2120                         /* Compute offset in file */
2121                         mpi_offset = mpi_file_offset +
2122                             (nbytes_xfer*pio_mpi_nprocs_g);
2123 
2124                         /* Set the file view */
2125                         mrc = MPI_File_set_view(fd->mpifd, mpi_offset, mpi_blk_type,
2126                             mpi_file_type, (char*)"native",  h5_io_info_g);
2127                         VRFY((mrc==MPI_SUCCESS), "MPIO_VIEW");
2128 
2129                         /* Perform collective read */
2130                         mrc = MPI_File_read_at_all(fd->mpifd, 0, buffer,
2131                             (int)(buf_size/blk_size), mpi_blk_type, &mpi_status);
2132                         VRFY((mrc==MPI_SUCCESS), "MPIO_READ");
2133 
2134                         /* Advance global offset in dataset */
2135                         nbytes_xfer+=buf_size;
2136                     } /* end else */
2137                 } /* end else */
2138             } /* end if */
2139             /* 2D dataspace */
2140             else {
2141                 /* Contiguous storage */
2142                 if (!parms->h5_use_chunks) {
2143                     /* Contiguous access pattern */
2144                     if (!parms->interleaved) {
2145                         /* Compute offset in file */
2146                         mpi_offset=mpi_file_offset+((nbytes_xfer/blk_size)/snbytes)*
2147                             (blk_size*snbytes)+((nbytes_xfer/blk_size)%snbytes);
2148 
2149                         /* Number of bytes to be transferred per I/O operation */
2150                         nbytes_xfer_advance = buf_size;
2151 
2152                         /* Global offset advance after each I/O operation */
2153                         mpi_offset_advance = snbytes;
2154 
2155                         /* MPI type to be used for collective access */
2156                         mpi_collective_type = mpi_cont_type;
2157                     } /* end if */
2158                     /* Interleaved access pattern */
2159                     else {
2160                         /* Compute offset in file */
2161                         mpi_offset=mpi_file_offset+(((nbytes_xfer/buf_size)*pio_mpi_nprocs_g)/snbytes)*
2162                             (buf_size*snbytes)+((nbytes_xfer/buf_size)*pio_mpi_nprocs_g)%snbytes;
2163 
2164                         /* Number of bytes to be transferred per I/O operation */
2165                         nbytes_xfer_advance = blk_size;
2166 
2167                         /* Global offset advance after each I/O operation */
2168                         mpi_offset_advance = snbytes;
2169 
2170                         /* MPI type to be used for collective access */
2171                         mpi_collective_type = mpi_inter_type;
2172                     } /* end else */
2173                 } /* end if */
2174                 /* Chunked storage */
2175                 else {
2176                     /*Contiguous access pattern */
2177                     if (!parms->interleaved) {
2178                         /* Compute offset in file */
2179                         mpi_offset=mpi_file_offset+nbytes_xfer;
2180 
2181                         /* Number of bytes to be transferred per I/O operation */
2182                         nbytes_xfer_advance = blk_size * buf_size;
2183 
2184                         /* Global offset advance after each I/O operation */
2185                         mpi_offset_advance = 0;
2186 
2187                         /* MPI type to be used for collective access */
2188                         mpi_collective_type = mpi_full_buffer;
2189                     } /* end if */
2190                     /*Interleaved access pattern */
2191                     else {
2192                         /* Compute offset in file */
2193                         /* Before simplification */
2194                         /* mpi_offset=mpi_file_offset+(nbytes_xfer/(buf_size/blk_size)
2195                             *pio_mpi_nprocs_g)/(snbytes/blk_size*(blk_size*blk_size))*
2196                             (buf_size/blk_size*snbytes/blk_size*(blk_size*blk_size))+
2197                             ((nbytes_xfer/(buf_size/blk_size))*pio_mpi_nprocs_g)%(snbytes
2198                             /blk_size*(blk_size*blk_size)); */
2199                         mpi_offset=mpi_file_offset+((nbytes_xfer/(buf_size/blk_size)
2200                             *pio_mpi_nprocs_g)/(snbytes*blk_size))*(buf_size*snbytes)
2201                             +((nbytes_xfer/(buf_size/blk_size))*pio_mpi_nprocs_g)%(snbytes*blk_size);
2202 
2203                         /* Number of bytes to be transferred per I/O operation */
2204                         nbytes_xfer_advance = blk_size * blk_size;
2205 
2206                         /* Global offset advance after each I/O operation */
2207                         /* mpi_offset_advance = (MPI_Offset)(snbytes/blk_size*(blk_size*blk_size)); */
2208                         mpi_offset_advance = (MPI_Offset)(snbytes*blk_size);
2209 
2210                         /* MPI type to be used for collective access */
2211                         mpi_collective_type = mpi_chunk_inter_type;
2212                     } /* end else */
2213                 } /* end else */
2214 
2215                 /* Common code for independent file access */
2216                 if (!parms->collective) {
2217                     /* Set the base of user's buffer */
2218                     buf_p = (unsigned char *)buffer;
2219 
2220                     /* Set the number of bytes to transfer this time */
2221                     nbytes_toxfer = buf_size * blk_size;
2222 
2223                     /* Loop over portions of the buffer to read */
2224                     while(nbytes_toxfer>0){
2225                         /* Perform independent read */
2226                         mrc = MPI_File_read_at(fd->mpifd, mpi_offset, buf_p,
2227                             (int)nbytes_xfer_advance, MPI_BYTE, &mpi_status);
2228                         VRFY((mrc==MPI_SUCCESS), "MPIO_READ");
2229 
2230                         /* Advance location in buffer */
2231                         buf_p+=nbytes_xfer_advance;
2232 
2233                         /* Advance global offset in dataset */
2234                         nbytes_xfer+=nbytes_xfer_advance;
2235 
2236                         /* Decrement number of bytes left this time */
2237                         nbytes_toxfer-=nbytes_xfer_advance;
2238 
2239                         /* Partially advance global offset in dataset */
2240                         mpi_offset+=mpi_offset_advance;
2241                     } /* end while */
2242                 } /* end if */
2243 
2244                 /* Common code for collective file access */
2245                 else {
2246                     /* Set the file view */
2247                     mrc = MPI_File_set_view(fd->mpifd, mpi_offset, MPI_BYTE,
2248                         mpi_collective_type, (char *)"native", h5_io_info_g);
2249                     VRFY((mrc==MPI_SUCCESS), "MPIO_VIEW");
2250 
2251                     /* Perform read */
2252                     MPI_File_read_at_all(fd->mpifd, 0, buffer,(int)(buf_size*blk_size),
2253                         MPI_BYTE, &mpi_status);
2254                     VRFY((mrc==MPI_SUCCESS), "MPIO_READ");
2255 
2256                     /* Advance global offset in dataset */
2257                     nbytes_xfer+=buf_size*blk_size;
2258                 } /* end else */
2259 
2260             } /* end else */
2261             break;
2262 
2263         case PHDF5:
2264             /* 1D dataspace */
2265             if (!parms->dim2d){
2266             /* Set up the file dset space id to move the selection to process */
2267             if (!parms->interleaved){
2268                 /* Contiguous pattern */
2269                 h5offset[0] = nbytes_xfer;
2270             } /* end if */
2271             else {
2272                 /* Interleaved access pattern */
2273                 /* Skip offset over blocks of other processes */
2274                 h5offset[0] = (nbytes_xfer*pio_mpi_nprocs_g);
2275             } /* end else */
2276             hrc = H5Soffset_simple(h5dset_space_id, h5offset);
2277             VRFY((hrc >= 0), "H5Soffset_simple");
2278 
2279             /* Read the buffer in */
2280             hrc = H5Dread(h5ds_id, ELMT_H5_TYPE, h5mem_space_id,
2281                 h5dset_space_id, h5dxpl, buffer);
2282             VRFY((hrc >= 0), "H5Dread");
2283 
2284             /* Increment number of bytes transferred */
2285             nbytes_xfer += buf_size;
2286             } /* end if */
2287             /* 2D dataspace */
2288             else {
2289             /* Set up the file dset space id to move the selection to process */
2290             if (!parms->interleaved){
2291                 /* Contiguous pattern */
2292                 h5offset[0] = (nbytes_xfer/(snbytes*blk_size))*blk_size;
2293                 h5offset[1] = (nbytes_xfer%(snbytes*blk_size))/blk_size;
2294             } /* end if */
2295             else {
2296                 /* Interleaved access pattern */
2297                 /* Skip offset over blocks of other processes */
2298                 h5offset[0] = ((nbytes_xfer*pio_mpi_nprocs_g)/(snbytes*buf_size))*buf_size;
2299                 h5offset[1] = ((nbytes_xfer*pio_mpi_nprocs_g)%(snbytes*buf_size))/buf_size;
2300 
2301             } /* end else */
2302             hrc = H5Soffset_simple(h5dset_space_id, h5offset);
2303             VRFY((hrc >= 0), "H5Soffset_simple");
2304 
2305             /* Write the buffer out */
2306             hrc = H5Dread(h5ds_id, ELMT_H5_TYPE, h5mem_space_id,
2307                 h5dset_space_id, h5dxpl, buffer);
2308             VRFY((hrc >= 0), "H5Dread");
2309 
2310             /* Increment number of bytes transferred */
2311             nbytes_xfer += buf_size*blk_size;
2312 
2313             } /* end else */
2314             break;
2315         } /* switch (parms->io_type) */
2316 
2317         /* Verify raw data, if asked */
2318         if (parms->verify) {
2319         /* Verify data read */
2320         unsigned char *ucharptr = (unsigned char *)buffer;
2321         size_t i;
2322         int nerror=0;
2323 
2324         for (i = 0; i < bsize; ++i){
2325             if (*ucharptr++ != pio_mpi_rank_g+1) {
2326             if (++nerror < 20){
2327                 /* report at most 20 errors */
2328                 HDprint_rank(output);
2329                 HDfprintf(output, "read data error, expected (%d), "
2330                     "got (%d)\n",
2331                     pio_mpi_rank_g+1,
2332                     (int)*(ucharptr-1));
2333             } /* end if */
2334             } /* end if */
2335         } /* end for */
2336         if (nerror >= 20) {
2337             HDprint_rank(output);
2338             HDfprintf(output, "...");
2339             HDfprintf(output, "total read data errors=%d\n",
2340                 nerror);
2341         } /* end if */
2342         }   /* if (parms->verify) */
2343 
2344     } /* end while */
2345 
2346     /* Stop "raw data" read timer */
2347     set_time(res->timers, HDF5_RAW_READ_FIXED_DIMS, TSTOP);
2348 
2349     /* Calculate read time */
2350 
2351     /* Close dataset. Only HDF5 needs to do an explicit close. */
2352     if (parms->io_type == PHDF5) {
2353         hrc = H5Dclose(h5ds_id);
2354 
2355         if (hrc < 0) {
2356         HDfprintf(stderr, "HDF5 Dataset Close failed\n");
2357         GOTOERROR(FAIL);
2358         }
2359 
2360         h5ds_id = -1;
2361     } /* end if */
2362     } /* end for */
2363 
2364 done:
2365     /* release MPI-I/O objects */
2366     if (parms->io_type == MPIO) {
2367         /* 1D dataspace */
2368         if (!parms->dim2d){
2369             /* Free file type */
2370             mrc = MPI_Type_free( &mpi_file_type );
2371             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2372 
2373             /* Free buffer type */
2374             mrc = MPI_Type_free( &mpi_blk_type );
2375             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2376         } /* end if */
2377         /* 2D dataspace */
2378         else {
2379             /* Free partial buffer type for contiguous access */
2380             mrc = MPI_Type_free( &mpi_partial_buffer_cont );
2381             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2382 
2383             /* Free contiguous file type */
2384             mrc = MPI_Type_free( &mpi_cont_type );
2385             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2386 
2387             /* Free partial buffer type for interleaved access */
2388             mrc = MPI_Type_free( &mpi_partial_buffer_inter );
2389             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2390 
2391             /* Free interleaved file type */
2392             mrc = MPI_Type_free( &mpi_inter_type );
2393             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2394 
2395             /* Free full buffer type */
2396             mrc = MPI_Type_free(&mpi_full_buffer);
2397             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2398 
2399             /* Free full chunk type */
2400             mrc = MPI_Type_free(&mpi_full_chunk);
2401             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2402 
2403             /* Free chunk interleaved file type */
2404             mrc = MPI_Type_free(&mpi_chunk_inter_type);
2405             VRFY((mrc==MPI_SUCCESS), "MPIO_TYPE_FREE");
2406         } /* end else */
2407     } /* end if */
2408 
2409     /* release HDF5 objects */
2410     if (h5dset_space_id != -1) {
2411     hrc = H5Sclose(h5dset_space_id);
2412     if (hrc < 0){
2413         HDfprintf(stderr, "HDF5 Dataset Space Close failed\n");
2414         ret_code = FAIL;
2415     } else {
2416         h5dset_space_id = -1;
2417     }
2418     }
2419 
2420     if (h5mem_space_id != -1) {
2421     hrc = H5Sclose(h5mem_space_id);
2422     if (hrc < 0) {
2423         HDfprintf(stderr, "HDF5 Memory Space Close failed\n");
2424         ret_code = FAIL;
2425     } else {
2426         h5mem_space_id = -1;
2427     }
2428     }
2429 
2430     if (h5dxpl != -1) {
2431     hrc = H5Pclose(h5dxpl);
2432     if (hrc < 0) {
2433         HDfprintf(stderr, "HDF5 Dataset Transfer Property List Close failed\n");
2434         ret_code = FAIL;
2435     } else {
2436         h5dxpl = -1;
2437     }
2438     }
2439 
2440     return ret_code;
2441 }
2442 
2443 /*
2444  * Function:    do_fopen
2445  * Purpose:     Open the specified file.
2446  * Return:      SUCCESS or FAIL
2447  * Programmer:  Albert Cheng, Bill Wendling, 2001/12/13
2448  * Modifications:
2449  */
2450     static herr_t
do_fopen(parameters * param,char * fname,file_descr * fd,int flags)2451 do_fopen(parameters *param, char *fname, file_descr *fd /*out*/, int flags)
2452 {
2453     int ret_code = SUCCESS, mrc;
2454     hid_t acc_tpl = -1;         /* file access templates */
2455 
2456     switch (param->io_type) {
2457         case POSIXIO:
2458             if (flags & (PIO_CREATE | PIO_WRITE))
2459                 fd->posixfd = POSIXCREATE(fname);
2460             else
2461                 fd->posixfd = POSIXOPEN(fname, O_RDONLY);
2462 
2463             if (fd->posixfd < 0 ) {
2464                 HDfprintf(stderr, "POSIX File Open failed(%s)\n", fname);
2465                 GOTOERROR(FAIL);
2466             }
2467 
2468 
2469             /* The perils of POSIX I/O in a parallel environment. The problem is:
2470              *
2471              *      - Process n opens a file with truncation and then starts
2472              *        writing to the file.
2473              *      - Process m also opens the file with truncation, but after
2474              *        process n has already started to write to the file. Thus,
2475              *        all of the stuff process n wrote is now lost.
2476              */
2477             MPI_Barrier(pio_comm_g);
2478 
2479             break;
2480 
2481         case MPIO:
2482             if (flags & (PIO_CREATE | PIO_WRITE)) {
2483                 MPI_File_delete(fname, h5_io_info_g);
2484                 mrc = MPI_File_open(pio_comm_g, fname, MPI_MODE_CREATE | MPI_MODE_RDWR,
2485                     h5_io_info_g, &fd->mpifd);
2486 
2487                 if (mrc != MPI_SUCCESS) {
2488                     HDfprintf(stderr, "MPI File Open failed(%s)\n", fname);
2489                     GOTOERROR(FAIL);
2490                 }
2491 
2492                 /*since MPI_File_open with MPI_MODE_CREATE does not truncate  */
2493                 /*filesize , set size to 0 explicitedly.    */
2494                 mrc = MPI_File_set_size(fd->mpifd, (MPI_Offset)0);
2495                 if (mrc != MPI_SUCCESS) {
2496                     HDfprintf(stderr, "MPI_File_set_size failed\n");
2497                     GOTOERROR(FAIL);
2498                 }
2499             } else {
2500                 mrc = MPI_File_open(pio_comm_g, fname, MPI_MODE_RDONLY, h5_io_info_g, &fd->mpifd);
2501                 if (mrc != MPI_SUCCESS) {
2502                     HDfprintf(stderr, "MPI File Open failed(%s)\n", fname);
2503                     GOTOERROR(FAIL);
2504                 }
2505             }
2506 
2507             break;
2508 
2509         case PHDF5:
2510             if ((acc_tpl = H5Pcreate(H5P_FILE_ACCESS)) < 0) {
2511                 HDfprintf(stderr, "HDF5 Property List Create failed\n");
2512                 GOTOERROR(FAIL);
2513             }
2514 
2515             /* Set the file driver to the MPI-IO driver */
2516             if (H5Pset_fapl_mpio(acc_tpl, pio_comm_g, h5_io_info_g) < 0) {
2517                 HDfprintf(stderr, "HDF5 Property List Set failed\n");
2518                 GOTOERROR(FAIL);
2519             }
2520 
2521             /* Set the alignment of objects in HDF5 file */
2522             if (H5Pset_alignment(acc_tpl, param->h5_thresh, param->h5_align) < 0) {
2523                 HDfprintf(stderr, "HDF5 Property List Set failed\n");
2524                 GOTOERROR(FAIL);
2525             }
2526 
2527             /* create the parallel file */
2528             if (flags & (PIO_CREATE | PIO_WRITE))
2529                 fd->h5fd = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, acc_tpl);
2530             else
2531                 fd->h5fd = H5Fopen(fname, H5F_ACC_RDONLY, acc_tpl);
2532             if (fd->h5fd < 0) {
2533                 HDfprintf(stderr, "HDF5 File Create failed(%s)\n", fname);
2534                 GOTOERROR(FAIL);
2535             }
2536 
2537             /* verifying the close of the acc_tpl */
2538             if (H5Pclose(acc_tpl) < 0) {
2539                 HDfprintf(stderr, "HDF5 Property List Close failed\n");
2540                 GOTOERROR(FAIL);
2541             }
2542 
2543             break;
2544     }
2545 
2546 done:
2547     return ret_code;
2548 }
2549 
2550 /*
2551  * Function:    do_fclose
2552  * Purpose:     Close the specified file descriptor.
2553  * Return:      SUCCESS or FAIL
2554  * Programmer:  Albert Cheng, Bill Wendling, 2001/12/13
2555  * Modifications:
2556  */
2557     static herr_t
do_fclose(iotype iot,file_descr * fd)2558 do_fclose(iotype iot, file_descr *fd /*out*/)
2559 {
2560     herr_t ret_code = SUCCESS, hrc;
2561     int mrc = 0, rc = 0;
2562 
2563     switch (iot) {
2564     case POSIXIO:
2565         rc = POSIXCLOSE(fd->posixfd);
2566 
2567         if (rc != 0){
2568         HDfprintf(stderr, "POSIX File Close failed\n");
2569         GOTOERROR(FAIL);
2570         }
2571 
2572         fd->posixfd = -1;
2573         break;
2574 
2575     case MPIO:
2576         mrc = MPI_File_close(&fd->mpifd);
2577 
2578         if (mrc != MPI_SUCCESS){
2579         HDfprintf(stderr, "MPI File close failed\n");
2580         GOTOERROR(FAIL);
2581         }
2582 
2583         fd->mpifd = MPI_FILE_NULL;
2584         break;
2585 
2586     case PHDF5:
2587         hrc = H5Fclose(fd->h5fd);
2588 
2589         if (hrc < 0) {
2590         HDfprintf(stderr, "HDF5 File Close failed\n");
2591         GOTOERROR(FAIL);
2592         }
2593 
2594         fd->h5fd = -1;
2595         break;
2596     }
2597 
2598 done:
2599     return ret_code;
2600 }
2601 
2602 
2603 /*
2604  * Function:    do_fclose
2605  * Purpose:     Cleanup temporary file unless HDF5_NOCLEANUP is set.
2606  *      Only Proc 0 of the PIO communicator will do the cleanup.
2607  *      Other processes just return.
2608  * Return:      void
2609  * Programmer:  Albert Cheng 2001/12/12
2610  * Modifications:
2611  */
2612     static void
do_cleanupfile(iotype iot,char * fname)2613 do_cleanupfile(iotype iot, char *fname)
2614 {
2615     if (pio_mpi_rank_g != 0)
2616     return;
2617 
2618     if (clean_file_g == -1)
2619     clean_file_g = (getenv("HDF5_NOCLEANUP")==NULL) ? 1 : 0;
2620 
2621     if (clean_file_g){
2622     switch (iot){
2623         case POSIXIO:
2624             HDremove(fname);
2625         break;
2626         case MPIO:
2627         case PHDF5:
2628         MPI_File_delete(fname, h5_io_info_g);
2629         break;
2630     }
2631     }
2632 }
2633 
2634 #ifdef TIME_MPI
2635 /* instrument the MPI_File_wrirte_xxx and read_xxx calls to measure
2636  * pure time spent in MPI_File code.
2637  */
MPI_File_read_at(MPI_File fh,MPI_Offset offset,void * buf,int count,MPI_Datatype datatype,MPI_Status * status)2638 int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,
2639     int count, MPI_Datatype datatype, MPI_Status *status)
2640 {
2641     int err;
2642     set_time(timer_g, HDF5_MPI_READ, TSTART);
2643     err=PMPI_File_read_at(fh, offset, buf, count, datatype, status);
2644     set_time(timer_g, HDF5_MPI_READ, TSTOP);
2645     return err;
2646 }
2647 
2648 
MPI_File_read_at_all(MPI_File fh,MPI_Offset offset,void * buf,int count,MPI_Datatype datatype,MPI_Status * status)2649 int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void *buf,
2650     int count, MPI_Datatype datatype, MPI_Status *status)
2651 {
2652     int err;
2653     set_time(timer_g, HDF5_MPI_READ, TSTART);
2654     err=PMPI_File_read_at_all(fh, offset, buf, count, datatype, status);
2655     set_time(timer_g, HDF5_MPI_READ, TSTOP);
2656     return err;
2657 }
2658 
MPI_File_write_at(MPI_File fh,MPI_Offset offset,void * buf,int count,MPI_Datatype datatype,MPI_Status * status)2659 int MPI_File_write_at(MPI_File fh, MPI_Offset offset, void *buf,
2660     int count, MPI_Datatype datatype, MPI_Status *status)
2661 {
2662     int err;
2663     set_time(timer_g, HDF5_MPI_WRITE, TSTART);
2664     err=PMPI_File_write_at(fh, offset, buf, count, datatype, status);
2665     set_time(timer_g, HDF5_MPI_WRITE, TSTOP);
2666     return err;
2667 }
2668 
MPI_File_write_at_all(MPI_File fh,MPI_Offset offset,void * buf,int count,MPI_Datatype datatype,MPI_Status * status)2669 int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, void *buf,
2670     int count, MPI_Datatype datatype, MPI_Status *status)
2671 {
2672     int err;
2673     set_time(timer_g, HDF5_MPI_WRITE, TSTART);
2674     err=PMPI_File_write_at_all(fh, offset, buf, count, datatype, status);
2675     set_time(timer_g, HDF5_MPI_WRITE, TSTOP);
2676     return err;
2677 }
2678 
2679 #endif  /* TIME_MPI */
2680 #endif /* H5_HAVE_PARALLEL */
2681 
2682 
2683 
2684 
2685 
2686