1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  * Copyright by The HDF Group.                                               *
3  * Copyright by the Board of Trustees of the University of Illinois.         *
4  * All rights reserved.                                                      *
5  *                                                                           *
6  * This file is part of HDF5.  The full HDF5 copyright notice, including     *
7  * terms governing use, modification, and redistribution, is contained in    *
8  * the COPYING file, which can be found at the root of the source code       *
9  * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases.  *
10  * If you do not have access to either file, you may request a copy from     *
11  * help@hdfgroup.org.                                                        *
12  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
13 
14 /*-------------------------------------------------------------------------
15  *
16  * Created:             H5ACmpio.c
17  *                      Jun 20 2015
18  *                      Quincey Koziol <koziol@hdfgroup.org>
19  *
20  * Purpose:             Functions in this file implement support for parallel
21  *                      I/O cache functionality
22  *
23  *-------------------------------------------------------------------------
24  */
25 
26 /****************/
27 /* Module Setup */
28 /****************/
29 
30 #include "H5ACmodule.h"         /* This source code file is part of the H5AC module */
31 #define H5F_FRIEND		/*suppress error about including H5Fpkg	  */
32 
33 
34 /***********/
35 /* Headers */
36 /***********/
37 #include "H5private.h"		/* Generic Functions			*/
38 #include "H5ACpkg.h"		/* Metadata cache			*/
39 #include "H5Cprivate.h"		/* Cache                                */
40 #include "H5Eprivate.h"		/* Error handling		  	*/
41 #include "H5Fpkg.h"		/* Files				*/
42 #include "H5MMprivate.h"        /* Memory management                    */
43 
44 #ifdef H5_HAVE_PARALLEL
45 
46 /****************/
47 /* Local Macros */
48 /****************/
49 
50 
51 /******************/
52 /* Local Typedefs */
53 /******************/
54 
55 /****************************************************************************
56  *
57  * structure H5AC_slist_entry_t
58  *
59  * The dirty entry list maintained via the d_slist_ptr field of H5AC_aux_t
60  * and the cleaned entry list maintained via the c_slist_ptr field of
61  * H5AC_aux_t are just lists of the file offsets of the dirty/cleaned
62  * entries.  Unfortunately, the slist code makes us define a dynamically
63  * allocated structure to store these offsets in.  This structure serves
64  * that purpose.  Its fields are as follows:
65  *
66  * addr:	file offset of a metadata entry.  Entries are added to this
67  *		list (if they aren't there already) when they are marked
68  *		dirty in an unprotect, inserted, or moved.  They are
69  *		removed when they appear in a clean entries broadcast.
70  *
71  ****************************************************************************/
72 typedef struct H5AC_slist_entry_t
73 {
74     haddr_t     addr;
75 } H5AC_slist_entry_t;
76 
77 /* User data for address list building callbacks */
78 typedef struct H5AC_addr_list_ud_t
79 {
80     H5AC_aux_t    * aux_ptr;        /* 'Auxiliary' parallel cache info */
81     haddr_t       * addr_buf_ptr;   /* Array to store addresses */
82     unsigned        u;              /* Counter for position in array */
83 } H5AC_addr_list_ud_t;
84 
85 
86 /********************/
87 /* Local Prototypes */
88 /********************/
89 
90 static herr_t H5AC__broadcast_candidate_list(H5AC_t *cache_ptr,
91     unsigned *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr);
92 static herr_t H5AC__broadcast_clean_list(H5AC_t *cache_ptr);
93 static herr_t H5AC__construct_candidate_list(H5AC_t *cache_ptr,
94     H5AC_aux_t *aux_ptr, int sync_point_op);
95 static herr_t H5AC__copy_candidate_list_to_buffer(const H5AC_t *cache_ptr,
96     unsigned *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr);
97 static herr_t H5AC__propagate_and_apply_candidate_list(H5F_t  *f, hid_t dxpl_id);
98 static herr_t H5AC__propagate_flushed_and_still_clean_entries_list(H5F_t  *f,
99     hid_t dxpl_id);
100 static herr_t H5AC__receive_haddr_list(MPI_Comm mpi_comm, unsigned *num_entries_ptr,
101     haddr_t **haddr_buf_ptr_ptr);
102 static herr_t H5AC__receive_candidate_list(const H5AC_t *cache_ptr,
103     unsigned *num_entries_ptr, haddr_t **haddr_buf_ptr_ptr);
104 static herr_t H5AC__receive_and_apply_clean_list(H5F_t *f, hid_t dxpl_id);
105 static herr_t H5AC__tidy_cache_0_lists(H5AC_t *cache_ptr, unsigned num_candidates,
106     haddr_t *candidates_list_ptr);
107 static herr_t H5AC__rsp__dist_md_write__flush(H5F_t *f, hid_t dxpl_id);
108 static herr_t H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f, hid_t dxpl_id);
109 static herr_t H5AC__rsp__p0_only__flush(H5F_t *f, hid_t dxpl_id);
110 static herr_t H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f, hid_t dxpl_id);
111 
112 
113 /*********************/
114 /* Package Variables */
115 /*********************/
116 
117 /* Declare a free list to manage the H5AC_aux_t struct */
118 H5FL_DEFINE(H5AC_aux_t);
119 
120 
121 /*****************************/
122 /* Library Private Variables */
123 /*****************************/
124 
125 
126 /*******************/
127 /* Local Variables */
128 /*******************/
129 
130 /* Declare a free list to manage the H5AC_slist_entry_t struct */
131 H5FL_DEFINE_STATIC(H5AC_slist_entry_t);
132 
133 
134 
135 /*-------------------------------------------------------------------------
136  * Function:    H5AC__set_sync_point_done_callback
137  *
138  * Purpose:     Set the value of the sync_point_done callback.  This
139  *		callback is used by the parallel test code to verify
140  *		that the expected writes and only the expected writes
141  *		take place during a sync point.
142  *
143  * Return:      Non-negative on success/Negative on failure
144  *
145  * Programmer:  John Mainzer
146  *              5/9/10
147  *
148  *-------------------------------------------------------------------------
149  */
150 herr_t
H5AC__set_sync_point_done_callback(H5C_t * cache_ptr,void (* sync_point_done)(unsigned num_writes,haddr_t * written_entries_tbl))151 H5AC__set_sync_point_done_callback(H5C_t * cache_ptr,
152     void (* sync_point_done)(unsigned num_writes, haddr_t * written_entries_tbl))
153 {
154     H5AC_aux_t * aux_ptr;
155 
156     FUNC_ENTER_PACKAGE_NOERR
157 
158     /* Sanity checks */
159     HDassert(cache_ptr);
160     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
161     HDassert(aux_ptr != NULL);
162     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
163 
164     aux_ptr->sync_point_done = sync_point_done;
165 
166     FUNC_LEAVE_NOAPI(SUCCEED)
167 } /* H5AC__set_sync_point_done_callback() */
168 
169 
170 /*-------------------------------------------------------------------------
171  * Function:    H5AC__set_write_done_callback
172  *
173  * Purpose:     Set the value of the write_done callback.  This callback
174  *              is used to improve performance of the parallel test bed
175  *              for the cache.
176  *
177  * Return:      Non-negative on success/Negative on failure
178  *
179  * Programmer:  John Mainzer
180  *              5/11/06
181  *
182  *-------------------------------------------------------------------------
183  */
184 herr_t
H5AC__set_write_done_callback(H5C_t * cache_ptr,void (* write_done)(void))185 H5AC__set_write_done_callback(H5C_t * cache_ptr, void (* write_done)(void))
186 {
187     H5AC_aux_t * aux_ptr;
188 
189     FUNC_ENTER_PACKAGE_NOERR
190 
191     /* Sanity checks */
192     HDassert(cache_ptr);
193     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
194     HDassert(aux_ptr != NULL);
195     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
196 
197     aux_ptr->write_done = write_done;
198 
199     FUNC_LEAVE_NOAPI(SUCCEED)
200 } /* H5AC__set_write_done_callback() */
201 
202 
203 /*-------------------------------------------------------------------------
204  * Function:    H5AC_add_candidate()
205  *
206  * Purpose:     Add the supplied metadata entry address to the candidate
207  *		list.  Verify that each entry added does not appear in
208  *		the list prior to its insertion.
209  *
210  *		This function is intended for used in constructing list
211  *		of entried to be flushed during sync points.  It shouldn't
212  *		be called anywhere else.
213  *
214  * Return:      Non-negative on success/Negative on failure
215  *
216  * Programmer:  John Mainzer
217  *              3/17/10
218  *
219  *-------------------------------------------------------------------------
220  */
221 herr_t
H5AC_add_candidate(H5AC_t * cache_ptr,haddr_t addr)222 H5AC_add_candidate(H5AC_t * cache_ptr, haddr_t addr)
223 {
224     H5AC_aux_t         * aux_ptr;
225     H5AC_slist_entry_t * slist_entry_ptr = NULL;
226     herr_t               ret_value = SUCCEED;    /* Return value */
227 
228     FUNC_ENTER_NOAPI(FAIL)
229 
230     /* Sanity checks */
231     HDassert(cache_ptr != NULL);
232     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
233     HDassert(aux_ptr != NULL);
234     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
235     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
236     HDassert(aux_ptr->candidate_slist_ptr != NULL);
237 
238     /* Construct an entry for the supplied address, and insert
239      * it into the candidate slist.
240      */
241     if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
242         HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate candidate slist entry")
243     slist_entry_ptr->addr  = addr;
244 
245     if(H5SL_insert(aux_ptr->candidate_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
246         HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist")
247 
248 done:
249     /* Clean up on error */
250     if(ret_value < 0)
251         if(slist_entry_ptr)
252             slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
253 
254     FUNC_LEAVE_NOAPI(ret_value)
255 } /* H5AC_add_candidate() */
256 
257 
258 /*-------------------------------------------------------------------------
259  *
260  * Function:    H5AC__broadcast_candidate_list()
261  *
262  * Purpose:     Broadcast the contents of the process 0 candidate entry
263  *		slist.  In passing, also remove all entries from said
264  *		list.  As the application of this will be handled by
265  *		the same functions on all processes, construct and
266  *		return a copy of the list in the same format as that
267  *		received by the other processes.  Note that if this
268  *		copy is returned in *haddr_buf_ptr_ptr, the caller
269  *		must free it.
270  *
271  *		This function must only be called by the process with
272  *		MPI_rank 0.
273  *
274  *		Return SUCCEED on success, and FAIL on failure.
275  *
276  * Return:      Non-negative on success/Negative on failure.
277  *
278  * Programmer:  John Mainzer, 7/1/05
279  *
280  *-------------------------------------------------------------------------
281  */
282 static herr_t
H5AC__broadcast_candidate_list(H5AC_t * cache_ptr,unsigned * num_entries_ptr,haddr_t ** haddr_buf_ptr_ptr)283 H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, unsigned *num_entries_ptr,
284     haddr_t **haddr_buf_ptr_ptr)
285 {
286     H5AC_aux_t         * aux_ptr = NULL;
287     haddr_t            * haddr_buf_ptr = NULL;
288     int                  mpi_result;
289     unsigned		 num_entries;
290     herr_t               ret_value = SUCCEED;    /* Return value */
291 
292     FUNC_ENTER_STATIC
293 
294     /* Sanity checks */
295     HDassert(cache_ptr != NULL);
296     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
297     HDassert(aux_ptr != NULL);
298     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
299     HDassert(aux_ptr->mpi_rank == 0);
300     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
301     HDassert(aux_ptr->candidate_slist_ptr != NULL);
302     HDassert(num_entries_ptr != NULL);
303     HDassert(*num_entries_ptr == 0);
304     HDassert(haddr_buf_ptr_ptr != NULL);
305     HDassert(*haddr_buf_ptr_ptr == NULL);
306 
307     /* First broadcast the number of entries in the list so that the
308      * receivers can set up buffers to receive them.  If there aren't
309      * any, we are done.
310      */
311     num_entries = (unsigned)H5SL_count(aux_ptr->candidate_slist_ptr);
312     if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_UNSIGNED, 0, aux_ptr->mpi_comm)))
313         HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
314 
315     if(num_entries > 0) {
316         size_t		 buf_size = 0;
317         unsigned	 chk_num_entries = 0;
318 
319         /* convert the candidate list into the format we
320          * are used to receiving from process 0, and also load it
321          * into a buffer for transmission.
322          */
323         if(H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0)
324             HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
325         HDassert(chk_num_entries == num_entries);
326         HDassert(haddr_buf_ptr != NULL);
327 
328         /* Now broadcast the list of candidate entries */
329         buf_size = sizeof(haddr_t) * num_entries;
330         if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)haddr_buf_ptr, (int)buf_size, MPI_BYTE, 0, aux_ptr->mpi_comm)))
331             HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
332     } /* end if */
333 
334     /* Pass the number of entries and the buffer pointer
335      * back to the caller.  Do this so that we can use the same code
336      * to apply the candidate list to all the processes.
337      */
338     *num_entries_ptr = num_entries;
339     *haddr_buf_ptr_ptr = haddr_buf_ptr;
340 
341 done:
342     if(ret_value < 0)
343         if(haddr_buf_ptr)
344             haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
345 
346     FUNC_LEAVE_NOAPI(ret_value)
347 } /* H5AC__broadcast_candidate_list() */
348 
349 
350 /*-------------------------------------------------------------------------
351  *
352  * Function:    H5AC__broadcast_clean_list_cb()
353  *
354  * Purpose:     Skip list callback for building array of addresses for
355  *              broadcasting the clean list.
356  *
357  * Return:      Non-negative on success/Negative on failure.
358  *
359  * Programmer:  Quincey Koziol, 6/12/15
360  *
361  *-------------------------------------------------------------------------
362  */
363 static herr_t
H5AC__broadcast_clean_list_cb(void * _item,void H5_ATTR_UNUSED * _key,void * _udata)364 H5AC__broadcast_clean_list_cb(void *_item, void H5_ATTR_UNUSED *_key,
365     void *_udata)
366 {
367     H5AC_slist_entry_t    * slist_entry_ptr = (H5AC_slist_entry_t *)_item;  /* Address of item */
368     H5AC_addr_list_ud_t   * udata = (H5AC_addr_list_ud_t *)_udata;      /* Context for callback */
369     haddr_t		    addr;
370 
371     FUNC_ENTER_STATIC_NOERR
372 
373     /* Sanity checks */
374     HDassert(slist_entry_ptr);
375     HDassert(udata);
376 
377     /* Store the entry's address in the buffer */
378     addr = slist_entry_ptr->addr;
379     udata->addr_buf_ptr[udata->u] = addr;
380     udata->u++;
381 
382     /* now release the entry */
383     slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
384 
385     /* and also remove the matching entry from the dirtied list
386      * if it exists.
387      */
388     if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(udata->aux_ptr->d_slist_ptr, (void *)(&addr))))
389         slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
390 
391     FUNC_LEAVE_NOAPI(SUCCEED)
392 } /* H5AC__broadcast_clean_list_cb() */
393 
394 
395 /*-------------------------------------------------------------------------
396  *
397  * Function:    H5AC__broadcast_clean_list()
398  *
399  * Purpose:     Broadcast the contents of the process 0 cleaned entry
400  *		slist.  In passing, also remove all entries from said
401  *		list, and also remove any matching entries from the dirtied
402  *		slist.
403  *
404  *		This function must only be called by the process with
405  *		MPI_rank 0.
406  *
407  *		Return SUCCEED on success, and FAIL on failure.
408  *
409  * Return:      Non-negative on success/Negative on failure.
410  *
411  * Programmer:  John Mainzer, 7/1/05
412  *
413  *-------------------------------------------------------------------------
414  */
415 static herr_t
H5AC__broadcast_clean_list(H5AC_t * cache_ptr)416 H5AC__broadcast_clean_list(H5AC_t * cache_ptr)
417 {
418     haddr_t	       * addr_buf_ptr = NULL;
419     H5AC_aux_t         * aux_ptr;
420     int                  mpi_result;
421     unsigned		 num_entries = 0;
422     herr_t               ret_value = SUCCEED;    /* Return value */
423 
424     FUNC_ENTER_STATIC
425 
426     /* Sanity checks */
427     HDassert(cache_ptr != NULL);
428     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
429     HDassert(aux_ptr != NULL);
430     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
431     HDassert(aux_ptr->mpi_rank == 0);
432     HDassert(aux_ptr->c_slist_ptr != NULL);
433 
434     /* First broadcast the number of entries in the list so that the
435      * receives can set up a buffer to receive them.  If there aren't
436      * any, we are done.
437      */
438     num_entries = (unsigned)H5SL_count(aux_ptr->c_slist_ptr);
439     if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_UNSIGNED, 0, aux_ptr->mpi_comm)))
440         HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
441 
442     if(num_entries > 0) {
443         H5AC_addr_list_ud_t udata;
444         size_t		 buf_size;
445 
446         /* allocate a buffer to store the list of entry base addresses in */
447         buf_size = sizeof(haddr_t) * num_entries;
448         if(NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
449             HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")
450 
451         /* Set up user data for callback */
452         udata.aux_ptr = aux_ptr;
453         udata.addr_buf_ptr = addr_buf_ptr;
454         udata.u = 0;
455 
456         /* Free all the clean list entries, building the address list in the callback */
457         /* (Callback also removes the matching entries from the dirtied list) */
458         if(H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0)
459             HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
460 
461         /* Now broadcast the list of cleaned entries */
462         if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)addr_buf_ptr, (int)buf_size, MPI_BYTE, 0, aux_ptr->mpi_comm)))
463             HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
464     } /* end if */
465 
466     /* if it is defined, call the sync point done callback.  Note
467      * that this callback is defined purely for testing purposes,
468      * and should be undefined under normal operating circumstances.
469      */
470     if(aux_ptr->sync_point_done)
471         (aux_ptr->sync_point_done)(num_entries, addr_buf_ptr);
472 
473 done:
474     if(addr_buf_ptr)
475         addr_buf_ptr = (haddr_t *)H5MM_xfree((void *)addr_buf_ptr);
476 
477     FUNC_LEAVE_NOAPI(ret_value)
478 } /* H5AC__broadcast_clean_list() */
479 
480 
481 /*-------------------------------------------------------------------------
482  * Function:    H5AC__construct_candidate_list()
483  *
484  * Purpose:     In the parallel case when the metadata_write_strategy is
485  *		H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED, process 0 uses
486  *		this function to construct the list of cache entries to
487  *		be flushed.  This list is then propagated to the other
488  *		caches, and then flushed in a distributed fashion.
489  *
490  *		The sync_point_op parameter is used to determine the extent
491  *		of the flush.
492  *
493  * Return:      Non-negative on success/Negative on failure
494  *
495  * Programmer:  John Mainzer
496  *              3/17/10
497  *
498  *-------------------------------------------------------------------------
499  */
500 static herr_t
H5AC__construct_candidate_list(H5AC_t * cache_ptr,H5AC_aux_t * aux_ptr,int sync_point_op)501 H5AC__construct_candidate_list(H5AC_t *cache_ptr, H5AC_aux_t *aux_ptr,
502     int sync_point_op)
503 {
504     herr_t ret_value = SUCCEED;    /* Return value */
505 
506     FUNC_ENTER_STATIC
507 
508     /* Sanity checks */
509     HDassert(cache_ptr != NULL);
510     HDassert(aux_ptr != NULL);
511     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
512     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
513     HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_CACHE) || (aux_ptr->mpi_rank == 0));
514     HDassert(aux_ptr->d_slist_ptr != NULL);
515     HDassert(aux_ptr->c_slist_ptr != NULL);
516     HDassert(H5SL_count(aux_ptr->c_slist_ptr) == 0);
517     HDassert(aux_ptr->candidate_slist_ptr != NULL);
518     HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) == 0);
519     HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) || (sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_CACHE));
520 
521     switch(sync_point_op) {
522 	case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN:
523             if(H5C_construct_candidate_list__min_clean((H5C_t *)cache_ptr) < 0)
524 		HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_construct_candidate_list__min_clean() failed.")
525 	    break;
526 
527 	case H5AC_SYNC_POINT_OP__FLUSH_CACHE:
528             if(H5C_construct_candidate_list__clean_cache((H5C_t *)cache_ptr) < 0)
529 		HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_construct_candidate_list__clean_cache() failed.")
530 	    break;
531 
532         default:
533 	    HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown sync point operation.")
534 	    break;
535     } /* end switch */
536 
537 done:
538     FUNC_LEAVE_NOAPI(ret_value)
539 } /* H5AC__construct_candidate_list() */
540 
541 
542 /*-------------------------------------------------------------------------
543  *
544  * Function:    H5AC__copy_candidate_list_to_buffer_cb
545  *
546  * Purpose:     Skip list callback for building array of addresses for
547  *              broadcasting the candidate list.
548  *
549  * Return:	Return SUCCEED on success, and FAIL on failure.
550  *
551  * Programmer:  Quincey Koziol, 6/12/15
552  *
553  *-------------------------------------------------------------------------
554  */
555 static herr_t
H5AC__copy_candidate_list_to_buffer_cb(void * _item,void H5_ATTR_UNUSED * _key,void * _udata)556 H5AC__copy_candidate_list_to_buffer_cb(void *_item, void H5_ATTR_UNUSED *_key,
557     void *_udata)
558 {
559     H5AC_slist_entry_t    * slist_entry_ptr = (H5AC_slist_entry_t *)_item;  /* Address of item */
560     H5AC_addr_list_ud_t   * udata = (H5AC_addr_list_ud_t *)_udata;      /* Context for callback */
561 
562     FUNC_ENTER_STATIC_NOERR
563 
564     /* Sanity checks */
565     HDassert(slist_entry_ptr);
566     HDassert(udata);
567 
568     /* Store the entry's address in the buffer */
569     udata->addr_buf_ptr[udata->u] = slist_entry_ptr->addr;
570     udata->u++;
571 
572     /* now release the entry */
573     slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
574 
575     FUNC_LEAVE_NOAPI(SUCCEED)
576 } /* H5AC__copy_candidate_list_to_buffer_cb() */
577 
578 
579 /*-------------------------------------------------------------------------
580  *
581  * Function:    H5AC__copy_candidate_list_to_buffer
582  *
583  * Purpose:     Allocate buffer(s) and copy the contents of the candidate
584  *		entry slist into it (them).  In passing, remove all
585  *		entries from the candidate slist.  Note that the
586  *		candidate slist must not be empty.
587  *
588  *		If MPI_Offset_buf_ptr_ptr is not NULL, allocate a buffer
589  *		of MPI_Offset, copy the contents of the candidate
590  *		entry list into it with the appropriate conversions,
591  *		and return the base address of the buffer in
592  *		*MPI_Offset_buf_ptr.  Note that this is the buffer
593  *		used by process 0 to transmit the list of entries to
594  *		be flushed to all other processes (in this file group).
595  *
596  *		Similarly, allocate a buffer of haddr_t, load the contents
597  *		of the candidate list into this buffer, and return its
598  *		base address in *haddr_buf_ptr_ptr.  Note that this
599  *		latter buffer is constructed unconditionally.
600  *
601  *		In passing, also remove all entries from the candidate
602  *		entry slist.
603  *
604  * Return:	Return SUCCEED on success, and FAIL on failure.
605  *
606  * Programmer:  John Mainzer, 4/19/10
607  *
608  *-------------------------------------------------------------------------
609  */
610 static herr_t
H5AC__copy_candidate_list_to_buffer(const H5AC_t * cache_ptr,unsigned * num_entries_ptr,haddr_t ** haddr_buf_ptr_ptr)611 H5AC__copy_candidate_list_to_buffer(const H5AC_t *cache_ptr, unsigned *num_entries_ptr,
612     haddr_t **haddr_buf_ptr_ptr)
613 {
614     H5AC_aux_t         * aux_ptr = NULL;
615     H5AC_addr_list_ud_t  udata;
616     haddr_t            * haddr_buf_ptr = NULL;
617     size_t		 buf_size;
618     unsigned		 num_entries = 0;
619     herr_t               ret_value = SUCCEED;    /* Return value */
620 
621     FUNC_ENTER_STATIC
622 
623     /* Sanity checks */
624     HDassert(cache_ptr != NULL);
625     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
626     HDassert(aux_ptr != NULL);
627     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
628     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
629     HDassert(aux_ptr->candidate_slist_ptr != NULL);
630     HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) > 0);
631     HDassert(num_entries_ptr != NULL);
632     HDassert(*num_entries_ptr == 0);
633     HDassert(haddr_buf_ptr_ptr != NULL);
634     HDassert(*haddr_buf_ptr_ptr == NULL);
635 
636     num_entries = (unsigned)H5SL_count(aux_ptr->candidate_slist_ptr);
637 
638     /* allocate a buffer(s) to store the list of candidate entry
639      * base addresses in
640      */
641     buf_size = sizeof(haddr_t) * num_entries;
642     if(NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
643         HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
644 
645     /* Set up user data for callback */
646     udata.aux_ptr = aux_ptr;
647     udata.addr_buf_ptr = haddr_buf_ptr;
648     udata.u = 0;
649 
650     /* Free all the candidate list entries, building the address list in the callback */
651     if(H5SL_free(aux_ptr->candidate_slist_ptr, H5AC__copy_candidate_list_to_buffer_cb, &udata) < 0)
652         HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for candidate entries")
653 
654     /* Pass the number of entries and the buffer pointer
655      * back to the caller.
656      */
657     *num_entries_ptr = num_entries;
658     *haddr_buf_ptr_ptr = haddr_buf_ptr;
659 
660 done:
661     if(ret_value < 0)
662         if(haddr_buf_ptr)
663             haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
664 
665     FUNC_LEAVE_NOAPI(ret_value)
666 } /* H5AC__copy_candidate_list_to_buffer() */
667 
668 
669 /*-------------------------------------------------------------------------
670  *
671  * Function:    H5AC__log_deleted_entry()
672  *
673  * Purpose:     Log an entry which has been deleted.
674  *
675  *		Only called for mpi_rank 0. We must make sure that the entry
676  *              doesn't appear in the cleaned or dirty entry lists.
677  *
678  *		Return SUCCEED on success, and FAIL on failure.
679  *
680  * Return:      Non-negative on success/Negative on failure.
681  *
682  * Programmer:  John Mainzer, 6/29/05
683  *
684  *-------------------------------------------------------------------------
685  */
686 herr_t
H5AC__log_deleted_entry(const H5AC_info_t * entry_ptr)687 H5AC__log_deleted_entry(const H5AC_info_t *entry_ptr)
688 {
689     H5AC_t             * cache_ptr;
690     H5AC_aux_t         * aux_ptr;
691     H5AC_slist_entry_t * slist_entry_ptr = NULL;
692     haddr_t              addr;
693 
694     FUNC_ENTER_PACKAGE_NOERR
695 
696     /* Sanity checks */
697     HDassert(entry_ptr);
698     addr = entry_ptr->addr;
699     cache_ptr = entry_ptr->cache_ptr;
700     HDassert(cache_ptr != NULL);
701     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
702     HDassert(aux_ptr != NULL);
703     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
704     HDassert(aux_ptr->mpi_rank == 0);
705     HDassert(aux_ptr->d_slist_ptr != NULL);
706     HDassert(aux_ptr->c_slist_ptr != NULL);
707 
708     /* if the entry appears in the dirtied entry slist, remove it. */
709     if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr))))
710         slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
711 
712     /* if the entry appears in the cleaned entry slist, remove it. */
713     if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr))))
714         slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
715 
716     FUNC_LEAVE_NOAPI(SUCCEED)
717 } /* H5AC__log_deleted_entry() */
718 
719 
720 /*-------------------------------------------------------------------------
721  *
722  * Function:    H5AC__log_dirtied_entry()
723  *
724  * Purpose:     Update the dirty_bytes count for a newly dirtied entry.
725  *
726  *		If mpi_rank isn't 0, this simply means adding the size
727  *		of the entries to the dirty_bytes count.
728  *
729  *		If mpi_rank is 0, we must first check to see if the entry
730  *		appears in the dirty entries slist.  If it is, do nothing.
731  *		If it isn't, add the size to the dirty_bytes count, add the
732  *		entry to the dirty entries slist, and remove it from the
733  *		cleaned list (if it is present there).
734  *
735  * Return:      Non-negative on success/Negative on failure.
736  *
737  * Programmer:  John Mainzer, 6/29/05
738  *
739  *-------------------------------------------------------------------------
740  */
741 herr_t
H5AC__log_dirtied_entry(const H5AC_info_t * entry_ptr)742 H5AC__log_dirtied_entry(const H5AC_info_t *entry_ptr)
743 {
744     H5AC_t             * cache_ptr;
745     H5AC_aux_t         * aux_ptr;
746     herr_t               ret_value = SUCCEED;    /* Return value */
747 
748     FUNC_ENTER_PACKAGE
749 
750     /* Sanity checks */
751     HDassert(entry_ptr);
752     HDassert(entry_ptr->is_dirty == FALSE);
753     cache_ptr = entry_ptr->cache_ptr;
754     HDassert(cache_ptr != NULL);
755     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
756     HDassert(aux_ptr != NULL);
757     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
758 
759     if(aux_ptr->mpi_rank == 0) {
760         H5AC_slist_entry_t *slist_entry_ptr;
761         haddr_t addr = entry_ptr->addr;
762 
763         /* Sanity checks */
764         HDassert(aux_ptr->d_slist_ptr != NULL);
765         HDassert(aux_ptr->c_slist_ptr != NULL);
766 
767         if(NULL == H5SL_search(aux_ptr->d_slist_ptr, (void *)(&addr))) {
768             /* insert the address of the entry in the dirty entry list, and
769              * add its size to the dirty_bytes count.
770              */
771             if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
772                 HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .")
773             slist_entry_ptr->addr  = addr;
774 
775             if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
776                 HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.")
777 
778             aux_ptr->dirty_bytes += entry_ptr->size;
779 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
780 	    aux_ptr->unprotect_dirty_bytes += entry_ptr->size;
781 	    aux_ptr->unprotect_dirty_bytes_updates += 1;
782 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
783         } /* end if */
784 
785         /* the entry is dirty.  If it exists on the cleaned entries list,
786          * remove it.
787          */
788         if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr))))
789             slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
790     } /* end if */
791     else {
792         aux_ptr->dirty_bytes += entry_ptr->size;
793 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
794         aux_ptr->unprotect_dirty_bytes += entry_size;
795         aux_ptr->unprotect_dirty_bytes_updates += 1;
796 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
797     } /* end else */
798 
799 done:
800     FUNC_LEAVE_NOAPI(ret_value)
801 } /* H5AC__log_dirtied_entry() */
802 
803 
804 /*-------------------------------------------------------------------------
805  *
806  * Function:    H5AC__log_cleaned_entry()
807  *
808  * Purpose:     Treat this operation as a 'clear' and remove the entry
809  * 		from both the cleaned and dirtied lists if it is present.
810  *		Reduces the dirty_bytes count by the size of the entry.
811  *
812  * Return:      Non-negative on success/Negative on failure.
813  *
814  * Programmer:  Quincey Koziol
815  *              7/23/16
816  *
817  *-------------------------------------------------------------------------
818  */
819 herr_t
H5AC__log_cleaned_entry(const H5AC_info_t * entry_ptr)820 H5AC__log_cleaned_entry(const H5AC_info_t *entry_ptr)
821 {
822     H5AC_t             * cache_ptr;
823     H5AC_aux_t         * aux_ptr;
824     herr_t               ret_value = SUCCEED;    /* Return value */
825 
826     FUNC_ENTER_PACKAGE
827 
828     /* Sanity check */
829     HDassert(entry_ptr);
830     HDassert(entry_ptr->is_dirty == FALSE);
831     cache_ptr = entry_ptr->cache_ptr;
832     HDassert(cache_ptr != NULL);
833     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
834     HDassert(aux_ptr != NULL);
835     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
836 
837     if(aux_ptr->mpi_rank == 0) {
838         H5AC_slist_entry_t *slist_entry_ptr;
839         haddr_t addr = entry_ptr->addr;
840 
841         /* Sanity checks */
842         HDassert(aux_ptr->d_slist_ptr != NULL);
843         HDassert(aux_ptr->c_slist_ptr != NULL);
844 
845         /* Remove it from both the cleaned list and the dirtied list.  */
846         if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr))))
847             slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
848         if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr))))
849             slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
850 
851     } /* end if */
852 
853     /* Decrement the dirty byte count */
854     aux_ptr->dirty_bytes -= entry_ptr->size;
855 
856 done:
857     FUNC_LEAVE_NOAPI(ret_value)
858 } /* H5AC__log_cleaned_entry() */
859 
860 
861 /*-------------------------------------------------------------------------
862  *
863  * Function:    H5AC__log_flushed_entry()
864  *
865  * Purpose:     Update the clean entry slist for the flush of an entry --
866  *		specifically, if the entry has been cleared, remove it
867  * 		from both the cleaned and dirtied lists if it is present.
868  *		Otherwise, if the entry was dirty, insert the indicated
869  *		entry address in the clean slist if it isn't there already.
870  *
871  *		This function is only used in PHDF5, and should only
872  *		be called for the process with mpi rank 0.
873  *
874  *		Return SUCCEED on success, and FAIL on failure.
875  *
876  * Return:      Non-negative on success/Negative on failure.
877  *
878  * Programmer:  John Mainzer, 6/29/05
879  *
880  *-------------------------------------------------------------------------
881  */
882 herr_t
H5AC__log_flushed_entry(H5C_t * cache_ptr,haddr_t addr,hbool_t was_dirty,unsigned flags)883 H5AC__log_flushed_entry(H5C_t *cache_ptr, haddr_t addr, hbool_t was_dirty,
884     unsigned flags)
885 {
886     hbool_t		 cleared;
887     H5AC_aux_t         * aux_ptr;
888     H5AC_slist_entry_t * slist_entry_ptr = NULL;
889     herr_t               ret_value = SUCCEED;    /* Return value */
890 
891     FUNC_ENTER_PACKAGE
892 
893     /* Sanity check */
894     HDassert(cache_ptr != NULL);
895     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
896     HDassert(aux_ptr != NULL);
897     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
898     HDassert(aux_ptr->mpi_rank == 0);
899     HDassert(aux_ptr->c_slist_ptr != NULL);
900 
901     /* Set local flags */
902     cleared = ((flags & H5C__FLUSH_CLEAR_ONLY_FLAG) != 0);
903 
904     if(cleared) {
905         /* If the entry has been cleared, must remove it from both the
906          * cleaned list and the dirtied list.
907          */
908         if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&addr))))
909             slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
910         if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&addr))))
911             slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
912     } /* end if */
913     else if(was_dirty) {
914         if(NULL == H5SL_search(aux_ptr->c_slist_ptr, (void *)(&addr))) {
915             /* insert the address of the entry in the clean entry list. */
916             if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
917                 HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate clean slist entry .")
918             slist_entry_ptr->addr = addr;
919 
920             if(H5SL_insert(aux_ptr->c_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
921                 HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into clean entry slist.")
922         } /* end if */
923     } /* end else-if */
924 
925 done:
926     FUNC_LEAVE_NOAPI(ret_value)
927 } /* H5AC__log_flushed_entry() */
928 
929 
930 /*-------------------------------------------------------------------------
931  *
932  * Function:    H5AC__log_inserted_entry()
933  *
934  * Purpose:     Update the dirty_bytes count for a newly inserted entry.
935  *
936  *		If mpi_rank isnt 0, this simply means adding the size
937  *		of the entry to the dirty_bytes count.
938  *
939  *		If mpi_rank is 0, we must also add the entry to the
940  *		dirty entries slist.
941  *
942  *		Return SUCCEED on success, and FAIL on failure.
943  *
944  * Return:      Non-negative on success/Negative on failure.
945  *
946  * Programmer:  John Mainzer, 6/30/05
947  *
948  *-------------------------------------------------------------------------
949  */
950 herr_t
H5AC__log_inserted_entry(const H5AC_info_t * entry_ptr)951 H5AC__log_inserted_entry(const H5AC_info_t *entry_ptr)
952 {
953     H5AC_t             * cache_ptr;
954     H5AC_aux_t         * aux_ptr;
955     herr_t               ret_value = SUCCEED;    /* Return value */
956 
957     FUNC_ENTER_PACKAGE
958 
959     /* Sanity checks */
960     HDassert(entry_ptr);
961     cache_ptr = entry_ptr->cache_ptr;
962     HDassert(cache_ptr != NULL);
963     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
964     HDassert(aux_ptr != NULL);
965     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
966 
967     if(aux_ptr->mpi_rank == 0) {
968         H5AC_slist_entry_t *slist_entry_ptr;
969 
970         HDassert(aux_ptr->d_slist_ptr != NULL);
971         HDassert(aux_ptr->c_slist_ptr != NULL);
972 
973         /* Entry to insert should not be in dirty list currently */
974         if(NULL != H5SL_search(aux_ptr->d_slist_ptr, (const void *)(&entry_ptr->addr)))
975             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Inserted entry already in dirty slist.")
976 
977         /* insert the address of the entry in the dirty entry list, and
978          * add its size to the dirty_bytes count.
979          */
980         if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
981             HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .")
982         slist_entry_ptr->addr  = entry_ptr->addr;
983         if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
984             HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.")
985 
986         /* Entry to insert should not be in clean list either */
987         if(NULL != H5SL_search(aux_ptr->c_slist_ptr, (const void *)(&entry_ptr->addr)))
988             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Inserted entry in clean slist.")
989     } /* end if */
990 
991     aux_ptr->dirty_bytes += entry_ptr->size;
992 
993 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
994     aux_ptr->insert_dirty_bytes += size;
995     aux_ptr->insert_dirty_bytes_updates += 1;
996 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
997 
998 done:
999     FUNC_LEAVE_NOAPI(ret_value)
1000 } /* H5AC__log_inserted_entry() */
1001 
1002 
1003 /*-------------------------------------------------------------------------
1004  *
1005  * Function:    H5AC__log_moved_entry()
1006  *
1007  * Purpose:     Update the dirty_bytes count for a moved entry.
1008  *
1009  *		WARNING
1010  *
1011  *		At present, the way that the move call is used ensures
1012  *		that the moved entry is present in all caches by
1013  *		moving in a collective operation and immediately after
1014  *		unprotecting the target entry.
1015  *
1016  *		This function uses this invariant, and will cause arcane
1017  *		failures if it is not met.  If maintaining this invariant
1018  *		becomes impossible, we will have to rework this function
1019  *		extensively, and likely include a bit of IPC for
1020  *		synchronization.  A better option might be to subsume
1021  *		move in the unprotect operation.
1022  *
1023  *		Given that the target entry is in all caches, the function
1024  *		proceeds as follows:
1025  *
1026  *		For processes with mpi rank other 0, it simply checks to
1027  *		see if the entry was dirty prior to the move, and adds
1028  *		the entries size to the dirty bytes count.
1029  *
1030  *		In the process with mpi rank 0, the function first checks
1031  *		to see if the entry was dirty prior to the move.  If it
1032  *		was, and if the entry doesn't appear in the dirtied list
1033  *		under its old address, it adds the entry's size to the
1034  *		dirty bytes count.
1035  *
1036  *		The rank 0 process then removes any references to the
1037  *		entry under its old address from the cleands and dirtied
1038  *		lists, and inserts an entry in the dirtied list under the
1039  *		new address.
1040  *
1041  *		Return SUCCEED on success, and FAIL on failure.
1042  *
1043  * Return:      Non-negative on success/Negative on failure.
1044  *
1045  * Programmer:  John Mainzer, 6/30/05
1046  *
1047  *-------------------------------------------------------------------------
1048  */
1049 herr_t
H5AC__log_moved_entry(const H5F_t * f,haddr_t old_addr,haddr_t new_addr)1050 H5AC__log_moved_entry(const H5F_t *f, haddr_t old_addr, haddr_t new_addr)
1051 {
1052     H5AC_t             * cache_ptr;
1053     H5AC_aux_t         * aux_ptr;
1054     hbool_t		 entry_in_cache;
1055     hbool_t		 entry_dirty;
1056     size_t               entry_size;
1057     herr_t               ret_value = SUCCEED;    /* Return value */
1058 
1059     FUNC_ENTER_PACKAGE
1060 
1061     /* Sanity checks */
1062     HDassert(f);
1063     HDassert(f->shared);
1064     cache_ptr = (H5AC_t *)f->shared->cache;
1065     HDassert(cache_ptr);
1066     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1067     HDassert(aux_ptr != NULL);
1068     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1069 
1070     /* get entry status, size, etc here */
1071     if(H5C_get_entry_status(f, old_addr, &entry_size, &entry_in_cache,
1072             &entry_dirty, NULL, NULL, NULL, NULL, NULL, NULL) < 0)
1073         HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't get entry status.")
1074     if(!entry_in_cache)
1075         HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "entry not in cache.")
1076 
1077     if(aux_ptr->mpi_rank == 0) {
1078         H5AC_slist_entry_t * slist_entry_ptr;
1079 
1080         HDassert(aux_ptr->d_slist_ptr != NULL);
1081         HDassert(aux_ptr->c_slist_ptr != NULL);
1082 
1083         /* if the entry appears in the cleaned entry slist, under its old
1084          * address, remove it.
1085          */
1086         if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)(&old_addr))))
1087             slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, slist_entry_ptr);
1088 
1089         /* if the entry appears in the dirtied entry slist under its old
1090          * address, remove it, but don't free it. Set addr to new_addr.
1091          */
1092         if(NULL != (slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)(&old_addr))))
1093             slist_entry_ptr->addr = new_addr;
1094         else {
1095              /* otherwise, allocate a new entry that is ready
1096               * for insertion, and increment dirty_bytes.
1097               *
1098               * Note that the fact that the entry wasn't in the dirtied
1099               * list under its old address implies that it must have
1100               * been clean to start with.
1101               */
1102             HDassert(!entry_dirty);
1103             if(NULL == (slist_entry_ptr = H5FL_MALLOC(H5AC_slist_entry_t)))
1104                 HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "Can't allocate dirty slist entry .")
1105             slist_entry_ptr->addr = new_addr;
1106 
1107             aux_ptr->dirty_bytes += entry_size;
1108 
1109 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
1110             aux_ptr->move_dirty_bytes += entry_size;
1111             aux_ptr->move_dirty_bytes_updates += 1;
1112 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
1113         } /* end else */
1114 
1115         /* insert / reinsert the entry in the dirty slist */
1116         if(H5SL_insert(aux_ptr->d_slist_ptr, slist_entry_ptr, &(slist_entry_ptr->addr)) < 0)
1117             HGOTO_ERROR(H5E_CACHE, H5E_CANTINSERT, FAIL, "can't insert entry into dirty entry slist.")
1118     } /* end if */
1119     else if(!entry_dirty) {
1120         aux_ptr->dirty_bytes += entry_size;
1121 
1122 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
1123         aux_ptr->move_dirty_bytes += entry_size;
1124         aux_ptr->move_dirty_bytes_updates += 1;
1125 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
1126     } /* end else-if */
1127 
1128 done:
1129     FUNC_LEAVE_NOAPI(ret_value)
1130 } /* H5AC__log_moved_entry() */
1131 
1132 
1133 /*-------------------------------------------------------------------------
1134  * Function:    H5AC__propagate_and_apply_candidate_list
1135  *
1136  * Purpose:     Prior to the addition of support for multiple metadata
1137  *		write strategies, in PHDF5, only the metadata cache with
1138  *		mpi rank 0 was allowed to write to file.  All other
1139  *		metadata caches on processes with rank greater than 0
1140  *		were required to retain dirty entries until they were
1141  *		notified that the entry was clean.
1142  *
1143  *		This constraint is relaxed with the distributed
1144  *		metadata write strategy, in which a list of candidate
1145  *		metadata cache entries is constructed by the process 0
1146  *		cache and then distributed to the caches of all the other
1147  *		processes.  Once the listed is distributed, many (if not
1148  *		all) processes writing writing a unique subset of the
1149  *		entries, and marking the remainder clean.  The subsets
1150  *		are chosen so that each entry in the list of candidates
1151  *		is written by exactly one cache, and all entries are
1152  *		marked as being clean in all caches.
1153  *
1154  *		While the list of candidate cache entries is prepared
1155  *		elsewhere, this function is the main routine for distributing
1156  *		and applying the list.  It must be run simultaniously on
1157  *		all processes that have the relevant file open.  To ensure
1158  *		proper synchronization, there is a barrier at the beginning
1159  *		of this function.
1160  *
1161  *		At present, this function is called under one of two
1162  *		circumstances:
1163  *
1164  *		1) Dirty byte creation exceeds some user specified value.
1165  *
1166  *		   While metadata reads may occur independently, all
1167  *		   operations writing metadata must be collective.  Thus
1168  *		   all metadata caches see the same sequence of operations,
1169  *                 and therefore the same dirty data creation.
1170  *
1171  *		   This fact is used to synchronize the caches for purposes
1172  *                 of propagating the list of candidate entries, by simply
1173  *		   calling this function from all caches whenever some user
1174  *		   specified threshold on dirty data is exceeded.  (the
1175  *		   process 0 cache creates the candidate list just before
1176  *		   calling this function).
1177  *
1178  *		2) Under direct user control -- this operation must be
1179  *		   collective.
1180  *
1181  *              The operations to be managed by this function are as
1182  * 		follows:
1183  *
1184  *		All processes:
1185  *
1186  *		1) Participate in an opening barrier.
1187  *
1188  *		For the process with mpi rank 0:
1189  *
1190  *		1) Load the contents of the candidate list
1191  *		   (candidate_slist_ptr) into a buffer, and broadcast that
1192  *		   buffer to all the other caches.  Clear the candidate
1193  *		   list in passing.
1194  *
1195  *		If there is a positive number of candidates, proceed with
1196  *		the following:
1197  *
1198  *		2) Apply the candidate entry list.
1199  *
1200  *		3) Particpate in a closing barrier.
1201  *
1202  *		4) Remove from the dirty list (d_slist_ptr) and from the
1203  *		   flushed and still clean entries list (c_slist_ptr),
1204  *                 all addresses that appeared in the candidate list, as
1205  *		   these entries are now clean.
1206  *
1207  *
1208  *		For all processes with mpi rank greater than 0:
1209  *
1210  *		1) Receive the candidate entry list broadcast
1211  *
1212  *		If there is a positive number of candidates, proceed with
1213  *		the following:
1214  *
1215  *		2) Apply the candidate entry list.
1216  *
1217  *		3) Particpate in a closing barrier.
1218  *
1219  * Return:      Success:        non-negative
1220  *
1221  *              Failure:        negative
1222  *
1223  * Programmer:  John Mainzer
1224  *              3/17/10
1225  *
1226  *-------------------------------------------------------------------------
1227  */
1228 static herr_t
H5AC__propagate_and_apply_candidate_list(H5F_t * f,hid_t dxpl_id)1229 H5AC__propagate_and_apply_candidate_list(H5F_t  *f, hid_t dxpl_id)
1230 {
1231     H5AC_t             * cache_ptr;
1232     H5AC_aux_t         * aux_ptr;
1233     haddr_t            * candidates_list_ptr = NULL;
1234     int		         mpi_result;
1235     unsigned	         num_candidates = 0;
1236     herr_t               ret_value = SUCCEED;   /* Return value */
1237 
1238     FUNC_ENTER_STATIC
1239 
1240     /* Sanity checks */
1241     HDassert(f != NULL);
1242     cache_ptr = f->shared->cache;
1243     HDassert(cache_ptr != NULL);
1244     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1245     HDassert(aux_ptr != NULL);
1246     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1247     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
1248 
1249     /* to prevent "messages from the future" we must synchronize all
1250      * processes before we write any entries.
1251      */
1252     if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
1253         HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
1254 
1255     if(aux_ptr->mpi_rank == 0) {
1256         if(H5AC__broadcast_candidate_list(cache_ptr, &num_candidates, &candidates_list_ptr) < 0)
1257             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't broadcast candidate slist.")
1258 
1259         HDassert(H5SL_count(aux_ptr->candidate_slist_ptr) == 0);
1260     } /* end if */
1261     else {
1262         if(H5AC__receive_candidate_list(cache_ptr, &num_candidates, &candidates_list_ptr) < 0)
1263             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't receive candidate broadcast.")
1264     } /* end else */
1265 
1266     if(num_candidates > 0) {
1267         herr_t	         result;
1268 
1269         /* all processes apply the candidate list.
1270          * H5C_apply_candidate_list() handles the details of
1271          * distributing the writes across the processes.
1272          */
1273 
1274         /* Enable writes during this operation */
1275         aux_ptr->write_permitted = TRUE;
1276 
1277         /* Apply the candidate list */
1278         result = H5C_apply_candidate_list(f, dxpl_id, cache_ptr, num_candidates,
1279             candidates_list_ptr, aux_ptr->mpi_rank, aux_ptr->mpi_size);
1280 
1281         /* Disable writes again */
1282         aux_ptr->write_permitted = FALSE;
1283 
1284         /* Check for error on the write operation */
1285         if(result < 0)
1286             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't apply candidate list.")
1287 
1288         /* this code exists primarily for the test bed -- it allows us to
1289          * enforce posix semantics on the server that pretends to be a
1290          * file system in our parallel tests.
1291          */
1292 	if(aux_ptr->write_done)
1293 	    (aux_ptr->write_done)();
1294 
1295         /* to prevent "messages from the past" we must synchronize all
1296          * processes again before we go on.
1297          */
1298         if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
1299             HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
1300 
1301 	/* if this is process zero, tidy up the dirtied,
1302          * and flushed and still clean lists.
1303          */
1304         if(aux_ptr->mpi_rank == 0)
1305             if(H5AC__tidy_cache_0_lists(cache_ptr, num_candidates, candidates_list_ptr) < 0)
1306                 HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't tidy up process 0 lists.")
1307     } /* end if */
1308 
1309     /* if it is defined, call the sync point done callback.  Note
1310      * that this callback is defined purely for testing purposes,
1311      * and should be undefined under normal operating circumstances.
1312      */
1313     if(aux_ptr->sync_point_done)
1314         (aux_ptr->sync_point_done)(num_candidates, candidates_list_ptr);
1315 
1316 done:
1317     if(candidates_list_ptr)
1318         candidates_list_ptr = (haddr_t *)H5MM_xfree((void *)candidates_list_ptr);
1319 
1320     FUNC_LEAVE_NOAPI(ret_value)
1321 } /* H5AC__propagate_and_apply_candidate_list() */
1322 
1323 
1324 /*-------------------------------------------------------------------------
1325  * Function:    H5AC__propagate_flushed_and_still_clean_entries_list
1326  *
1327  * Purpose:     In PHDF5, if the process 0 only metadata write strategy
1328  *		is selected, only the metadata cache with mpi rank 0 is
1329  *		allowed to write to file.  All other metadata caches on
1330  *		processes with rank greater than 0 must retain dirty
1331  *		entries until they are notified that the entry is now
1332  *		clean.
1333  *
1334  *		This function is the main routine for handling this
1335  *		notification proceedure.  It must be called
1336  *		simultaniously on all processes that have the relevant
1337  *		file open.  To this end, it is called only during a
1338  *		sync point, with a barrier prior to the call.
1339  *
1340  *		Note that any metadata entry writes by process 0 will
1341  *		occur after the barrier and just before this call.
1342  *
1343  *		Typicaly, calls to this function will be triggered in
1344  *		one of two ways:
1345  *
1346  *		1) Dirty byte creation exceeds some user specified value.
1347  *
1348  *		   While metadata reads may occur independently, all
1349  *		   operations writing metadata must be collective.  Thus
1350  *		   all metadata caches see the same sequence of operations,
1351  *                 and therefore the same dirty data creation.
1352  *
1353  *		   This fact is used to synchronize the caches for purposes
1354  *                 of propagating the list of flushed and still clean
1355  *		   entries, by simply calling this function from all
1356  *		   caches whenever some user specified threshold on dirty
1357  *		   data is exceeded.
1358  *
1359  *		2) Under direct user control -- this operation must be
1360  *		   collective.
1361  *
1362  *              The operations to be managed by this function are as
1363  * 		follows:
1364  *
1365  *		For the process with mpi rank 0:
1366  *
1367  *		1) Load the contents of the flushed and still clean entries
1368  *		   list (c_slist_ptr) into a buffer, and broadcast that
1369  *		   buffer to all the other caches.
1370  *
1371  *		2) Clear the flushed and still clean entries list
1372  *                 (c_slist_ptr).
1373  *
1374  *
1375  *		For all processes with mpi rank greater than 0:
1376  *
1377  *		1) Receive the flushed and still clean entries list broadcast
1378  *
1379  *		2) Mark the specified entries as clean.
1380  *
1381  *
1382  *		For all processes:
1383  *
1384  *		1) Reset the dirtied bytes count to 0.
1385  *
1386  * Return:      Success:        non-negative
1387  *
1388  *              Failure:        negative
1389  *
1390  * Programmer:  John Mainzer
1391  *              July 5, 2005
1392  *
1393  *-------------------------------------------------------------------------
1394  */
1395 static herr_t
H5AC__propagate_flushed_and_still_clean_entries_list(H5F_t * f,hid_t dxpl_id)1396 H5AC__propagate_flushed_and_still_clean_entries_list(H5F_t  *f, hid_t dxpl_id)
1397 {
1398     H5AC_t     * cache_ptr;
1399     H5AC_aux_t * aux_ptr;
1400     herr_t	 ret_value = SUCCEED;   /* Return value */
1401 
1402     FUNC_ENTER_STATIC
1403 
1404     /* Sanity checks */
1405     HDassert(f != NULL);
1406     cache_ptr = f->shared->cache;
1407     HDassert(cache_ptr != NULL);
1408     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1409     HDassert(aux_ptr != NULL);
1410     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1411     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY);
1412 
1413     if(aux_ptr->mpi_rank == 0) {
1414         if(H5AC__broadcast_clean_list(cache_ptr) < 0)
1415             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't broadcast clean slist.")
1416         HDassert(H5SL_count(aux_ptr->c_slist_ptr) == 0);
1417     } /* end if */
1418     else {
1419         if(H5AC__receive_and_apply_clean_list(f, dxpl_id) < 0)
1420             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't receive and/or process clean slist broadcast.")
1421     } /* end else */
1422 
1423 done:
1424     FUNC_LEAVE_NOAPI(ret_value)
1425 } /* H5AC__propagate_flushed_and_still_clean_entries_list() */
1426 
1427 
1428 /*-------------------------------------------------------------------------
1429  *
1430  * Function:    H5AC_receive_haddr_list()
1431  *
1432  * Purpose:     Receive the list of entry addresses from process 0,
1433  *		and return it in a buffer pointed to by *haddr_buf_ptr_ptr.
1434  *		Note that the caller must free this buffer if it is
1435  *		returned.
1436  *
1437  *		This function must only be called by the process with
1438  *		MPI_rank greater than 0.
1439  *
1440  *		Return SUCCEED on success, and FAIL on failure.
1441  *
1442  * Return:      Non-negative on success/Negative on failure.
1443  *
1444  * Programmer:  Quincey Koziol, 6/11/2015
1445  *
1446  *-------------------------------------------------------------------------
1447  */
1448 static herr_t
H5AC__receive_haddr_list(MPI_Comm mpi_comm,unsigned * num_entries_ptr,haddr_t ** haddr_buf_ptr_ptr)1449 H5AC__receive_haddr_list(MPI_Comm mpi_comm, unsigned *num_entries_ptr,
1450     haddr_t **haddr_buf_ptr_ptr)
1451 {
1452     haddr_t	       * haddr_buf_ptr = NULL;
1453     int                  mpi_result;
1454     unsigned		 num_entries;
1455     herr_t               ret_value = SUCCEED;    /* Return value */
1456 
1457     FUNC_ENTER_STATIC
1458 
1459     /* Sanity checks */
1460     HDassert(num_entries_ptr != NULL);
1461     HDassert(*num_entries_ptr == 0);
1462     HDassert(haddr_buf_ptr_ptr != NULL);
1463     HDassert(*haddr_buf_ptr_ptr == NULL);
1464 
1465     /* First receive the number of entries in the list so that we
1466      * can set up a buffer to receive them.  If there aren't
1467      * any, we are done.
1468      */
1469     if(MPI_SUCCESS != (mpi_result = MPI_Bcast(&num_entries, 1, MPI_UNSIGNED, 0, mpi_comm)))
1470         HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
1471 
1472     if(num_entries > 0) {
1473         size_t buf_size;
1474 
1475         /* allocate buffers to store the list of entry base addresses in */
1476         buf_size = sizeof(haddr_t) * num_entries;
1477         if(NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
1478             HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
1479 
1480         /* Now receive the list of candidate entries */
1481         if(MPI_SUCCESS != (mpi_result = MPI_Bcast((void *)haddr_buf_ptr, (int)buf_size, MPI_BYTE, 0, mpi_comm)))
1482             HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_result)
1483     } /* end if */
1484 
1485     /* finally, pass the number of entries and the buffer pointer
1486      * back to the caller.
1487      */
1488     *num_entries_ptr = num_entries;
1489     *haddr_buf_ptr_ptr = haddr_buf_ptr;
1490 
1491 done:
1492     if(ret_value < 0)
1493         if(haddr_buf_ptr)
1494             haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
1495 
1496     FUNC_LEAVE_NOAPI(ret_value)
1497 } /* H5AC_receive_haddr_list() */
1498 
1499 
1500 /*-------------------------------------------------------------------------
1501  *
1502  * Function:    H5AC__receive_and_apply_clean_list()
1503  *
1504  * Purpose:     Receive the list of cleaned entries from process 0,
1505  *		and mark the specified entries as clean.
1506  *
1507  *		This function must only be called by the process with
1508  *		MPI_rank greater than 0.
1509  *
1510  *		Return SUCCEED on success, and FAIL on failure.
1511  *
1512  * Return:      Non-negative on success/Negative on failure.
1513  *
1514  * Programmer:  John Mainzer, 7/4/05
1515  *
1516  *-------------------------------------------------------------------------
1517  */
1518 static herr_t
H5AC__receive_and_apply_clean_list(H5F_t * f,hid_t dxpl_id)1519 H5AC__receive_and_apply_clean_list(H5F_t *f, hid_t dxpl_id)
1520 {
1521     H5AC_t             * cache_ptr;
1522     H5AC_aux_t         * aux_ptr;
1523     haddr_t	       * haddr_buf_ptr = NULL;
1524     unsigned		 num_entries = 0;
1525     herr_t               ret_value = SUCCEED;    /* Return value */
1526 
1527     FUNC_ENTER_STATIC
1528 
1529     /* Sanity check */
1530     HDassert(f != NULL);
1531     cache_ptr = f->shared->cache;
1532     HDassert(cache_ptr != NULL);
1533     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1534     HDassert(aux_ptr != NULL);
1535     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1536     HDassert(aux_ptr->mpi_rank != 0);
1537 
1538     /* Retrieve the clean list from process 0 */
1539     if(H5AC__receive_haddr_list(aux_ptr->mpi_comm, &num_entries, &haddr_buf_ptr) < 0)
1540         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't receive clean list")
1541 
1542     if(num_entries > 0)
1543         /* mark the indicated entries as clean */
1544         if(H5C_mark_entries_as_clean(f, dxpl_id, num_entries, haddr_buf_ptr) < 0)
1545             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't mark entries clean.")
1546 
1547     /* if it is defined, call the sync point done callback.  Note
1548      * that this callback is defined purely for testing purposes,
1549      * and should be undefined under normal operating circumstances.
1550      */
1551     if(aux_ptr->sync_point_done)
1552         (aux_ptr->sync_point_done)(num_entries, haddr_buf_ptr);
1553 
1554 done:
1555     if(haddr_buf_ptr)
1556         haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
1557 
1558     FUNC_LEAVE_NOAPI(ret_value)
1559 } /* H5AC__receive_and_apply_clean_list() */
1560 
1561 
1562 /*-------------------------------------------------------------------------
1563  *
1564  * Function:    H5AC__receive_candidate_list()
1565  *
1566  * Purpose:     Receive the list of candidate entries from process 0,
1567  *		and return it in a buffer pointed to by *haddr_buf_ptr_ptr.
1568  *		Note that the caller must free this buffer if it is
1569  *		returned.
1570  *
1571  *		This function must only be called by the process with
1572  *		MPI_rank greater than 0.
1573  *
1574  *		Return SUCCEED on success, and FAIL on failure.
1575  *
1576  * Return:      Non-negative on success/Negative on failure.
1577  *
1578  * Programmer:  John Mainzer, 3/17/10
1579  *
1580  *-------------------------------------------------------------------------
1581  */
1582 static herr_t
H5AC__receive_candidate_list(const H5AC_t * cache_ptr,unsigned * num_entries_ptr,haddr_t ** haddr_buf_ptr_ptr)1583 H5AC__receive_candidate_list(const H5AC_t *cache_ptr, unsigned *num_entries_ptr,
1584     haddr_t **haddr_buf_ptr_ptr)
1585 {
1586     H5AC_aux_t         * aux_ptr;
1587     herr_t               ret_value = SUCCEED;    /* Return value */
1588 
1589     FUNC_ENTER_STATIC
1590 
1591     /* Sanity checks */
1592     HDassert(cache_ptr != NULL);
1593     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1594     HDassert(aux_ptr != NULL);
1595     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1596     HDassert(aux_ptr->mpi_rank != 0);
1597     HDassert(aux_ptr-> metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
1598     HDassert(num_entries_ptr != NULL);
1599     HDassert(*num_entries_ptr == 0);
1600     HDassert(haddr_buf_ptr_ptr != NULL);
1601     HDassert(*haddr_buf_ptr_ptr == NULL);
1602 
1603     /* Retrieve the candidate list from process 0 */
1604     if(H5AC__receive_haddr_list(aux_ptr->mpi_comm, num_entries_ptr, haddr_buf_ptr_ptr) < 0)
1605         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "can't receive clean list")
1606 
1607 done:
1608     FUNC_LEAVE_NOAPI(ret_value)
1609 } /* H5AC__receive_candidate_list() */
1610 
1611 
1612 /*-------------------------------------------------------------------------
1613  * Function:    H5AC__rsp__dist_md_write__flush
1614  *
1615  * Purpose:     Routine for handling the details of running a sync point
1616  *		that is triggered by a flush -- which in turn must have been
1617  *		triggered by either a flush API call or a file close --
1618  *		when the distributed metadata write strategy is selected.
1619  *
1620  *		Upon entry, each process generates it own candidate list,
1621  *              being a sorted list of all dirty metadata entries currently
1622  *		in the metadata cache.  Note that this list must be idendical
1623  *		across all processes, as all processes see the same stream
1624  *		of dirty metadata coming in, and use the same lists of
1625  *		candidate entries at each sync point.  (At first glance, this
1626  *		argument sounds circular, but think of it in the sense of
1627  *		a recursive proof).
1628  *
1629  *		If this this list is empty, we are done, and the function
1630  *		returns
1631  *
1632  *		Otherwise, after the sorted list dirty metadata entries is
1633  *		constructed, each process uses the same algorithm to assign
1634  *		each entry on the candidate list to exactly one process for
1635  *		flushing.
1636  *
1637  *		At this point, all processes participate in a barrier to
1638  *		avoid messages from the past/future bugs.
1639  *
1640  *		Each process then flushes the entries assigned to it, and
1641  *		marks all other entries on the candidate list as clean.
1642  *
1643  *		Finally, all processes participate in a second barrier to
1644  *		avoid messages from the past/future bugs.
1645  *
1646  *		At the end of this process, process 0 and only process 0
1647  *		must tidy up its lists of dirtied and cleaned entries.
1648  *		These lists are not used in the distributed metadata write
1649  *		strategy, but they must be maintained should we shift
1650  *		to a strategy that uses them.
1651  *
1652  * Return:      Success:        non-negative
1653  *
1654  *              Failure:        negative
1655  *
1656  * Programmer:  John Mainzer
1657  *              April 28, 2010
1658  *
1659  *-------------------------------------------------------------------------
1660  */
1661 static herr_t
H5AC__rsp__dist_md_write__flush(H5F_t * f,hid_t dxpl_id)1662 H5AC__rsp__dist_md_write__flush(H5F_t *f, hid_t dxpl_id)
1663 {
1664     H5AC_t     * cache_ptr;
1665     H5AC_aux_t * aux_ptr;
1666     haddr_t    * haddr_buf_ptr = NULL;
1667     int		 mpi_result;
1668     unsigned     num_entries = 0;
1669     herr_t	 ret_value = SUCCEED;   /* Return value */
1670 
1671     FUNC_ENTER_STATIC
1672 
1673     /* Sanity checks */
1674     HDassert(f != NULL);
1675     cache_ptr = f->shared->cache;
1676     HDassert(cache_ptr != NULL);
1677     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1678     HDassert(aux_ptr != NULL);
1679     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1680     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
1681 
1682     /* first construct the candidate list -- initially, this will be in the
1683      * form of a skip list.  We will convert it later.
1684      */
1685     if(H5C_construct_candidate_list__clean_cache(cache_ptr) < 0)
1686         HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
1687 
1688     if(H5SL_count(aux_ptr->candidate_slist_ptr) > 0) {
1689         herr_t	 result;
1690 
1691         /* convert the candidate list into the format we
1692          * are used to receiving from process 0.
1693          */
1694         if(H5AC__copy_candidate_list_to_buffer(cache_ptr, &num_entries, &haddr_buf_ptr) < 0)
1695             HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
1696 
1697         /* initial sync point barrier */
1698         if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
1699             HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
1700 
1701         /* Enable writes during this operation */
1702         aux_ptr->write_permitted = TRUE;
1703 
1704         /* Apply the candidate list */
1705         result = H5C_apply_candidate_list(f, dxpl_id, cache_ptr, num_entries,
1706             haddr_buf_ptr, aux_ptr->mpi_rank, aux_ptr->mpi_size);
1707 
1708         /* Disable writes again */
1709         aux_ptr->write_permitted = FALSE;
1710 
1711         /* Check for error on the write operation */
1712         if(result < 0)
1713             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't apply candidate list.")
1714 
1715         /* this code exists primarily for the test bed -- it allows us to
1716          * enforce posix semantics on the server that pretends to be a
1717          * file system in our parallel tests.
1718          */
1719         if(aux_ptr->write_done)
1720             (aux_ptr->write_done)();
1721 
1722         /* final sync point barrier */
1723         if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
1724             HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
1725 
1726 	/* if this is process zero, tidy up the dirtied,
1727          * and flushed and still clean lists.
1728          */
1729         if(aux_ptr->mpi_rank == 0)
1730             if(H5AC__tidy_cache_0_lists(cache_ptr, num_entries, haddr_buf_ptr) < 0)
1731                 HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Can't tidy up process 0 lists.")
1732     } /* end if */
1733 
1734     /* if it is defined, call the sync point done callback.  Note
1735      * that this callback is defined purely for testing purposes,
1736      * and should be undefined under normal operating circumstances.
1737      */
1738     if(aux_ptr->sync_point_done)
1739         (aux_ptr->sync_point_done)(num_entries, haddr_buf_ptr);
1740 
1741 done:
1742     if(haddr_buf_ptr)
1743         haddr_buf_ptr = (haddr_t *)H5MM_xfree((void *)haddr_buf_ptr);
1744 
1745     FUNC_LEAVE_NOAPI(ret_value)
1746 } /* H5AC__rsp__dist_md_write__flush() */
1747 
1748 
1749 /*-------------------------------------------------------------------------
1750  * Function:    H5AC__rsp__dist_md_write__flush_to_min_clean
1751  *
1752  * Purpose:     Routine for handling the details of running a sync point
1753  *		triggered by the accumulation of dirty metadata (as
1754  *		opposed to a flush call to the API) when the distributed
1755  *		metadata write strategy is selected.
1756  *
1757  *		After invocation and initial sanity checking this function
1758  *		first checks to see if evictions are enabled -- if they
1759  *		are not, the function does nothing and returns.
1760  *
1761  *		Otherwise, process zero constructs a list of entries to
1762  *		be flushed in order to bring the process zero cache back
1763  *		within its min clean requirement.  Note that this list
1764  *		(the candidate list) may be empty.
1765  *
1766  *              Then, all processes participate in a barrier.
1767  *
1768  *		After the barrier, process 0 broadcasts the number of
1769  *		entries in the candidate list prepared above, and all
1770  *		other processes receive this number.
1771  *
1772  *		If this number is zero, we are done, and the function
1773  *		returns without further action.
1774  *
1775  *		Otherwise, process 0 broadcasts the sorted list of
1776  *		candidate entries, and all other processes receive it.
1777  *
1778  *		Then, each process uses the same algorithm to assign
1779  *		each entry on the candidate list to exactly one process
1780  *		for flushing.
1781  *
1782  *		Each process then flushes the entries assigned to it, and
1783  *		marks all other entries on the candidate list as clean.
1784  *
1785  *		Finally, all processes participate in a second barrier to
1786  *		avoid messages from the past/future bugs.
1787  *
1788  *		At the end of this process, process 0 and only process 0
1789  *		must tidy up its lists of dirtied and cleaned entries.
1790  *		These lists are not used in the distributed metadata write
1791  *		strategy, but they must be maintained should we shift
1792  *		to a strategy that uses them.
1793  *
1794  * Return:      Success:        non-negative
1795  *
1796  *              Failure:        negative
1797  *
1798  * Programmer:  John Mainzer
1799  *              April 28, 2010
1800  *
1801  *-------------------------------------------------------------------------
1802  */
1803 static herr_t
H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t * f,hid_t dxpl_id)1804 H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f, hid_t dxpl_id)
1805 {
1806     H5AC_t     * cache_ptr;
1807     H5AC_aux_t * aux_ptr;
1808     hbool_t 	 evictions_enabled;
1809     herr_t	 ret_value = SUCCEED;   /* Return value */
1810 
1811     FUNC_ENTER_STATIC
1812 
1813     /* Sanity checks */
1814     HDassert(f != NULL);
1815     cache_ptr = f->shared->cache;
1816     HDassert(cache_ptr != NULL);
1817     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1818     HDassert(aux_ptr != NULL);
1819     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1820     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
1821 
1822     /* Query if evictions are allowed */
1823     if(H5C_get_evictions_enabled((const H5C_t *)cache_ptr, &evictions_enabled) < 0)
1824         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_get_evictions_enabled() failed.")
1825 
1826     if(evictions_enabled) {
1827         /* construct candidate list -- process 0 only */
1828         if(aux_ptr->mpi_rank == 0)
1829             if(H5AC__construct_candidate_list(cache_ptr, aux_ptr, H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) < 0)
1830                 HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
1831 
1832         /* propagate and apply candidate list -- all processes */
1833         if(H5AC__propagate_and_apply_candidate_list(f, dxpl_id) < 0)
1834             HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate and apply candidate list.")
1835     } /* evictions enabled */
1836 
1837 done:
1838     FUNC_LEAVE_NOAPI(ret_value)
1839 } /* H5AC__rsp__dist_md_write__flush_to_min_clean() */
1840 
1841 
1842 /*-------------------------------------------------------------------------
1843  * Function:    H5AC__rsp__p0_only__flush
1844  *
1845  * Purpose:     Routine for handling the details of running a sync point
1846  *		that is triggered a flush -- which in turn must have been
1847  *		triggered by either a flush API call or a file close --
1848  *		when the process 0 only metadata write strategy is selected.
1849  *
1850  *              First, all processes participate in a barrier.
1851  *
1852  *		Then process zero flushes all dirty entries, and broadcasts
1853  *		they number of clean entries (if any) to all the other
1854  *		caches.
1855  *
1856  *		If this number is zero, we are done.
1857  *
1858  *		Otherwise, process 0 broadcasts the list of cleaned
1859  *		entries, and all other processes which are part of this
1860  *		file group receive it, and mark the listed entries as
1861  *		clean in their caches.
1862  *
1863  *		Since all processes have the same set of dirty
1864  *		entries at the beginning of the sync point, and all
1865  *		entries that will be written are written before
1866  *		process zero broadcasts the number of cleaned entries,
1867  *		there is no need for a closing barrier.
1868  *
1869  * Return:      Success:        non-negative
1870  *
1871  *              Failure:        negative
1872  *
1873  * Programmer:  John Mainzer
1874  *              April 28, 2010
1875  *
1876  *-------------------------------------------------------------------------
1877  */
1878 static herr_t
H5AC__rsp__p0_only__flush(H5F_t * f,hid_t dxpl_id)1879 H5AC__rsp__p0_only__flush(H5F_t *f, hid_t dxpl_id)
1880 {
1881     H5AC_t     * cache_ptr;
1882     H5AC_aux_t * aux_ptr;
1883     int		 mpi_result;
1884     herr_t	 ret_value = SUCCEED;   /* Return value */
1885 
1886     FUNC_ENTER_STATIC
1887 
1888     /* Sanity checks */
1889     HDassert(f != NULL);
1890     cache_ptr = f->shared->cache;
1891     HDassert(cache_ptr != NULL);
1892     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1893     HDassert(aux_ptr != NULL);
1894     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1895     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY);
1896 
1897     /* to prevent "messages from the future" we must
1898      * synchronize all processes before we start the flush.
1899      * Hence the following barrier.
1900      */
1901     if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
1902         HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
1903 
1904     /* Flush data to disk, from rank 0 process */
1905     if(aux_ptr->mpi_rank == 0) {
1906         herr_t        result;
1907 
1908         /* Enable writes during this operation */
1909         aux_ptr->write_permitted = TRUE;
1910 
1911         /* Flush the cache */
1912         result = H5C_flush_cache(f, dxpl_id, H5AC__NO_FLAGS_SET);
1913 
1914         /* Disable writes again */
1915         aux_ptr->write_permitted = FALSE;
1916 
1917         /* Check for error on the write operation */
1918         if(result < 0)
1919             HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")
1920 
1921         /* this code exists primarily for the test bed -- it allows us to
1922          * enforce posix semantics on the server that pretends to be a
1923          * file system in our parallel tests.
1924          */
1925         if(aux_ptr->write_done)
1926             (aux_ptr->write_done)();
1927     } /* end if */
1928 
1929     /* Propagate cleaned entries to other ranks. */
1930     if(H5AC__propagate_flushed_and_still_clean_entries_list(f, dxpl_id) < 0)
1931         HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate clean entries list.")
1932 
1933 done:
1934     FUNC_LEAVE_NOAPI(ret_value)
1935 } /* H5AC__rsp__p0_only__flush() */
1936 
1937 
1938 /*-------------------------------------------------------------------------
1939  * Function:    H5AC__rsp__p0_only__flush_to_min_clean
1940  *
1941  * Purpose:     Routine for handling the details of running a sync point
1942  *		triggered by the accumulation of dirty metadata (as
1943  *		opposed to a flush call to the API) when the process 0
1944  *		only metadata write strategy is selected.
1945  *
1946  *		After invocation and initial sanity checking this function
1947  *		first checks to see if evictions are enabled -- if they
1948  *		are not, the function does nothing and returns.
1949  *
1950  *              Otherwise, all processes participate in a barrier.
1951  *
1952  *		After the barrier, if this is process 0, the function
1953  *		causes the cache to flush sufficient entries to get the
1954  *		cache back within its minimum clean fraction, and broadcast
1955  *		the number of entries which have been flushed since
1956  *		the last sync point, and are still clean.
1957  *
1958  *		If this number is zero, we are done.
1959  *
1960  *		Otherwise, process 0 broadcasts the list of cleaned
1961  *		entries, and all other processes which are part of this
1962  *		file group receive it, and mark the listed entries as
1963  *		clean in their caches.
1964  *
1965  *		Since all processes have the same set of dirty
1966  *		entries at the beginning of the sync point, and all
1967  *		entries that will be written are written before
1968  *		process zero broadcasts the number of cleaned entries,
1969  *		there is no need for a closing barrier.
1970  *
1971  * Return:      Success:        non-negative
1972  *
1973  *              Failure:        negative
1974  *
1975  * Programmer:  John Mainzer
1976  *              April 28, 2010
1977  *
1978  *-------------------------------------------------------------------------
1979  */
1980 static herr_t
H5AC__rsp__p0_only__flush_to_min_clean(H5F_t * f,hid_t dxpl_id)1981 H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f, hid_t dxpl_id)
1982 {
1983     H5AC_t     * cache_ptr;
1984     H5AC_aux_t * aux_ptr;
1985     hbool_t 	 evictions_enabled;
1986     herr_t	 ret_value = SUCCEED;   /* Return value */
1987 
1988     FUNC_ENTER_STATIC
1989 
1990     /* Sanity checks */
1991     HDassert(f != NULL);
1992     cache_ptr = f->shared->cache;
1993     HDassert(cache_ptr != NULL);
1994     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
1995     HDassert(aux_ptr != NULL);
1996     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
1997     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY);
1998 
1999     /* Query if evictions are allowed */
2000     if(H5C_get_evictions_enabled((const H5C_t *)cache_ptr, &evictions_enabled) < 0)
2001         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_get_evictions_enabled() failed.")
2002 
2003     /* Flush if evictions are allowed -- following call
2004      * will cause process 0 to flush to min clean size,
2005      * and then propagate the newly clean entries to the
2006      * other processes.
2007      *
2008      * Otherwise, do nothing.
2009      */
2010     if(evictions_enabled) {
2011         int          mpi_result;
2012 
2013         /* to prevent "messages from the future" we must synchronize all
2014          * processes before we start the flush.  This synchronization may
2015          * already be done -- hence the do_barrier parameter.
2016          */
2017         if(MPI_SUCCESS != (mpi_result = MPI_Barrier(aux_ptr->mpi_comm)))
2018             HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_result)
2019 
2020         if(0 == aux_ptr->mpi_rank) {
2021             herr_t	 result;
2022 
2023             /* here, process 0 flushes as many entries as necessary to
2024              * comply with the currently specified min clean size.
2025              * Note that it is quite possible that no entries will be
2026              * flushed.
2027              */
2028 
2029             /* Enable writes during this operation */
2030             aux_ptr->write_permitted = TRUE;
2031 
2032             /* Flush the cache */
2033             result = H5C_flush_to_min_clean(f, dxpl_id);
2034 
2035             /* Disable writes again */
2036             aux_ptr->write_permitted = FALSE;
2037 
2038             /* Check for error on the write operation */
2039             if(result < 0)
2040                 HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")
2041 
2042             /* this call exists primarily for the test code -- it is used
2043  	     * to enforce POSIX semantics on the process used to simulate
2044  	     * reads and writes in t_cache.c.
2045              */
2046             if(aux_ptr->write_done)
2047                 (aux_ptr->write_done)();
2048         } /* end if */
2049 
2050         if(H5AC__propagate_flushed_and_still_clean_entries_list(f, dxpl_id) < 0)
2051             HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't propagate clean entries list.")
2052     } /* end if */
2053 
2054 done:
2055     FUNC_LEAVE_NOAPI(ret_value)
2056 } /* H5AC__rsp__p0_only__flush_to_min_clean() */
2057 
2058 
2059 /*-------------------------------------------------------------------------
2060  * Function:    H5AC__run_sync_point
2061  *
2062  * Purpose:     Top level routine for managing a sync point between all
2063  *		meta data caches in the parallel case.  Since all caches
2064  *		see the same sequence of dirty metadata, we simply count
2065  *		bytes of dirty metadata, and run a sync point whenever the
2066  *		number of dirty bytes of metadata seen since the last
2067  *		sync point exceeds a threshold that is common across all
2068  *		processes.  We also run sync points in response to
2069  *		HDF5 API calls triggering either a flush or a file close.
2070  *
2071  *		In earlier versions of PHDF5, only the metadata cache with
2072  *		mpi rank 0 was allowed to write to file.  All other
2073  *		metadata caches on processes with rank greater than 0 were
2074  *		required to retain dirty entries until they were notified
2075  *		that the entry is was clean.
2076  *
2077  *		This function was created to make it easier for us to
2078  *		experiment with other options, as it is a single point
2079  *		for the execution of sync points.
2080  *
2081  * Return:      Success:        non-negative
2082  *
2083  *              Failure:        negative
2084  *
2085  * Programmer:  John Mainzer
2086  *              March 11, 2010
2087  *
2088  *-------------------------------------------------------------------------
2089  */
2090 herr_t
H5AC__run_sync_point(H5F_t * f,hid_t dxpl_id,int sync_point_op)2091 H5AC__run_sync_point(H5F_t *f, hid_t dxpl_id, int sync_point_op)
2092 {
2093     H5AC_t     * cache_ptr;
2094     H5AC_aux_t * aux_ptr;
2095     herr_t	 ret_value = SUCCEED;   /* Return value */
2096 
2097     FUNC_ENTER_PACKAGE
2098 
2099     /* Sanity checks */
2100     HDassert(f != NULL);
2101     cache_ptr = f->shared->cache;
2102     HDassert(cache_ptr != NULL);
2103     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
2104     HDassert(aux_ptr != NULL);
2105     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
2106     HDassert((sync_point_op == H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) ||
2107         (sync_point_op == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED));
2108 
2109 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
2110 HDfprintf(stdout, "%d:H5AC_propagate...:%u: (u/uu/i/iu/r/ru) = %zu/%u/%zu/%u/%zu/%u\n",
2111     aux_ptr->mpi_rank,
2112     aux_ptr->dirty_bytes_propagations,
2113     aux_ptr->unprotect_dirty_bytes,
2114     aux_ptr->unprotect_dirty_bytes_updates,
2115     aux_ptr->insert_dirty_bytes,
2116     aux_ptr->insert_dirty_bytes_updates,
2117     aux_ptr->rename_dirty_bytes,
2118     aux_ptr->rename_dirty_bytes_updates);
2119 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
2120 
2121     /* clear collective access flag on half of the entries in the
2122        cache and mark them as independent in case they need to be
2123        evicted later. All ranks are guranteed to mark the same entries
2124        since we don't modify the order of the collectively accessed
2125        entries except through collective access. */
2126     if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
2127         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5C_clear_coll_entries() failed.")
2128 
2129     switch(aux_ptr->metadata_write_strategy) {
2130         case H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY:
2131 	    switch(sync_point_op) {
2132                 case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN:
2133 	            if(H5AC__rsp__p0_only__flush_to_min_clean(f, dxpl_id) < 0)
2134                         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__p0_only__flush_to_min_clean() failed.")
2135 		    break;
2136 
2137 		case H5AC_SYNC_POINT_OP__FLUSH_CACHE:
2138 	            if(H5AC__rsp__p0_only__flush(f, dxpl_id) < 0)
2139                         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__p0_only__flush() failed.")
2140 		    break;
2141 
2142 		default:
2143                     HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown flush op");
2144 		    break;
2145 	    } /* end switch */
2146 	    break;
2147 
2148 	case H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED:
2149 	    switch(sync_point_op) {
2150                 case H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN:
2151 	            if(H5AC__rsp__dist_md_write__flush_to_min_clean(f, dxpl_id) < 0)
2152                         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__dist_md_write__flush_to_min_clean() failed.")
2153 		    break;
2154 
2155 		case H5AC_SYNC_POINT_OP__FLUSH_CACHE:
2156 	            if(H5AC__rsp__dist_md_write__flush(f, dxpl_id) < 0)
2157                         HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, FAIL, "H5AC__rsp__dist_md_write__flush() failed.")
2158 		    break;
2159 
2160 		default:
2161                     HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "unknown flush op");
2162 		    break;
2163 	    } /* end switch */
2164 	    break;
2165 
2166 	default:
2167             HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "Unknown metadata write strategy.")
2168 	    break;
2169     } /* end switch */
2170 
2171     /* reset the dirty bytes count */
2172     aux_ptr->dirty_bytes = 0;
2173 
2174 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
2175     aux_ptr->dirty_bytes_propagations     += 1;
2176     aux_ptr->unprotect_dirty_bytes         = 0;
2177     aux_ptr->unprotect_dirty_bytes_updates = 0;
2178     aux_ptr->insert_dirty_bytes            = 0;
2179     aux_ptr->insert_dirty_bytes_updates    = 0;
2180     aux_ptr->rename_dirty_bytes            = 0;
2181     aux_ptr->rename_dirty_bytes_updates    = 0;
2182 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
2183 
2184 done:
2185     FUNC_LEAVE_NOAPI(ret_value)
2186 } /* H5AC__run_sync_point() */
2187 
2188 
2189 /*-------------------------------------------------------------------------
2190  * Function:    H5AC__tidy_cache_0_lists()
2191  *
2192  * Purpose:     In the distributed metadata write strategy, not all dirty
2193  *		entries are written by process 0 -- thus we must tidy
2194  *		up the dirtied, and flushed and still clean lists
2195  *		maintained by process zero after each sync point.
2196  *
2197  *		This procedure exists to tend to this issue.
2198  *
2199  *		At this point, all entries that process 0 cleared should
2200  *		have been removed from both the dirty and flushed and
2201  *		still clean lists, and entries that process 0 has flushed
2202  *		should have been removed from the dirtied list and added
2203  *		to the flushed and still clean list.
2204  *
2205  *		However, since the distributed metadata write strategy
2206  *		doesn't make use of these lists, the objective is simply
2207  *		to maintain these lists in consistent state that allows
2208  *		them to be used should the metadata write strategy change
2209  *		to one that uses these lists.
2210  *
2211  *		Thus for our purposes, all we need to do is remove from
2212  *		the dirtied and flushed and still clean lists all
2213  *		references to entries that appear in the candidate list.
2214  *
2215  * Return:      Success:        non-negative
2216  *
2217  *              Failure:        negative
2218  *
2219  * Programmer:  John Mainzer
2220  *              4/20/10
2221  *
2222  *-------------------------------------------------------------------------
2223  */
2224 static herr_t
H5AC__tidy_cache_0_lists(H5AC_t * cache_ptr,unsigned num_candidates,haddr_t * candidates_list_ptr)2225 H5AC__tidy_cache_0_lists(H5AC_t *cache_ptr, unsigned num_candidates,
2226     haddr_t *candidates_list_ptr)
2227 {
2228     H5AC_aux_t         * aux_ptr;
2229     unsigned             u;
2230 
2231     FUNC_ENTER_STATIC_NOERR
2232 
2233     /* Sanity checks */
2234     HDassert(cache_ptr != NULL);
2235     aux_ptr = (H5AC_aux_t *)H5C_get_aux_ptr(cache_ptr);
2236     HDassert(aux_ptr != NULL);
2237     HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
2238     HDassert(aux_ptr->metadata_write_strategy == H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED);
2239     HDassert(aux_ptr->mpi_rank == 0);
2240     HDassert(num_candidates > 0);
2241     HDassert(candidates_list_ptr != NULL);
2242 
2243     /* clean up dirtied and flushed and still clean lists by removing
2244      * all entries on the candidate list.  Cleared entries should
2245      * have been removed from both the dirty and cleaned lists at
2246      * this point, flushed entries should have been added to the
2247      * cleaned list.  However, for this metadata write strategy,
2248      * we just want to remove all references to the candidate entries.
2249      */
2250     for(u = 0; u < num_candidates; u++) {
2251         H5AC_slist_entry_t * d_slist_entry_ptr;
2252         H5AC_slist_entry_t * c_slist_entry_ptr;
2253         haddr_t              addr;
2254 
2255         addr = candidates_list_ptr[u];
2256 
2257         /* addr may be either on the dirtied list, or on the flushed
2258          * and still clean list.  Remove it.
2259          */
2260         if(NULL != (d_slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->d_slist_ptr, (void *)&addr)))
2261             d_slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, d_slist_entry_ptr);
2262         if(NULL != (c_slist_entry_ptr = (H5AC_slist_entry_t *)H5SL_remove(aux_ptr->c_slist_ptr, (void *)&addr)))
2263             c_slist_entry_ptr = H5FL_FREE(H5AC_slist_entry_t, c_slist_entry_ptr);
2264     } /* end for */
2265 
2266     FUNC_LEAVE_NOAPI(SUCCEED)
2267 } /* H5AC__tidy_cache_0_lists() */
2268 
2269 
2270 /*-------------------------------------------------------------------------
2271  * Function:    H5AC__flush_entries
2272  *
2273  * Purpose:     Flush the metadata cache associated with the specified file,
2274  *              only writing from rank 0, but propagating the cleaned entries
2275  *              to all ranks.
2276  *
2277  * Return:      Non-negative on success/Negative on failure if there was a
2278  *              request to flush all items and something was protected.
2279  *
2280  * Programmer:  Quincey Koziol
2281  *              koziol@hdfgroup.org
2282  *              Aug 22 2009
2283  *
2284  *-------------------------------------------------------------------------
2285  */
2286 herr_t
H5AC__flush_entries(H5F_t * f,hid_t dxpl_id)2287 H5AC__flush_entries(H5F_t *f, hid_t dxpl_id)
2288 {
2289     herr_t        ret_value = SUCCEED;      /* Return value */
2290 
2291     FUNC_ENTER_PACKAGE
2292 
2293     /* Sanity checks */
2294     HDassert(f);
2295     HDassert(f->shared->cache);
2296 
2297     /* Check if we have >1 ranks */
2298     if(H5C_get_aux_ptr(f->shared->cache))
2299         if(H5AC__run_sync_point(f, dxpl_id, H5AC_SYNC_POINT_OP__FLUSH_CACHE) < 0)
2300             HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't run sync point.")
2301 
2302 done:
2303     FUNC_LEAVE_NOAPI(ret_value)
2304 } /* H5AC__flush_entries() */
2305 #endif /* H5_HAVE_PARALLEL */
2306 
2307