1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  * Copyright by The HDF Group.                                               *
3  * Copyright by the Board of Trustees of the University of Illinois.         *
4  * All rights reserved.                                                      *
5  *                                                                           *
6  * This file is part of HDF5.  The full HDF5 copyright notice, including     *
7  * terms governing use, modification, and redistribution, is contained in    *
8  * the COPYING file, which can be found at the root of the source code       *
9  * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases.  *
10  * If you do not have access to either file, you may request a copy from     *
11  * help@hdfgroup.org.                                                        *
12  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
13 
14 /*
15  * Programmer: John Mainzer -- 4/19/06
16  *
17  * Purpose:     This file contains declarations which are normally visible
18  *              only within the H5AC package (just H5AC.c at present).
19  *
20  *		Source files outside the H5AC package should include
21  *		H5ACprivate.h instead.
22  *
23  *		The one exception to this rule is testpar/t_cache.c.  The
24  *		test code is easier to write if it can look at H5AC_aux_t.
25  *		Indeed, this is the main reason why this file was created.
26  *
27  */
28 
29 #if !(defined H5AC_FRIEND || defined H5AC_MODULE)
30 #error "Do not include this file outside the H5AC package!"
31 #endif
32 
33 #ifndef _H5ACpkg_H
34 #define _H5ACpkg_H
35 
36 /* Get package's private header */
37 #include "H5ACprivate.h"	/* Metadata cache			*/
38 
39 
40 /* Get needed headers */
41 #include "H5Cprivate.h"         /* Cache                                */
42 #include "H5FLprivate.h"        /* Free Lists                           */
43 
44 /*****************************/
45 /* Package Private Variables */
46 /*****************************/
47 
48 /* Declare extern the free list to manage the H5AC_aux_t struct */
49 H5FL_EXTERN(H5AC_aux_t);
50 
51 
52 /**************************/
53 /* Package Private Macros */
54 /**************************/
55 
56 #define H5AC_DEBUG_DIRTY_BYTES_CREATION	0
57 
58 #ifdef H5_HAVE_PARALLEL
59 
60 /* the following #defined are used to specify the operation required
61  * at a sync point.
62  */
63 
64 #define H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN		0
65 #define H5AC_SYNC_POINT_OP__FLUSH_CACHE			1
66 
67 #endif /* H5_HAVE_PARALLEL */
68 
69 /*-------------------------------------------------------------------------
70  *  It is a bit difficult to set ranges of allowable values on the
71  *  dirty_bytes_threshold field of H5AC_aux_t.  The following are
72  *  probably broader than they should be.
73  *-------------------------------------------------------------------------
74  */
75 
76 #define H5AC__MIN_DIRTY_BYTES_THRESHOLD		(size_t) \
77 						(H5C__MIN_MAX_CACHE_SIZE / 2)
78 #define H5AC__DEFAULT_DIRTY_BYTES_THRESHOLD	(256 * 1024)
79 #define H5AC__MAX_DIRTY_BYTES_THRESHOLD   	(size_t) \
80 						(H5C__MAX_MAX_CACHE_SIZE / 4)
81 
82 
83 /****************************************************************************
84  *
85  * structure H5AC_aux_t
86  *
87  * While H5AC has become a wrapper for the cache implemented in H5C.c, there
88  * are some features of the metadata cache that are specific to it, and which
89  * therefore do not belong in the more generic H5C cache code.
90  *
91  * In particular, there is the matter of synchronizing writes from the
92  * metadata cache to disk in the PHDF5 case.
93  *
94  * Prior to this update, the presumption was that all metadata caches would
95  * write the same data at the same time since all operations modifying
96  * metadata must be performed collectively.  Given this assumption, it was
97  * safe to allow only the writes from process 0 to actually make it to disk,
98  * while metadata writes from all other processes were discarded.
99  *
100  * Unfortunately, this presumption is in error as operations that read
101  * metadata need not be collective, but can change the location of dirty
102  * entries in the metadata cache LRU lists.  This can result in the same
103  * metadata write operation triggering writes from the metadata caches on
104  * some processes, but not all (causing a hang), or in different sets of
105  * entries being written from different caches (potentially resulting in
106  * metadata corruption in the file).
107  *
108  * To deal with this issue, I decided to apply a paradigm shift to the way
109  * metadata is written to disk.
110  *
111  * With this set of changes, only the metadata cache on process 0 is able
112  * to write metadata to disk, although metadata caches on all other
113  * processes can read metadata from disk as before.
114  *
115  * To keep all the other caches from getting plugged up with dirty metadata,
116  * process 0 periodically broadcasts a list of entries that it has flushed
117  * since that last notice, and which are currently clean.  The other caches
118  * mark these entries as clean as well, which allows them to evict the
119  * entries as needed.
120  *
121  * One obvious problem in this approach is synchronizing the broadcasts
122  * and receptions, as different caches may see different amounts of
123  * activity.
124  *
125  * The current solution is for the caches to track the number of bytes
126  * of newly generated dirty metadata, and to broadcast and receive
127  * whenever this value exceeds some user specified threshold.
128  *
129  * Maintaining this count is easy for all processes not on process 0 --
130  * all that is necessary is to add the size of the entry to the total
131  * whenever there is an insertion, a move of a previously clean entry,
132  * or whever a previously clean entry is marked dirty in an unprotect.
133  *
134  * On process 0, we have to be careful not to count dirty bytes twice.
135  * If an entry is marked dirty, flushed, and marked dirty again, all
136  * within a single reporting period, it only th first marking should
137  * be added to the dirty bytes generated tally, as that is all that
138  * the other processes will see.
139  *
140  * At present, this structure exists to maintain the fields needed to
141  * implement the above scheme, and thus is only used in the parallel
142  * case.  However, other uses may arise in the future.
143  *
144  * Instance of this structure are associated with metadata caches via
145  * the aux_ptr field of H5C_t (see H5Cpkg.h).  The H5AC code is
146  * responsible for allocating, maintaining, and discarding instances
147  * of H5AC_aux_t.
148  *
149  * The remainder of this header comments documents the individual fields
150  * of the structure.
151  *
152  *                                              JRM - 6/27/05
153  *
154  * Update: When the above was written, I planned to allow the process
155  *	0 metadata cache to write dirty metadata between sync points.
156  *	However, testing indicated that this allowed occasional
157  *	messages from the future to reach the caches on other processes.
158  *
159  *	To resolve this, the code was altered to require that all metadata
160  *	writes take place during sync points -- which solved the problem.
161  *	Initially all writes were performed by the process 0 cache.  This
162  *	approach was later replaced with a distributed write approach
163  *	in which each process writes a subset of the metadata to be
164  *	written.
165  *
166  *	After thinking on the matter for a while, I arrived at the
167  *	conclusion that the process 0 cache could be allowed to write
168  *	dirty metadata between sync points if it restricted itself to
169  *	entries that had been dirty at the time of the previous sync point.
170  *
171  *	To date, there has been no attempt to implement this optimization.
172  *	However, should it be attempted, much of the supporting code
173  *	should still be around.
174  *
175  *						JRM -- 1/6/15
176  *
177  * magic:       Unsigned 32 bit integer always set to
178  *		H5AC__H5AC_AUX_T_MAGIC.  This field is used to validate
179  *		pointers to instances of H5AC_aux_t.
180  *
181  * mpi_comm:	MPI communicator associated with the file for which the
182  *		cache has been created.
183  *
184  * mpi_rank:	MPI rank of this process within mpi_comm.
185  *
186  * mpi_size:	Number of processes in mpi_comm.
187  *
188  * write_permitted:  Boolean flag used to control whether the cache
189  *		is permitted to write to file.
190  *
191  * dirty_bytes_threshold: Integer field containing the dirty bytes
192  *		generation threshold.  Whenever dirty byte creation
193  *		exceeds this value, the metadata cache on process 0
194  *		broadcasts a list of the entries it has flushed since
195  *		the last broadcast (or since the beginning of execution)
196  *		and which are currently clean (if they are still in the
197  *		cache)
198  *
199  *		Similarly, metadata caches on processes other than process
200  *		0 will attempt to receive a list of clean entries whenever
201  *		the threshold is exceeded.
202  *
203  * dirty_bytes:  Integer field containing the number of bytes of dirty
204  *		metadata generated since the beginning of the computation,
205  *		or (more typically) since the last clean entries list
206  *		broadcast.  This field is reset to zero after each such
207  *		broadcast.
208  *
209  * metadata_write_strategy: Integer code indicating how we will be
210  *		writing the metadata.  In the first incarnation of
211  *		this code, all writes were done from process 0.  This
212  *		field exists to facilitate experiments with other
213  *		strategies.
214  *
215  *		At present, this field must be set to either
216  *		H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY or
217  *		H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED.
218  *
219  * dirty_bytes_propagations: This field only exists when the
220  *		H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
221  *
222  *		It is used to track the number of times the cleaned list
223  *		has been propagated from process 0 to the other
224  *		processes.
225  *
226  * unprotect_dirty_bytes:  This field only exists when the
227  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
228  *
229  *		It is used to track the number of dirty bytes created
230  *		via unprotect operations since the last time the cleaned
231  *		list was propagated.
232  *
233  * unprotect_dirty_bytes_updates: This field only exists when the
234  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
235  *
236  *		It is used to track the number of times dirty bytes have
237  *		been created via unprotect operations since the last time
238  *		the cleaned list was propagated.
239  *
240  * insert_dirty_bytes:  This field only exists when the
241  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
242  *
243  *		It is used to track the number of dirty bytes created
244  *		via insert operations since the last time the cleaned
245  *		list was propagated.
246  *
247  * insert_dirty_bytes_updates:  This field only exists when the
248  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
249  *
250  *		It is used to track the number of times dirty bytes have
251  *		been created via insert operations since the last time
252  *		the cleaned list was propagated.
253  *
254  * move_dirty_bytes:  This field only exists when the
255  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
256  *
257  *		It is used to track the number of dirty bytes created
258  *		via move operations since the last time the cleaned
259  *		list was propagated.
260  *
261  * move_dirty_bytes_updates:  This field only exists when the
262  *              H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE.
263  *
264  *		It is used to track the number of times dirty bytes have
265  *		been created via move operations since the last time
266  *		the cleaned list was propagated.
267  *
268  * Things have changed a bit since the following four fields were defined.
269  * If metadata_write_strategy is H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY,
270  * all comments hold as before -- with the caviate that pending further
271  * coding, the process 0 metadata cache is forbidden to flush entries outside
272  * of a sync point.
273  *
274  * However, for different metadata write strategies, these fields are used
275  * only to maintain the correct dirty byte count on process zero -- and in
276  * most if not all cases, this is redundant, as process zero will be barred
277  * from flushing entries outside of a sync point.
278  *
279  *						JRM -- 3/16/10
280  *
281  * d_slist_ptr:  Pointer to an instance of H5SL_t used to maintain a list
282  *		of entries that have been dirtied since the last time they
283  *		were listed in a clean entries broadcast.  This list is
284  *		only maintained by the metadata cache on process 0 -- it
285  *		it used to maintain a view of the dirty entries as seen
286  *		by the other caches, so as to keep the dirty bytes count
287  *		in synchronization with them.
288  *
289  *		Thus on process 0, the dirty_bytes count is incremented
290  *		only if either
291  *
292  *		1) an entry is inserted in the metadata cache, or
293  *
294  *		2) a previously clean entry is moved, and it does not
295  *		   already appear in the dirty entry list, or
296  *
297  *		3) a previously clean entry is unprotected with the
298  *		   dirtied flag set and the entry does not already appear
299  *		   in the dirty entry list.
300  *
301  *		Entries are added to the dirty entry list whever they cause
302  *		the dirty bytes count to be increased.  They are removed
303  *		when they appear in a clean entries broadcast.  Note that
304  *		moves must be reflected in the dirty entry list.
305  *
306  *		To reitterate, this field is only used on process 0 -- it
307  *		should be NULL on all other processes.
308  *
309  * c_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list
310  *		of entries that were dirty, have been flushed
311  *		to disk since the last clean entries broadcast, and are
312  *		still clean.  Since only process 0 can write to disk, this
313  *		list only exists on process 0.
314  *
315  *		In essence, this slist is used to assemble the contents of
316  *		the next clean entries broadcast.  The list emptied after
317  *		each broadcast.
318  *
319  * The following two fields are used only when metadata_write_strategy
320  * is H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED.
321  *
322  * candidate_slist_ptr: Pointer to an instance of H5SL_t used by process 0
323  *		to construct a list of entries to be flushed at this sync
324  *		point.  This list is then broadcast to the other processes,
325  *		which then either flush or mark clean all entries on it.
326  *
327  * write_done:  In the parallel test bed, it is necessary to ensure that
328  *              all writes to the server process from cache 0 complete
329  *              before it enters the barrier call with the other caches.
330  *
331  *              The write_done callback allows t_cache to do this without
332  *              requiring an ACK on each write.  Since these ACKs greatly
333  *              increase the run time on some platforms, this is a
334  *              significant optimization.
335  *
336  *              This field must be set to NULL when the callback is not
337  *              needed.
338  *
339  *		Note: This field has been extended for use by all processes
340  *		      with the addition of support for the distributed
341  *		      metadata write strategy.
342  *                                                     JRM -- 5/9/10
343  *
344  * sync_point_done:  In the parallel test bed, it is necessary to verify
345  *		that the expected writes, and only the expected writes,
346  *		have taken place at the end of each sync point.
347  *
348  *		The sync_point_done callback allows t_cache to perform
349  *		this verification.  The field is set to NULL when the
350  *		callback is not needed.
351  *
352  * The following field supports the metadata cache image feature.
353  *
354  * p0_image_len: unsiged integer containing the length of the metadata cache
355  *		image constructed by MPI process 0.  This field should be 0
356  *		if the value is unknown, or if cache image is not enabled.
357  *
358  ****************************************************************************/
359 
360 #ifdef H5_HAVE_PARALLEL
361 
362 #define H5AC__H5AC_AUX_T_MAGIC        (unsigned)0x00D0A01
363 
364 typedef struct H5AC_aux_t
365 {
366     uint32_t	magic;
367 
368     MPI_Comm	mpi_comm;
369 
370     int		mpi_rank;
371 
372     int		mpi_size;
373 
374     hbool_t	write_permitted;
375 
376     size_t	dirty_bytes_threshold;
377 
378     size_t	dirty_bytes;
379 
380     int32_t	metadata_write_strategy;
381 
382 #if H5AC_DEBUG_DIRTY_BYTES_CREATION
383 
384     unsigned	dirty_bytes_propagations;
385 
386     size_t      unprotect_dirty_bytes;
387     unsigned    unprotect_dirty_bytes_updates;
388 
389     size_t      insert_dirty_bytes;
390     unsigned    insert_dirty_bytes_updates;
391 
392     size_t      move_dirty_bytes;
393     unsigned    move_dirty_bytes_updates;
394 
395 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */
396 
397     H5SL_t *	d_slist_ptr;
398 
399     H5SL_t *	c_slist_ptr;
400 
401     H5SL_t *	candidate_slist_ptr;
402 
403     void	(* write_done)(void);
404 
405     void	(* sync_point_done)(unsigned num_writes,
406                                     haddr_t * written_entries_tbl);
407 
408     unsigned    p0_image_len;
409 
410 } H5AC_aux_t; /* struct H5AC_aux_t */
411 #endif /* H5_HAVE_PARALLEL */
412 
413 
414 /******************************/
415 /* Package Private Prototypes */
416 /******************************/
417 
418 #ifdef H5_HAVE_PARALLEL
419 /* Parallel I/O routines */
420 H5_DLL herr_t H5AC__log_deleted_entry(const H5AC_info_t *entry_ptr);
421 H5_DLL herr_t H5AC__log_dirtied_entry(const H5AC_info_t *entry_ptr);
422 H5_DLL herr_t H5AC__log_cleaned_entry(const H5AC_info_t *entry_ptr);
423 H5_DLL herr_t H5AC__log_flushed_entry(H5C_t *cache_ptr, haddr_t addr,
424     hbool_t was_dirty, unsigned flags);
425 H5_DLL herr_t H5AC__log_inserted_entry(const H5AC_info_t *entry_ptr);
426 H5_DLL herr_t H5AC__log_moved_entry(const H5F_t *f, haddr_t old_addr,
427     haddr_t new_addr);
428 H5_DLL herr_t H5AC__flush_entries(H5F_t *f);
429 H5_DLL herr_t H5AC__run_sync_point(H5F_t *f, int sync_point_op);
430 H5_DLL herr_t H5AC__set_sync_point_done_callback(H5C_t *cache_ptr,
431     void (*sync_point_done)(unsigned num_writes, haddr_t *written_entries_tbl));
432 H5_DLL herr_t H5AC__set_write_done_callback(H5C_t * cache_ptr,
433     void (* write_done)(void));
434 #endif /* H5_HAVE_PARALLEL */
435 
436 /* Trace file routines */
437 H5_DLL herr_t H5AC__close_trace_file(H5AC_t *cache_ptr);
438 H5_DLL herr_t H5AC__open_trace_file(H5AC_t *cache_ptr, const char *trace_file_name);
439 
440 /* Cache logging routines */
441 H5_DLL herr_t H5AC__write_create_cache_log_msg(H5AC_t *cache);
442 H5_DLL herr_t H5AC__write_destroy_cache_log_msg(H5AC_t *cache);
443 H5_DLL herr_t H5AC__write_evict_cache_log_msg(const H5AC_t *cache,
444                                         herr_t fxn_ret_value);
445 H5_DLL herr_t H5AC__write_expunge_entry_log_msg(const H5AC_t *cache,
446                                                 haddr_t address,
447                                                 int type_id,
448                                                 herr_t fxn_ret_value);
449 H5_DLL herr_t H5AC__write_flush_cache_log_msg(const H5AC_t *cache,
450                                               herr_t fxn_ret_value);
451 H5_DLL herr_t H5AC__write_insert_entry_log_msg(const H5AC_t *cache,
452                                                haddr_t address,
453                                                int type_id,
454                                                unsigned flags,
455                                                size_t size,
456                                                herr_t fxn_ret_value);
457 H5_DLL herr_t H5AC__write_mark_dirty_entry_log_msg(const H5AC_t *cache,
458                                                    const H5AC_info_t *entry,
459                                                    herr_t fxn_ret_value);
460 H5_DLL herr_t H5AC__write_mark_clean_entry_log_msg(const H5AC_t *cache,
461     const H5AC_info_t *entry, herr_t fxn_ret_value);
462 H5_DLL herr_t H5AC__write_mark_unserialized_entry_log_msg(const H5AC_t *cache,
463         const H5AC_info_t *entry, herr_t fxn_ret_value);
464 H5_DLL herr_t H5AC__write_mark_serialized_entry_log_msg(const H5AC_t *cache,
465     const H5AC_info_t *entry, herr_t fxn_ret_value);
466 H5_DLL herr_t H5AC__write_move_entry_log_msg(const H5AC_t *cache,
467                                              haddr_t old_addr,
468                                              haddr_t new_addr,
469                                              int type_id,
470                                              herr_t fxn_ret_value);
471 H5_DLL herr_t H5AC__write_pin_entry_log_msg(const H5AC_t *cache,
472                                             const H5AC_info_t *entry,
473                                             herr_t fxn_ret_value);
474 H5_DLL herr_t H5AC__write_create_fd_log_msg(const H5AC_t *cache,
475                                             const H5AC_info_t *parent,
476                                             const H5AC_info_t *child,
477                                             herr_t fxn_ret_value);
478 H5_DLL herr_t H5AC__write_protect_entry_log_msg(const H5AC_t *cache,
479                                                 const H5AC_info_t *entry,
480                                                 unsigned flags,
481                                                 herr_t fxn_ret_value);
482 H5_DLL herr_t H5AC__write_resize_entry_log_msg(const H5AC_t *cache,
483                                                const H5AC_info_t *entry,
484                                                size_t new_size,
485                                                herr_t fxn_ret_value);
486 H5_DLL herr_t H5AC__write_unpin_entry_log_msg(const H5AC_t *cache,
487                                               const H5AC_info_t *entry,
488                                               herr_t fxn_ret_value);
489 H5_DLL herr_t H5AC__write_destroy_fd_log_msg(const H5AC_t *cache,
490                                              const H5AC_info_t *parent,
491                                              const H5AC_info_t *child,
492                                              herr_t fxn_ret_value);
493 H5_DLL herr_t H5AC__write_unprotect_entry_log_msg(const H5AC_t *cache,
494                                                   const H5AC_info_t *entry,
495                                                   int type_id,
496                                                   unsigned flags,
497                                                   herr_t fxn_ret_value);
498 H5_DLL herr_t H5AC__write_set_cache_config_log_msg(const H5AC_t *cache,
499                                                    const H5AC_cache_config_t *config,
500                                                    herr_t fxn_ret_value);
501 H5_DLL herr_t H5AC__write_remove_entry_log_msg(const H5AC_t *cache,
502                                               const H5AC_info_t *entry,
503                                               herr_t fxn_ret_value);
504 
505 #endif /* _H5ACpkg_H */
506 
507