1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 2 * Copyright by The HDF Group. * 3 * Copyright by the Board of Trustees of the University of Illinois. * 4 * All rights reserved. * 5 * * 6 * This file is part of HDF5. The full HDF5 copyright notice, including * 7 * terms governing use, modification, and redistribution, is contained in * 8 * the COPYING file, which can be found at the root of the source code * 9 * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * 10 * If you do not have access to either file, you may request a copy from * 11 * help@hdfgroup.org. * 12 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 13 14 /* 15 * Programmer: John Mainzer -- 4/19/06 16 * 17 * Purpose: This file contains declarations which are normally visible 18 * only within the H5AC package (just H5AC.c at present). 19 * 20 * Source files outside the H5AC package should include 21 * H5ACprivate.h instead. 22 * 23 * The one exception to this rule is testpar/t_cache.c. The 24 * test code is easier to write if it can look at H5AC_aux_t. 25 * Indeed, this is the main reason why this file was created. 26 * 27 */ 28 29 #if !(defined H5AC_FRIEND || defined H5AC_MODULE) 30 #error "Do not include this file outside the H5AC package!" 31 #endif 32 33 #ifndef _H5ACpkg_H 34 #define _H5ACpkg_H 35 36 /* Get package's private header */ 37 #include "H5ACprivate.h" /* Metadata cache */ 38 39 40 /* Get needed headers */ 41 #include "H5Cprivate.h" /* Cache */ 42 #include "H5FLprivate.h" /* Free Lists */ 43 44 /*****************************/ 45 /* Package Private Variables */ 46 /*****************************/ 47 48 /* Declare extern the free list to manage the H5AC_aux_t struct */ 49 H5FL_EXTERN(H5AC_aux_t); 50 51 52 /**************************/ 53 /* Package Private Macros */ 54 /**************************/ 55 56 #define H5AC_DEBUG_DIRTY_BYTES_CREATION 0 57 58 #ifdef H5_HAVE_PARALLEL 59 60 /* the following #defined are used to specify the operation required 61 * at a sync point. 62 */ 63 64 #define H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN 0 65 #define H5AC_SYNC_POINT_OP__FLUSH_CACHE 1 66 67 #endif /* H5_HAVE_PARALLEL */ 68 69 /*------------------------------------------------------------------------- 70 * It is a bit difficult to set ranges of allowable values on the 71 * dirty_bytes_threshold field of H5AC_aux_t. The following are 72 * probably broader than they should be. 73 *------------------------------------------------------------------------- 74 */ 75 76 #define H5AC__MIN_DIRTY_BYTES_THRESHOLD (size_t) \ 77 (H5C__MIN_MAX_CACHE_SIZE / 2) 78 #define H5AC__DEFAULT_DIRTY_BYTES_THRESHOLD (256 * 1024) 79 #define H5AC__MAX_DIRTY_BYTES_THRESHOLD (size_t) \ 80 (H5C__MAX_MAX_CACHE_SIZE / 4) 81 82 83 /**************************************************************************** 84 * 85 * structure H5AC_aux_t 86 * 87 * While H5AC has become a wrapper for the cache implemented in H5C.c, there 88 * are some features of the metadata cache that are specific to it, and which 89 * therefore do not belong in the more generic H5C cache code. 90 * 91 * In particular, there is the matter of synchronizing writes from the 92 * metadata cache to disk in the PHDF5 case. 93 * 94 * Prior to this update, the presumption was that all metadata caches would 95 * write the same data at the same time since all operations modifying 96 * metadata must be performed collectively. Given this assumption, it was 97 * safe to allow only the writes from process 0 to actually make it to disk, 98 * while metadata writes from all other processes were discarded. 99 * 100 * Unfortunately, this presumption is in error as operations that read 101 * metadata need not be collective, but can change the location of dirty 102 * entries in the metadata cache LRU lists. This can result in the same 103 * metadata write operation triggering writes from the metadata caches on 104 * some processes, but not all (causing a hang), or in different sets of 105 * entries being written from different caches (potentially resulting in 106 * metadata corruption in the file). 107 * 108 * To deal with this issue, I decided to apply a paradigm shift to the way 109 * metadata is written to disk. 110 * 111 * With this set of changes, only the metadata cache on process 0 is able 112 * to write metadata to disk, although metadata caches on all other 113 * processes can read metadata from disk as before. 114 * 115 * To keep all the other caches from getting plugged up with dirty metadata, 116 * process 0 periodically broadcasts a list of entries that it has flushed 117 * since that last notice, and which are currently clean. The other caches 118 * mark these entries as clean as well, which allows them to evict the 119 * entries as needed. 120 * 121 * One obvious problem in this approach is synchronizing the broadcasts 122 * and receptions, as different caches may see different amounts of 123 * activity. 124 * 125 * The current solution is for the caches to track the number of bytes 126 * of newly generated dirty metadata, and to broadcast and receive 127 * whenever this value exceeds some user specified threshold. 128 * 129 * Maintaining this count is easy for all processes not on process 0 -- 130 * all that is necessary is to add the size of the entry to the total 131 * whenever there is an insertion, a move of a previously clean entry, 132 * or whever a previously clean entry is marked dirty in an unprotect. 133 * 134 * On process 0, we have to be careful not to count dirty bytes twice. 135 * If an entry is marked dirty, flushed, and marked dirty again, all 136 * within a single reporting period, it only th first marking should 137 * be added to the dirty bytes generated tally, as that is all that 138 * the other processes will see. 139 * 140 * At present, this structure exists to maintain the fields needed to 141 * implement the above scheme, and thus is only used in the parallel 142 * case. However, other uses may arise in the future. 143 * 144 * Instance of this structure are associated with metadata caches via 145 * the aux_ptr field of H5C_t (see H5Cpkg.h). The H5AC code is 146 * responsible for allocating, maintaining, and discarding instances 147 * of H5AC_aux_t. 148 * 149 * The remainder of this header comments documents the individual fields 150 * of the structure. 151 * 152 * JRM - 6/27/05 153 * 154 * Update: When the above was written, I planned to allow the process 155 * 0 metadata cache to write dirty metadata between sync points. 156 * However, testing indicated that this allowed occasional 157 * messages from the future to reach the caches on other processes. 158 * 159 * To resolve this, the code was altered to require that all metadata 160 * writes take place during sync points -- which solved the problem. 161 * Initially all writes were performed by the process 0 cache. This 162 * approach was later replaced with a distributed write approach 163 * in which each process writes a subset of the metadata to be 164 * written. 165 * 166 * After thinking on the matter for a while, I arrived at the 167 * conclusion that the process 0 cache could be allowed to write 168 * dirty metadata between sync points if it restricted itself to 169 * entries that had been dirty at the time of the previous sync point. 170 * 171 * To date, there has been no attempt to implement this optimization. 172 * However, should it be attempted, much of the supporting code 173 * should still be around. 174 * 175 * JRM -- 1/6/15 176 * 177 * magic: Unsigned 32 bit integer always set to 178 * H5AC__H5AC_AUX_T_MAGIC. This field is used to validate 179 * pointers to instances of H5AC_aux_t. 180 * 181 * mpi_comm: MPI communicator associated with the file for which the 182 * cache has been created. 183 * 184 * mpi_rank: MPI rank of this process within mpi_comm. 185 * 186 * mpi_size: Number of processes in mpi_comm. 187 * 188 * write_permitted: Boolean flag used to control whether the cache 189 * is permitted to write to file. 190 * 191 * dirty_bytes_threshold: Integer field containing the dirty bytes 192 * generation threshold. Whenever dirty byte creation 193 * exceeds this value, the metadata cache on process 0 194 * broadcasts a list of the entries it has flushed since 195 * the last broadcast (or since the beginning of execution) 196 * and which are currently clean (if they are still in the 197 * cache) 198 * 199 * Similarly, metadata caches on processes other than process 200 * 0 will attempt to receive a list of clean entries whenever 201 * the threshold is exceeded. 202 * 203 * dirty_bytes: Integer field containing the number of bytes of dirty 204 * metadata generated since the beginning of the computation, 205 * or (more typically) since the last clean entries list 206 * broadcast. This field is reset to zero after each such 207 * broadcast. 208 * 209 * metadata_write_strategy: Integer code indicating how we will be 210 * writing the metadata. In the first incarnation of 211 * this code, all writes were done from process 0. This 212 * field exists to facilitate experiments with other 213 * strategies. 214 * 215 * At present, this field must be set to either 216 * H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY or 217 * H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED. 218 * 219 * dirty_bytes_propagations: This field only exists when the 220 * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. 221 * 222 * It is used to track the number of times the cleaned list 223 * has been propagated from process 0 to the other 224 * processes. 225 * 226 * unprotect_dirty_bytes: This field only exists when the 227 * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. 228 * 229 * It is used to track the number of dirty bytes created 230 * via unprotect operations since the last time the cleaned 231 * list was propagated. 232 * 233 * unprotect_dirty_bytes_updates: This field only exists when the 234 * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. 235 * 236 * It is used to track the number of times dirty bytes have 237 * been created via unprotect operations since the last time 238 * the cleaned list was propagated. 239 * 240 * insert_dirty_bytes: This field only exists when the 241 * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. 242 * 243 * It is used to track the number of dirty bytes created 244 * via insert operations since the last time the cleaned 245 * list was propagated. 246 * 247 * insert_dirty_bytes_updates: This field only exists when the 248 * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. 249 * 250 * It is used to track the number of times dirty bytes have 251 * been created via insert operations since the last time 252 * the cleaned list was propagated. 253 * 254 * move_dirty_bytes: This field only exists when the 255 * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. 256 * 257 * It is used to track the number of dirty bytes created 258 * via move operations since the last time the cleaned 259 * list was propagated. 260 * 261 * move_dirty_bytes_updates: This field only exists when the 262 * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. 263 * 264 * It is used to track the number of times dirty bytes have 265 * been created via move operations since the last time 266 * the cleaned list was propagated. 267 * 268 * Things have changed a bit since the following four fields were defined. 269 * If metadata_write_strategy is H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY, 270 * all comments hold as before -- with the caviate that pending further 271 * coding, the process 0 metadata cache is forbidden to flush entries outside 272 * of a sync point. 273 * 274 * However, for different metadata write strategies, these fields are used 275 * only to maintain the correct dirty byte count on process zero -- and in 276 * most if not all cases, this is redundant, as process zero will be barred 277 * from flushing entries outside of a sync point. 278 * 279 * JRM -- 3/16/10 280 * 281 * d_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list 282 * of entries that have been dirtied since the last time they 283 * were listed in a clean entries broadcast. This list is 284 * only maintained by the metadata cache on process 0 -- it 285 * it used to maintain a view of the dirty entries as seen 286 * by the other caches, so as to keep the dirty bytes count 287 * in synchronization with them. 288 * 289 * Thus on process 0, the dirty_bytes count is incremented 290 * only if either 291 * 292 * 1) an entry is inserted in the metadata cache, or 293 * 294 * 2) a previously clean entry is moved, and it does not 295 * already appear in the dirty entry list, or 296 * 297 * 3) a previously clean entry is unprotected with the 298 * dirtied flag set and the entry does not already appear 299 * in the dirty entry list. 300 * 301 * Entries are added to the dirty entry list whever they cause 302 * the dirty bytes count to be increased. They are removed 303 * when they appear in a clean entries broadcast. Note that 304 * moves must be reflected in the dirty entry list. 305 * 306 * To reitterate, this field is only used on process 0 -- it 307 * should be NULL on all other processes. 308 * 309 * c_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list 310 * of entries that were dirty, have been flushed 311 * to disk since the last clean entries broadcast, and are 312 * still clean. Since only process 0 can write to disk, this 313 * list only exists on process 0. 314 * 315 * In essence, this slist is used to assemble the contents of 316 * the next clean entries broadcast. The list emptied after 317 * each broadcast. 318 * 319 * The following two fields are used only when metadata_write_strategy 320 * is H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED. 321 * 322 * candidate_slist_ptr: Pointer to an instance of H5SL_t used by process 0 323 * to construct a list of entries to be flushed at this sync 324 * point. This list is then broadcast to the other processes, 325 * which then either flush or mark clean all entries on it. 326 * 327 * write_done: In the parallel test bed, it is necessary to ensure that 328 * all writes to the server process from cache 0 complete 329 * before it enters the barrier call with the other caches. 330 * 331 * The write_done callback allows t_cache to do this without 332 * requiring an ACK on each write. Since these ACKs greatly 333 * increase the run time on some platforms, this is a 334 * significant optimization. 335 * 336 * This field must be set to NULL when the callback is not 337 * needed. 338 * 339 * Note: This field has been extended for use by all processes 340 * with the addition of support for the distributed 341 * metadata write strategy. 342 * JRM -- 5/9/10 343 * 344 * sync_point_done: In the parallel test bed, it is necessary to verify 345 * that the expected writes, and only the expected writes, 346 * have taken place at the end of each sync point. 347 * 348 * The sync_point_done callback allows t_cache to perform 349 * this verification. The field is set to NULL when the 350 * callback is not needed. 351 * 352 * The following field supports the metadata cache image feature. 353 * 354 * p0_image_len: unsiged integer containing the length of the metadata cache 355 * image constructed by MPI process 0. This field should be 0 356 * if the value is unknown, or if cache image is not enabled. 357 * 358 ****************************************************************************/ 359 360 #ifdef H5_HAVE_PARALLEL 361 362 #define H5AC__H5AC_AUX_T_MAGIC (unsigned)0x00D0A01 363 364 typedef struct H5AC_aux_t 365 { 366 uint32_t magic; 367 368 MPI_Comm mpi_comm; 369 370 int mpi_rank; 371 372 int mpi_size; 373 374 hbool_t write_permitted; 375 376 size_t dirty_bytes_threshold; 377 378 size_t dirty_bytes; 379 380 int32_t metadata_write_strategy; 381 382 #if H5AC_DEBUG_DIRTY_BYTES_CREATION 383 384 unsigned dirty_bytes_propagations; 385 386 size_t unprotect_dirty_bytes; 387 unsigned unprotect_dirty_bytes_updates; 388 389 size_t insert_dirty_bytes; 390 unsigned insert_dirty_bytes_updates; 391 392 size_t move_dirty_bytes; 393 unsigned move_dirty_bytes_updates; 394 395 #endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ 396 397 H5SL_t * d_slist_ptr; 398 399 H5SL_t * c_slist_ptr; 400 401 H5SL_t * candidate_slist_ptr; 402 403 void (* write_done)(void); 404 405 void (* sync_point_done)(unsigned num_writes, 406 haddr_t * written_entries_tbl); 407 408 unsigned p0_image_len; 409 410 } H5AC_aux_t; /* struct H5AC_aux_t */ 411 #endif /* H5_HAVE_PARALLEL */ 412 413 414 /******************************/ 415 /* Package Private Prototypes */ 416 /******************************/ 417 418 #ifdef H5_HAVE_PARALLEL 419 /* Parallel I/O routines */ 420 H5_DLL herr_t H5AC__log_deleted_entry(const H5AC_info_t *entry_ptr); 421 H5_DLL herr_t H5AC__log_dirtied_entry(const H5AC_info_t *entry_ptr); 422 H5_DLL herr_t H5AC__log_cleaned_entry(const H5AC_info_t *entry_ptr); 423 H5_DLL herr_t H5AC__log_flushed_entry(H5C_t *cache_ptr, haddr_t addr, 424 hbool_t was_dirty, unsigned flags); 425 H5_DLL herr_t H5AC__log_inserted_entry(const H5AC_info_t *entry_ptr); 426 H5_DLL herr_t H5AC__log_moved_entry(const H5F_t *f, haddr_t old_addr, 427 haddr_t new_addr); 428 H5_DLL herr_t H5AC__flush_entries(H5F_t *f); 429 H5_DLL herr_t H5AC__run_sync_point(H5F_t *f, int sync_point_op); 430 H5_DLL herr_t H5AC__set_sync_point_done_callback(H5C_t *cache_ptr, 431 void (*sync_point_done)(unsigned num_writes, haddr_t *written_entries_tbl)); 432 H5_DLL herr_t H5AC__set_write_done_callback(H5C_t * cache_ptr, 433 void (* write_done)(void)); 434 #endif /* H5_HAVE_PARALLEL */ 435 436 /* Trace file routines */ 437 H5_DLL herr_t H5AC__close_trace_file(H5AC_t *cache_ptr); 438 H5_DLL herr_t H5AC__open_trace_file(H5AC_t *cache_ptr, const char *trace_file_name); 439 440 /* Cache logging routines */ 441 H5_DLL herr_t H5AC__write_create_cache_log_msg(H5AC_t *cache); 442 H5_DLL herr_t H5AC__write_destroy_cache_log_msg(H5AC_t *cache); 443 H5_DLL herr_t H5AC__write_evict_cache_log_msg(const H5AC_t *cache, 444 herr_t fxn_ret_value); 445 H5_DLL herr_t H5AC__write_expunge_entry_log_msg(const H5AC_t *cache, 446 haddr_t address, 447 int type_id, 448 herr_t fxn_ret_value); 449 H5_DLL herr_t H5AC__write_flush_cache_log_msg(const H5AC_t *cache, 450 herr_t fxn_ret_value); 451 H5_DLL herr_t H5AC__write_insert_entry_log_msg(const H5AC_t *cache, 452 haddr_t address, 453 int type_id, 454 unsigned flags, 455 size_t size, 456 herr_t fxn_ret_value); 457 H5_DLL herr_t H5AC__write_mark_dirty_entry_log_msg(const H5AC_t *cache, 458 const H5AC_info_t *entry, 459 herr_t fxn_ret_value); 460 H5_DLL herr_t H5AC__write_mark_clean_entry_log_msg(const H5AC_t *cache, 461 const H5AC_info_t *entry, herr_t fxn_ret_value); 462 H5_DLL herr_t H5AC__write_mark_unserialized_entry_log_msg(const H5AC_t *cache, 463 const H5AC_info_t *entry, herr_t fxn_ret_value); 464 H5_DLL herr_t H5AC__write_mark_serialized_entry_log_msg(const H5AC_t *cache, 465 const H5AC_info_t *entry, herr_t fxn_ret_value); 466 H5_DLL herr_t H5AC__write_move_entry_log_msg(const H5AC_t *cache, 467 haddr_t old_addr, 468 haddr_t new_addr, 469 int type_id, 470 herr_t fxn_ret_value); 471 H5_DLL herr_t H5AC__write_pin_entry_log_msg(const H5AC_t *cache, 472 const H5AC_info_t *entry, 473 herr_t fxn_ret_value); 474 H5_DLL herr_t H5AC__write_create_fd_log_msg(const H5AC_t *cache, 475 const H5AC_info_t *parent, 476 const H5AC_info_t *child, 477 herr_t fxn_ret_value); 478 H5_DLL herr_t H5AC__write_protect_entry_log_msg(const H5AC_t *cache, 479 const H5AC_info_t *entry, 480 unsigned flags, 481 herr_t fxn_ret_value); 482 H5_DLL herr_t H5AC__write_resize_entry_log_msg(const H5AC_t *cache, 483 const H5AC_info_t *entry, 484 size_t new_size, 485 herr_t fxn_ret_value); 486 H5_DLL herr_t H5AC__write_unpin_entry_log_msg(const H5AC_t *cache, 487 const H5AC_info_t *entry, 488 herr_t fxn_ret_value); 489 H5_DLL herr_t H5AC__write_destroy_fd_log_msg(const H5AC_t *cache, 490 const H5AC_info_t *parent, 491 const H5AC_info_t *child, 492 herr_t fxn_ret_value); 493 H5_DLL herr_t H5AC__write_unprotect_entry_log_msg(const H5AC_t *cache, 494 const H5AC_info_t *entry, 495 int type_id, 496 unsigned flags, 497 herr_t fxn_ret_value); 498 H5_DLL herr_t H5AC__write_set_cache_config_log_msg(const H5AC_t *cache, 499 const H5AC_cache_config_t *config, 500 herr_t fxn_ret_value); 501 H5_DLL herr_t H5AC__write_remove_entry_log_msg(const H5AC_t *cache, 502 const H5AC_info_t *entry, 503 herr_t fxn_ret_value); 504 505 #endif /* _H5ACpkg_H */ 506 507