1 /*- 2 * Copyright (c) 2014-2018 MongoDB, Inc. 3 * Copyright (c) 2008-2014 WiredTiger, Inc. 4 * All rights reserved. 5 * 6 * See the file LICENSE for redistribution information. 7 */ 8 9 /* 10 * WT_LSM_WORKER_COOKIE -- 11 * State for an LSM worker thread. 12 */ 13 struct __wt_lsm_worker_cookie { 14 WT_LSM_CHUNK **chunk_array; 15 size_t chunk_alloc; 16 u_int nchunks; 17 }; 18 19 /* 20 * WT_LSM_WORKER_ARGS -- 21 * State for an LSM worker thread. 22 */ 23 struct __wt_lsm_worker_args { 24 WT_SESSION_IMPL *session; /* Session */ 25 WT_CONDVAR *work_cond; /* Owned by the manager */ 26 27 wt_thread_t tid; /* Thread id */ 28 bool tid_set; /* Thread id set */ 29 30 u_int id; /* My manager slot id */ 31 uint32_t type; /* Types of operations handled */ 32 33 volatile bool running; /* Worker is running */ 34 }; 35 36 /* 37 * WT_LSM_CURSOR_CHUNK -- 38 * Iterator struct containing all the LSM cursor access points for a chunk. 39 */ 40 struct __wt_lsm_cursor_chunk { 41 WT_BLOOM *bloom; /* Bloom filter handle for each chunk.*/ 42 WT_CURSOR *cursor; /* Cursor handle for each chunk. */ 43 uint64_t count; /* Number of items in chunk */ 44 uint64_t switch_txn; /* Switch txn for each chunk */ 45 }; 46 47 /* 48 * WT_CURSOR_LSM -- 49 * An LSM cursor. 50 */ 51 struct __wt_cursor_lsm { 52 WT_CURSOR iface; 53 54 WT_LSM_TREE *lsm_tree; 55 uint64_t dsk_gen; 56 57 u_int nchunks; /* Number of chunks in the cursor */ 58 u_int nupdates; /* Updates needed (including 59 snapshot isolation checks). */ 60 WT_CURSOR *current; /* The current cursor for iteration */ 61 WT_LSM_CHUNK *primary_chunk; /* The current primary chunk */ 62 63 WT_LSM_CURSOR_CHUNK **chunks; /* Array of LSM cursor units */ 64 size_t chunks_alloc; /* Current size iterators array */ 65 size_t chunks_count; /* Current number of iterators */ 66 67 u_int update_count; /* Updates performed. */ 68 69 /* AUTOMATIC FLAG VALUE GENERATION START */ 70 #define WT_CLSM_ACTIVE 0x001u /* Incremented the session count */ 71 #define WT_CLSM_BULK 0x002u /* Open for snapshot isolation */ 72 #define WT_CLSM_ITERATE_NEXT 0x004u /* Forward iteration */ 73 #define WT_CLSM_ITERATE_PREV 0x008u /* Backward iteration */ 74 #define WT_CLSM_MERGE 0x010u /* Merge cursor, don't update */ 75 #define WT_CLSM_MINOR_MERGE 0x020u /* Minor merge, include tombstones */ 76 #define WT_CLSM_MULTIPLE 0x040u /* Multiple cursors have values */ 77 #define WT_CLSM_OPEN_READ 0x080u /* Open for reads */ 78 #define WT_CLSM_OPEN_SNAPSHOT 0x100u /* Open for snapshot isolation */ 79 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 80 uint32_t flags; 81 }; 82 83 /* 84 * WT_LSM_CHUNK -- 85 * A single chunk (file) in an LSM tree. 86 */ 87 struct __wt_lsm_chunk { 88 const char *uri; /* Data source for this chunk */ 89 const char *bloom_uri; /* URI of Bloom filter, if any */ 90 struct timespec create_time; /* Creation time (for rate limiting) */ 91 uint64_t count; /* Approximate count of records */ 92 uint64_t size; /* Final chunk size */ 93 94 uint64_t switch_txn; /* 95 * Largest transaction that can write 96 * to this chunk, set by a worker 97 * thread when the chunk is switched 98 * out, or by compact to get the most 99 * recent chunk flushed. 100 */ 101 WT_DECL_TIMESTAMP(switch_timestamp)/* 102 * The timestamp used to decide when 103 * updates need to detect conflicts. 104 */ 105 WT_SPINLOCK timestamp_spinlock; 106 107 uint32_t id; /* ID used to generate URIs */ 108 uint32_t generation; /* Merge generation */ 109 uint32_t refcnt; /* Number of worker thread references */ 110 uint32_t bloom_busy; /* Currently creating bloom filter */ 111 uint32_t evict_enabled; /* Eviction allowed on the chunk */ 112 113 int8_t empty; /* 1/0: checkpoint missing */ 114 int8_t evicted; /* 1/0: in-memory chunk was evicted */ 115 uint8_t flushing; /* 1/0: chunk flush in progress */ 116 117 /* AUTOMATIC FLAG VALUE GENERATION START */ 118 #define WT_LSM_CHUNK_BLOOM 0x01u 119 #define WT_LSM_CHUNK_HAS_TIMESTAMP 0x02u 120 #define WT_LSM_CHUNK_MERGING 0x04u 121 #define WT_LSM_CHUNK_ONDISK 0x08u 122 #define WT_LSM_CHUNK_STABLE 0x10u 123 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 124 uint32_t flags; 125 }; 126 127 /* 128 * Different types of work units. Used by LSM worker threads to choose which 129 * type of work they will execute, and by work units to define which action 130 * is required. 131 */ 132 /* AUTOMATIC FLAG VALUE GENERATION START */ 133 #define WT_LSM_WORK_BLOOM 0x01u /* Create a bloom filter */ 134 #define WT_LSM_WORK_DROP 0x02u /* Drop unused chunks */ 135 #define WT_LSM_WORK_ENABLE_EVICT 0x04u /* Create a bloom filter */ 136 #define WT_LSM_WORK_FLUSH 0x08u /* Flush a chunk to disk */ 137 #define WT_LSM_WORK_MERGE 0x10u /* Look for a tree merge */ 138 #define WT_LSM_WORK_SWITCH 0x20u /* Switch the in-memory chunk */ 139 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 140 141 /* Work units that are serviced by general worker threads. */ 142 #define WT_LSM_WORK_GENERAL_OPS \ 143 (WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_ENABLE_EVICT |\ 144 WT_LSM_WORK_FLUSH | WT_LSM_WORK_SWITCH) 145 146 /* 147 * WT_LSM_WORK_UNIT -- 148 * A definition of maintenance that an LSM tree needs done. 149 */ 150 struct __wt_lsm_work_unit { 151 TAILQ_ENTRY(__wt_lsm_work_unit) q; /* Worker unit queue */ 152 uint32_t type; /* Type of operation */ 153 /* AUTOMATIC FLAG VALUE GENERATION START */ 154 #define WT_LSM_WORK_FORCE 0x1u /* Force operation */ 155 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 156 uint32_t flags; /* Flags for operation */ 157 WT_LSM_TREE *lsm_tree; 158 }; 159 160 /* 161 * WT_LSM_MANAGER -- 162 * A structure that holds resources used to manage any LSM trees in a 163 * database. 164 */ 165 struct __wt_lsm_manager { 166 /* 167 * Queues of work units for LSM worker threads. We maintain three 168 * queues, to allow us to keep each queue FIFO, rather than needing 169 * to manage the order of work by shuffling the queue order. 170 * One queue for switches - since switches should never wait for other 171 * work to be done. 172 * One queue for application requested work. For example flushing 173 * and creating bloom filters. 174 * One queue that is for longer running operations such as merges. 175 */ 176 TAILQ_HEAD(__wt_lsm_work_switch_qh, __wt_lsm_work_unit) switchqh; 177 TAILQ_HEAD(__wt_lsm_work_app_qh, __wt_lsm_work_unit) appqh; 178 TAILQ_HEAD(__wt_lsm_work_manager_qh, __wt_lsm_work_unit) managerqh; 179 WT_SPINLOCK switch_lock; /* Lock for switch queue */ 180 WT_SPINLOCK app_lock; /* Lock for application queue */ 181 WT_SPINLOCK manager_lock; /* Lock for manager queue */ 182 WT_CONDVAR *work_cond; /* Used to notify worker of activity */ 183 uint32_t lsm_workers; /* Current number of LSM workers */ 184 uint32_t lsm_workers_max; 185 #define WT_LSM_MAX_WORKERS 20 186 #define WT_LSM_MIN_WORKERS 3 187 WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS]; 188 189 /* AUTOMATIC FLAG VALUE GENERATION START */ 190 #define WT_LSM_MANAGER_SHUTDOWN 0x1u /* Manager has shut down */ 191 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 192 uint32_t flags; 193 }; 194 195 /* 196 * The value aggressive needs to get to before it influences how merges 197 * are chosen. The default value translates to enough level 0 chunks being 198 * generated to create a second level merge. 199 */ 200 #define WT_LSM_AGGRESSIVE_THRESHOLD 2 201 202 /* 203 * The minimum size for opening a tree: three chunks, plus one page for each 204 * participant in up to three concurrent merges. 205 */ 206 #define WT_LSM_TREE_MINIMUM_SIZE(chunk_size, merge_max, maxleafpage) \ 207 (3 * (chunk_size) + 3 * ((merge_max) * (maxleafpage))) 208 209 /* 210 * WT_LSM_TREE -- 211 * An LSM tree. 212 */ 213 struct __wt_lsm_tree { 214 const char *name, *config, *filename; 215 const char *key_format, *value_format; 216 const char *bloom_config, *file_config; 217 218 uint32_t custom_generation; /* Level at which a custom data source 219 should be used for merges. */ 220 const char *custom_prefix; /* Prefix for custom data source */ 221 const char *custom_suffix; /* Suffix for custom data source */ 222 223 WT_COLLATOR *collator; 224 const char *collator_name; 225 int collator_owned; 226 227 uint32_t refcnt; /* Number of users of the tree */ 228 WT_SESSION_IMPL *excl_session; /* Session has exclusive lock */ 229 230 #define LSM_TREE_MAX_QUEUE 100 231 uint32_t queue_ref; 232 WT_RWLOCK rwlock; 233 TAILQ_ENTRY(__wt_lsm_tree) q; 234 235 uint64_t dsk_gen; 236 237 uint64_t ckpt_throttle; /* Rate limiting due to checkpoints */ 238 uint64_t merge_throttle; /* Rate limiting due to merges */ 239 uint64_t chunk_fill_ms; /* Estimate of time to fill a chunk */ 240 struct timespec last_flush_time;/* Time last flush finished */ 241 uint64_t chunks_flushed; /* Count of chunks flushed since open */ 242 struct timespec merge_aggressive_time;/* Time for merge aggression */ 243 uint64_t merge_progressing; /* Bumped when merges are active */ 244 uint32_t merge_syncing; /* Bumped when merges are syncing */ 245 struct timespec last_active; /* Time last work unit added */ 246 uint64_t mgr_work_count; /* Manager work count */ 247 uint64_t work_count; /* Work units added */ 248 249 /* Configuration parameters */ 250 uint32_t bloom_bit_count; 251 uint32_t bloom_hash_count; 252 uint32_t chunk_count_limit; /* Limit number of chunks */ 253 uint64_t chunk_size; 254 uint64_t chunk_max; /* Maximum chunk a merge creates */ 255 u_int merge_min, merge_max; 256 257 /* AUTOMATIC FLAG VALUE GENERATION START */ 258 #define WT_LSM_BLOOM_MERGED 0x1u 259 #define WT_LSM_BLOOM_OFF 0x2u 260 #define WT_LSM_BLOOM_OLDEST 0x4u 261 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 262 uint32_t bloom; /* Bloom creation policy */ 263 264 WT_LSM_CHUNK **chunk; /* Array of active LSM chunks */ 265 size_t chunk_alloc; /* Space allocated for chunks */ 266 uint32_t nchunks; /* Number of active chunks */ 267 uint32_t last; /* Last allocated ID */ 268 bool modified; /* Have there been updates? */ 269 270 WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */ 271 size_t old_alloc; /* Space allocated for old chunks */ 272 u_int nold_chunks; /* Number of old chunks */ 273 uint32_t freeing_old_chunks; /* Whether chunks are being freed */ 274 uint32_t merge_aggressiveness; /* Increase amount of work per merge */ 275 276 /* 277 * We maintain a set of statistics outside of the normal statistics 278 * area, copying them into place when a statistics cursor is created. 279 */ 280 #define WT_LSM_TREE_STAT_INCR(session, fld) do { \ 281 if (WT_STAT_ENABLED(session)) \ 282 ++(fld); \ 283 } while (0) 284 #define WT_LSM_TREE_STAT_INCRV(session, fld, v) do { \ 285 if (WT_STAT_ENABLED(session)) \ 286 (fld) += (int64_t)(v); \ 287 } while (0) 288 int64_t bloom_false_positive; 289 int64_t bloom_hit; 290 int64_t bloom_miss; 291 int64_t lsm_checkpoint_throttle; 292 int64_t lsm_lookup_no_bloom; 293 int64_t lsm_merge_throttle; 294 295 /* 296 * Following fields used to be flags but are susceptible to races. 297 * Don't merge them with flags. 298 */ 299 bool active; /* The tree is open for business */ 300 bool aggressive_timer_enabled; /* Timer for merge aggression enabled */ 301 bool need_switch; /* New chunk needs creating */ 302 303 /* 304 * flags here are not protected for concurrent access, don't put 305 * anything here that is susceptible to races. 306 */ 307 /* AUTOMATIC FLAG VALUE GENERATION START */ 308 #define WT_LSM_TREE_COMPACTING 0x1u /* Tree being compacted */ 309 #define WT_LSM_TREE_MERGES 0x2u /* Tree should run merges */ 310 #define WT_LSM_TREE_OPEN 0x4u /* The tree is open */ 311 #define WT_LSM_TREE_THROTTLE 0x8u /* Throttle updates */ 312 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 313 uint32_t flags; 314 }; 315 316 /* 317 * WT_LSM_DATA_SOURCE -- 318 * Implementation of the WT_DATA_SOURCE interface for LSM. 319 */ 320 struct __wt_lsm_data_source { 321 WT_DATA_SOURCE iface; 322 323 WT_RWLOCK *rwlock; 324 }; 325