1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 /*
10  * WT_LSM_WORKER_COOKIE --
11  *	State for an LSM worker thread.
12  */
13 struct __wt_lsm_worker_cookie {
14 	WT_LSM_CHUNK **chunk_array;
15 	size_t chunk_alloc;
16 	u_int nchunks;
17 };
18 
19 /*
20  * WT_LSM_WORKER_ARGS --
21  *	State for an LSM worker thread.
22  */
23 struct __wt_lsm_worker_args {
24 	WT_SESSION_IMPL	*session;	/* Session */
25 	WT_CONDVAR	*work_cond;	/* Owned by the manager */
26 
27 	wt_thread_t	tid;		/* Thread id */
28 	bool		tid_set;	/* Thread id set */
29 
30 	u_int		id;		/* My manager slot id */
31 	uint32_t	type;		/* Types of operations handled */
32 
33 	volatile bool	running;	/* Worker is running */
34 };
35 
36 /*
37  * WT_LSM_CURSOR_CHUNK --
38  *	Iterator struct containing all the LSM cursor access points for a chunk.
39  */
40 struct __wt_lsm_cursor_chunk {
41 	WT_BLOOM *bloom;		/* Bloom filter handle for each chunk.*/
42 	WT_CURSOR *cursor;		/* Cursor handle for each chunk. */
43 	uint64_t count;			/* Number of items in chunk */
44 	uint64_t switch_txn;		/* Switch txn for each chunk */
45 };
46 
47 /*
48  * WT_CURSOR_LSM --
49  *	An LSM cursor.
50  */
51 struct __wt_cursor_lsm {
52 	WT_CURSOR iface;
53 
54 	WT_LSM_TREE *lsm_tree;
55 	uint64_t dsk_gen;
56 
57 	u_int nchunks;			/* Number of chunks in the cursor */
58 	u_int nupdates;			/* Updates needed (including
59 					   snapshot isolation checks). */
60 	WT_CURSOR *current;		/* The current cursor for iteration */
61 	WT_LSM_CHUNK *primary_chunk;	/* The current primary chunk */
62 
63 	WT_LSM_CURSOR_CHUNK **chunks;	/* Array of LSM cursor units */
64 	size_t chunks_alloc;		/* Current size iterators array */
65 	size_t chunks_count;		/* Current number of iterators */
66 
67 	u_int update_count;		/* Updates performed. */
68 
69 /* AUTOMATIC FLAG VALUE GENERATION START */
70 #define	WT_CLSM_ACTIVE		0x001u	/* Incremented the session count */
71 #define	WT_CLSM_BULK		0x002u	/* Open for snapshot isolation */
72 #define	WT_CLSM_ITERATE_NEXT    0x004u	/* Forward iteration */
73 #define	WT_CLSM_ITERATE_PREV    0x008u	/* Backward iteration */
74 #define	WT_CLSM_MERGE           0x010u	/* Merge cursor, don't update */
75 #define	WT_CLSM_MINOR_MERGE	0x020u	/* Minor merge, include tombstones */
76 #define	WT_CLSM_MULTIPLE        0x040u	/* Multiple cursors have values */
77 #define	WT_CLSM_OPEN_READ	0x080u	/* Open for reads */
78 #define	WT_CLSM_OPEN_SNAPSHOT	0x100u	/* Open for snapshot isolation */
79 /* AUTOMATIC FLAG VALUE GENERATION STOP */
80 	uint32_t flags;
81 };
82 
83 /*
84  * WT_LSM_CHUNK --
85  *	A single chunk (file) in an LSM tree.
86  */
87 struct __wt_lsm_chunk {
88 	const char *uri;		/* Data source for this chunk */
89 	const char *bloom_uri;		/* URI of Bloom filter, if any */
90 	struct timespec create_time;	/* Creation time (for rate limiting) */
91 	uint64_t count;			/* Approximate count of records */
92 	uint64_t size;			/* Final chunk size */
93 
94 	uint64_t switch_txn;		/*
95 					 * Largest transaction that can write
96 					 * to this chunk, set by a worker
97 					 * thread when the chunk is switched
98 					 * out, or by compact to get the most
99 					 * recent chunk flushed.
100 					 */
101 	WT_DECL_TIMESTAMP(switch_timestamp)/*
102 					 * The timestamp used to decide when
103 					 * updates need to detect conflicts.
104 					 */
105 	WT_SPINLOCK timestamp_spinlock;
106 
107 	uint32_t id;			/* ID used to generate URIs */
108 	uint32_t generation;		/* Merge generation */
109 	uint32_t refcnt;		/* Number of worker thread references */
110 	uint32_t bloom_busy;		/* Currently creating bloom filter */
111 	uint32_t evict_enabled;		/* Eviction allowed on the chunk */
112 
113 	int8_t empty;			/* 1/0: checkpoint missing */
114 	int8_t evicted;			/* 1/0: in-memory chunk was evicted */
115 	uint8_t flushing;		/* 1/0: chunk flush in progress */
116 
117 /* AUTOMATIC FLAG VALUE GENERATION START */
118 #define	WT_LSM_CHUNK_BLOOM		0x01u
119 #define	WT_LSM_CHUNK_HAS_TIMESTAMP	0x02u
120 #define	WT_LSM_CHUNK_MERGING		0x04u
121 #define	WT_LSM_CHUNK_ONDISK		0x08u
122 #define	WT_LSM_CHUNK_STABLE		0x10u
123 /* AUTOMATIC FLAG VALUE GENERATION STOP */
124 	uint32_t flags;
125 };
126 
127 /*
128  * Different types of work units. Used by LSM worker threads to choose which
129  * type of work they will execute, and by work units to define which action
130  * is required.
131  */
132 /* AUTOMATIC FLAG VALUE GENERATION START */
133 #define	WT_LSM_WORK_BLOOM		0x01u	/* Create a bloom filter */
134 #define	WT_LSM_WORK_DROP		0x02u	/* Drop unused chunks */
135 #define	WT_LSM_WORK_ENABLE_EVICT	0x04u	/* Create a bloom filter */
136 #define	WT_LSM_WORK_FLUSH		0x08u	/* Flush a chunk to disk */
137 #define	WT_LSM_WORK_MERGE		0x10u	/* Look for a tree merge */
138 #define	WT_LSM_WORK_SWITCH		0x20u	/* Switch the in-memory chunk */
139 /* AUTOMATIC FLAG VALUE GENERATION STOP */
140 
141 /* Work units that are serviced by general worker threads. */
142 #define	WT_LSM_WORK_GENERAL_OPS	 					\
143 	(WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_ENABLE_EVICT |\
144 	WT_LSM_WORK_FLUSH | WT_LSM_WORK_SWITCH)
145 
146 /*
147  * WT_LSM_WORK_UNIT --
148  *	A definition of maintenance that an LSM tree needs done.
149  */
150 struct __wt_lsm_work_unit {
151 	TAILQ_ENTRY(__wt_lsm_work_unit) q;	/* Worker unit queue */
152 	uint32_t	type;			/* Type of operation */
153 /* AUTOMATIC FLAG VALUE GENERATION START */
154 #define	WT_LSM_WORK_FORCE	0x1u		/* Force operation */
155 /* AUTOMATIC FLAG VALUE GENERATION STOP */
156 	uint32_t	flags;			/* Flags for operation */
157 	WT_LSM_TREE *lsm_tree;
158 };
159 
160 /*
161  * WT_LSM_MANAGER --
162  *	A structure that holds resources used to manage any LSM trees in a
163  *	database.
164  */
165 struct __wt_lsm_manager {
166 	/*
167 	 * Queues of work units for LSM worker threads. We maintain three
168 	 * queues, to allow us to keep each queue FIFO, rather than needing
169 	 * to manage the order of work by shuffling the queue order.
170 	 * One queue for switches - since switches should never wait for other
171 	 *   work to be done.
172 	 * One queue for application requested work. For example flushing
173 	 *   and creating bloom filters.
174 	 * One queue that is for longer running operations such as merges.
175 	 */
176 	TAILQ_HEAD(__wt_lsm_work_switch_qh, __wt_lsm_work_unit)  switchqh;
177 	TAILQ_HEAD(__wt_lsm_work_app_qh, __wt_lsm_work_unit)	  appqh;
178 	TAILQ_HEAD(__wt_lsm_work_manager_qh, __wt_lsm_work_unit) managerqh;
179 	WT_SPINLOCK	switch_lock;	/* Lock for switch queue */
180 	WT_SPINLOCK	app_lock;	/* Lock for application queue */
181 	WT_SPINLOCK	manager_lock;	/* Lock for manager queue */
182 	WT_CONDVAR     *work_cond;	/* Used to notify worker of activity */
183 	uint32_t	lsm_workers;	/* Current number of LSM workers */
184 	uint32_t	lsm_workers_max;
185 #define	WT_LSM_MAX_WORKERS	20
186 #define	WT_LSM_MIN_WORKERS	3
187 	WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS];
188 
189 /* AUTOMATIC FLAG VALUE GENERATION START */
190 #define	WT_LSM_MANAGER_SHUTDOWN	0x1u	/* Manager has shut down */
191 /* AUTOMATIC FLAG VALUE GENERATION STOP */
192 	uint32_t flags;
193 };
194 
195 /*
196  * The value aggressive needs to get to before it influences how merges
197  * are chosen. The default value translates to enough level 0 chunks being
198  * generated to create a second level merge.
199  */
200 #define	WT_LSM_AGGRESSIVE_THRESHOLD	2
201 
202 /*
203  * The minimum size for opening a tree: three chunks, plus one page for each
204  * participant in up to three concurrent merges.
205  */
206 #define	WT_LSM_TREE_MINIMUM_SIZE(chunk_size, merge_max, maxleafpage)	\
207 	(3 * (chunk_size) + 3 * ((merge_max) * (maxleafpage)))
208 
209 /*
210  * WT_LSM_TREE --
211  *	An LSM tree.
212  */
213 struct __wt_lsm_tree {
214 	const char *name, *config, *filename;
215 	const char *key_format, *value_format;
216 	const char *bloom_config, *file_config;
217 
218 	uint32_t custom_generation;	/* Level at which a custom data source
219 					   should be used for merges. */
220 	const char *custom_prefix;	/* Prefix for custom data source */
221 	const char *custom_suffix;	/* Suffix for custom data source */
222 
223 	WT_COLLATOR *collator;
224 	const char *collator_name;
225 	int collator_owned;
226 
227 	uint32_t refcnt;		/* Number of users of the tree */
228 	WT_SESSION_IMPL *excl_session;	/* Session has exclusive lock */
229 
230 #define	LSM_TREE_MAX_QUEUE	100
231 	uint32_t queue_ref;
232 	WT_RWLOCK rwlock;
233 	TAILQ_ENTRY(__wt_lsm_tree) q;
234 
235 	uint64_t dsk_gen;
236 
237 	uint64_t ckpt_throttle;		/* Rate limiting due to checkpoints */
238 	uint64_t merge_throttle;	/* Rate limiting due to merges */
239 	uint64_t chunk_fill_ms;		/* Estimate of time to fill a chunk */
240 	struct timespec last_flush_time;/* Time last flush finished */
241 	uint64_t chunks_flushed;	/* Count of chunks flushed since open */
242 	struct timespec merge_aggressive_time;/* Time for merge aggression */
243 	uint64_t merge_progressing;	/* Bumped when merges are active */
244 	uint32_t merge_syncing;		/* Bumped when merges are syncing */
245 	struct timespec last_active;	/* Time last work unit added */
246 	uint64_t mgr_work_count;	/* Manager work count */
247 	uint64_t work_count;		/* Work units added */
248 
249 	/* Configuration parameters */
250 	uint32_t bloom_bit_count;
251 	uint32_t bloom_hash_count;
252 	uint32_t chunk_count_limit;	/* Limit number of chunks */
253 	uint64_t chunk_size;
254 	uint64_t chunk_max;		/* Maximum chunk a merge creates */
255 	u_int merge_min, merge_max;
256 
257 /* AUTOMATIC FLAG VALUE GENERATION START */
258 #define	WT_LSM_BLOOM_MERGED	0x1u
259 #define	WT_LSM_BLOOM_OFF	0x2u
260 #define	WT_LSM_BLOOM_OLDEST	0x4u
261 /* AUTOMATIC FLAG VALUE GENERATION STOP */
262 	uint32_t bloom;			/* Bloom creation policy */
263 
264 	WT_LSM_CHUNK **chunk;		/* Array of active LSM chunks */
265 	size_t chunk_alloc;		/* Space allocated for chunks */
266 	uint32_t nchunks;		/* Number of active chunks */
267 	uint32_t last;			/* Last allocated ID */
268 	bool modified;			/* Have there been updates? */
269 
270 	WT_LSM_CHUNK **old_chunks;	/* Array of old LSM chunks */
271 	size_t old_alloc;		/* Space allocated for old chunks */
272 	u_int nold_chunks;		/* Number of old chunks */
273 	uint32_t freeing_old_chunks;	/* Whether chunks are being freed */
274 	uint32_t merge_aggressiveness;	/* Increase amount of work per merge */
275 
276 	/*
277 	 * We maintain a set of statistics outside of the normal statistics
278 	 * area, copying them into place when a statistics cursor is created.
279 	 */
280 #define	WT_LSM_TREE_STAT_INCR(session, fld) do {			\
281 	if (WT_STAT_ENABLED(session))					\
282 		++(fld);						\
283 } while (0)
284 #define	WT_LSM_TREE_STAT_INCRV(session, fld, v) do {			\
285 	if (WT_STAT_ENABLED(session))					\
286 		(fld) += (int64_t)(v);					\
287 } while (0)
288 	int64_t bloom_false_positive;
289 	int64_t bloom_hit;
290 	int64_t bloom_miss;
291 	int64_t lsm_checkpoint_throttle;
292 	int64_t lsm_lookup_no_bloom;
293 	int64_t lsm_merge_throttle;
294 
295 	/*
296 	 * Following fields used to be flags but are susceptible to races.
297 	 * Don't merge them with flags.
298 	 */
299 	bool active;			/* The tree is open for business */
300 	bool aggressive_timer_enabled;	/* Timer for merge aggression enabled */
301 	bool need_switch;		/* New chunk needs creating */
302 
303 	/*
304 	 * flags here are not protected for concurrent access, don't put
305 	 * anything here that is susceptible to races.
306 	 */
307 /* AUTOMATIC FLAG VALUE GENERATION START */
308 #define	WT_LSM_TREE_COMPACTING		0x1u	/* Tree being compacted */
309 #define	WT_LSM_TREE_MERGES		0x2u	/* Tree should run merges */
310 #define	WT_LSM_TREE_OPEN		0x4u	/* The tree is open */
311 #define	WT_LSM_TREE_THROTTLE		0x8u	/* Throttle updates */
312 /* AUTOMATIC FLAG VALUE GENERATION STOP */
313 	uint32_t flags;
314 };
315 
316 /*
317  * WT_LSM_DATA_SOURCE --
318  *	Implementation of the WT_DATA_SOURCE interface for LSM.
319  */
320 struct __wt_lsm_data_source {
321 	WT_DATA_SOURCE iface;
322 
323 	WT_RWLOCK *rwlock;
324 };
325