1 /*-
2  * Copyright (c) 2014-2018 MongoDB, Inc.
3  * Copyright (c) 2008-2014 WiredTiger, Inc.
4  *	All rights reserved.
5  *
6  * See the file LICENSE for redistribution information.
7  */
8 
9 #include "wt_internal.h"
10 
11 /*
12  * When an operation is accessing the lookaside table, it should ignore the
13  * cache size (since the cache is already full), any pages it reads should be
14  * evicted before application data, and the operation can't reenter
15  * reconciliation.
16  */
17 #define	WT_LAS_SESSION_FLAGS						\
18 	(WT_SESSION_IGNORE_CACHE_SIZE | WT_SESSION_READ_WONT_NEED |	\
19 	WT_SESSION_NO_RECONCILE)
20 
21 /*
22  * __las_set_isolation --
23  *	Switch to read-uncommitted.
24  */
25 static void
__las_set_isolation(WT_SESSION_IMPL * session,WT_TXN_ISOLATION * saved_isolationp)26 __las_set_isolation(
27     WT_SESSION_IMPL *session, WT_TXN_ISOLATION *saved_isolationp)
28 {
29 	*saved_isolationp = session->txn.isolation;
30 	session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
31 }
32 
33 /*
34  * __las_restore_isolation --
35  *	Restore isolation.
36  */
37 static void
__las_restore_isolation(WT_SESSION_IMPL * session,WT_TXN_ISOLATION saved_isolation)38 __las_restore_isolation(
39     WT_SESSION_IMPL *session, WT_TXN_ISOLATION saved_isolation)
40 {
41 	session->txn.isolation = saved_isolation;
42 }
43 
44 /*
45  * __las_entry_count --
46  *	Return when there are entries in the lookaside table.
47  */
48 static uint64_t
__las_entry_count(WT_CACHE * cache)49 __las_entry_count(WT_CACHE *cache)
50 {
51 	uint64_t insert_cnt, remove_cnt;
52 
53 	insert_cnt = cache->las_insert_count;
54 	WT_ORDERED_READ(remove_cnt, cache->las_remove_count);
55 
56 	return (insert_cnt > remove_cnt ? insert_cnt - remove_cnt : 0);
57 }
58 
59 /*
60  * __wt_las_config --
61  *	Configure the lookaside table.
62  */
63 int
__wt_las_config(WT_SESSION_IMPL * session,const char ** cfg)64 __wt_las_config(WT_SESSION_IMPL *session, const char **cfg)
65 {
66 	WT_CONFIG_ITEM cval;
67 	WT_CURSOR_BTREE *las_cursor;
68 	WT_SESSION_IMPL *las_session;
69 
70 	WT_RET(__wt_config_gets(
71 	    session, cfg, "cache_overflow.file_max", &cval));
72 
73 	if (cval.val != 0 && cval.val < WT_LAS_FILE_MIN)
74 		WT_RET_MSG(session, EINVAL,
75 		    "max cache overflow size %" PRId64 " below minimum %d",
76 		    cval.val, WT_LAS_FILE_MIN);
77 
78 	/* This is expected for in-memory configurations. */
79 	las_session = S2C(session)->cache->las_session[0];
80 	WT_ASSERT(session,
81 	    las_session != NULL || F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
82 
83 	if (las_session == NULL)
84 		return (0);
85 
86 	/*
87 	 * We need to set file_max on the btree associated with one of the
88 	 * lookaside sessions.
89 	 */
90 	las_cursor = (WT_CURSOR_BTREE *)las_session->las_cursor;
91 	las_cursor->btree->file_max = (uint64_t)cval.val;
92 
93 	WT_STAT_CONN_SET(
94 	    session, cache_lookaside_ondisk_max, las_cursor->btree->file_max);
95 
96 	return (0);
97 }
98 
99 /*
100  * __wt_las_empty --
101  *	Return when there are entries in the lookaside table.
102  */
103 bool
__wt_las_empty(WT_SESSION_IMPL * session)104 __wt_las_empty(WT_SESSION_IMPL *session)
105 {
106 	return (__las_entry_count(S2C(session)->cache) == 0);
107 }
108 
109 /*
110  * __wt_las_stats_update --
111  *	Update the lookaside table statistics for return to the application.
112  */
113 void
__wt_las_stats_update(WT_SESSION_IMPL * session)114 __wt_las_stats_update(WT_SESSION_IMPL *session)
115 {
116 	WT_CACHE *cache;
117 	WT_CONNECTION_IMPL *conn;
118 	WT_CONNECTION_STATS **cstats;
119 	WT_DSRC_STATS **dstats;
120 	int64_t v;
121 
122 	conn = S2C(session);
123 	cache = conn->cache;
124 
125 	/*
126 	 * Lookaside table statistics are copied from the underlying lookaside
127 	 * table data-source statistics. If there's no lookaside table, values
128 	 * remain 0.
129 	 */
130 	if (!F_ISSET(conn, WT_CONN_LOOKASIDE_OPEN))
131 		return;
132 
133 	/* Set the connection-wide statistics. */
134 	cstats = conn->stats;
135 
136 	WT_STAT_SET(session, cstats,
137 	    cache_lookaside_entries, __las_entry_count(cache));
138 
139 	/*
140 	 * We have a cursor, and we need the underlying data handle; we can get
141 	 * to it by way of the underlying btree handle, but it's a little ugly.
142 	 */
143 	dstats = ((WT_CURSOR_BTREE *)
144 	    cache->las_session[0]->las_cursor)->btree->dhandle->stats;
145 
146 	v = WT_STAT_READ(dstats, cursor_update);
147 	WT_STAT_SET(session, cstats, cache_lookaside_insert, v);
148 	v = WT_STAT_READ(dstats, cursor_remove);
149 	WT_STAT_SET(session, cstats, cache_lookaside_remove, v);
150 
151 	/*
152 	 * If we're clearing stats we need to clear the cursor values we just
153 	 * read.  This does not clear the rest of the statistics in the
154 	 * lookaside data source stat cursor, but we own that namespace so we
155 	 * don't have to worry about users seeing inconsistent data source
156 	 * information.
157 	 */
158 	if (FLD_ISSET(conn->stat_flags, WT_STAT_CLEAR)) {
159 		WT_STAT_SET(session, dstats, cursor_insert, 0);
160 		WT_STAT_SET(session, dstats, cursor_remove, 0);
161 	}
162 }
163 
164 /*
165  * __wt_las_create --
166  *	Initialize the database's lookaside store.
167  */
168 int
__wt_las_create(WT_SESSION_IMPL * session,const char ** cfg)169 __wt_las_create(WT_SESSION_IMPL *session, const char **cfg)
170 {
171 	WT_CACHE *cache;
172 	WT_CONNECTION_IMPL *conn;
173 	WT_DECL_RET;
174 	int i;
175 	const char *drop_cfg[] = {
176 	    WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL };
177 
178 	conn = S2C(session);
179 	cache = conn->cache;
180 
181 	/* Read-only and in-memory configurations don't need the LAS table. */
182 	if (F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY))
183 		return (0);
184 
185 	/*
186 	 * Done at startup: we cannot do it on demand because we require the
187 	 * schema lock to create and drop the table, and it may not always be
188 	 * available.
189 	 *
190 	 * Discard any previous incarnation of the table.
191 	 */
192 	WT_WITH_SCHEMA_LOCK(session,
193 	    ret = __wt_schema_drop(session, WT_LAS_URI, drop_cfg));
194 	WT_RET(ret);
195 
196 	/* Re-create the table. */
197 	WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_CONFIG));
198 
199 	/*
200 	 * Open a shared internal session and cursor used for the lookaside
201 	 * table. This session should never perform reconciliation.
202 	 */
203 	for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) {
204 		WT_RET(__wt_open_internal_session(conn, "lookaside table",
205 		    true, WT_LAS_SESSION_FLAGS, &cache->las_session[i]));
206 		WT_RET(__wt_las_cursor_open(cache->las_session[i]));
207 	}
208 
209 	WT_RET(__wt_las_config(session, cfg));
210 
211 	/* The statistics server is already running, make sure we don't race. */
212 	WT_WRITE_BARRIER();
213 	F_SET(conn, WT_CONN_LOOKASIDE_OPEN);
214 
215 	return (0);
216 }
217 
218 /*
219  * __wt_las_destroy --
220  *	Destroy the database's lookaside store.
221  */
222 int
__wt_las_destroy(WT_SESSION_IMPL * session)223 __wt_las_destroy(WT_SESSION_IMPL *session)
224 {
225 	WT_CACHE *cache;
226 	WT_CONNECTION_IMPL *conn;
227 	WT_DECL_RET;
228 	WT_SESSION *wt_session;
229 	int i;
230 
231 	conn = S2C(session);
232 	cache = conn->cache;
233 
234 	F_CLR(conn, WT_CONN_LOOKASIDE_OPEN);
235 	if (cache == NULL)
236 		return (0);
237 
238 	for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) {
239 		if (cache->las_session[i] == NULL)
240 			continue;
241 
242 		wt_session = &cache->las_session[i]->iface;
243 		WT_TRET(wt_session->close(wt_session, NULL));
244 		cache->las_session[i] = NULL;
245 	}
246 
247 	__wt_buf_free(session, &cache->las_sweep_key);
248 	__wt_free(session, cache->las_dropped);
249 	__wt_free(session, cache->las_sweep_dropmap);
250 
251 	return (ret);
252 }
253 
254 /*
255  * __wt_las_cursor_open --
256  *	Open a new lookaside table cursor.
257  */
258 int
__wt_las_cursor_open(WT_SESSION_IMPL * session)259 __wt_las_cursor_open(WT_SESSION_IMPL *session)
260 {
261 	WT_BTREE *btree;
262 	WT_CURSOR *cursor;
263 	WT_DECL_RET;
264 	const char *open_cursor_cfg[] = {
265 	    WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
266 
267 	WT_WITHOUT_DHANDLE(session, ret = __wt_open_cursor(
268 	    session, WT_LAS_URI, NULL, open_cursor_cfg, &cursor));
269 	WT_RET(ret);
270 
271 	/*
272 	 * Retrieve the btree from the cursor, rather than the session because
273 	 * we don't always switch the LAS handle in to the session before
274 	 * entering this function.
275 	 */
276 	btree = ((WT_CURSOR_BTREE *)cursor)->btree;
277 
278 	/* Track the lookaside file ID. */
279 	if (S2C(session)->cache->las_fileid == 0)
280 		S2C(session)->cache->las_fileid = btree->id;
281 
282 	/*
283 	 * Set special flags for the lookaside table: the lookaside flag (used,
284 	 * for example, to avoid writing records during reconciliation), also
285 	 * turn off checkpoints and logging.
286 	 *
287 	 * Test flags before setting them so updates can't race in subsequent
288 	 * opens (the first update is safe because it's single-threaded from
289 	 * wiredtiger_open).
290 	 */
291 	if (!F_ISSET(btree, WT_BTREE_LOOKASIDE))
292 		F_SET(btree, WT_BTREE_LOOKASIDE);
293 	if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
294 		F_SET(btree, WT_BTREE_NO_CHECKPOINT);
295 	if (!F_ISSET(btree, WT_BTREE_NO_LOGGING))
296 		F_SET(btree, WT_BTREE_NO_LOGGING);
297 
298 	session->las_cursor = cursor;
299 	F_SET(session, WT_SESSION_LOOKASIDE_CURSOR);
300 
301 	return (0);
302 }
303 
304 /*
305  * __wt_las_cursor --
306  *	Return a lookaside cursor.
307  */
308 void
__wt_las_cursor(WT_SESSION_IMPL * session,WT_CURSOR ** cursorp,uint32_t * session_flags)309 __wt_las_cursor(
310     WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags)
311 {
312 	WT_CACHE *cache;
313 	int i;
314 
315 	*cursorp = NULL;
316 
317 	/*
318 	 * We don't want to get tapped for eviction after we start using the
319 	 * lookaside cursor; save a copy of the current eviction state, we'll
320 	 * turn eviction off before we return.
321 	 *
322 	 * Don't cache lookaside table pages, we're here because of eviction
323 	 * problems and there's no reason to believe lookaside pages will be
324 	 * useful more than once.
325 	 */
326 	*session_flags = F_MASK(session, WT_LAS_SESSION_FLAGS);
327 
328 	cache = S2C(session)->cache;
329 
330 	/*
331 	 * Some threads have their own lookaside table cursors, else lock the
332 	 * shared lookaside cursor.
333 	 */
334 	if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
335 		*cursorp = session->las_cursor;
336 	else {
337 		for (;;) {
338 			__wt_spin_lock(session, &cache->las_lock);
339 			for (i = 0; i < WT_LAS_NUM_SESSIONS; i++) {
340 				if (!cache->las_session_inuse[i]) {
341 					*cursorp =
342 					    cache->las_session[i]->las_cursor;
343 					cache->las_session_inuse[i] = true;
344 					break;
345 				}
346 			}
347 			__wt_spin_unlock(session, &cache->las_lock);
348 			if (*cursorp != NULL)
349 				break;
350 			/*
351 			 * If all the lookaside sessions are busy, stall.
352 			 *
353 			 * XXX better as a condition variable.
354 			 */
355 			__wt_sleep(0, WT_THOUSAND);
356 			if (F_ISSET(session, WT_SESSION_INTERNAL))
357 				WT_STAT_CONN_INCRV(session,
358 				    cache_lookaside_cursor_wait_internal,
359 				    WT_THOUSAND);
360 			else
361 				WT_STAT_CONN_INCRV(session,
362 				    cache_lookaside_cursor_wait_application,
363 				    WT_THOUSAND);
364 
365 		}
366 	}
367 
368 	/* Configure session to access the lookaside table. */
369 	F_SET(session, WT_LAS_SESSION_FLAGS);
370 }
371 
372 /*
373  * __wt_las_cursor_close --
374  *	Discard a lookaside cursor.
375  */
376 int
__wt_las_cursor_close(WT_SESSION_IMPL * session,WT_CURSOR ** cursorp,uint32_t session_flags)377 __wt_las_cursor_close(
378     WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags)
379 {
380 	WT_CACHE *cache;
381 	WT_CURSOR *cursor;
382 	WT_DECL_RET;
383 	int i;
384 
385 	cache = S2C(session)->cache;
386 
387 	if ((cursor = *cursorp) == NULL)
388 		return (0);
389 	*cursorp = NULL;
390 
391 	/* Reset the cursor. */
392 	ret = cursor->reset(cursor);
393 
394 	/*
395 	 * We turned off caching and eviction while the lookaside cursor was in
396 	 * use, restore the session's flags.
397 	 */
398 	F_CLR(session, WT_LAS_SESSION_FLAGS);
399 	F_SET(session, session_flags);
400 
401 	/*
402 	 * Some threads have their own lookaside table cursors, else unlock the
403 	 * shared lookaside cursor.
404 	 */
405 	if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) {
406 		__wt_spin_lock(session, &cache->las_lock);
407 		for (i = 0; i < WT_LAS_NUM_SESSIONS; i++)
408 			if (cursor->session == &cache->las_session[i]->iface) {
409 				cache->las_session_inuse[i] = false;
410 				break;
411 			}
412 		__wt_spin_unlock(session, &cache->las_lock);
413 		WT_ASSERT(session, i != WT_LAS_NUM_SESSIONS);
414 	}
415 
416 	return (ret);
417 }
418 
419 /*
420  * __wt_las_page_skip_locked --
421  *	 Check if we can skip reading a page with lookaside entries, where
422  * the page is already locked.
423  */
424 bool
__wt_las_page_skip_locked(WT_SESSION_IMPL * session,WT_REF * ref)425 __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref)
426 {
427 	WT_TXN *txn;
428 
429 	txn = &session->txn;
430 
431 	/*
432 	 * Skip lookaside pages if reading without a timestamp and all the
433 	 * updates in lookaside are in the past.
434 	 *
435 	 * Lookaside eviction preferentially chooses the newest updates when
436 	 * creating page images with no stable timestamp. If a stable timestamp
437 	 * has been set, we have to visit the page because eviction chooses old
438 	 * version of records in that case.
439 	 *
440 	 * One case where we may need to visit the page is if lookaside eviction
441 	 * is active in tree 2 when a checkpoint has started and is working its
442 	 * way through tree 1. In that case, lookaside may have created a page
443 	 * image with updates in the future of the checkpoint.
444 	 *
445 	 * We also need to instantiate a lookaside page if this is an update
446 	 * operation in progress.
447 	 */
448 	if (F_ISSET(txn, WT_TXN_UPDATE))
449 		return (false);
450 
451 	if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
452 		return (false);
453 
454 	/*
455 	 * If some of the page's history overlaps with the reader's snapshot
456 	 * then we have to read it.  This is only relevant if we chose versions
457 	 * that were unstable when the page was written.
458 	 */
459 	if (ref->page_las->skew_newest &&
460 	    WT_TXNID_LE(txn->snap_min, ref->page_las->unstable_txn))
461 		return (false);
462 
463 	if (!F_ISSET(txn, WT_TXN_HAS_TS_READ))
464 		return (ref->page_las->skew_newest);
465 
466 #ifdef HAVE_TIMESTAMPS
467 	/*
468 	 * Skip lookaside pages if reading as of a timestamp, we evicted new
469 	 * versions of data and all the updates are in the past.
470 	 */
471 	if (ref->page_las->skew_newest &&
472 	    __wt_timestamp_cmp(
473 	    &txn->read_timestamp, &ref->page_las->unstable_timestamp) > 0)
474 		return (true);
475 
476 	/*
477 	 * Skip lookaside pages if reading as of a timestamp, we evicted old
478 	 * versions of data and all the unstable updates are in the future.
479 	 */
480 	if (!ref->page_las->skew_newest &&
481 	    __wt_timestamp_cmp(
482 	    &txn->read_timestamp, &ref->page_las->unstable_timestamp) < 0)
483 		return (true);
484 #endif
485 
486 	return (false);
487 }
488 
489 /*
490  * __wt_las_page_skip --
491  *	 Check if we can skip reading a page with lookaside entries, where the
492  * page needs to be locked before checking.
493  */
494 bool
__wt_las_page_skip(WT_SESSION_IMPL * session,WT_REF * ref)495 __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
496 {
497 	uint32_t previous_state;
498 	bool skip;
499 
500 	if ((previous_state = ref->state) != WT_REF_LIMBO &&
501 	    previous_state != WT_REF_LOOKASIDE)
502 		return (false);
503 
504 	if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED))
505 		return (false);
506 
507 	skip = __wt_las_page_skip_locked(session, ref);
508 
509 	/* Restore the state and push the change. */
510 	ref->state = previous_state;
511 	WT_FULL_BARRIER();
512 
513 	return (skip);
514 }
515 
516 /*
517  * __las_remove_block --
518  *	Remove all records for a given page from the lookaside store.
519  */
520 static int
__las_remove_block(WT_CURSOR * cursor,uint64_t pageid,bool lock_wait,uint64_t * remove_cntp)521 __las_remove_block(
522     WT_CURSOR *cursor, uint64_t pageid, bool lock_wait, uint64_t *remove_cntp)
523 {
524 	WT_CONNECTION_IMPL *conn;
525 	WT_DECL_RET;
526 	WT_ITEM las_key;
527 	WT_SESSION_IMPL *session;
528 	WT_TXN_ISOLATION saved_isolation;
529 	uint64_t las_counter, las_pageid;
530 	uint32_t las_id;
531 	bool local_txn;
532 
533 	*remove_cntp = 0;
534 
535 	session = (WT_SESSION_IMPL *)cursor->session;
536 	conn = S2C(session);
537 	local_txn = false;
538 
539 	/* Prevent the sweep thread from removing the block. */
540 	if (lock_wait)
541 		__wt_writelock(session, &conn->cache->las_sweepwalk_lock);
542 	else
543 		WT_RET(__wt_try_writelock(
544 		    session, &conn->cache->las_sweepwalk_lock));
545 
546 	__las_set_isolation(session, &saved_isolation);
547 	WT_ERR(__wt_txn_begin(session, NULL));
548 	local_txn = true;
549 
550 	/*
551 	 * Search for the block's unique btree ID and page ID prefix and step
552 	 * through all matching records, removing them.
553 	 */
554 	for (ret = __wt_las_cursor_position(cursor, pageid);
555 	    ret == 0; ret = cursor->next(cursor)) {
556 		WT_ERR(cursor->get_key(cursor,
557 		    &las_pageid, &las_id, &las_counter, &las_key));
558 
559 		/* Confirm that we have a matching record. */
560 		if (las_pageid != pageid)
561 			break;
562 
563 		WT_ERR(cursor->remove(cursor));
564 		++*remove_cntp;
565 	}
566 	WT_ERR_NOTFOUND_OK(ret);
567 
568 err:	if (local_txn) {
569 		if (ret == 0)
570 			ret = __wt_txn_commit(session, NULL);
571 		else
572 			WT_TRET(__wt_txn_rollback(session, NULL));
573 	}
574 
575 	__las_restore_isolation(session, saved_isolation);
576 	__wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
577 	return (ret);
578 }
579 
580 /*
581  * __las_insert_block_verbose --
582  *	Display a verbose message once per checkpoint with details about the
583  *	cache state when performing a lookaside table write.
584  */
585 static int
__las_insert_block_verbose(WT_SESSION_IMPL * session,WT_BTREE * btree,WT_MULTI * multi)586 __las_insert_block_verbose(
587     WT_SESSION_IMPL *session, WT_BTREE *btree, WT_MULTI *multi)
588 {
589 	WT_CACHE *cache;
590 	WT_CONNECTION_IMPL *conn;
591 	double pct_dirty, pct_full;
592 	uint64_t ckpt_gen_current, ckpt_gen_last;
593 	uint32_t btree_id;
594 #ifdef HAVE_TIMESTAMPS
595 	char hex_timestamp[2 * WT_TIMESTAMP_SIZE + 1];
596 #endif
597 	const char *ts;
598 
599 	btree_id = btree->id;
600 
601 	if (!WT_VERBOSE_ISSET(session,
602 	    WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY))
603 		return (0);
604 
605 	conn = S2C(session);
606 	cache = conn->cache;
607 	ckpt_gen_current = __wt_gen(session, WT_GEN_CHECKPOINT);
608 	ckpt_gen_last = cache->las_verb_gen_write;
609 
610 	/*
611 	 * Print a message if verbose lookaside, or once per checkpoint if
612 	 * only reporting activity. Avoid an expensive atomic operation as
613 	 * often as possible when the message rate is limited.
614 	 */
615 	if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE) ||
616 	    (ckpt_gen_current > ckpt_gen_last &&
617 	    __wt_atomic_casv64(&cache->las_verb_gen_write,
618 	    ckpt_gen_last, ckpt_gen_current))) {
619 		(void)__wt_eviction_clean_needed(session, &pct_full);
620 		(void)__wt_eviction_dirty_needed(session, &pct_dirty);
621 
622 #ifdef HAVE_TIMESTAMPS
623 		WT_RET(__wt_timestamp_to_hex_string(session, hex_timestamp,
624 		    &multi->page_las.unstable_timestamp));
625 		ts = hex_timestamp;
626 #else
627 		ts = "disabled";
628 #endif
629 		__wt_verbose(session,
630 		    WT_VERB_LOOKASIDE | WT_VERB_LOOKASIDE_ACTIVITY,
631 		    "Page reconciliation triggered lookaside write "
632 		    "file ID %" PRIu32 ", page ID %" PRIu64 ". "
633 		    "Max txn ID %" PRIu64 ", unstable timestamp %s, %s. "
634 		    "Entries now in lookaside file: %" PRId64 ", "
635 		    "cache dirty: %2.3f%% , "
636 		    "cache use: %2.3f%%",
637 		    btree_id, multi->page_las.las_pageid,
638 		    multi->page_las.max_txn,
639 		    ts,
640 		    multi->page_las.skew_newest ? "newest" : "not newest",
641 		    WT_STAT_READ(conn->stats, cache_lookaside_entries),
642 		    pct_dirty, pct_full);
643 	}
644 
645 	/* Never skip updating the tracked generation */
646 	if (WT_VERBOSE_ISSET(session, WT_VERB_LOOKASIDE))
647 		cache->las_verb_gen_write = ckpt_gen_current;
648 	return (0);
649 }
650 
651 /*
652  * __wt_las_insert_block --
653  *	Copy one set of saved updates into the database's lookaside table.
654  */
655 int
__wt_las_insert_block(WT_CURSOR * cursor,WT_BTREE * btree,WT_PAGE * page,WT_MULTI * multi,WT_ITEM * key)656 __wt_las_insert_block(WT_CURSOR *cursor,
657     WT_BTREE *btree, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key)
658 {
659 	WT_CONNECTION_IMPL *conn;
660 	WT_DECL_RET;
661 	WT_DECL_TIMESTAMP(prev_timestamp)
662 	WT_ITEM las_timestamp, las_value;
663 	WT_SAVE_UPD *list;
664 	WT_SESSION_IMPL *session;
665 	WT_TXN_ISOLATION saved_isolation;
666 	WT_UPDATE *upd;
667 	wt_off_t las_size;
668 	uint64_t insert_cnt;
669 	uint64_t las_counter, las_pageid, max_las_size;
670 	uint32_t btree_id, i, slot;
671 	uint8_t *p;
672 	bool local_txn;
673 
674 	session = (WT_SESSION_IMPL *)cursor->session;
675 	conn = S2C(session);
676 	WT_CLEAR(las_timestamp);
677 	WT_CLEAR(las_value);
678 	insert_cnt = 0;
679 	btree_id = btree->id;
680 	local_txn = false;
681 
682 	__wt_timestamp_set_zero(&prev_timestamp);
683 
684 	las_pageid = __wt_atomic_add64(&conn->cache->las_pageid, 1);
685 
686 	if (!btree->lookaside_entries)
687 		btree->lookaside_entries = true;
688 
689 #ifdef HAVE_DIAGNOSTIC
690 	{
691 	uint64_t remove_cnt;
692 	/*
693 	 * There should never be any entries with the page ID we are about to
694 	 * use.
695 	 */
696 	WT_RET_BUSY_OK(
697 	    __las_remove_block(cursor, las_pageid, false, &remove_cnt));
698 	WT_ASSERT(session, remove_cnt == 0);
699 	}
700 #endif
701 
702 	/* Wrap all the updates in a transaction. */
703 	__las_set_isolation(session, &saved_isolation);
704 	WT_ERR(__wt_txn_begin(session, NULL));
705 	local_txn = true;
706 
707 	/*
708 	 * Inserts should be on the same page absent a split, search any pinned
709 	 * leaf page.
710 	 */
711 	F_SET(cursor, WT_CURSTD_UPDATE_LOCAL);
712 
713 	/* Enter each update in the boundary's list into the lookaside store. */
714 	for (las_counter = 0, i = 0,
715 	    list = multi->supd; i < multi->supd_entries; ++i, ++list) {
716 		/* Lookaside table key component: source key. */
717 		switch (page->type) {
718 		case WT_PAGE_COL_FIX:
719 		case WT_PAGE_COL_VAR:
720 			p = key->mem;
721 			WT_ERR(
722 			    __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins)));
723 			key->size = WT_PTRDIFF(p, key->data);
724 			break;
725 		case WT_PAGE_ROW_LEAF:
726 			if (list->ins == NULL) {
727 				WT_WITH_BTREE(session, btree,
728 				    ret = __wt_row_leaf_key(
729 				    session, page, list->ripcip, key, false));
730 				WT_ERR(ret);
731 			} else {
732 				key->data = WT_INSERT_KEY(list->ins);
733 				key->size = WT_INSERT_KEY_SIZE(list->ins);
734 			}
735 			break;
736 		WT_ILLEGAL_VALUE_ERR(session, page->type);
737 		}
738 
739 		/*
740 		 * Lookaside table value component: update reference. Updates
741 		 * come from the row-store insert list (an inserted item), or
742 		 * update array (an update to an original on-page item), or from
743 		 * a column-store insert list (column-store format has no update
744 		 * array, the insert list contains both inserted items and
745 		 * updates to original on-page items). When rolling forward a
746 		 * modify update from an original on-page item, we need an
747 		 * on-page slot so we can find the original on-page item. When
748 		 * rolling forward from an inserted item, no on-page slot is
749 		 * possible.
750 		 */
751 		slot = UINT32_MAX;			/* Impossible slot */
752 		if (list->ripcip != NULL)
753 			slot = page->type == WT_PAGE_ROW_LEAF ?
754 			    WT_ROW_SLOT(page, list->ripcip) :
755 			    WT_COL_SLOT(page, list->ripcip);
756 		upd = list->ins == NULL ?
757 		    page->modify->mod_row_update[slot] : list->ins->upd;
758 
759 		/*
760 		 * Walk the list of updates, storing each key/value pair into
761 		 * the lookaside table. Skip aborted items (there's no point
762 		 * to restoring them), and assert we never see a reserved item.
763 		 */
764 		do {
765 			if (upd->txnid == WT_TXN_ABORTED)
766 				continue;
767 
768 			switch (upd->type) {
769 			case WT_UPDATE_MODIFY:
770 			case WT_UPDATE_STANDARD:
771 				las_value.data = upd->data;
772 				las_value.size = upd->size;
773 				break;
774 			case WT_UPDATE_BIRTHMARK:
775 			case WT_UPDATE_TOMBSTONE:
776 				las_value.size = 0;
777 				break;
778 			WT_ILLEGAL_VALUE_ERR(session, upd->type);
779 			}
780 
781 			cursor->set_key(cursor,
782 			    las_pageid, btree_id, ++las_counter, key);
783 
784 #ifdef HAVE_TIMESTAMPS
785 			las_timestamp.data = &upd->timestamp;
786 			las_timestamp.size = WT_TIMESTAMP_SIZE;
787 #endif
788 			/*
789 			 * If saving a non-zero length value on the page, save a
790 			 * birthmark instead of duplicating it in the lookaside
791 			 * table. (We check the length because row-store doesn't
792 			 * write zero-length data items.)
793 			 */
794 			if (upd == list->onpage_upd &&
795 			    upd->size > 0 &&
796 			    (upd->type == WT_UPDATE_STANDARD ||
797 			    upd->type == WT_UPDATE_MODIFY)) {
798 				las_value.size = 0;
799 				cursor->set_value(cursor, upd->txnid,
800 				    &las_timestamp, upd->prepare_state,
801 				    WT_UPDATE_BIRTHMARK, &las_value);
802 			} else
803 				cursor->set_value(cursor, upd->txnid,
804 				    &las_timestamp, upd->prepare_state,
805 				    upd->type, &las_value);
806 
807 			/*
808 			 * Using update instead of insert so the page stays
809 			 * pinned and can be searched before the tree.
810 			 */
811 			WT_ERR(cursor->update(cursor));
812 			++insert_cnt;
813 		} while ((upd = upd->next) != NULL);
814 	}
815 
816 	WT_ERR(__wt_block_manager_named_size(session, WT_LAS_FILE, &las_size));
817 	WT_STAT_CONN_SET(session, cache_lookaside_ondisk, las_size);
818 	max_las_size = ((WT_CURSOR_BTREE *)cursor)->btree->file_max;
819 	if (max_las_size != 0 && (uint64_t)las_size > max_las_size)
820 		WT_PANIC_MSG(session, WT_PANIC,
821 		    "WiredTigerLAS: file size of %" PRIu64 " exceeds maximum "
822 		    "size %" PRIu64, (uint64_t)las_size, max_las_size);
823 
824 err:	/* Resolve the transaction. */
825 	if (local_txn) {
826 		if (ret == 0)
827 			ret = __wt_txn_commit(session, NULL);
828 		else
829 			WT_TRET(__wt_txn_rollback(session, NULL));
830 
831 		/* Adjust the entry count. */
832 		if (ret == 0)
833 			(void)__wt_atomic_add64(
834 			    &conn->cache->las_insert_count, insert_cnt);
835 	}
836 
837 	__las_restore_isolation(session, saved_isolation);
838 	F_CLR(cursor, WT_CURSTD_UPDATE_LOCAL);
839 
840 	if (ret == 0 && insert_cnt > 0) {
841 		multi->page_las.las_pageid = las_pageid;
842 		ret = __las_insert_block_verbose(session, btree, multi);
843 	}
844 
845 	return (ret);
846 }
847 
848 /*
849  * __wt_las_cursor_position --
850  *	Position a lookaside cursor at the beginning of a block.
851  *
852  *	There may be no block of lookaside entries if they have been removed by
853  *	WT_CONNECTION::rollback_to_stable.
854  */
855 int
__wt_las_cursor_position(WT_CURSOR * cursor,uint64_t pageid)856 __wt_las_cursor_position(WT_CURSOR *cursor, uint64_t pageid)
857 {
858 	WT_ITEM las_key;
859 	uint64_t las_counter, las_pageid;
860 	uint32_t las_id;
861 	int exact;
862 
863 	/*
864 	 * When scanning for all pages, start at the beginning of the lookaside
865 	 * table.
866 	 */
867 	if (pageid == 0) {
868 		WT_RET(cursor->reset(cursor));
869 		return (cursor->next(cursor));
870 	}
871 
872 	/*
873 	 * Because of the special visibility rules for lookaside, a new block
874 	 * can appear in between our search and the block of interest.  Keep
875 	 * trying until we find it.
876 	 */
877 	for (;;) {
878 		WT_CLEAR(las_key);
879 		cursor->set_key(cursor,
880 		    pageid, (uint32_t)0, (uint64_t)0, &las_key);
881 		WT_RET(cursor->search_near(cursor, &exact));
882 		if (exact < 0) {
883 			WT_RET(cursor->next(cursor));
884 
885 			/*
886 			 * Because of the special visibility rules for
887 			 * lookaside, a new block can appear in between our
888 			 * search and the block of interest.  Keep trying while
889 			 * we have a key lower than we expect.
890 			 *
891 			 * There may be no block of lookaside entries if they
892 			 * have been removed by
893 			 * WT_CONNECTION::rollback_to_stable.
894 			 */
895 			WT_RET(cursor->get_key(cursor,
896 			    &las_pageid, &las_id, &las_counter, &las_key));
897 			if (las_pageid < pageid)
898 				continue;
899 		}
900 
901 		return (0);
902 	}
903 
904 	/* NOTREACHED */
905 }
906 
907 /*
908  * __wt_las_remove_block --
909  *	Remove all records for a given page from the lookaside table.
910  */
911 int
__wt_las_remove_block(WT_SESSION_IMPL * session,uint64_t pageid,bool lock_wait)912 __wt_las_remove_block(
913     WT_SESSION_IMPL *session, uint64_t pageid, bool lock_wait)
914 {
915 	WT_CONNECTION_IMPL *conn;
916 	WT_CURSOR *cursor;
917 	WT_DECL_RET;
918 	uint64_t remove_cnt;
919 	uint32_t session_flags;
920 
921 	conn = S2C(session);
922 	session_flags = 0;		/* [-Wconditional-uninitialized] */
923 
924 	/*
925 	 * This is an external API for removing records from the lookaside
926 	 * table, first acquiring a lookaside table cursor and enclosing
927 	 * transaction, then calling an underlying function to do the work.
928 	 */
929 	__wt_las_cursor(session, &cursor, &session_flags);
930 
931 	if ((ret = __las_remove_block(
932 	    cursor, pageid, lock_wait, &remove_cnt)) == 0)
933 		(void)__wt_atomic_add64(
934 		    &conn->cache->las_remove_count, remove_cnt);
935 
936 	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
937 	return (ret);
938 }
939 
940 /*
941  * __wt_las_remove_dropped --
942  *	Remove an opened btree ID if it is in the dropped table.
943  */
944 void
__wt_las_remove_dropped(WT_SESSION_IMPL * session)945 __wt_las_remove_dropped(WT_SESSION_IMPL *session)
946 {
947 	WT_BTREE *btree;
948 	WT_CACHE *cache;
949 	u_int i, j;
950 
951 	btree = S2BT(session);
952 	cache = S2C(session)->cache;
953 
954 	__wt_spin_lock(session, &cache->las_sweep_lock);
955 	for (i = 0; i < cache->las_dropped_next &&
956 	    cache->las_dropped[i] != btree->id; i++)
957 		;
958 
959 	if (i < cache->las_dropped_next) {
960 		cache->las_dropped_next--;
961 		for (j = i; j < cache->las_dropped_next; j++)
962 			cache->las_dropped[j] = cache->las_dropped[j + 1];
963 	}
964 	__wt_spin_unlock(session, &cache->las_sweep_lock);
965 }
966 
967 /*
968  * __wt_las_save_dropped --
969  *	Save a dropped btree ID to be swept from the lookaside table.
970  */
971 int
__wt_las_save_dropped(WT_SESSION_IMPL * session)972 __wt_las_save_dropped(WT_SESSION_IMPL *session)
973 {
974 	WT_BTREE *btree;
975 	WT_CACHE *cache;
976 	WT_DECL_RET;
977 
978 	btree = S2BT(session);
979 	cache = S2C(session)->cache;
980 
981 	__wt_spin_lock(session, &cache->las_sweep_lock);
982 	WT_ERR(__wt_realloc_def(session, &cache->las_dropped_alloc,
983 	    cache->las_dropped_next + 1, &cache->las_dropped));
984 	cache->las_dropped[cache->las_dropped_next++] = btree->id;
985 err:	__wt_spin_unlock(session, &cache->las_sweep_lock);
986 	return (ret);
987 }
988 
989 /*
990  * __las_sweep_count --
991  *	Calculate how many records to examine per sweep step.
992  */
993 static inline uint64_t
__las_sweep_count(WT_CACHE * cache)994 __las_sweep_count(WT_CACHE *cache)
995 {
996 	uint64_t las_entry_count;
997 
998 	/*
999 	 * The sweep server is a slow moving thread. Try to review the entire
1000 	 * lookaside table once every 5 minutes.
1001 	 *
1002 	 * The reason is because the lookaside table exists because we're seeing
1003 	 * cache/eviction pressure (it allows us to trade performance and disk
1004 	 * space for cache space), and it's likely lookaside blocks are being
1005 	 * evicted, and reading them back in doesn't help things. A trickier,
1006 	 * but possibly better, alternative might be to review all lookaside
1007 	 * blocks in the cache in order to get rid of them, and slowly review
1008 	 * lookaside blocks that have already been evicted.
1009 	 *
1010 	 * Put upper and lower bounds on the calculation: since reads of pages
1011 	 * with lookaside entries are blocked during sweep, make sure we do
1012 	 * some work but don't block reads for too long.
1013 	 */
1014 	las_entry_count = __las_entry_count(cache);
1015 	return ((uint64_t)WT_MAX(WT_LAS_SWEEP_ENTRIES,
1016 	    las_entry_count / (5 * WT_MINUTE / WT_LAS_SWEEP_SEC)));
1017 }
1018 
1019 /*
1020  * __las_sweep_init --
1021  *	Prepare to start a lookaside sweep.
1022  */
1023 static int
__las_sweep_init(WT_SESSION_IMPL * session)1024 __las_sweep_init(WT_SESSION_IMPL *session)
1025 {
1026 	WT_CACHE *cache;
1027 	WT_DECL_RET;
1028 	u_int i;
1029 
1030 	cache = S2C(session)->cache;
1031 
1032 	__wt_spin_lock(session, &cache->las_sweep_lock);
1033 
1034 	/*
1035 	 * If no files have been dropped and the lookaside file is empty,
1036 	 * there's nothing to do.
1037 	 */
1038 	if (cache->las_dropped_next == 0 && __wt_las_empty(session))
1039 		WT_ERR(WT_NOTFOUND);
1040 
1041 	/*
1042 	 * Record the current page ID: sweep will stop after this point.
1043 	 *
1044 	 * Since the btree IDs we're scanning are closed, any eviction must
1045 	 * have already completed, so we won't miss anything with this
1046 	 * approach.
1047 	 *
1048 	 * Also, if a tree is reopened and there is lookaside activity before
1049 	 * this sweep completes, it will have a higher page ID and should not
1050 	 * be removed.
1051 	 */
1052 	cache->las_sweep_max_pageid = cache->las_pageid;
1053 
1054 	/* Scan the btree IDs to find min/max. */
1055 	cache->las_sweep_dropmin = UINT32_MAX;
1056 	cache->las_sweep_dropmax = 0;
1057 	for (i = 0; i < cache->las_dropped_next; i++) {
1058 		cache->las_sweep_dropmin =
1059 		    WT_MIN(cache->las_sweep_dropmin, cache->las_dropped[i]);
1060 		cache->las_sweep_dropmax =
1061 		    WT_MAX(cache->las_sweep_dropmax, cache->las_dropped[i]);
1062 	}
1063 
1064 	/* Initialize the bitmap. */
1065 	__wt_free(session, cache->las_sweep_dropmap);
1066 	WT_ERR(__bit_alloc(session,
1067 	    1 + cache->las_sweep_dropmax - cache->las_sweep_dropmin,
1068 	    &cache->las_sweep_dropmap));
1069 	for (i = 0; i < cache->las_dropped_next; i++)
1070 		__bit_set(cache->las_sweep_dropmap,
1071 		    cache->las_dropped[i] - cache->las_sweep_dropmin);
1072 
1073 	/* Clear the list of btree IDs. */
1074 	cache->las_dropped_next = 0;
1075 
1076 err:	__wt_spin_unlock(session, &cache->las_sweep_lock);
1077 	return (ret);
1078 }
1079 
1080 /*
1081  * __wt_las_sweep --
1082  *	Sweep the lookaside table.
1083  */
1084 int
__wt_las_sweep(WT_SESSION_IMPL * session)1085 __wt_las_sweep(WT_SESSION_IMPL *session)
1086 {
1087 	WT_CACHE *cache;
1088 	WT_CURSOR *cursor;
1089 	WT_DECL_ITEM(saved_key);
1090 	WT_DECL_RET;
1091 	WT_ITEM las_key, las_timestamp, las_value;
1092 	WT_ITEM *sweep_key;
1093 #ifdef HAVE_TIMESTAMPS
1094 	wt_timestamp_t timestamp, *val_ts;
1095 #else
1096 	wt_timestamp_t *val_ts;
1097 #endif
1098 	uint64_t cnt, remove_cnt, las_pageid, saved_pageid, visit_cnt;
1099 	uint64_t las_counter, las_txnid;
1100 	uint32_t las_id, session_flags;
1101 	uint8_t prepare_state, upd_type;
1102 	int notused;
1103 	bool local_txn, locked, removing_key_block;
1104 
1105 	cache = S2C(session)->cache;
1106 	cursor = NULL;
1107 	sweep_key = &cache->las_sweep_key;
1108 	remove_cnt = 0;
1109 	session_flags = 0;		/* [-Werror=maybe-uninitialized] */
1110 	local_txn = locked = removing_key_block = false;
1111 
1112 	WT_RET(__wt_scr_alloc(session, 0, &saved_key));
1113 	saved_pageid = 0;
1114 
1115 	/*
1116 	 * Prevent other threads removing entries from underneath the sweep.
1117 	 */
1118 	__wt_writelock(session, &cache->las_sweepwalk_lock);
1119 	locked = true;
1120 
1121 	/*
1122 	 * Allocate a cursor and wrap all the updates in a transaction.
1123 	 * We should have our own lookaside cursor.
1124 	 */
1125 	__wt_las_cursor(session, &cursor, &session_flags);
1126 	WT_ASSERT(session, cursor->session == &session->iface);
1127 	WT_ERR(__wt_txn_begin(session, NULL));
1128 	local_txn = true;
1129 
1130 	/* Encourage a race */
1131 	__wt_timing_stress(session, WT_TIMING_STRESS_LOOKASIDE_SWEEP);
1132 
1133 	/*
1134 	 * When continuing a sweep, position the cursor using the key from the
1135 	 * last call (we don't care if we're before or after the key, either
1136 	 * side is fine).
1137 	 *
1138 	 * Otherwise, we're starting a new sweep, gather the list of trees to
1139 	 * sweep.
1140 	 */
1141 	if (sweep_key->size != 0) {
1142 		__wt_cursor_set_raw_key(cursor, sweep_key);
1143 		ret = cursor->search_near(cursor, &notused);
1144 
1145 		/*
1146 		 * Don't search for the same key twice; if we don't set a new
1147 		 * key below, it's because we've reached the end of the table
1148 		 * and we want the next pass to start at the beginning of the
1149 		 * table. Searching for the same key could leave us stuck at
1150 		 * the end of the table, repeatedly checking the same rows.
1151 		 */
1152 		__wt_buf_free(session, sweep_key);
1153 	} else
1154 		ret = __las_sweep_init(session);
1155 	if (ret != 0)
1156 		goto srch_notfound;
1157 
1158 	cnt = __las_sweep_count(cache);
1159 	visit_cnt = 0;
1160 
1161 	/* Walk the file. */
1162 	while ((ret = cursor->next(cursor)) == 0) {
1163 		WT_ERR(cursor->get_key(cursor,
1164 		    &las_pageid, &las_id, &las_counter, &las_key));
1165 
1166 		__wt_verbose(session,
1167 		    WT_VERB_LOOKASIDE_ACTIVITY,
1168 		    "Sweep reviewing lookaside entry with lookaside "
1169 		    "page ID %" PRIu64 " btree ID %" PRIu32
1170 		    " saved key size: %" WT_SIZET_FMT,
1171 		    las_pageid, las_id, saved_key->size);
1172 
1173 		/*
1174 		 * Signal to stop if the cache is stuck: we are ignoring the
1175 		 * cache size while scanning the lookaside table, so we're
1176 		 * making things worse.
1177 		 */
1178 		if (__wt_cache_stuck(session))
1179 			cnt = 0;
1180 
1181 		/*
1182 		 * Don't go past the end of lookaside from when sweep started.
1183 		 * If a file is reopened, its ID may be reused past this point
1184 		 * so the bitmap we're using is not valid.
1185 		 */
1186 		if (las_pageid > cache->las_sweep_max_pageid) {
1187 			__wt_buf_free(session, sweep_key);
1188 			ret = WT_NOTFOUND;
1189 			break;
1190 		}
1191 
1192 		/*
1193 		 * We only want to break between key blocks. Stop if we've
1194 		 * processed enough entries either all we wanted or enough
1195 		 * and there is a reader waiting and we're on a key boundary.
1196 		 */
1197 		++visit_cnt;
1198 		if (!removing_key_block && (cnt == 0 ||
1199 		    (visit_cnt > WT_LAS_SWEEP_ENTRIES && cache->las_reader)))
1200 			break;
1201 		if (cnt > 0)
1202 			--cnt;
1203 
1204 		/*
1205 		 * If the entry belongs to a dropped tree, discard it.
1206 		 *
1207 		 * Cursor opened overwrite=true: won't return WT_NOTFOUND
1208 		 * should another thread remove the record before we do (not
1209 		 * expected for dropped trees), and the cursor remains
1210 		 * positioned in that case.
1211 		 */
1212 		if (las_id >= cache->las_sweep_dropmin &&
1213 		    las_id <= cache->las_sweep_dropmax &&
1214 		    __bit_test(cache->las_sweep_dropmap,
1215 		    las_id - cache->las_sweep_dropmin)) {
1216 			WT_ERR(cursor->remove(cursor));
1217 			++remove_cnt;
1218 			saved_key->size = 0;
1219 			/*
1220 			 * Allow sweep to break while removing entries from a
1221 			 * dead file.
1222 			 */
1223 			removing_key_block = false;
1224 			continue;
1225 		}
1226 
1227 		/*
1228 		 * Remove all entries for a key once they have aged out and are
1229 		 * no longer needed.
1230 		 */
1231 		WT_ERR(cursor->get_value(cursor, &las_txnid,
1232 		    &las_timestamp, &prepare_state, &upd_type, &las_value));
1233 #ifdef HAVE_TIMESTAMPS
1234 		WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
1235 		memcpy(&timestamp, las_timestamp.data, las_timestamp.size);
1236 		val_ts = &timestamp;
1237 #else
1238 		val_ts = NULL;
1239 #endif
1240 
1241 		/*
1242 		 * Check to see if the page or key has changed this iteration,
1243 		 * and if they have, setup context for safely removing obsolete
1244 		 * updates.
1245 		 *
1246 		 * It's important to check for page boundaries explicitly
1247 		 * because it is possible for the same key to be at the start
1248 		 * of the next block. See WT-3982 for details.
1249 		 */
1250 		if (las_pageid != saved_pageid ||
1251 		    saved_key->size != las_key.size ||
1252 		    memcmp(saved_key->data, las_key.data, las_key.size) != 0) {
1253 			/* If we've examined enough entries, give up. */
1254 			if (cnt == 0)
1255 				break;
1256 
1257 			saved_pageid = las_pageid;
1258 			WT_ERR(__wt_buf_set(
1259 			    session, saved_key, las_key.data, las_key.size));
1260 
1261 			/*
1262 			 * There are several conditions that need to be met
1263 			 * before we choose to remove a key block:
1264 			 *  * The entries were written with skew newest.
1265 			 *    Indicated by the first entry being a birthmark.
1266 			 *  * The first entry is globally visible.
1267 			 *  * The entry wasn't from a prepared transaction.
1268 			 */
1269 			if (upd_type == WT_UPDATE_BIRTHMARK &&
1270 			    __wt_txn_visible_all(session, las_txnid, val_ts) &&
1271 			    prepare_state != WT_PREPARE_INPROGRESS)
1272 				removing_key_block = true;
1273 			else
1274 				removing_key_block = false;
1275 		}
1276 
1277 		if (!removing_key_block)
1278 			continue;
1279 
1280 		__wt_verbose(session,
1281 		    WT_VERB_LOOKASIDE_ACTIVITY,
1282 		    "Sweep removing lookaside entry with "
1283 		    "page ID: %" PRIu64 " btree ID: %" PRIu32
1284 		    " saved key size: %" WT_SIZET_FMT ", record type: %" PRIu8
1285 		    " transaction ID: %" PRIu64,
1286 		    las_pageid, las_id, saved_key->size, upd_type, las_txnid);
1287 		WT_ERR(cursor->remove(cursor));
1288 		++remove_cnt;
1289 	}
1290 
1291 	/*
1292 	 * If the loop terminates after completing a work unit, we will
1293 	 * continue the table sweep next time. Get a local copy of the
1294 	 * sweep key, we're going to reset the cursor; do so before
1295 	 * calling cursor.remove, cursor.remove can discard our hazard
1296 	 * pointer and the page could be evicted from underneath us.
1297 	 */
1298 	if (ret == 0) {
1299 		WT_ERR(__wt_cursor_get_raw_key(cursor, sweep_key));
1300 		if (!WT_DATA_IN_ITEM(sweep_key))
1301 			WT_ERR(__wt_buf_set(session, sweep_key,
1302 			    sweep_key->data, sweep_key->size));
1303 	}
1304 
1305 srch_notfound:
1306 	WT_ERR_NOTFOUND_OK(ret);
1307 
1308 	if (0) {
1309 err:		__wt_buf_free(session, sweep_key);
1310 	}
1311 	if (local_txn) {
1312 		if (ret == 0)
1313 			ret = __wt_txn_commit(session, NULL);
1314 		else
1315 			WT_TRET(__wt_txn_rollback(session, NULL));
1316 		if (ret == 0)
1317 			(void)__wt_atomic_add64(
1318 			    &cache->las_remove_count, remove_cnt);
1319 	}
1320 
1321 	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
1322 
1323 	if (locked)
1324 		__wt_writeunlock(session, &cache->las_sweepwalk_lock);
1325 
1326 	__wt_scr_free(session, &saved_key);
1327 
1328 	return (ret);
1329 }
1330