1 /*-
2 * Copyright (c) 2014-2018 MongoDB, Inc.
3 * Copyright (c) 2008-2014 WiredTiger, Inc.
4 * All rights reserved.
5 *
6 * See the file LICENSE for redistribution information.
7 */
8
9 /*
10 * __wt_ref_is_root --
11 * Return if the page reference is for the root page.
12 */
13 static inline bool
__wt_ref_is_root(WT_REF * ref)14 __wt_ref_is_root(WT_REF *ref)
15 {
16 return (ref->home == NULL);
17 }
18
19 /*
20 * __wt_page_is_empty --
21 * Return if the page is empty.
22 */
23 static inline bool
__wt_page_is_empty(WT_PAGE * page)24 __wt_page_is_empty(WT_PAGE *page)
25 {
26 return (page->modify != NULL &&
27 page->modify->rec_result == WT_PM_REC_EMPTY);
28 }
29
30 /*
31 * __wt_page_evict_clean --
32 * Return if the page can be evicted without dirtying the tree.
33 */
34 static inline bool
__wt_page_evict_clean(WT_PAGE * page)35 __wt_page_evict_clean(WT_PAGE *page)
36 {
37 return (page->modify == NULL ||
38 (page->modify->page_state == WT_PAGE_CLEAN &&
39 page->modify->rec_result == 0));
40 }
41
42 /*
43 * __wt_page_is_modified --
44 * Return if the page is dirty.
45 */
46 static inline bool
__wt_page_is_modified(WT_PAGE * page)47 __wt_page_is_modified(WT_PAGE *page)
48 {
49 return (page->modify != NULL &&
50 page->modify->page_state != WT_PAGE_CLEAN);
51 }
52
53 /*
54 * __wt_btree_block_free --
55 * Helper function to free a block from the current tree.
56 */
57 static inline int
__wt_btree_block_free(WT_SESSION_IMPL * session,const uint8_t * addr,size_t addr_size)58 __wt_btree_block_free(
59 WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
60 {
61 WT_BM *bm;
62 WT_BTREE *btree;
63
64 btree = S2BT(session);
65 bm = btree->bm;
66
67 return (bm->free(bm, session, addr, addr_size));
68 }
69
70 /*
71 * __wt_btree_bytes_inuse --
72 * Return the number of bytes in use.
73 */
74 static inline uint64_t
__wt_btree_bytes_inuse(WT_SESSION_IMPL * session)75 __wt_btree_bytes_inuse(WT_SESSION_IMPL *session)
76 {
77 WT_BTREE *btree;
78 WT_CACHE *cache;
79
80 btree = S2BT(session);
81 cache = S2C(session)->cache;
82
83 return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_inmem));
84 }
85
86 /*
87 * __wt_btree_bytes_evictable --
88 * Return the number of bytes that can be evicted (i.e. bytes apart from
89 * the pinned root page).
90 */
91 static inline uint64_t
__wt_btree_bytes_evictable(WT_SESSION_IMPL * session)92 __wt_btree_bytes_evictable(WT_SESSION_IMPL *session)
93 {
94 WT_BTREE *btree;
95 WT_CACHE *cache;
96 WT_PAGE *root_page;
97 uint64_t bytes_inmem, bytes_root;
98
99 btree = S2BT(session);
100 cache = S2C(session)->cache;
101 root_page = btree->root.page;
102
103 bytes_inmem = btree->bytes_inmem;
104 bytes_root = root_page == NULL ? 0 : root_page->memory_footprint;
105
106 return (bytes_inmem <= bytes_root ? 0 :
107 __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root));
108 }
109
110 /*
111 * __wt_btree_dirty_inuse --
112 * Return the number of dirty bytes in use.
113 */
114 static inline uint64_t
__wt_btree_dirty_inuse(WT_SESSION_IMPL * session)115 __wt_btree_dirty_inuse(WT_SESSION_IMPL *session)
116 {
117 WT_BTREE *btree;
118 WT_CACHE *cache;
119
120 btree = S2BT(session);
121 cache = S2C(session)->cache;
122
123 return (__wt_cache_bytes_plus_overhead(cache,
124 btree->bytes_dirty_intl + btree->bytes_dirty_leaf));
125 }
126
127 /*
128 * __wt_btree_dirty_leaf_inuse --
129 * Return the number of bytes in use by dirty leaf pages.
130 */
131 static inline uint64_t
__wt_btree_dirty_leaf_inuse(WT_SESSION_IMPL * session)132 __wt_btree_dirty_leaf_inuse(WT_SESSION_IMPL *session)
133 {
134 WT_BTREE *btree;
135 WT_CACHE *cache;
136
137 btree = S2BT(session);
138 cache = S2C(session)->cache;
139
140 return (__wt_cache_bytes_plus_overhead(cache, btree->bytes_dirty_leaf));
141 }
142
143 /*
144 * __wt_cache_page_inmem_incr --
145 * Increment a page's memory footprint in the cache.
146 */
147 static inline void
__wt_cache_page_inmem_incr(WT_SESSION_IMPL * session,WT_PAGE * page,size_t size)148 __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
149 {
150 WT_BTREE *btree;
151 WT_CACHE *cache;
152
153 WT_ASSERT(session, size < WT_EXABYTE);
154 btree = S2BT(session);
155 cache = S2C(session)->cache;
156
157 (void)__wt_atomic_add64(&btree->bytes_inmem, size);
158 (void)__wt_atomic_add64(&cache->bytes_inmem, size);
159 (void)__wt_atomic_addsize(&page->memory_footprint, size);
160 if (__wt_page_is_modified(page)) {
161 (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size);
162 if (WT_PAGE_IS_INTERNAL(page)) {
163 (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size);
164 (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size);
165 } else if (!btree->lsm_primary) {
166 (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
167 (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
168 }
169 }
170 /* Track internal size in cache. */
171 if (WT_PAGE_IS_INTERNAL(page))
172 (void)__wt_atomic_add64(&cache->bytes_internal, size);
173 }
174
175 /*
176 * __wt_cache_decr_check_size --
177 * Decrement a size_t cache value and check for underflow.
178 */
179 static inline void
__wt_cache_decr_check_size(WT_SESSION_IMPL * session,size_t * vp,size_t v,const char * fld)180 __wt_cache_decr_check_size(
181 WT_SESSION_IMPL *session, size_t *vp, size_t v, const char *fld)
182 {
183 if (v == 0 || __wt_atomic_subsize(vp, v) < WT_EXABYTE)
184 return;
185
186 /*
187 * It's a bug if this accounting underflowed but allow the application
188 * to proceed - the consequence is we use more cache than configured.
189 */
190 *vp = 0;
191 __wt_errx(session,
192 "%s went negative with decrement of %" WT_SIZET_FMT, fld, v);
193
194 #ifdef HAVE_DIAGNOSTIC
195 __wt_abort(session);
196 #endif
197 }
198
199 /*
200 * __wt_cache_decr_check_uint64 --
201 * Decrement a uint64_t cache value and check for underflow.
202 */
203 static inline void
__wt_cache_decr_check_uint64(WT_SESSION_IMPL * session,uint64_t * vp,uint64_t v,const char * fld)204 __wt_cache_decr_check_uint64(
205 WT_SESSION_IMPL *session, uint64_t *vp, uint64_t v, const char *fld)
206 {
207 uint64_t orig = *vp;
208
209 if (v == 0 || __wt_atomic_sub64(vp, v) < WT_EXABYTE)
210 return;
211
212 /*
213 * It's a bug if this accounting underflowed but allow the application
214 * to proceed - the consequence is we use more cache than configured.
215 */
216 *vp = 0;
217 __wt_errx(session,
218 "%s was %" PRIu64 ", went negative with decrement of %" PRIu64, fld,
219 orig, v);
220
221 #ifdef HAVE_DIAGNOSTIC
222 __wt_abort(session);
223 #endif
224 }
225
226 /*
227 * __wt_cache_page_byte_dirty_decr --
228 * Decrement the page's dirty byte count, guarding from underflow.
229 */
230 static inline void
__wt_cache_page_byte_dirty_decr(WT_SESSION_IMPL * session,WT_PAGE * page,size_t size)231 __wt_cache_page_byte_dirty_decr(
232 WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
233 {
234 WT_BTREE *btree;
235 WT_CACHE *cache;
236 size_t decr, orig;
237 int i;
238
239 btree = S2BT(session);
240 cache = S2C(session)->cache;
241 decr = 0; /* [-Wconditional-uninitialized] */
242
243 /*
244 * We don't have exclusive access and there are ways of decrementing the
245 * page's dirty byte count by a too-large value. For example:
246 * T1: __wt_cache_page_inmem_incr(page, size)
247 * page is clean, don't increment dirty byte count
248 * T2: mark page dirty
249 * T1: __wt_cache_page_inmem_decr(page, size)
250 * page is dirty, decrement dirty byte count
251 * and, of course, the reverse where the page is dirty at the increment
252 * and clean at the decrement.
253 *
254 * The page's dirty-byte value always reflects bytes represented in the
255 * cache's dirty-byte count, decrement the page/cache as much as we can
256 * without underflow. If we can't decrement the dirty byte counts after
257 * few tries, give up: the cache's value will be wrong, but consistent,
258 * and we'll fix it the next time this page is marked clean, or evicted.
259 */
260 for (i = 0; i < 5; ++i) {
261 /*
262 * Take care to read the dirty-byte count only once in case
263 * we're racing with updates.
264 */
265 WT_ORDERED_READ(orig, page->modify->bytes_dirty);
266 decr = WT_MIN(size, orig);
267 if (__wt_atomic_cassize(
268 &page->modify->bytes_dirty, orig, orig - decr))
269 break;
270 }
271
272 if (i == 5)
273 return;
274
275 if (WT_PAGE_IS_INTERNAL(page)) {
276 __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_intl,
277 decr, "WT_BTREE.bytes_dirty_intl");
278 __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl,
279 decr, "WT_CACHE.bytes_dirty_intl");
280 } else if (!btree->lsm_primary) {
281 __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf,
282 decr, "WT_BTREE.bytes_dirty_leaf");
283 __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf,
284 decr, "WT_CACHE.bytes_dirty_leaf");
285 }
286 }
287
288 /*
289 * __wt_cache_page_inmem_decr --
290 * Decrement a page's memory footprint in the cache.
291 */
292 static inline void
__wt_cache_page_inmem_decr(WT_SESSION_IMPL * session,WT_PAGE * page,size_t size)293 __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
294 {
295 WT_CACHE *cache;
296
297 cache = S2C(session)->cache;
298
299 WT_ASSERT(session, size < WT_EXABYTE);
300
301 __wt_cache_decr_check_uint64(
302 session, &S2BT(session)->bytes_inmem, size, "WT_BTREE.bytes_inmem");
303 __wt_cache_decr_check_uint64(
304 session, &cache->bytes_inmem, size, "WT_CACHE.bytes_inmem");
305 __wt_cache_decr_check_size(
306 session, &page->memory_footprint, size, "WT_PAGE.memory_footprint");
307 if (__wt_page_is_modified(page))
308 __wt_cache_page_byte_dirty_decr(session, page, size);
309 /* Track internal size in cache. */
310 if (WT_PAGE_IS_INTERNAL(page))
311 __wt_cache_decr_check_uint64(session,
312 &cache->bytes_internal, size, "WT_CACHE.bytes_internal");
313 }
314
315 /*
316 * __wt_cache_dirty_incr --
317 * Page switch from clean to dirty: increment the cache dirty page/byte
318 * counts.
319 */
320 static inline void
__wt_cache_dirty_incr(WT_SESSION_IMPL * session,WT_PAGE * page)321 __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
322 {
323 WT_BTREE *btree;
324 WT_CACHE *cache;
325 size_t size;
326
327 btree = S2BT(session);
328 cache = S2C(session)->cache;
329
330 /*
331 * Take care to read the memory_footprint once in case we are racing
332 * with updates.
333 */
334 size = page->memory_footprint;
335 if (WT_PAGE_IS_INTERNAL(page)) {
336 (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size);
337 (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size);
338 (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1);
339 } else {
340 if (!btree->lsm_primary) {
341 (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
342 (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
343 }
344 (void)__wt_atomic_add64(&cache->pages_dirty_leaf, 1);
345 }
346 (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size);
347 }
348
349 /*
350 * __wt_cache_dirty_decr --
351 * Page switch from dirty to clean: decrement the cache dirty page/byte
352 * counts.
353 */
354 static inline void
__wt_cache_dirty_decr(WT_SESSION_IMPL * session,WT_PAGE * page)355 __wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page)
356 {
357 WT_CACHE *cache;
358 WT_PAGE_MODIFY *modify;
359
360 cache = S2C(session)->cache;
361
362 if (WT_PAGE_IS_INTERNAL(page))
363 __wt_cache_decr_check_uint64(session,
364 &cache->pages_dirty_intl, 1, "dirty internal page count");
365 else
366 __wt_cache_decr_check_uint64(session,
367 &cache->pages_dirty_leaf, 1, "dirty leaf page count");
368
369 modify = page->modify;
370 if (modify != NULL && modify->bytes_dirty != 0)
371 __wt_cache_page_byte_dirty_decr(
372 session, page, modify->bytes_dirty);
373 }
374
375 /*
376 * __wt_cache_page_image_decr --
377 * Decrement a page image's size to the cache.
378 */
379 static inline void
__wt_cache_page_image_decr(WT_SESSION_IMPL * session,uint32_t size)380 __wt_cache_page_image_decr(WT_SESSION_IMPL *session, uint32_t size)
381 {
382 WT_CACHE *cache;
383
384 cache = S2C(session)->cache;
385
386 __wt_cache_decr_check_uint64(
387 session, &cache->bytes_image, size, "WT_CACHE.image_inmem");
388 }
389
390 /*
391 * __wt_cache_page_image_incr --
392 * Increment a page image's size to the cache.
393 */
394 static inline void
__wt_cache_page_image_incr(WT_SESSION_IMPL * session,uint32_t size)395 __wt_cache_page_image_incr(WT_SESSION_IMPL *session, uint32_t size)
396 {
397 WT_CACHE *cache;
398
399 cache = S2C(session)->cache;
400 (void)__wt_atomic_add64(&cache->bytes_image, size);
401 }
402
403 /*
404 * __wt_cache_page_evict --
405 * Evict pages from the cache.
406 */
407 static inline void
__wt_cache_page_evict(WT_SESSION_IMPL * session,WT_PAGE * page)408 __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
409 {
410 WT_BTREE *btree;
411 WT_CACHE *cache;
412 WT_PAGE_MODIFY *modify;
413
414 btree = S2BT(session);
415 cache = S2C(session)->cache;
416 modify = page->modify;
417
418 /* Update the bytes in-memory to reflect the eviction. */
419 __wt_cache_decr_check_uint64(session, &btree->bytes_inmem,
420 page->memory_footprint, "WT_BTREE.bytes_inmem");
421 __wt_cache_decr_check_uint64(session, &cache->bytes_inmem,
422 page->memory_footprint, "WT_CACHE.bytes_inmem");
423
424 /* Update the bytes_internal value to reflect the eviction */
425 if (WT_PAGE_IS_INTERNAL(page))
426 __wt_cache_decr_check_uint64(session,
427 &cache->bytes_internal,
428 page->memory_footprint, "WT_CACHE.bytes_internal");
429
430 /* Update the cache's dirty-byte count. */
431 if (modify != NULL && modify->bytes_dirty != 0) {
432 if (WT_PAGE_IS_INTERNAL(page)) {
433 __wt_cache_decr_check_uint64(session,
434 &btree->bytes_dirty_intl,
435 modify->bytes_dirty, "WT_BTREE.bytes_dirty_intl");
436 __wt_cache_decr_check_uint64(session,
437 &cache->bytes_dirty_intl,
438 modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl");
439 } else if (!btree->lsm_primary) {
440 __wt_cache_decr_check_uint64(session,
441 &btree->bytes_dirty_leaf,
442 modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf");
443 __wt_cache_decr_check_uint64(session,
444 &cache->bytes_dirty_leaf,
445 modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf");
446 }
447 }
448
449 /* Update bytes and pages evicted. */
450 (void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint);
451 (void)__wt_atomic_addv64(&cache->pages_evicted, 1);
452
453 /*
454 * Track if eviction makes progress. This is used in various places to
455 * determine whether eviction is stuck.
456 */
457 if (!F_ISSET_ATOMIC(page, WT_PAGE_EVICT_NO_PROGRESS))
458 (void)__wt_atomic_addv64(&cache->eviction_progress, 1);
459 }
460
461 /*
462 * __wt_update_list_memsize --
463 * The size in memory of a list of updates.
464 */
465 static inline size_t
__wt_update_list_memsize(WT_UPDATE * upd)466 __wt_update_list_memsize(WT_UPDATE *upd)
467 {
468 size_t upd_size;
469
470 for (upd_size = 0; upd != NULL; upd = upd->next)
471 upd_size += WT_UPDATE_MEMSIZE(upd);
472
473 return (upd_size);
474 }
475
476 /*
477 * __wt_page_modify_init --
478 * A page is about to be modified, allocate the modification structure.
479 */
480 static inline int
__wt_page_modify_init(WT_SESSION_IMPL * session,WT_PAGE * page)481 __wt_page_modify_init(WT_SESSION_IMPL *session, WT_PAGE *page)
482 {
483 return (page->modify == NULL ?
484 __wt_page_modify_alloc(session, page) : 0);
485 }
486
487 /*
488 * __wt_page_only_modify_set --
489 * Mark the page (but only the page) dirty.
490 */
491 static inline void
__wt_page_only_modify_set(WT_SESSION_IMPL * session,WT_PAGE * page)492 __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
493 {
494 uint64_t last_running;
495
496 WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD));
497
498 last_running = 0;
499 if (page->modify->page_state == WT_PAGE_CLEAN)
500 last_running = S2C(session)->txn_global.last_running;
501
502 /*
503 * We depend on the atomic operation being a write barrier, that is, a
504 * barrier to ensure all changes to the page are flushed before updating
505 * the page state and/or marking the tree dirty, otherwise checkpoints
506 * and/or page reconciliation might be looking at a clean page/tree.
507 *
508 * Every time the page transitions from clean to dirty, update the cache
509 * and transactional information.
510 *
511 * The page state can only ever be incremented above dirty by the number
512 * of concurrently running threads, so the counter will never approach
513 * the point where it would wrap.
514 */
515 if (page->modify->page_state < WT_PAGE_DIRTY &&
516 __wt_atomic_add32(&page->modify->page_state, 1) ==
517 WT_PAGE_DIRTY_FIRST) {
518 __wt_cache_dirty_incr(session, page);
519 /*
520 * In the event we dirty a page which is flagged for eviction
521 * soon, we update its read generation to avoid evicting a
522 * dirty page prematurely.
523 */
524 if (page->read_gen == WT_READGEN_WONT_NEED)
525 __wt_cache_read_gen_new(session, page);
526
527 /*
528 * We won the race to dirty the page, but another thread could
529 * have committed in the meantime, and the last_running field
530 * been updated past it. That is all very unlikely, but not
531 * impossible, so we take care to read the global state before
532 * the atomic increment.
533 *
534 * If the page was dirty on entry, then last_running == 0. The
535 * page could have become clean since then, if reconciliation
536 * completed. In that case, we leave the previous value for
537 * first_dirty_txn rather than potentially racing to update it,
538 * at worst, we'll unnecessarily write a page in a checkpoint.
539 */
540 if (last_running != 0)
541 page->modify->first_dirty_txn = last_running;
542 }
543
544 /* Check if this is the largest transaction ID to update the page. */
545 if (WT_TXNID_LT(page->modify->update_txn, session->txn.id))
546 page->modify->update_txn = session->txn.id;
547 }
548
549 /*
550 * __wt_tree_modify_set --
551 * Mark the tree dirty.
552 */
553 static inline void
__wt_tree_modify_set(WT_SESSION_IMPL * session)554 __wt_tree_modify_set(WT_SESSION_IMPL *session)
555 {
556 /*
557 * Test before setting the dirty flag, it's a hot cache line.
558 *
559 * The tree's modified flag is cleared by the checkpoint thread: set it
560 * and insert a barrier before dirtying the page. (I don't think it's
561 * a problem if the tree is marked dirty with all the pages clean, it
562 * might result in an extra checkpoint that doesn't do any work but it
563 * shouldn't cause problems; regardless, let's play it safe.)
564 */
565 if (!S2BT(session)->modified) {
566 /* Assert we never dirty a checkpoint handle. */
567 WT_ASSERT(session, session->dhandle->checkpoint == NULL);
568
569 S2BT(session)->modified = true;
570 WT_FULL_BARRIER();
571 }
572
573 /*
574 * The btree may already be marked dirty while the connection is still
575 * clean; mark the connection dirty outside the test of the btree state.
576 */
577 if (!S2C(session)->modified)
578 S2C(session)->modified = true;
579 }
580
581 /*
582 * __wt_page_modify_clear --
583 * Clean a modified page.
584 */
585 static inline void
__wt_page_modify_clear(WT_SESSION_IMPL * session,WT_PAGE * page)586 __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
587 {
588 /*
589 * The page must be held exclusive when this call is made, this call
590 * can only be used when the page is owned by a single thread.
591 *
592 * Allow the call to be made on clean pages.
593 */
594 if (__wt_page_is_modified(page)) {
595 /*
596 * The only part where ordering matters is during
597 * reconciliation where updates on other threads are performing
598 * writes to the page state that need to be visible to the
599 * reconciliation thread.
600 *
601 * Since clearing of the page state is not going to be happening
602 * during reconciliation on a separate thread, there's no write
603 * barrier needed here.
604 */
605 page->modify->page_state = WT_PAGE_CLEAN;
606 __wt_cache_dirty_decr(session, page);
607 }
608 }
609
610 /*
611 * __wt_page_modify_set --
612 * Mark the page and tree dirty.
613 */
614 static inline void
__wt_page_modify_set(WT_SESSION_IMPL * session,WT_PAGE * page)615 __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
616 {
617 /*
618 * Mark the tree dirty (even if the page is already marked dirty), newly
619 * created pages to support "empty" files are dirty, but the file isn't
620 * marked dirty until there's a real change needing to be written.
621 */
622 __wt_tree_modify_set(session);
623
624 __wt_page_only_modify_set(session, page);
625 }
626
627 /*
628 * __wt_page_parent_modify_set --
629 * Mark the parent page, and optionally the tree, dirty.
630 */
631 static inline int
__wt_page_parent_modify_set(WT_SESSION_IMPL * session,WT_REF * ref,bool page_only)632 __wt_page_parent_modify_set(
633 WT_SESSION_IMPL *session, WT_REF *ref, bool page_only)
634 {
635 WT_PAGE *parent;
636
637 /*
638 * This function exists as a place to stash this comment. There are a
639 * few places where we need to dirty a page's parent. The trick is the
640 * page's parent might split at any point, and the page parent might be
641 * the wrong parent at any particular time. We ignore this and dirty
642 * whatever page the page's reference structure points to. This is safe
643 * because if we're pointing to the wrong parent, that parent must have
644 * split, deepening the tree, which implies marking the original parent
645 * and all of the newly-created children as dirty. In other words, if
646 * we have the wrong parent page, everything was marked dirty already.
647 */
648 parent = ref->home;
649 WT_RET(__wt_page_modify_init(session, parent));
650 if (page_only)
651 __wt_page_only_modify_set(session, parent);
652 else
653 __wt_page_modify_set(session, parent);
654 return (0);
655 }
656
657 /*
658 * __wt_off_page --
659 * Return if a pointer references off-page data.
660 */
661 static inline bool
__wt_off_page(WT_PAGE * page,const void * p)662 __wt_off_page(WT_PAGE *page, const void *p)
663 {
664 /*
665 * There may be no underlying page, in which case the reference is
666 * off-page by definition.
667 */
668 return (page->dsk == NULL ||
669 p < (void *)page->dsk ||
670 p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size));
671 }
672
673 /*
674 * __wt_ref_addr_free --
675 * Free the address in a reference, if necessary.
676 */
677 static inline void
__wt_ref_addr_free(WT_SESSION_IMPL * session,WT_REF * ref)678 __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref)
679 {
680 if (ref->addr == NULL)
681 return;
682
683 if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
684 __wt_free(session, ((WT_ADDR *)ref->addr)->addr);
685 __wt_free(session, ref->addr);
686 }
687 ref->addr = NULL;
688 }
689
690 /*
691 * __wt_ref_key --
692 * Return a reference to a row-store internal page key as cheaply as
693 * possible.
694 */
695 static inline void
__wt_ref_key(WT_PAGE * page,WT_REF * ref,void * keyp,size_t * sizep)696 __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep)
697 {
698 uintptr_t v;
699
700 /*
701 * An internal page key is in one of two places: if we instantiated the
702 * key (for example, when reading the page), WT_REF.ref_ikey references
703 * a WT_IKEY structure, otherwise WT_REF.ref_ikey references an on-page
704 * key offset/length pair.
705 *
706 * Now the magic: allocated memory must be aligned to store any standard
707 * type, and we expect some standard type to require at least quad-byte
708 * alignment, so allocated memory should have some clear low-order bits.
709 * On-page objects consist of an offset/length pair: the maximum page
710 * size currently fits into 29 bits, so we use the low-order bits of the
711 * pointer to mark the other bits of the pointer as encoding the key's
712 * location and length. This breaks if allocated memory isn't aligned,
713 * of course.
714 *
715 * In this specific case, we use bit 0x01 to mark an on-page key, else
716 * it's a WT_IKEY reference. The bit pattern for internal row-store
717 * on-page keys is:
718 * 32 bits key length
719 * 31 bits page offset of the key's bytes,
720 * 1 bits flags
721 */
722 #define WT_IK_FLAG 0x01
723 #define WT_IK_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32)
724 #define WT_IK_DECODE_KEY_LEN(v) ((v) >> 32)
725 #define WT_IK_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 1)
726 #define WT_IK_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 1)
727 v = (uintptr_t)ref->ref_ikey;
728 if (v & WT_IK_FLAG) {
729 *(void **)keyp =
730 WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v));
731 *sizep = WT_IK_DECODE_KEY_LEN(v);
732 } else {
733 *(void **)keyp = WT_IKEY_DATA(ref->ref_ikey);
734 *sizep = ((WT_IKEY *)ref->ref_ikey)->size;
735 }
736 }
737
738 /*
739 * __wt_ref_key_onpage_set --
740 * Set a WT_REF to reference an on-page key.
741 */
742 static inline void
__wt_ref_key_onpage_set(WT_PAGE * page,WT_REF * ref,WT_CELL_UNPACK * unpack)743 __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack)
744 {
745 uintptr_t v;
746
747 /*
748 * See the comment in __wt_ref_key for an explanation of the magic.
749 */
750 v = WT_IK_ENCODE_KEY_LEN(unpack->size) |
751 WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
752 WT_IK_FLAG;
753 ref->ref_ikey = (void *)v;
754 }
755
756 /*
757 * __wt_ref_key_instantiated --
758 * Return if a WT_REF key is instantiated.
759 */
760 static inline WT_IKEY *
__wt_ref_key_instantiated(WT_REF * ref)761 __wt_ref_key_instantiated(WT_REF *ref)
762 {
763 uintptr_t v;
764
765 /*
766 * See the comment in __wt_ref_key for an explanation of the magic.
767 */
768 v = (uintptr_t)ref->ref_ikey;
769 return (v & WT_IK_FLAG ? NULL : ref->ref_ikey);
770 }
771
772 /*
773 * __wt_ref_key_clear --
774 * Clear a WT_REF key.
775 */
776 static inline void
__wt_ref_key_clear(WT_REF * ref)777 __wt_ref_key_clear(WT_REF *ref)
778 {
779 /*
780 * The key union has 2 8B fields; this is equivalent to:
781 *
782 * ref->ref_recno = WT_RECNO_OOB;
783 * ref->ref_ikey = NULL;
784 */
785 ref->ref_recno = 0;
786 }
787
788 /*
789 * __wt_row_leaf_key_info --
790 * Return a row-store leaf page key referenced by a WT_ROW if it can be
791 * had without unpacking a cell, and information about the cell, if the key
792 * isn't cheaply available.
793 */
794 static inline bool
__wt_row_leaf_key_info(WT_PAGE * page,void * copy,WT_IKEY ** ikeyp,WT_CELL ** cellp,void * datap,size_t * sizep)795 __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
796 WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
797 {
798 WT_IKEY *ikey;
799 uintptr_t v;
800
801 v = (uintptr_t)copy;
802
803 /*
804 * A row-store leaf page key is in one of two places: if instantiated,
805 * the WT_ROW pointer references a WT_IKEY structure, otherwise, it
806 * references an on-page offset. Further, on-page keys are in one of
807 * two states: if the key is a simple key (not an overflow key, prefix
808 * compressed or Huffman encoded, all of which are likely), the key's
809 * offset/size is encoded in the pointer. Otherwise, the offset is to
810 * the key's on-page cell.
811 *
812 * Now the magic: allocated memory must be aligned to store any standard
813 * type, and we expect some standard type to require at least quad-byte
814 * alignment, so allocated memory should have some clear low-order bits.
815 * On-page objects consist of an offset/length pair: the maximum page
816 * size currently fits into 29 bits, so we use the low-order bits of the
817 * pointer to mark the other bits of the pointer as encoding the key's
818 * location and length. This breaks if allocated memory isn't aligned,
819 * of course.
820 *
821 * In this specific case, we use bit 0x01 to mark an on-page cell, bit
822 * 0x02 to mark an on-page key, 0x03 to mark an on-page key/value pair,
823 * otherwise it's a WT_IKEY reference. The bit pattern for on-page cells
824 * is:
825 * 29 bits page offset of the key's cell,
826 * 2 bits flags
827 *
828 * The bit pattern for on-page keys is:
829 * 32 bits key length,
830 * 29 bits page offset of the key's bytes,
831 * 2 bits flags
832 *
833 * But, while that allows us to skip decoding simple key cells, we also
834 * want to skip decoding the value cell in the case where the value cell
835 * is also simple/short. We use bit 0x03 to mark an encoded on-page key
836 * and value pair. The bit pattern for on-page key/value pairs is:
837 * 9 bits key length,
838 * 13 bits value length,
839 * 20 bits page offset of the key's bytes,
840 * 20 bits page offset of the value's bytes,
841 * 2 bits flags
842 *
843 * These bit patterns are in-memory only, of course, so can be modified
844 * (we could even tune for specific workloads). Generally, the fields
845 * are larger than the anticipated values being stored (512B keys, 8KB
846 * values, 1MB pages), hopefully that won't be necessary.
847 *
848 * This function returns a list of things about the key (instantiation
849 * reference, cell reference and key/length pair). Our callers know
850 * the order in which we look things up and the information returned;
851 * for example, the cell will never be returned if we are working with
852 * an on-page key.
853 */
854 #define WT_CELL_FLAG 0x01
855 #define WT_CELL_ENCODE_OFFSET(v) ((uintptr_t)(v) << 2)
856 #define WT_CELL_DECODE_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2)
857
858 #define WT_K_FLAG 0x02
859 #define WT_K_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 32)
860 #define WT_K_DECODE_KEY_LEN(v) ((v) >> 32)
861 #define WT_K_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 2)
862 #define WT_K_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 2)
863
864 #define WT_KV_FLAG 0x03
865 #define WT_KV_ENCODE_KEY_LEN(v) ((uintptr_t)(v) << 55)
866 #define WT_KV_DECODE_KEY_LEN(v) ((v) >> 55)
867 #define WT_KV_MAX_KEY_LEN (0x200 - 1)
868 #define WT_KV_ENCODE_VALUE_LEN(v) ((uintptr_t)(v) << 42)
869 #define WT_KV_DECODE_VALUE_LEN(v) (((v) & 0x007FFC0000000000) >> 42)
870 #define WT_KV_MAX_VALUE_LEN (0x2000 - 1)
871 #define WT_KV_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 22)
872 #define WT_KV_DECODE_KEY_OFFSET(v) (((v) & 0x000003FFFFC00000) >> 22)
873 #define WT_KV_MAX_KEY_OFFSET (0x100000 - 1)
874 #define WT_KV_ENCODE_VALUE_OFFSET(v) ((uintptr_t)(v) << 2)
875 #define WT_KV_DECODE_VALUE_OFFSET(v) (((v) & 0x00000000003FFFFC) >> 2)
876 #define WT_KV_MAX_VALUE_OFFSET (0x100000 - 1)
877 switch (v & 0x03) {
878 case WT_CELL_FLAG:
879 /* On-page cell: no instantiated key. */
880 if (ikeyp != NULL)
881 *ikeyp = NULL;
882 if (cellp != NULL)
883 *cellp =
884 WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
885 return (false);
886 case WT_K_FLAG:
887 /* Encoded key: no instantiated key, no cell. */
888 if (cellp != NULL)
889 *cellp = NULL;
890 if (ikeyp != NULL)
891 *ikeyp = NULL;
892 if (datap != NULL) {
893 *(void **)datap =
894 WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
895 *sizep = WT_K_DECODE_KEY_LEN(v);
896 return (true);
897 }
898 return (false);
899 case WT_KV_FLAG:
900 /* Encoded key/value pair: no instantiated key, no cell. */
901 if (cellp != NULL)
902 *cellp = NULL;
903 if (ikeyp != NULL)
904 *ikeyp = NULL;
905 if (datap != NULL) {
906 *(void **)datap = WT_PAGE_REF_OFFSET(
907 page, WT_KV_DECODE_KEY_OFFSET(v));
908 *sizep = WT_KV_DECODE_KEY_LEN(v);
909 return (true);
910 }
911 return (false);
912
913 }
914
915 /* Instantiated key. */
916 ikey = copy;
917 if (ikeyp != NULL)
918 *ikeyp = copy;
919 if (cellp != NULL)
920 *cellp = WT_PAGE_REF_OFFSET(page, ikey->cell_offset);
921 if (datap != NULL) {
922 *(void **)datap = WT_IKEY_DATA(ikey);
923 *sizep = ikey->size;
924 return (true);
925 }
926 return (false);
927 }
928
929 /*
930 * __wt_row_leaf_key_set_cell --
931 * Set a WT_ROW to reference an on-page row-store leaf cell.
932 */
933 static inline void
__wt_row_leaf_key_set_cell(WT_PAGE * page,WT_ROW * rip,WT_CELL * cell)934 __wt_row_leaf_key_set_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL *cell)
935 {
936 uintptr_t v;
937
938 /*
939 * See the comment in __wt_row_leaf_key_info for an explanation of the
940 * magic.
941 */
942 v = WT_CELL_ENCODE_OFFSET(WT_PAGE_DISK_OFFSET(page, cell)) |
943 WT_CELL_FLAG;
944 WT_ROW_KEY_SET(rip, v);
945 }
946
947 /*
948 * __wt_row_leaf_key_set --
949 * Set a WT_ROW to reference an on-page row-store leaf key.
950 */
951 static inline void
__wt_row_leaf_key_set(WT_PAGE * page,WT_ROW * rip,WT_CELL_UNPACK * unpack)952 __wt_row_leaf_key_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
953 {
954 uintptr_t v;
955
956 /*
957 * See the comment in __wt_row_leaf_key_info for an explanation of the
958 * magic.
959 */
960 v = WT_K_ENCODE_KEY_LEN(unpack->size) |
961 WT_K_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
962 WT_K_FLAG;
963 WT_ROW_KEY_SET(rip, v);
964 }
965
966 /*
967 * __wt_row_leaf_value_set --
968 * Set a WT_ROW to reference an on-page row-store leaf value.
969 */
970 static inline void
__wt_row_leaf_value_set(WT_PAGE * page,WT_ROW * rip,WT_CELL_UNPACK * unpack)971 __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack)
972 {
973 uintptr_t key_len, key_offset, value_offset, v;
974
975 v = (uintptr_t)WT_ROW_KEY_COPY(rip);
976
977 /*
978 * See the comment in __wt_row_leaf_key_info for an explanation of the
979 * magic.
980 */
981 if (!(v & WT_K_FLAG)) /* Already an encoded key */
982 return;
983
984 key_len = WT_K_DECODE_KEY_LEN(v); /* Key length */
985 if (key_len > WT_KV_MAX_KEY_LEN)
986 return;
987 if (unpack->size > WT_KV_MAX_VALUE_LEN) /* Value length */
988 return;
989
990 key_offset = WT_K_DECODE_KEY_OFFSET(v); /* Page offsets */
991 if (key_offset > WT_KV_MAX_KEY_OFFSET)
992 return;
993 value_offset = WT_PAGE_DISK_OFFSET(page, unpack->data);
994 if (value_offset > WT_KV_MAX_VALUE_OFFSET)
995 return;
996
997 v = WT_KV_ENCODE_KEY_LEN(key_len) |
998 WT_KV_ENCODE_VALUE_LEN(unpack->size) |
999 WT_KV_ENCODE_KEY_OFFSET(key_offset) |
1000 WT_KV_ENCODE_VALUE_OFFSET(value_offset) | WT_KV_FLAG;
1001 WT_ROW_KEY_SET(rip, v);
1002 }
1003
1004 /*
1005 * __wt_row_leaf_key --
1006 * Set a buffer to reference a row-store leaf page key as cheaply as
1007 * possible.
1008 */
1009 static inline int
__wt_row_leaf_key(WT_SESSION_IMPL * session,WT_PAGE * page,WT_ROW * rip,WT_ITEM * key,bool instantiate)1010 __wt_row_leaf_key(WT_SESSION_IMPL *session,
1011 WT_PAGE *page, WT_ROW *rip, WT_ITEM *key, bool instantiate)
1012 {
1013 void *copy;
1014
1015 /*
1016 * A front-end for __wt_row_leaf_key_work, here to inline fast paths.
1017 *
1018 * The row-store key can change underfoot; explicitly take a copy.
1019 */
1020 copy = WT_ROW_KEY_COPY(rip);
1021
1022 /*
1023 * All we handle here are on-page keys (which should be a common case),
1024 * and instantiated keys (which start out rare, but become more common
1025 * as a leaf page is searched, instantiating prefix-compressed keys).
1026 */
1027 if (__wt_row_leaf_key_info(
1028 page, copy, NULL, NULL, &key->data, &key->size))
1029 return (0);
1030
1031 /*
1032 * The alternative is an on-page cell with some kind of compressed or
1033 * overflow key that's never been instantiated. Call the underlying
1034 * worker function to figure it out.
1035 */
1036 return (__wt_row_leaf_key_work(session, page, rip, key, instantiate));
1037 }
1038
1039 /*
1040 * __wt_row_leaf_value_cell --
1041 * Return the unpacked value for a row-store leaf page key.
1042 */
1043 static inline void
__wt_row_leaf_value_cell(WT_PAGE * page,WT_ROW * rip,WT_CELL_UNPACK * kpack,WT_CELL_UNPACK * vpack)1044 __wt_row_leaf_value_cell(
1045 WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack, WT_CELL_UNPACK *vpack)
1046 {
1047 WT_CELL *kcell, *vcell;
1048 WT_CELL_UNPACK unpack;
1049 size_t size;
1050 void *copy, *key;
1051
1052 /* If we already have an unpacked key cell, use it. */
1053 if (kpack != NULL)
1054 vcell = (WT_CELL *)
1055 ((uint8_t *)kpack->cell + __wt_cell_total_len(kpack));
1056 else {
1057 /*
1058 * The row-store key can change underfoot; explicitly take a
1059 * copy.
1060 */
1061 copy = WT_ROW_KEY_COPY(rip);
1062
1063 /*
1064 * Figure out where the key is, step past it to the value cell.
1065 * The test for a cell not being set tells us that we have an
1066 * on-page key, otherwise we're looking at an instantiated key
1067 * or on-page cell, both of which require an unpack of the key's
1068 * cell to find the value cell that follows.
1069 */
1070 if (__wt_row_leaf_key_info(
1071 page, copy, NULL, &kcell, &key, &size) && kcell == NULL)
1072 vcell = (WT_CELL *)((uint8_t *)key + size);
1073 else {
1074 __wt_cell_unpack(kcell, &unpack);
1075 vcell = (WT_CELL *)((uint8_t *)
1076 unpack.cell + __wt_cell_total_len(&unpack));
1077 }
1078 }
1079
1080 __wt_cell_unpack(__wt_cell_leaf_value_parse(page, vcell), vpack);
1081 }
1082
1083 /*
1084 * __wt_row_leaf_value --
1085 * Return the value for a row-store leaf page encoded key/value pair.
1086 */
1087 static inline bool
__wt_row_leaf_value(WT_PAGE * page,WT_ROW * rip,WT_ITEM * value)1088 __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
1089 {
1090 uintptr_t v;
1091
1092 /* The row-store key can change underfoot; explicitly take a copy. */
1093 v = (uintptr_t)WT_ROW_KEY_COPY(rip);
1094
1095 /*
1096 * See the comment in __wt_row_leaf_key_info for an explanation of the
1097 * magic.
1098 */
1099 if ((v & 0x03) == WT_KV_FLAG) {
1100 value->data =
1101 WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
1102 value->size = WT_KV_DECODE_VALUE_LEN(v);
1103 return (true);
1104 }
1105 return (false);
1106 }
1107
1108 /*
1109 * __wt_ref_info --
1110 * Return the addr/size and type triplet for a reference.
1111 */
1112 static inline void
__wt_ref_info(WT_REF * ref,const uint8_t ** addrp,size_t * sizep,u_int * typep)1113 __wt_ref_info(WT_REF *ref, const uint8_t **addrp, size_t *sizep, u_int *typep)
1114 {
1115 WT_ADDR *addr;
1116 WT_CELL_UNPACK *unpack, _unpack;
1117
1118 addr = ref->addr;
1119 unpack = &_unpack;
1120
1121 /*
1122 * If NULL, there is no location.
1123 * If off-page, the pointer references a WT_ADDR structure.
1124 * If on-page, the pointer references a cell.
1125 *
1126 * The type is of a limited set: internal, leaf or no-overflow leaf.
1127 */
1128 if (addr == NULL) {
1129 *addrp = NULL;
1130 *sizep = 0;
1131 if (typep != NULL)
1132 *typep = 0;
1133 } else if (__wt_off_page(ref->home, addr)) {
1134 *addrp = addr->addr;
1135 *sizep = addr->size;
1136 if (typep != NULL)
1137 switch (addr->type) {
1138 case WT_ADDR_INT:
1139 *typep = WT_CELL_ADDR_INT;
1140 break;
1141 case WT_ADDR_LEAF:
1142 *typep = WT_CELL_ADDR_LEAF;
1143 break;
1144 case WT_ADDR_LEAF_NO:
1145 *typep = WT_CELL_ADDR_LEAF_NO;
1146 break;
1147 default:
1148 *typep = 0;
1149 break;
1150 }
1151 } else {
1152 __wt_cell_unpack((WT_CELL *)addr, unpack);
1153 *addrp = unpack->data;
1154 *sizep = unpack->size;
1155 if (typep != NULL)
1156 *typep = unpack->type;
1157 }
1158 }
1159
1160 /*
1161 * __wt_ref_block_free --
1162 * Free the on-disk block for a reference and clear the address.
1163 */
1164 static inline int
__wt_ref_block_free(WT_SESSION_IMPL * session,WT_REF * ref)1165 __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
1166 {
1167 size_t addr_size;
1168 const uint8_t *addr;
1169
1170 if (ref->addr == NULL)
1171 return (0);
1172
1173 __wt_ref_info(ref, &addr, &addr_size, NULL);
1174 WT_RET(__wt_btree_block_free(session, addr, addr_size));
1175
1176 /* Clear the address (so we don't free it twice). */
1177 __wt_ref_addr_free(session, ref);
1178 return (0);
1179 }
1180
1181 /*
1182 * __wt_page_del_active --
1183 * Return if a truncate operation is active.
1184 */
1185 static inline bool
__wt_page_del_active(WT_SESSION_IMPL * session,WT_REF * ref,bool visible_all)1186 __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
1187 {
1188 WT_PAGE_DELETED *page_del;
1189 uint8_t prepare_state;
1190
1191 if ((page_del = ref->page_del) == NULL)
1192 return (false);
1193 if (page_del->txnid == WT_TXN_ABORTED)
1194 return (false);
1195 WT_ORDERED_READ(prepare_state, page_del->prepare_state);
1196 if (prepare_state == WT_PREPARE_INPROGRESS ||
1197 prepare_state == WT_PREPARE_LOCKED)
1198 return (true);
1199 return (visible_all ?
1200 !__wt_txn_visible_all(session,
1201 page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)) :
1202 !__wt_txn_visible(session,
1203 page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp)));
1204 }
1205
1206 /*
1207 * __wt_page_las_active --
1208 * Return if lookaside data for a page is still required.
1209 */
1210 static inline bool
__wt_page_las_active(WT_SESSION_IMPL * session,WT_REF * ref)1211 __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref)
1212 {
1213 WT_PAGE_LOOKASIDE *page_las;
1214
1215 if ((page_las = ref->page_las) == NULL)
1216 return (false);
1217 if (page_las->resolved)
1218 return (false);
1219 if (!page_las->skew_newest)
1220 return (true);
1221 if (__wt_txn_visible_all(session, page_las->max_txn,
1222 WT_TIMESTAMP_NULL(&page_las->max_timestamp)))
1223 return (false);
1224
1225 return (true);
1226 }
1227
1228 /*
1229 * __wt_btree_can_evict_dirty --
1230 * Check whether eviction of dirty pages or splits are permitted in the
1231 * current tree.
1232 *
1233 * We cannot evict dirty pages or split while a checkpoint is in progress,
1234 * unless the checkpoint thread is doing the work.
1235 *
1236 * Also, during connection close, if we take a checkpoint as of a
1237 * timestamp, eviction should not write dirty pages to avoid updates newer
1238 * than the checkpoint timestamp leaking to disk.
1239 */
1240 static inline bool
__wt_btree_can_evict_dirty(WT_SESSION_IMPL * session)1241 __wt_btree_can_evict_dirty(WT_SESSION_IMPL *session)
1242 {
1243 WT_BTREE *btree;
1244
1245 btree = S2BT(session);
1246 return ((!WT_BTREE_SYNCING(btree) || WT_SESSION_BTREE_SYNC(session)) &&
1247 !F_ISSET(S2C(session), WT_CONN_CLOSING_TIMESTAMP));
1248 }
1249
1250 /*
1251 * __wt_leaf_page_can_split --
1252 * Check whether a page can be split in memory.
1253 */
1254 static inline bool
__wt_leaf_page_can_split(WT_SESSION_IMPL * session,WT_PAGE * page)1255 __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
1256 {
1257 WT_BTREE *btree;
1258 WT_INSERT *ins;
1259 WT_INSERT_HEAD *ins_head;
1260 size_t size;
1261 int count;
1262
1263 btree = S2BT(session);
1264
1265 /*
1266 * Checkpoints can't do in-memory splits in the tree they are walking:
1267 * that can lead to corruption when the parent internal page is
1268 * updated.
1269 */
1270 if (WT_SESSION_BTREE_SYNC(session))
1271 return (false);
1272
1273 /*
1274 * Only split a page once, otherwise workloads that update in the middle
1275 * of the page could continually split without benefit.
1276 */
1277 if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
1278 return (false);
1279
1280 /*
1281 * Check for pages with append-only workloads. A common application
1282 * pattern is to have multiple threads frantically appending to the
1283 * tree. We want to reconcile and evict this page, but we'd like to
1284 * do it without making the appending threads wait. See if it's worth
1285 * doing a split to let the threads continue before doing eviction.
1286 *
1287 * Ignore anything other than large, dirty leaf pages. We depend on the
1288 * page being dirty for correctness (the page must be reconciled again
1289 * before being evicted after the split, information from a previous
1290 * reconciliation will be wrong, so we can't evict immediately).
1291 */
1292 if (page->memory_footprint < btree->splitmempage)
1293 return (false);
1294 if (WT_PAGE_IS_INTERNAL(page))
1295 return (false);
1296 if (!__wt_page_is_modified(page))
1297 return (false);
1298
1299 /*
1300 * There is no point doing an in-memory split unless there is a lot of
1301 * data in the last skiplist on the page. Split if there are enough
1302 * items and the skiplist does not fit within a single disk page.
1303 */
1304 ins_head = page->type == WT_PAGE_ROW_LEAF ?
1305 (page->entries == 0 ?
1306 WT_ROW_INSERT_SMALLEST(page) :
1307 WT_ROW_INSERT_SLOT(page, page->entries - 1)) :
1308 WT_COL_APPEND(page);
1309 if (ins_head == NULL)
1310 return (false);
1311
1312 /*
1313 * In the extreme case, where the page is much larger than the maximum
1314 * size, split as soon as there are 5 items on the page.
1315 */
1316 #define WT_MAX_SPLIT_COUNT 5
1317 if (page->memory_footprint > (size_t)btree->maxleafpage * 2) {
1318 for (count = 0, ins = ins_head->head[0];
1319 ins != NULL;
1320 ins = ins->next[0]) {
1321 if (++count < WT_MAX_SPLIT_COUNT)
1322 continue;
1323
1324 WT_STAT_CONN_INCR(session, cache_inmem_splittable);
1325 WT_STAT_DATA_INCR(session, cache_inmem_splittable);
1326 return (true);
1327 }
1328
1329 return (false);
1330 }
1331
1332 /*
1333 * Rather than scanning the whole list, walk a higher level, which
1334 * gives a sample of the items -- at level 0 we have all the items, at
1335 * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more
1336 * than 30 items and more data than would fit in a disk page, split.
1337 */
1338 #define WT_MIN_SPLIT_DEPTH 2
1339 #define WT_MIN_SPLIT_COUNT 30
1340 #define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */
1341
1342 for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH];
1343 ins != NULL;
1344 ins = ins->next[WT_MIN_SPLIT_DEPTH]) {
1345 count += WT_MIN_SPLIT_MULTIPLIER;
1346 size += WT_MIN_SPLIT_MULTIPLIER *
1347 (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd));
1348 if (count > WT_MIN_SPLIT_COUNT &&
1349 size > (size_t)btree->maxleafpage) {
1350 WT_STAT_CONN_INCR(session, cache_inmem_splittable);
1351 WT_STAT_DATA_INCR(session, cache_inmem_splittable);
1352 return (true);
1353 }
1354 }
1355 return (false);
1356 }
1357
1358 /*
1359 * __wt_page_evict_retry --
1360 * Avoid busy-spinning attempting to evict the same page all the time.
1361 */
1362 static inline bool
__wt_page_evict_retry(WT_SESSION_IMPL * session,WT_PAGE * page)1363 __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page)
1364 {
1365 WT_DECL_TIMESTAMP(pinned_ts)
1366 WT_PAGE_MODIFY *mod;
1367 WT_TXN_GLOBAL *txn_global;
1368
1369 txn_global = &S2C(session)->txn_global;
1370
1371 /*
1372 * If the page hasn't been through one round of update/restore, give it
1373 * a try.
1374 */
1375 if ((mod = page->modify) == NULL ||
1376 !FLD_ISSET(mod->restore_state, WT_PAGE_RS_RESTORED))
1377 return (true);
1378
1379 /*
1380 * Retry if a reasonable amount of eviction time has passed, the
1381 * choice of 5 eviction passes as a reasonable amount of time is
1382 * currently pretty arbitrary.
1383 */
1384 if (__wt_cache_aggressive(session) ||
1385 mod->last_evict_pass_gen + 5 < S2C(session)->cache->evict_pass_gen)
1386 return (true);
1387
1388 /* Retry if the global transaction state has moved forward. */
1389 if (txn_global->current == txn_global->oldest_id ||
1390 mod->last_eviction_id != __wt_txn_oldest_id(session))
1391 return (true);
1392
1393 #ifdef HAVE_TIMESTAMPS
1394 if (__wt_timestamp_iszero(&mod->last_eviction_timestamp))
1395 return (true);
1396
1397 __wt_txn_pinned_timestamp(session, &pinned_ts);
1398 if (__wt_timestamp_cmp(&pinned_ts, &mod->last_eviction_timestamp) > 0)
1399 return (true);
1400 #endif
1401
1402 return (false);
1403 }
1404
1405 /*
1406 * __wt_page_can_evict --
1407 * Check whether a page can be evicted.
1408 */
1409 static inline bool
__wt_page_can_evict(WT_SESSION_IMPL * session,WT_REF * ref,bool * inmem_splitp)1410 __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
1411 {
1412 WT_PAGE *page;
1413 WT_PAGE_MODIFY *mod;
1414 bool modified;
1415
1416 if (inmem_splitp != NULL)
1417 *inmem_splitp = false;
1418
1419 page = ref->page;
1420 mod = page->modify;
1421
1422 /* A truncated page can't be evicted until the truncate completes. */
1423 if (__wt_page_del_active(session, ref, true))
1424 return (false);
1425
1426 /* Otherwise, never modified pages can always be evicted. */
1427 if (mod == NULL)
1428 return (true);
1429
1430 /*
1431 * We can't split or evict multiblock row-store pages where the parent's
1432 * key for the page is an overflow item, because the split into the
1433 * parent frees the backing blocks for any no-longer-used overflow keys,
1434 * which will corrupt the checkpoint's block management.
1435 */
1436 if (!__wt_btree_can_evict_dirty(session) &&
1437 F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS))
1438 return (false);
1439
1440 /*
1441 * Check for in-memory splits before other eviction tests. If the page
1442 * should split in-memory, return success immediately and skip more
1443 * detailed eviction tests. We don't need further tests since the page
1444 * won't be written or discarded from the cache.
1445 */
1446 if (__wt_leaf_page_can_split(session, page)) {
1447 if (inmem_splitp != NULL)
1448 *inmem_splitp = true;
1449 return (true);
1450 }
1451
1452 modified = __wt_page_is_modified(page);
1453
1454 /*
1455 * If the file is being checkpointed, other threads can't evict dirty
1456 * pages: if a page is written and the previous version freed, that
1457 * previous version might be referenced by an internal page already
1458 * written in the checkpoint, leaving the checkpoint inconsistent.
1459 */
1460 if (modified && !__wt_btree_can_evict_dirty(session)) {
1461 WT_STAT_CONN_INCR(session, cache_eviction_checkpoint);
1462 WT_STAT_DATA_INCR(session, cache_eviction_checkpoint);
1463 return (false);
1464 }
1465
1466 /*
1467 * If a split created new internal pages, those newly created internal
1468 * pages cannot be evicted until all threads are known to have exited
1469 * the original parent page's index, because evicting an internal page
1470 * discards its WT_REF array, and a thread traversing the original
1471 * parent page index might see a freed WT_REF.
1472 *
1473 * One special case where we know this is safe is if the handle is
1474 * locked exclusive (e.g., when the whole tree is being evicted). In
1475 * that case, no readers can be looking at an old index.
1476 */
1477 if (WT_PAGE_IS_INTERNAL(page) &&
1478 !F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE) &&
1479 __wt_gen_active(session, WT_GEN_SPLIT, page->pg_intl_split_gen))
1480 return (false);
1481
1482 /*
1483 * If the page is clean but has modifications that appear too new to
1484 * evict, skip it.
1485 */
1486 if (!modified && !__wt_txn_visible_all(session,
1487 mod->rec_max_txn, WT_TIMESTAMP_NULL(&mod->rec_max_timestamp)))
1488 return (false);
1489
1490 return (true);
1491 }
1492
1493 /*
1494 * __wt_page_release --
1495 * Release a reference to a page.
1496 */
1497 static inline int
__wt_page_release(WT_SESSION_IMPL * session,WT_REF * ref,uint32_t flags)1498 __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
1499 {
1500 WT_BTREE *btree;
1501 WT_PAGE *page;
1502 bool inmem_split;
1503
1504 btree = S2BT(session);
1505
1506 /*
1507 * Discard our hazard pointer. Ignore pages we don't have and the root
1508 * page, which sticks in memory, regardless.
1509 */
1510 if (ref == NULL || ref->page == NULL || __wt_ref_is_root(ref))
1511 return (0);
1512
1513 /*
1514 * If hazard pointers aren't necessary for this file, we can't be
1515 * evicting, we're done.
1516 */
1517 if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
1518 return (0);
1519
1520 /*
1521 * Attempt to evict pages with the special "oldest" read generation.
1522 * This is set for pages that grow larger than the configured
1523 * memory_page_max setting, when we see many deleted items, and when we
1524 * are attempting to scan without trashing the cache.
1525 *
1526 * Checkpoint should not queue pages for urgent eviction if they require
1527 * dirty eviction: there is a special exemption that allows checkpoint
1528 * to evict dirty pages in a tree that is being checkpointed, and no
1529 * other thread can help with that. Checkpoints don't rely on this code
1530 * for dirty eviction: that is handled explicitly in __wt_sync_file.
1531 *
1532 * If the operation has disabled eviction or splitting, or the session
1533 * is preventing from reconciling, then just queue the page for urgent
1534 * eviction. Otherwise, attempt to release and evict it.
1535 */
1536 page = ref->page;
1537 if (WT_READGEN_EVICT_SOON(page->read_gen) &&
1538 btree->evict_disabled == 0 &&
1539 __wt_page_can_evict(session, ref, &inmem_split) &&
1540 (!WT_SESSION_IS_CHECKPOINT(session) ||
1541 __wt_page_evict_clean(page))) {
1542 if (LF_ISSET(WT_READ_NO_EVICT) ||
1543 (inmem_split ? LF_ISSET(WT_READ_NO_SPLIT) :
1544 F_ISSET(session, WT_SESSION_NO_RECONCILE)))
1545 WT_IGNORE_RET(
1546 __wt_page_evict_urgent(session, ref));
1547 else {
1548 WT_RET_BUSY_OK(
1549 __wt_page_release_evict(session, ref));
1550 return (0);
1551 }
1552 }
1553
1554 return (__wt_hazard_clear(session, ref));
1555 }
1556
1557 /*
1558 * __wt_skip_choose_depth --
1559 * Randomly choose a depth for a skiplist insert.
1560 */
1561 static inline u_int
__wt_skip_choose_depth(WT_SESSION_IMPL * session)1562 __wt_skip_choose_depth(WT_SESSION_IMPL *session)
1563 {
1564 u_int d;
1565
1566 for (d = 1; d < WT_SKIP_MAXDEPTH &&
1567 __wt_random(&session->rnd) < WT_SKIP_PROBABILITY; d++)
1568 ;
1569 return (d);
1570 }
1571
1572 /*
1573 * __wt_btree_lsm_over_size --
1574 * Return if the size of an in-memory tree with a single leaf page is over
1575 * a specified maximum. If called on anything other than a simple tree with a
1576 * single leaf page, returns true so our LSM caller will switch to a new tree.
1577 */
1578 static inline bool
__wt_btree_lsm_over_size(WT_SESSION_IMPL * session,uint64_t maxsize)1579 __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
1580 {
1581 WT_BTREE *btree;
1582 WT_PAGE *child, *root;
1583 WT_PAGE_INDEX *pindex;
1584 WT_REF *first;
1585
1586 btree = S2BT(session);
1587 root = btree->root.page;
1588
1589 /* Check for a non-existent tree. */
1590 if (root == NULL)
1591 return (false);
1592
1593 /* A tree that can be evicted always requires a switch. */
1594 if (btree->evict_disabled == 0)
1595 return (true);
1596
1597 /* Check for a tree with a single leaf page. */
1598 WT_INTL_INDEX_GET(session, root, pindex);
1599 if (pindex->entries != 1) /* > 1 child page, switch */
1600 return (true);
1601
1602 first = pindex->index[0];
1603 if (first->state != WT_REF_MEM) /* no child page, ignore */
1604 return (false);
1605
1606 /*
1607 * We're reaching down into the page without a hazard pointer, but
1608 * that's OK because we know that no-eviction is set and so the page
1609 * cannot disappear.
1610 */
1611 child = first->page;
1612 if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */
1613 return (true);
1614
1615 return (child->memory_footprint > maxsize);
1616 }
1617
1618 /*
1619 * __wt_split_descent_race --
1620 * Return if we raced with an internal page split when descending the tree.
1621 */
1622 static inline bool
__wt_split_descent_race(WT_SESSION_IMPL * session,WT_REF * ref,WT_PAGE_INDEX * saved_pindex)1623 __wt_split_descent_race(
1624 WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX *saved_pindex)
1625 {
1626 WT_PAGE_INDEX *pindex;
1627
1628 /* No test when starting the descent (there's no home to check). */
1629 if (__wt_ref_is_root(ref))
1630 return (false);
1631
1632 /*
1633 * A place to hang this comment...
1634 *
1635 * There's a page-split race when we walk the tree: if we're splitting
1636 * an internal page into its parent, we update the parent's page index
1637 * before updating the split page's page index, and it's not an atomic
1638 * update. A thread can read the parent page's original page index and
1639 * then read the split page's replacement index.
1640 *
1641 * For example, imagine a search descending the tree.
1642 *
1643 * Because internal page splits work by truncating the original page to
1644 * the initial part of the original page, the result of this race is we
1645 * will have a search key that points past the end of the current page.
1646 * This is only an issue when we search past the end of the page, if we
1647 * find a WT_REF in the page with the namespace we're searching for, we
1648 * don't care if the WT_REF moved or not while we were searching, we
1649 * have the correct page.
1650 *
1651 * For example, imagine an internal page with 3 child pages, with the
1652 * namespaces a-f, g-h and i-j; the first child page splits. The parent
1653 * starts out with the following page-index:
1654 *
1655 * | ... | a | g | i | ... |
1656 *
1657 * which changes to this:
1658 *
1659 * | ... | a | c | e | g | i | ... |
1660 *
1661 * The child starts out with the following page-index:
1662 *
1663 * | a | b | c | d | e | f |
1664 *
1665 * which changes to this:
1666 *
1667 * | a | b |
1668 *
1669 * The thread searches the original parent page index for the key "cat",
1670 * it couples to the "a" child page; if it uses the replacement child
1671 * page index, it will search past the end of the page and couple to the
1672 * "b" page, which is wrong.
1673 *
1674 * To detect the problem, we remember the parent page's page index used
1675 * to descend the tree. Whenever we search past the end of a page, we
1676 * check to see if the parent's page index has changed since our use of
1677 * it during descent. As the problem only appears if we read the split
1678 * page's replacement index, the parent page's index must already have
1679 * changed, ensuring we detect the problem.
1680 *
1681 * It's possible for the opposite race to happen (a thread could read
1682 * the parent page's replacement page index and then read the split
1683 * page's original index). This isn't a problem because internal splits
1684 * work by truncating the split page, so the split page search is for
1685 * content the split page retains after the split, and we ignore this
1686 * race.
1687 *
1688 * This code is a general purpose check for a descent race and we call
1689 * it in other cases, for example, a cursor traversing backwards through
1690 * the tree.
1691 *
1692 * Presumably we acquired a page index on the child page before calling
1693 * this code, don't re-order that acquisition with this check.
1694 */
1695 WT_BARRIER();
1696 WT_INTL_INDEX_GET(session, ref->home, pindex);
1697 return (pindex != saved_pindex);
1698 }
1699
1700 /*
1701 * __wt_page_swap_func --
1702 * Swap one page's hazard pointer for another one when hazard pointer
1703 * coupling up/down the tree.
1704 */
1705 static inline int
__wt_page_swap_func(WT_SESSION_IMPL * session,WT_REF * held,WT_REF * want,uint32_t flags,const char * func,int line)1706 __wt_page_swap_func(
1707 WT_SESSION_IMPL *session, WT_REF *held, WT_REF *want, uint32_t flags
1708 #ifdef HAVE_DIAGNOSTIC
1709 , const char *func, int line
1710 #endif
1711 )
1712 {
1713 WT_DECL_RET;
1714 bool acquired;
1715
1716 /*
1717 * This function is here to simplify the error handling during hazard
1718 * pointer coupling so we never leave a hazard pointer dangling. The
1719 * assumption is we're holding a hazard pointer on "held", and want to
1720 * acquire a hazard pointer on "want", releasing the hazard pointer on
1721 * "held" when we're done.
1722 *
1723 * When walking the tree, we sometimes swap to the same page. Fast-path
1724 * that to avoid thinking about error handling.
1725 */
1726 if (held == want)
1727 return (0);
1728
1729 /* Get the wanted page. */
1730 ret = __wt_page_in_func(session, want, flags
1731 #ifdef HAVE_DIAGNOSTIC
1732 , func, line
1733 #endif
1734 );
1735
1736 /*
1737 * Expected failures: page not found or restart. Our callers list the
1738 * errors they're expecting to handle.
1739 */
1740 if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND)
1741 return (WT_NOTFOUND);
1742 if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART)
1743 return (WT_RESTART);
1744
1745 /* Discard the original held page on either success or error. */
1746 acquired = ret == 0;
1747 WT_TRET(__wt_page_release(session, held, flags));
1748
1749 /* Fast-path expected success. */
1750 if (ret == 0)
1751 return (0);
1752
1753 /*
1754 * If there was an error at any point that our caller isn't prepared to
1755 * handle, discard any page we acquired.
1756 */
1757 if (acquired)
1758 WT_TRET(__wt_page_release(session, want, flags));
1759
1760 /*
1761 * If we're returning an error, don't let it be one our caller expects
1762 * to handle as returned by page-in: the expectation includes the held
1763 * page not having been released, and that's not the case.
1764 */
1765 if (LF_ISSET(WT_READ_NOTFOUND_OK) && ret == WT_NOTFOUND)
1766 WT_RET_MSG(session,
1767 EINVAL, "page-release WT_NOTFOUND error mapped to EINVAL");
1768 if (LF_ISSET(WT_READ_RESTART_OK) && ret == WT_RESTART)
1769 WT_RET_MSG(session,
1770 EINVAL, "page-release WT_RESTART error mapped to EINVAL");
1771
1772 return (ret);
1773 }
1774