1 /*-------------------------------------------------------------------------
2 *
3 * nbtsearch.c
4 * Search code for postgres btrees.
5 *
6 *
7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16 #include "postgres.h"
17
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25
26
27 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29 static int _bt_binsrch_posting(BTScanInsert key, Page page,
30 OffsetNumber offnum);
31 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
32 OffsetNumber offnum);
33 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
34 OffsetNumber offnum, IndexTuple itup);
35 static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
36 OffsetNumber offnum, ItemPointer heapTid,
37 IndexTuple itup);
38 static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
39 OffsetNumber offnum,
40 ItemPointer heapTid, int tupleOffset);
41 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
42 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
43 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
44 ScanDirection dir);
45 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
46 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
47 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
48
49
50 /*
51 * _bt_drop_lock_and_maybe_pin()
52 *
53 * Unlock the buffer; and if it is safe to release the pin, do that, too. It
54 * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
55 * This will prevent vacuum from stalling in a blocked state trying to read a
56 * page when a cursor is sitting on it -- at least in many important cases.
57 *
58 * Set the buffer to invalid if the pin is released, since the buffer may be
59 * re-used. If we need to go back to this block (for example, to apply
60 * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
61 * will remain in shared memory for as long as it takes to scan the index
62 * buffer page.
63 */
64 static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan,BTScanPos sp)65 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
66 {
67 _bt_unlockbuf(scan->indexRelation, sp->buf);
68
69 if (IsMVCCSnapshot(scan->xs_snapshot) &&
70 RelationNeedsWAL(scan->indexRelation) &&
71 !scan->xs_want_itup)
72 {
73 ReleaseBuffer(sp->buf);
74 sp->buf = InvalidBuffer;
75 }
76 }
77
78 /*
79 * _bt_search() -- Search the tree for a particular scankey,
80 * or more precisely for the first leaf page it could be on.
81 *
82 * The passed scankey is an insertion-type scankey (see nbtree/README),
83 * but it can omit the rightmost column(s) of the index.
84 *
85 * Return value is a stack of parent-page pointers (i.e. there is no entry for
86 * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
87 * which is locked and pinned. No locks are held on the parent pages,
88 * however!
89 *
90 * If the snapshot parameter is not NULL, "old snapshot" checking will take
91 * place during the descent through the tree. This is not needed when
92 * positioning for an insert or delete, so NULL is used for those cases.
93 *
94 * The returned buffer is locked according to access parameter. Additionally,
95 * access = BT_WRITE will allow an empty root page to be created and returned.
96 * When access = BT_READ, an empty index will result in *bufP being set to
97 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
98 * during the search will be finished.
99 */
100 BTStack
_bt_search(Relation rel,BTScanInsert key,Buffer * bufP,int access,Snapshot snapshot)101 _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
102 Snapshot snapshot)
103 {
104 BTStack stack_in = NULL;
105 int page_access = BT_READ;
106
107 /* Get the root page to start with */
108 *bufP = _bt_getroot(rel, access);
109
110 /* If index is empty and access = BT_READ, no root page is created. */
111 if (!BufferIsValid(*bufP))
112 return (BTStack) NULL;
113
114 /* Loop iterates once per level descended in the tree */
115 for (;;)
116 {
117 Page page;
118 BTPageOpaque opaque;
119 OffsetNumber offnum;
120 ItemId itemid;
121 IndexTuple itup;
122 BlockNumber child;
123 BTStack new_stack;
124
125 /*
126 * Race -- the page we just grabbed may have split since we read its
127 * downlink in its parent page (or the metapage). If it has, we may
128 * need to move right to its new sibling. Do that.
129 *
130 * In write-mode, allow _bt_moveright to finish any incomplete splits
131 * along the way. Strictly speaking, we'd only need to finish an
132 * incomplete split on the leaf page we're about to insert to, not on
133 * any of the upper levels (internal pages with incomplete splits are
134 * also taken care of in _bt_getstackbuf). But this is a good
135 * opportunity to finish splits of internal pages too.
136 */
137 *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
138 page_access, snapshot);
139
140 /* if this is a leaf page, we're done */
141 page = BufferGetPage(*bufP);
142 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
143 if (P_ISLEAF(opaque))
144 break;
145
146 /*
147 * Find the appropriate pivot tuple on this page. Its downlink points
148 * to the child page that we're about to descend to.
149 */
150 offnum = _bt_binsrch(rel, key, *bufP);
151 itemid = PageGetItemId(page, offnum);
152 itup = (IndexTuple) PageGetItem(page, itemid);
153 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
154 child = BTreeTupleGetDownLink(itup);
155
156 /*
157 * We need to save the location of the pivot tuple we chose in a new
158 * stack entry for this page/level. If caller ends up splitting a
159 * page one level down, it usually ends up inserting a new pivot
160 * tuple/downlink immediately after the location recorded here.
161 */
162 new_stack = (BTStack) palloc(sizeof(BTStackData));
163 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
164 new_stack->bts_offset = offnum;
165 new_stack->bts_parent = stack_in;
166
167 /*
168 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
169 * we're on the level 1 and asked to lock leaf page in write mode,
170 * then lock next page in write mode, because it must be a leaf.
171 */
172 if (opaque->btpo_level == 1 && access == BT_WRITE)
173 page_access = BT_WRITE;
174
175 /* drop the read lock on the page, then acquire one on its child */
176 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
177
178 /* okay, all set to move down a level */
179 stack_in = new_stack;
180 }
181
182 /*
183 * If we're asked to lock leaf in write mode, but didn't manage to, then
184 * relock. This should only happen when the root page is a leaf page (and
185 * the only page in the index other than the metapage).
186 */
187 if (access == BT_WRITE && page_access == BT_READ)
188 {
189 /* trade in our read lock for a write lock */
190 _bt_unlockbuf(rel, *bufP);
191 _bt_lockbuf(rel, *bufP, BT_WRITE);
192
193 /*
194 * Race -- the leaf page may have split after we dropped the read lock
195 * but before we acquired a write lock. If it has, we may need to
196 * move right to its new sibling. Do that.
197 */
198 *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
199 snapshot);
200 }
201
202 return stack_in;
203 }
204
205 /*
206 * _bt_moveright() -- move right in the btree if necessary.
207 *
208 * When we follow a pointer to reach a page, it is possible that
209 * the page has changed in the meanwhile. If this happens, we're
210 * guaranteed that the page has "split right" -- that is, that any
211 * data that appeared on the page originally is either on the page
212 * or strictly to the right of it.
213 *
214 * This routine decides whether or not we need to move right in the
215 * tree by examining the high key entry on the page. If that entry is
216 * strictly less than the scankey, or <= the scankey in the
217 * key.nextkey=true case, then we followed the wrong link and we need
218 * to move right.
219 *
220 * The passed insertion-type scankey can omit the rightmost column(s) of the
221 * index. (see nbtree/README)
222 *
223 * When key.nextkey is false (the usual case), we are looking for the first
224 * item >= key. When key.nextkey is true, we are looking for the first item
225 * strictly greater than key.
226 *
227 * If forupdate is true, we will attempt to finish any incomplete splits
228 * that we encounter. This is required when locking a target page for an
229 * insertion, because we don't allow inserting on a page before the split
230 * is completed. 'stack' is only used if forupdate is true.
231 *
232 * On entry, we have the buffer pinned and a lock of the type specified by
233 * 'access'. If we move right, we release the buffer and lock and acquire
234 * the same on the right sibling. Return value is the buffer we stop at.
235 *
236 * If the snapshot parameter is not NULL, "old snapshot" checking will take
237 * place during the descent through the tree. This is not needed when
238 * positioning for an insert or delete, so NULL is used for those cases.
239 */
240 Buffer
_bt_moveright(Relation rel,BTScanInsert key,Buffer buf,bool forupdate,BTStack stack,int access,Snapshot snapshot)241 _bt_moveright(Relation rel,
242 BTScanInsert key,
243 Buffer buf,
244 bool forupdate,
245 BTStack stack,
246 int access,
247 Snapshot snapshot)
248 {
249 Page page;
250 BTPageOpaque opaque;
251 int32 cmpval;
252
253 /*
254 * When nextkey = false (normal case): if the scan key that brought us to
255 * this page is > the high key stored on the page, then the page has split
256 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
257 * have some duplicates to the right as well as the left, but that's
258 * something that's only ever dealt with on the leaf level, after
259 * _bt_search has found an initial leaf page.)
260 *
261 * When nextkey = true: move right if the scan key is >= page's high key.
262 * (Note that key.scantid cannot be set in this case.)
263 *
264 * The page could even have split more than once, so scan as far as
265 * needed.
266 *
267 * We also have to move right if we followed a link that brought us to a
268 * dead page.
269 */
270 cmpval = key->nextkey ? 0 : 1;
271
272 for (;;)
273 {
274 page = BufferGetPage(buf);
275 TestForOldSnapshot(snapshot, rel, page);
276 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
277
278 if (P_RIGHTMOST(opaque))
279 break;
280
281 /*
282 * Finish any incomplete splits we encounter along the way.
283 */
284 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
285 {
286 BlockNumber blkno = BufferGetBlockNumber(buf);
287
288 /* upgrade our lock if necessary */
289 if (access == BT_READ)
290 {
291 _bt_unlockbuf(rel, buf);
292 _bt_lockbuf(rel, buf, BT_WRITE);
293 }
294
295 if (P_INCOMPLETE_SPLIT(opaque))
296 _bt_finish_split(rel, buf, stack);
297 else
298 _bt_relbuf(rel, buf);
299
300 /* re-acquire the lock in the right mode, and re-check */
301 buf = _bt_getbuf(rel, blkno, access);
302 continue;
303 }
304
305 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
306 {
307 /* step right one page */
308 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
309 continue;
310 }
311 else
312 break;
313 }
314
315 if (P_IGNORE(opaque))
316 elog(ERROR, "fell off the end of index \"%s\"",
317 RelationGetRelationName(rel));
318
319 return buf;
320 }
321
322 /*
323 * _bt_binsrch() -- Do a binary search for a key on a particular page.
324 *
325 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
326 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
327 * particular, this means it is possible to return a value 1 greater than the
328 * number of keys on the page, if the scankey is > all keys on the page.)
329 *
330 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
331 * of the last key < given scankey, or last key <= given scankey if nextkey
332 * is true. (Since _bt_compare treats the first data key of such a page as
333 * minus infinity, there will be at least one key < scankey, so the result
334 * always points at one of the keys on the page.) This key indicates the
335 * right place to descend to be sure we find all leaf keys >= given scankey
336 * (or leaf keys > given scankey when nextkey is true).
337 *
338 * This procedure is not responsible for walking right, it just examines
339 * the given page. _bt_binsrch() has no lock or refcount side effects
340 * on the buffer.
341 */
342 static OffsetNumber
_bt_binsrch(Relation rel,BTScanInsert key,Buffer buf)343 _bt_binsrch(Relation rel,
344 BTScanInsert key,
345 Buffer buf)
346 {
347 Page page;
348 BTPageOpaque opaque;
349 OffsetNumber low,
350 high;
351 int32 result,
352 cmpval;
353
354 page = BufferGetPage(buf);
355 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
356
357 /* Requesting nextkey semantics while using scantid seems nonsensical */
358 Assert(!key->nextkey || key->scantid == NULL);
359 /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
360 Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
361
362 low = P_FIRSTDATAKEY(opaque);
363 high = PageGetMaxOffsetNumber(page);
364
365 /*
366 * If there are no keys on the page, return the first available slot. Note
367 * this covers two cases: the page is really empty (no keys), or it
368 * contains only a high key. The latter case is possible after vacuuming.
369 * This can never happen on an internal page, however, since they are
370 * never empty (an internal page must have children).
371 */
372 if (unlikely(high < low))
373 return low;
374
375 /*
376 * Binary search to find the first key on the page >= scan key, or first
377 * key > scankey when nextkey is true.
378 *
379 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
380 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
381 *
382 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
383 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
384 *
385 * We can fall out when high == low.
386 */
387 high++; /* establish the loop invariant for high */
388
389 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
390
391 while (high > low)
392 {
393 OffsetNumber mid = low + ((high - low) / 2);
394
395 /* We have low <= mid < high, so mid points at a real slot */
396
397 result = _bt_compare(rel, key, page, mid);
398
399 if (result >= cmpval)
400 low = mid + 1;
401 else
402 high = mid;
403 }
404
405 /*
406 * At this point we have high == low, but be careful: they could point
407 * past the last slot on the page.
408 *
409 * On a leaf page, we always return the first key >= scan key (resp. >
410 * scan key), which could be the last slot + 1.
411 */
412 if (P_ISLEAF(opaque))
413 return low;
414
415 /*
416 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
417 * There must be one if _bt_compare() is playing by the rules.
418 */
419 Assert(low > P_FIRSTDATAKEY(opaque));
420
421 return OffsetNumberPrev(low);
422 }
423
424 /*
425 *
426 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
427 *
428 * Like _bt_binsrch(), but with support for caching the binary search
429 * bounds. Only used during insertion, and only on the leaf page that it
430 * looks like caller will insert tuple on. Exclusive-locked and pinned
431 * leaf page is contained within insertstate.
432 *
433 * Caches the bounds fields in insertstate so that a subsequent call can
434 * reuse the low and strict high bounds of original binary search. Callers
435 * that use these fields directly must be prepared for the case where low
436 * and/or stricthigh are not on the same page (one or both exceed maxoff
437 * for the page). The case where there are no items on the page (high <
438 * low) makes bounds invalid.
439 *
440 * Caller is responsible for invalidating bounds when it modifies the page
441 * before calling here a second time, and for dealing with posting list
442 * tuple matches (callers can use insertstate's postingoff field to
443 * determine which existing heap TID will need to be replaced by a posting
444 * list split).
445 */
446 OffsetNumber
_bt_binsrch_insert(Relation rel,BTInsertState insertstate)447 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
448 {
449 BTScanInsert key = insertstate->itup_key;
450 Page page;
451 BTPageOpaque opaque;
452 OffsetNumber low,
453 high,
454 stricthigh;
455 int32 result,
456 cmpval;
457
458 page = BufferGetPage(insertstate->buf);
459 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
460
461 Assert(P_ISLEAF(opaque));
462 Assert(!key->nextkey);
463 Assert(insertstate->postingoff == 0);
464
465 if (!insertstate->bounds_valid)
466 {
467 /* Start new binary search */
468 low = P_FIRSTDATAKEY(opaque);
469 high = PageGetMaxOffsetNumber(page);
470 }
471 else
472 {
473 /* Restore result of previous binary search against same page */
474 low = insertstate->low;
475 high = insertstate->stricthigh;
476 }
477
478 /* If there are no keys on the page, return the first available slot */
479 if (unlikely(high < low))
480 {
481 /* Caller can't reuse bounds */
482 insertstate->low = InvalidOffsetNumber;
483 insertstate->stricthigh = InvalidOffsetNumber;
484 insertstate->bounds_valid = false;
485 return low;
486 }
487
488 /*
489 * Binary search to find the first key on the page >= scan key. (nextkey
490 * is always false when inserting).
491 *
492 * The loop invariant is: all slots before 'low' are < scan key, all slots
493 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
494 * maintained to save additional search effort for caller.
495 *
496 * We can fall out when high == low.
497 */
498 if (!insertstate->bounds_valid)
499 high++; /* establish the loop invariant for high */
500 stricthigh = high; /* high initially strictly higher */
501
502 cmpval = 1; /* !nextkey comparison value */
503
504 while (high > low)
505 {
506 OffsetNumber mid = low + ((high - low) / 2);
507
508 /* We have low <= mid < high, so mid points at a real slot */
509
510 result = _bt_compare(rel, key, page, mid);
511
512 if (result >= cmpval)
513 low = mid + 1;
514 else
515 {
516 high = mid;
517 if (result != 0)
518 stricthigh = high;
519 }
520
521 /*
522 * If tuple at offset located by binary search is a posting list whose
523 * TID range overlaps with caller's scantid, perform posting list
524 * binary search to set postingoff for caller. Caller must split the
525 * posting list when postingoff is set. This should happen
526 * infrequently.
527 */
528 if (unlikely(result == 0 && key->scantid != NULL))
529 {
530 /*
531 * postingoff should never be set more than once per leaf page
532 * binary search. That would mean that there are duplicate table
533 * TIDs in the index, which is never okay. Check for that here.
534 */
535 if (insertstate->postingoff != 0)
536 ereport(ERROR,
537 (errcode(ERRCODE_INDEX_CORRUPTED),
538 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
539 ItemPointerGetBlockNumber(key->scantid),
540 ItemPointerGetOffsetNumber(key->scantid),
541 low, stricthigh,
542 BufferGetBlockNumber(insertstate->buf),
543 RelationGetRelationName(rel))));
544
545 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
546 }
547 }
548
549 /*
550 * On a leaf page, a binary search always returns the first key >= scan
551 * key (at least in !nextkey case), which could be the last slot + 1. This
552 * is also the lower bound of cached search.
553 *
554 * stricthigh may also be the last slot + 1, which prevents caller from
555 * using bounds directly, but is still useful to us if we're called a
556 * second time with cached bounds (cached low will be < stricthigh when
557 * that happens).
558 */
559 insertstate->low = low;
560 insertstate->stricthigh = stricthigh;
561 insertstate->bounds_valid = true;
562
563 return low;
564 }
565
566 /*----------
567 * _bt_binsrch_posting() -- posting list binary search.
568 *
569 * Helper routine for _bt_binsrch_insert().
570 *
571 * Returns offset into posting list where caller's scantid belongs.
572 *----------
573 */
574 static int
_bt_binsrch_posting(BTScanInsert key,Page page,OffsetNumber offnum)575 _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
576 {
577 IndexTuple itup;
578 ItemId itemid;
579 int low,
580 high,
581 mid,
582 res;
583
584 /*
585 * If this isn't a posting tuple, then the index must be corrupt (if it is
586 * an ordinary non-pivot tuple then there must be an existing tuple with a
587 * heap TID that equals inserter's new heap TID/scantid). Defensively
588 * check that tuple is a posting list tuple whose posting list range
589 * includes caller's scantid.
590 *
591 * (This is also needed because contrib/amcheck's rootdescend option needs
592 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
593 */
594 itemid = PageGetItemId(page, offnum);
595 itup = (IndexTuple) PageGetItem(page, itemid);
596 if (!BTreeTupleIsPosting(itup))
597 return 0;
598
599 Assert(key->heapkeyspace && key->allequalimage);
600
601 /*
602 * In the event that posting list tuple has LP_DEAD bit set, indicate this
603 * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
604 * second call to _bt_binsrch_insert() can take place when its caller has
605 * removed the dead item.
606 */
607 if (ItemIdIsDead(itemid))
608 return -1;
609
610 /* "high" is past end of posting list for loop invariant */
611 low = 0;
612 high = BTreeTupleGetNPosting(itup);
613 Assert(high >= 2);
614
615 while (high > low)
616 {
617 mid = low + ((high - low) / 2);
618 res = ItemPointerCompare(key->scantid,
619 BTreeTupleGetPostingN(itup, mid));
620
621 if (res > 0)
622 low = mid + 1;
623 else if (res < 0)
624 high = mid;
625 else
626 return mid;
627 }
628
629 /* Exact match not found */
630 return low;
631 }
632
633 /*----------
634 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
635 *
636 * page/offnum: location of btree item to be compared to.
637 *
638 * This routine returns:
639 * <0 if scankey < tuple at offnum;
640 * 0 if scankey == tuple at offnum;
641 * >0 if scankey > tuple at offnum.
642 *
643 * NULLs in the keys are treated as sortable values. Therefore
644 * "equality" does not necessarily mean that the item should be returned
645 * to the caller as a matching key. Similarly, an insertion scankey
646 * with its scantid set is treated as equal to a posting tuple whose TID
647 * range overlaps with their scantid. There generally won't be a
648 * matching TID in the posting tuple, which caller must handle
649 * themselves (e.g., by splitting the posting list tuple).
650 *
651 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
652 * "minus infinity": this routine will always claim it is less than the
653 * scankey. The actual key value stored is explicitly truncated to 0
654 * attributes (explicitly minus infinity) with version 3+ indexes, but
655 * that isn't relied upon. This allows us to implement the Lehman and
656 * Yao convention that the first down-link pointer is before the first
657 * key. See backend/access/nbtree/README for details.
658 *----------
659 */
660 int32
_bt_compare(Relation rel,BTScanInsert key,Page page,OffsetNumber offnum)661 _bt_compare(Relation rel,
662 BTScanInsert key,
663 Page page,
664 OffsetNumber offnum)
665 {
666 TupleDesc itupdesc = RelationGetDescr(rel);
667 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
668 IndexTuple itup;
669 ItemPointer heapTid;
670 ScanKey scankey;
671 int ncmpkey;
672 int ntupatts;
673 int32 result;
674
675 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
676 Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
677 Assert(key->heapkeyspace || key->scantid == NULL);
678
679 /*
680 * Force result ">" if target item is first data item on an internal page
681 * --- see NOTE above.
682 */
683 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
684 return 1;
685
686 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
687 ntupatts = BTreeTupleGetNAtts(itup, rel);
688
689 /*
690 * The scan key is set up with the attribute number associated with each
691 * term in the key. It is important that, if the index is multi-key, the
692 * scan contain the first k key attributes, and that they be in order. If
693 * you think about how multi-key ordering works, you'll understand why
694 * this is.
695 *
696 * We don't test for violation of this condition here, however. The
697 * initial setup for the index scan had better have gotten it right (see
698 * _bt_first).
699 */
700
701 ncmpkey = Min(ntupatts, key->keysz);
702 Assert(key->heapkeyspace || ncmpkey == key->keysz);
703 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
704 scankey = key->scankeys;
705 for (int i = 1; i <= ncmpkey; i++)
706 {
707 Datum datum;
708 bool isNull;
709
710 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
711
712 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
713 {
714 if (isNull)
715 result = 0; /* NULL "=" NULL */
716 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
717 result = -1; /* NULL "<" NOT_NULL */
718 else
719 result = 1; /* NULL ">" NOT_NULL */
720 }
721 else if (isNull) /* key is NOT_NULL and item is NULL */
722 {
723 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
724 result = 1; /* NOT_NULL ">" NULL */
725 else
726 result = -1; /* NOT_NULL "<" NULL */
727 }
728 else
729 {
730 /*
731 * The sk_func needs to be passed the index value as left arg and
732 * the sk_argument as right arg (they might be of different
733 * types). Since it is convenient for callers to think of
734 * _bt_compare as comparing the scankey to the index item, we have
735 * to flip the sign of the comparison result. (Unless it's a DESC
736 * column, in which case we *don't* flip the sign.)
737 */
738 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
739 scankey->sk_collation,
740 datum,
741 scankey->sk_argument));
742
743 if (!(scankey->sk_flags & SK_BT_DESC))
744 INVERT_COMPARE_RESULT(result);
745 }
746
747 /* if the keys are unequal, return the difference */
748 if (result != 0)
749 return result;
750
751 scankey++;
752 }
753
754 /*
755 * All non-truncated attributes (other than heap TID) were found to be
756 * equal. Treat truncated attributes as minus infinity when scankey has a
757 * key attribute value that would otherwise be compared directly.
758 *
759 * Note: it doesn't matter if ntupatts includes non-key attributes;
760 * scankey won't, so explicitly excluding non-key attributes isn't
761 * necessary.
762 */
763 if (key->keysz > ntupatts)
764 return 1;
765
766 /*
767 * Use the heap TID attribute and scantid to try to break the tie. The
768 * rules are the same as any other key attribute -- only the
769 * representation differs.
770 */
771 heapTid = BTreeTupleGetHeapTID(itup);
772 if (key->scantid == NULL)
773 {
774 /*
775 * Most searches have a scankey that is considered greater than a
776 * truncated pivot tuple if and when the scankey has equal values for
777 * attributes up to and including the least significant untruncated
778 * attribute in tuple.
779 *
780 * For example, if an index has the minimum two attributes (single
781 * user key attribute, plus heap TID attribute), and a page's high key
782 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
783 * will not descend to the page to the left. The search will descend
784 * right instead. The truncated attribute in pivot tuple means that
785 * all non-pivot tuples on the page to the left are strictly < 'foo',
786 * so it isn't necessary to descend left. In other words, search
787 * doesn't have to descend left because it isn't interested in a match
788 * that has a heap TID value of -inf.
789 *
790 * However, some searches (pivotsearch searches) actually require that
791 * we descend left when this happens. -inf is treated as a possible
792 * match for omitted scankey attribute(s). This is needed by page
793 * deletion, which must re-find leaf pages that are targets for
794 * deletion using their high keys.
795 *
796 * Note: the heap TID part of the test ensures that scankey is being
797 * compared to a pivot tuple with one or more truncated key
798 * attributes.
799 *
800 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
801 * left here, since they have no heap TID attribute (and cannot have
802 * any -inf key values in any case, since truncation can only remove
803 * non-key attributes). !heapkeyspace searches must always be
804 * prepared to deal with matches on both sides of the pivot once the
805 * leaf level is reached.
806 */
807 if (key->heapkeyspace && !key->pivotsearch &&
808 key->keysz == ntupatts && heapTid == NULL)
809 return 1;
810
811 /* All provided scankey arguments found to be equal */
812 return 0;
813 }
814
815 /*
816 * Treat truncated heap TID as minus infinity, since scankey has a key
817 * attribute value (scantid) that would otherwise be compared directly
818 */
819 Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
820 if (heapTid == NULL)
821 return 1;
822
823 /*
824 * Scankey must be treated as equal to a posting list tuple if its scantid
825 * value falls within the range of the posting list. In all other cases
826 * there can only be a single heap TID value, which is compared directly
827 * with scantid.
828 */
829 Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
830 result = ItemPointerCompare(key->scantid, heapTid);
831 if (result <= 0 || !BTreeTupleIsPosting(itup))
832 return result;
833 else
834 {
835 result = ItemPointerCompare(key->scantid,
836 BTreeTupleGetMaxHeapTID(itup));
837 if (result > 0)
838 return 1;
839 }
840
841 return 0;
842 }
843
844 /*
845 * _bt_first() -- Find the first item in a scan.
846 *
847 * We need to be clever about the direction of scan, the search
848 * conditions, and the tree ordering. We find the first item (or,
849 * if backwards scan, the last item) in the tree that satisfies the
850 * qualifications in the scan key. On success exit, the page containing
851 * the current index tuple is pinned but not locked, and data about
852 * the matching tuple(s) on the page has been loaded into so->currPos.
853 * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
854 * and if requested, scan->xs_itup points to a copy of the index tuple.
855 *
856 * If there are no matching items in the index, we return false, with no
857 * pins or locks held.
858 *
859 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
860 * are both search-type scankeys (see nbtree/README for more about this).
861 * Within this routine, we build a temporary insertion-type scankey to use
862 * in locating the scan start position.
863 */
864 bool
_bt_first(IndexScanDesc scan,ScanDirection dir)865 _bt_first(IndexScanDesc scan, ScanDirection dir)
866 {
867 Relation rel = scan->indexRelation;
868 BTScanOpaque so = (BTScanOpaque) scan->opaque;
869 Buffer buf;
870 BTStack stack;
871 OffsetNumber offnum;
872 StrategyNumber strat;
873 bool nextkey;
874 bool goback;
875 BTScanInsertData inskey;
876 ScanKey startKeys[INDEX_MAX_KEYS];
877 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
878 int keysCount = 0;
879 int i;
880 bool status;
881 StrategyNumber strat_total;
882 BTScanPosItem *currItem;
883 BlockNumber blkno;
884
885 Assert(!BTScanPosIsValid(so->currPos));
886
887 pgstat_count_index_scan(rel);
888
889 /*
890 * Examine the scan keys and eliminate any redundant keys; also mark the
891 * keys that must be matched to continue the scan.
892 */
893 _bt_preprocess_keys(scan);
894
895 /*
896 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
897 * never be satisfied (eg, x == 1 AND x > 2).
898 */
899 if (!so->qual_ok)
900 {
901 /* Notify any other workers that we're done with this scan key. */
902 _bt_parallel_done(scan);
903 return false;
904 }
905
906 /*
907 * For parallel scans, get the starting page from shared state. If the
908 * scan has not started, proceed to find out first leaf page in the usual
909 * way while keeping other participating processes waiting. If the scan
910 * has already begun, use the page number from the shared structure.
911 */
912 if (scan->parallel_scan != NULL)
913 {
914 status = _bt_parallel_seize(scan, &blkno);
915 if (!status)
916 return false;
917 else if (blkno == P_NONE)
918 {
919 _bt_parallel_done(scan);
920 return false;
921 }
922 else if (blkno != InvalidBlockNumber)
923 {
924 if (!_bt_parallel_readpage(scan, blkno, dir))
925 return false;
926 goto readcomplete;
927 }
928 }
929
930 /*----------
931 * Examine the scan keys to discover where we need to start the scan.
932 *
933 * We want to identify the keys that can be used as starting boundaries;
934 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
935 * a backwards scan. We can use keys for multiple attributes so long as
936 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
937 * a > or < boundary or find an attribute with no boundary (which can be
938 * thought of as the same as "> -infinity"), we can't use keys for any
939 * attributes to its right, because it would break our simplistic notion
940 * of what initial positioning strategy to use.
941 *
942 * When the scan keys include cross-type operators, _bt_preprocess_keys
943 * may not be able to eliminate redundant keys; in such cases we will
944 * arbitrarily pick a usable one for each attribute. This is correct
945 * but possibly not optimal behavior. (For example, with keys like
946 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
947 * x=5 would be more efficient.) Since the situation only arises given
948 * a poorly-worded query plus an incomplete opfamily, live with it.
949 *
950 * When both equality and inequality keys appear for a single attribute
951 * (again, only possible when cross-type operators appear), we *must*
952 * select one of the equality keys for the starting point, because
953 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
954 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
955 * start at x=4, we will fail and stop before reaching x=10. If multiple
956 * equality quals survive preprocessing, however, it doesn't matter which
957 * one we use --- by definition, they are either redundant or
958 * contradictory.
959 *
960 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
961 * If the index stores nulls at the end of the index we'll be starting
962 * from, and we have no boundary key for the column (which means the key
963 * we deduced NOT NULL from is an inequality key that constrains the other
964 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
965 * use as a boundary key. If we didn't do this, we might find ourselves
966 * traversing a lot of null entries at the start of the scan.
967 *
968 * In this loop, row-comparison keys are treated the same as keys on their
969 * first (leftmost) columns. We'll add on lower-order columns of the row
970 * comparison below, if possible.
971 *
972 * The selected scan keys (at most one per index column) are remembered by
973 * storing their addresses into the local startKeys[] array.
974 *----------
975 */
976 strat_total = BTEqualStrategyNumber;
977 if (so->numberOfKeys > 0)
978 {
979 AttrNumber curattr;
980 ScanKey chosen;
981 ScanKey impliesNN;
982 ScanKey cur;
983
984 /*
985 * chosen is the so-far-chosen key for the current attribute, if any.
986 * We don't cast the decision in stone until we reach keys for the
987 * next attribute.
988 */
989 curattr = 1;
990 chosen = NULL;
991 /* Also remember any scankey that implies a NOT NULL constraint */
992 impliesNN = NULL;
993
994 /*
995 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
996 * pass to handle after-last-key processing. Actual exit from the
997 * loop is at one of the "break" statements below.
998 */
999 for (cur = so->keyData, i = 0;; cur++, i++)
1000 {
1001 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1002 {
1003 /*
1004 * Done looking at keys for curattr. If we didn't find a
1005 * usable boundary key, see if we can deduce a NOT NULL key.
1006 */
1007 if (chosen == NULL && impliesNN != NULL &&
1008 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1009 ScanDirectionIsForward(dir) :
1010 ScanDirectionIsBackward(dir)))
1011 {
1012 /* Yes, so build the key in notnullkeys[keysCount] */
1013 chosen = ¬nullkeys[keysCount];
1014 ScanKeyEntryInitialize(chosen,
1015 (SK_SEARCHNOTNULL | SK_ISNULL |
1016 (impliesNN->sk_flags &
1017 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1018 curattr,
1019 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1020 BTGreaterStrategyNumber :
1021 BTLessStrategyNumber),
1022 InvalidOid,
1023 InvalidOid,
1024 InvalidOid,
1025 (Datum) 0);
1026 }
1027
1028 /*
1029 * If we still didn't find a usable boundary key, quit; else
1030 * save the boundary key pointer in startKeys.
1031 */
1032 if (chosen == NULL)
1033 break;
1034 startKeys[keysCount++] = chosen;
1035
1036 /*
1037 * Adjust strat_total, and quit if we have stored a > or <
1038 * key.
1039 */
1040 strat = chosen->sk_strategy;
1041 if (strat != BTEqualStrategyNumber)
1042 {
1043 strat_total = strat;
1044 if (strat == BTGreaterStrategyNumber ||
1045 strat == BTLessStrategyNumber)
1046 break;
1047 }
1048
1049 /*
1050 * Done if that was the last attribute, or if next key is not
1051 * in sequence (implying no boundary key is available for the
1052 * next attribute).
1053 */
1054 if (i >= so->numberOfKeys ||
1055 cur->sk_attno != curattr + 1)
1056 break;
1057
1058 /*
1059 * Reset for next attr.
1060 */
1061 curattr = cur->sk_attno;
1062 chosen = NULL;
1063 impliesNN = NULL;
1064 }
1065
1066 /*
1067 * Can we use this key as a starting boundary for this attr?
1068 *
1069 * If not, does it imply a NOT NULL constraint? (Because
1070 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1071 * *any* inequality key works for that; we need not test.)
1072 */
1073 switch (cur->sk_strategy)
1074 {
1075 case BTLessStrategyNumber:
1076 case BTLessEqualStrategyNumber:
1077 if (chosen == NULL)
1078 {
1079 if (ScanDirectionIsBackward(dir))
1080 chosen = cur;
1081 else
1082 impliesNN = cur;
1083 }
1084 break;
1085 case BTEqualStrategyNumber:
1086 /* override any non-equality choice */
1087 chosen = cur;
1088 break;
1089 case BTGreaterEqualStrategyNumber:
1090 case BTGreaterStrategyNumber:
1091 if (chosen == NULL)
1092 {
1093 if (ScanDirectionIsForward(dir))
1094 chosen = cur;
1095 else
1096 impliesNN = cur;
1097 }
1098 break;
1099 }
1100 }
1101 }
1102
1103 /*
1104 * If we found no usable boundary keys, we have to start from one end of
1105 * the tree. Walk down that edge to the first or last key, and scan from
1106 * there.
1107 */
1108 if (keysCount == 0)
1109 {
1110 bool match;
1111
1112 match = _bt_endpoint(scan, dir);
1113
1114 if (!match)
1115 {
1116 /* No match, so mark (parallel) scan finished */
1117 _bt_parallel_done(scan);
1118 }
1119
1120 return match;
1121 }
1122
1123 /*
1124 * We want to start the scan somewhere within the index. Set up an
1125 * insertion scankey we can use to search for the boundary point we
1126 * identified above. The insertion scankey is built using the keys
1127 * identified by startKeys[]. (Remaining insertion scankey fields are
1128 * initialized after initial-positioning strategy is finalized.)
1129 */
1130 Assert(keysCount <= INDEX_MAX_KEYS);
1131 for (i = 0; i < keysCount; i++)
1132 {
1133 ScanKey cur = startKeys[i];
1134
1135 Assert(cur->sk_attno == i + 1);
1136
1137 if (cur->sk_flags & SK_ROW_HEADER)
1138 {
1139 /*
1140 * Row comparison header: look to the first row member instead.
1141 *
1142 * The member scankeys are already in insertion format (ie, they
1143 * have sk_func = 3-way-comparison function), but we have to watch
1144 * out for nulls, which _bt_preprocess_keys didn't check. A null
1145 * in the first row member makes the condition unmatchable, just
1146 * like qual_ok = false.
1147 */
1148 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1149
1150 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1151 if (subkey->sk_flags & SK_ISNULL)
1152 {
1153 _bt_parallel_done(scan);
1154 return false;
1155 }
1156 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1157
1158 /*
1159 * If the row comparison is the last positioning key we accepted,
1160 * try to add additional keys from the lower-order row members.
1161 * (If we accepted independent conditions on additional index
1162 * columns, we use those instead --- doesn't seem worth trying to
1163 * determine which is more restrictive.) Note that this is OK
1164 * even if the row comparison is of ">" or "<" type, because the
1165 * condition applied to all but the last row member is effectively
1166 * ">=" or "<=", and so the extra keys don't break the positioning
1167 * scheme. But, by the same token, if we aren't able to use all
1168 * the row members, then the part of the row comparison that we
1169 * did use has to be treated as just a ">=" or "<=" condition, and
1170 * so we'd better adjust strat_total accordingly.
1171 */
1172 if (i == keysCount - 1)
1173 {
1174 bool used_all_subkeys = false;
1175
1176 Assert(!(subkey->sk_flags & SK_ROW_END));
1177 for (;;)
1178 {
1179 subkey++;
1180 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1181 if (subkey->sk_attno != keysCount + 1)
1182 break; /* out-of-sequence, can't use it */
1183 if (subkey->sk_strategy != cur->sk_strategy)
1184 break; /* wrong direction, can't use it */
1185 if (subkey->sk_flags & SK_ISNULL)
1186 break; /* can't use null keys */
1187 Assert(keysCount < INDEX_MAX_KEYS);
1188 memcpy(inskey.scankeys + keysCount, subkey,
1189 sizeof(ScanKeyData));
1190 keysCount++;
1191 if (subkey->sk_flags & SK_ROW_END)
1192 {
1193 used_all_subkeys = true;
1194 break;
1195 }
1196 }
1197 if (!used_all_subkeys)
1198 {
1199 switch (strat_total)
1200 {
1201 case BTLessStrategyNumber:
1202 strat_total = BTLessEqualStrategyNumber;
1203 break;
1204 case BTGreaterStrategyNumber:
1205 strat_total = BTGreaterEqualStrategyNumber;
1206 break;
1207 }
1208 }
1209 break; /* done with outer loop */
1210 }
1211 }
1212 else
1213 {
1214 /*
1215 * Ordinary comparison key. Transform the search-style scan key
1216 * to an insertion scan key by replacing the sk_func with the
1217 * appropriate btree comparison function.
1218 *
1219 * If scankey operator is not a cross-type comparison, we can use
1220 * the cached comparison function; otherwise gotta look it up in
1221 * the catalogs. (That can't lead to infinite recursion, since no
1222 * indexscan initiated by syscache lookup will use cross-data-type
1223 * operators.)
1224 *
1225 * We support the convention that sk_subtype == InvalidOid means
1226 * the opclass input type; this is a hack to simplify life for
1227 * ScanKeyInit().
1228 */
1229 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1230 cur->sk_subtype == InvalidOid)
1231 {
1232 FmgrInfo *procinfo;
1233
1234 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1235 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1236 cur->sk_flags,
1237 cur->sk_attno,
1238 InvalidStrategy,
1239 cur->sk_subtype,
1240 cur->sk_collation,
1241 procinfo,
1242 cur->sk_argument);
1243 }
1244 else
1245 {
1246 RegProcedure cmp_proc;
1247
1248 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1249 rel->rd_opcintype[i],
1250 cur->sk_subtype,
1251 BTORDER_PROC);
1252 if (!RegProcedureIsValid(cmp_proc))
1253 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1254 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1255 cur->sk_attno, RelationGetRelationName(rel));
1256 ScanKeyEntryInitialize(inskey.scankeys + i,
1257 cur->sk_flags,
1258 cur->sk_attno,
1259 InvalidStrategy,
1260 cur->sk_subtype,
1261 cur->sk_collation,
1262 cmp_proc,
1263 cur->sk_argument);
1264 }
1265 }
1266 }
1267
1268 /*----------
1269 * Examine the selected initial-positioning strategy to determine exactly
1270 * where we need to start the scan, and set flag variables to control the
1271 * code below.
1272 *
1273 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1274 * item >= scan key. If nextkey = true, they will locate the first
1275 * item > scan key.
1276 *
1277 * If goback = true, we will then step back one item, while if
1278 * goback = false, we will start the scan on the located item.
1279 *----------
1280 */
1281 switch (strat_total)
1282 {
1283 case BTLessStrategyNumber:
1284
1285 /*
1286 * Find first item >= scankey, then back up one to arrive at last
1287 * item < scankey. (Note: this positioning strategy is only used
1288 * for a backward scan, so that is always the correct starting
1289 * position.)
1290 */
1291 nextkey = false;
1292 goback = true;
1293 break;
1294
1295 case BTLessEqualStrategyNumber:
1296
1297 /*
1298 * Find first item > scankey, then back up one to arrive at last
1299 * item <= scankey. (Note: this positioning strategy is only used
1300 * for a backward scan, so that is always the correct starting
1301 * position.)
1302 */
1303 nextkey = true;
1304 goback = true;
1305 break;
1306
1307 case BTEqualStrategyNumber:
1308
1309 /*
1310 * If a backward scan was specified, need to start with last equal
1311 * item not first one.
1312 */
1313 if (ScanDirectionIsBackward(dir))
1314 {
1315 /*
1316 * This is the same as the <= strategy. We will check at the
1317 * end whether the found item is actually =.
1318 */
1319 nextkey = true;
1320 goback = true;
1321 }
1322 else
1323 {
1324 /*
1325 * This is the same as the >= strategy. We will check at the
1326 * end whether the found item is actually =.
1327 */
1328 nextkey = false;
1329 goback = false;
1330 }
1331 break;
1332
1333 case BTGreaterEqualStrategyNumber:
1334
1335 /*
1336 * Find first item >= scankey. (This is only used for forward
1337 * scans.)
1338 */
1339 nextkey = false;
1340 goback = false;
1341 break;
1342
1343 case BTGreaterStrategyNumber:
1344
1345 /*
1346 * Find first item > scankey. (This is only used for forward
1347 * scans.)
1348 */
1349 nextkey = true;
1350 goback = false;
1351 break;
1352
1353 default:
1354 /* can't get here, but keep compiler quiet */
1355 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1356 return false;
1357 }
1358
1359 /* Initialize remaining insertion scan key fields */
1360 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1361 inskey.anynullkeys = false; /* unused */
1362 inskey.nextkey = nextkey;
1363 inskey.pivotsearch = false;
1364 inskey.scantid = NULL;
1365 inskey.keysz = keysCount;
1366
1367 /*
1368 * Use the manufactured insertion scan key to descend the tree and
1369 * position ourselves on the target leaf page.
1370 */
1371 stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1372
1373 /* don't need to keep the stack around... */
1374 _bt_freestack(stack);
1375
1376 if (!BufferIsValid(buf))
1377 {
1378 /*
1379 * We only get here if the index is completely empty. Lock relation
1380 * because nothing finer to lock exists.
1381 */
1382 PredicateLockRelation(rel, scan->xs_snapshot);
1383
1384 /*
1385 * mark parallel scan as done, so that all the workers can finish
1386 * their scan
1387 */
1388 _bt_parallel_done(scan);
1389 BTScanPosInvalidate(so->currPos);
1390
1391 return false;
1392 }
1393 else
1394 PredicateLockPage(rel, BufferGetBlockNumber(buf),
1395 scan->xs_snapshot);
1396
1397 _bt_initialize_more_data(so, dir);
1398
1399 /* position to the precise item on the page */
1400 offnum = _bt_binsrch(rel, &inskey, buf);
1401
1402 /*
1403 * If nextkey = false, we are positioned at the first item >= scan key, or
1404 * possibly at the end of a page on which all the existing items are less
1405 * than the scan key and we know that everything on later pages is greater
1406 * than or equal to scan key.
1407 *
1408 * If nextkey = true, we are positioned at the first item > scan key, or
1409 * possibly at the end of a page on which all the existing items are less
1410 * than or equal to the scan key and we know that everything on later
1411 * pages is greater than scan key.
1412 *
1413 * The actually desired starting point is either this item or the prior
1414 * one, or in the end-of-page case it's the first item on the next page or
1415 * the last item on this page. Adjust the starting offset if needed. (If
1416 * this results in an offset before the first item or after the last one,
1417 * _bt_readpage will report no items found, and then we'll step to the
1418 * next page as needed.)
1419 */
1420 if (goback)
1421 offnum = OffsetNumberPrev(offnum);
1422
1423 /* remember which buffer we have pinned, if any */
1424 Assert(!BTScanPosIsValid(so->currPos));
1425 so->currPos.buf = buf;
1426
1427 /*
1428 * Now load data from the first page of the scan.
1429 */
1430 if (!_bt_readpage(scan, dir, offnum))
1431 {
1432 /*
1433 * There's no actually-matching data on this page. Try to advance to
1434 * the next page. Return false if there's no matching data at all.
1435 */
1436 _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1437 if (!_bt_steppage(scan, dir))
1438 return false;
1439 }
1440 else
1441 {
1442 /* Drop the lock, and maybe the pin, on the current page */
1443 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1444 }
1445
1446 readcomplete:
1447 /* OK, itemIndex says what to return */
1448 currItem = &so->currPos.items[so->currPos.itemIndex];
1449 scan->xs_heaptid = currItem->heapTid;
1450 if (scan->xs_want_itup)
1451 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1452
1453 return true;
1454 }
1455
1456 /*
1457 * _bt_next() -- Get the next item in a scan.
1458 *
1459 * On entry, so->currPos describes the current page, which may be pinned
1460 * but is not locked, and so->currPos.itemIndex identifies which item was
1461 * previously returned.
1462 *
1463 * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1464 * next heap tuple, and if requested, scan->xs_itup points to a copy of
1465 * the index tuple. so->currPos is updated as needed.
1466 *
1467 * On failure exit (no more tuples), we release pin and set
1468 * so->currPos.buf to InvalidBuffer.
1469 */
1470 bool
_bt_next(IndexScanDesc scan,ScanDirection dir)1471 _bt_next(IndexScanDesc scan, ScanDirection dir)
1472 {
1473 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1474 BTScanPosItem *currItem;
1475
1476 /*
1477 * Advance to next tuple on current page; or if there's no more, try to
1478 * step to the next page with data.
1479 */
1480 if (ScanDirectionIsForward(dir))
1481 {
1482 if (++so->currPos.itemIndex > so->currPos.lastItem)
1483 {
1484 if (!_bt_steppage(scan, dir))
1485 return false;
1486 }
1487 }
1488 else
1489 {
1490 if (--so->currPos.itemIndex < so->currPos.firstItem)
1491 {
1492 if (!_bt_steppage(scan, dir))
1493 return false;
1494 }
1495 }
1496
1497 /* OK, itemIndex says what to return */
1498 currItem = &so->currPos.items[so->currPos.itemIndex];
1499 scan->xs_heaptid = currItem->heapTid;
1500 if (scan->xs_want_itup)
1501 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1502
1503 return true;
1504 }
1505
1506 /*
1507 * _bt_readpage() -- Load data from current index page into so->currPos
1508 *
1509 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1510 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1511 * they are updated as appropriate. All other fields of so->currPos are
1512 * initialized from scratch here.
1513 *
1514 * We scan the current page starting at offnum and moving in the indicated
1515 * direction. All items matching the scan keys are loaded into currPos.items.
1516 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1517 * that there can be no more matching tuples in the current scan direction.
1518 *
1519 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1520 * prior to calling this function; this function will invoke
1521 * _bt_parallel_release before returning.
1522 *
1523 * Returns true if any matching items found on the page, false if none.
1524 */
1525 static bool
_bt_readpage(IndexScanDesc scan,ScanDirection dir,OffsetNumber offnum)1526 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1527 {
1528 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1529 Page page;
1530 BTPageOpaque opaque;
1531 OffsetNumber minoff;
1532 OffsetNumber maxoff;
1533 int itemIndex;
1534 bool continuescan;
1535 int indnatts;
1536
1537 /*
1538 * We must have the buffer pinned and locked, but the usual macro can't be
1539 * used here; this function is what makes it good for currPos.
1540 */
1541 Assert(BufferIsValid(so->currPos.buf));
1542
1543 page = BufferGetPage(so->currPos.buf);
1544 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1545
1546 /* allow next page be processed by parallel worker */
1547 if (scan->parallel_scan)
1548 {
1549 if (ScanDirectionIsForward(dir))
1550 _bt_parallel_release(scan, opaque->btpo_next);
1551 else
1552 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1553 }
1554
1555 continuescan = true; /* default assumption */
1556 indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1557 minoff = P_FIRSTDATAKEY(opaque);
1558 maxoff = PageGetMaxOffsetNumber(page);
1559
1560 /*
1561 * We note the buffer's block number so that we can release the pin later.
1562 * This allows us to re-read the buffer if it is needed again for hinting.
1563 */
1564 so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1565
1566 /*
1567 * We save the LSN of the page as we read it, so that we know whether it
1568 * safe to apply LP_DEAD hints to the page later. This allows us to drop
1569 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1570 */
1571 so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1572
1573 /*
1574 * we must save the page's right-link while scanning it; this tells us
1575 * where to step right to after we're done with these items. There is no
1576 * corresponding need for the left-link, since splits always go right.
1577 */
1578 so->currPos.nextPage = opaque->btpo_next;
1579
1580 /* initialize tuple workspace to empty */
1581 so->currPos.nextTupleOffset = 0;
1582
1583 /*
1584 * Now that the current page has been made consistent, the macro should be
1585 * good.
1586 */
1587 Assert(BTScanPosIsPinned(so->currPos));
1588
1589 if (ScanDirectionIsForward(dir))
1590 {
1591 /* load items[] in ascending order */
1592 itemIndex = 0;
1593
1594 offnum = Max(offnum, minoff);
1595
1596 while (offnum <= maxoff)
1597 {
1598 ItemId iid = PageGetItemId(page, offnum);
1599 IndexTuple itup;
1600
1601 /*
1602 * If the scan specifies not to return killed tuples, then we
1603 * treat a killed tuple as not passing the qual
1604 */
1605 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1606 {
1607 offnum = OffsetNumberNext(offnum);
1608 continue;
1609 }
1610
1611 itup = (IndexTuple) PageGetItem(page, iid);
1612
1613 if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1614 {
1615 /* tuple passes all scan key conditions */
1616 if (!BTreeTupleIsPosting(itup))
1617 {
1618 /* Remember it */
1619 _bt_saveitem(so, itemIndex, offnum, itup);
1620 itemIndex++;
1621 }
1622 else
1623 {
1624 int tupleOffset;
1625
1626 /*
1627 * Set up state to return posting list, and remember first
1628 * TID
1629 */
1630 tupleOffset =
1631 _bt_setuppostingitems(so, itemIndex, offnum,
1632 BTreeTupleGetPostingN(itup, 0),
1633 itup);
1634 itemIndex++;
1635 /* Remember additional TIDs */
1636 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1637 {
1638 _bt_savepostingitem(so, itemIndex, offnum,
1639 BTreeTupleGetPostingN(itup, i),
1640 tupleOffset);
1641 itemIndex++;
1642 }
1643 }
1644 }
1645 /* When !continuescan, there can't be any more matches, so stop */
1646 if (!continuescan)
1647 break;
1648
1649 offnum = OffsetNumberNext(offnum);
1650 }
1651
1652 /*
1653 * We don't need to visit page to the right when the high key
1654 * indicates that no more matches will be found there.
1655 *
1656 * Checking the high key like this works out more often than you might
1657 * think. Leaf page splits pick a split point between the two most
1658 * dissimilar tuples (this is weighed against the need to evenly share
1659 * free space). Leaf pages with high key attribute values that can
1660 * only appear on non-pivot tuples on the right sibling page are
1661 * common.
1662 */
1663 if (continuescan && !P_RIGHTMOST(opaque))
1664 {
1665 ItemId iid = PageGetItemId(page, P_HIKEY);
1666 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1667 int truncatt;
1668
1669 truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1670 _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1671 }
1672
1673 if (!continuescan)
1674 so->currPos.moreRight = false;
1675
1676 Assert(itemIndex <= MaxTIDsPerBTreePage);
1677 so->currPos.firstItem = 0;
1678 so->currPos.lastItem = itemIndex - 1;
1679 so->currPos.itemIndex = 0;
1680 }
1681 else
1682 {
1683 /* load items[] in descending order */
1684 itemIndex = MaxTIDsPerBTreePage;
1685
1686 offnum = Min(offnum, maxoff);
1687
1688 while (offnum >= minoff)
1689 {
1690 ItemId iid = PageGetItemId(page, offnum);
1691 IndexTuple itup;
1692 bool tuple_alive;
1693 bool passes_quals;
1694
1695 /*
1696 * If the scan specifies not to return killed tuples, then we
1697 * treat a killed tuple as not passing the qual. Most of the
1698 * time, it's a win to not bother examining the tuple's index
1699 * keys, but just skip to the next tuple (previous, actually,
1700 * since we're scanning backwards). However, if this is the first
1701 * tuple on the page, we do check the index keys, to prevent
1702 * uselessly advancing to the page to the left. This is similar
1703 * to the high key optimization used by forward scans.
1704 */
1705 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1706 {
1707 Assert(offnum >= P_FIRSTDATAKEY(opaque));
1708 if (offnum > P_FIRSTDATAKEY(opaque))
1709 {
1710 offnum = OffsetNumberPrev(offnum);
1711 continue;
1712 }
1713
1714 tuple_alive = false;
1715 }
1716 else
1717 tuple_alive = true;
1718
1719 itup = (IndexTuple) PageGetItem(page, iid);
1720
1721 passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1722 &continuescan);
1723 if (passes_quals && tuple_alive)
1724 {
1725 /* tuple passes all scan key conditions */
1726 if (!BTreeTupleIsPosting(itup))
1727 {
1728 /* Remember it */
1729 itemIndex--;
1730 _bt_saveitem(so, itemIndex, offnum, itup);
1731 }
1732 else
1733 {
1734 int tupleOffset;
1735
1736 /*
1737 * Set up state to return posting list, and remember first
1738 * TID.
1739 *
1740 * Note that we deliberately save/return items from
1741 * posting lists in ascending heap TID order for backwards
1742 * scans. This allows _bt_killitems() to make a
1743 * consistent assumption about the order of items
1744 * associated with the same posting list tuple.
1745 */
1746 itemIndex--;
1747 tupleOffset =
1748 _bt_setuppostingitems(so, itemIndex, offnum,
1749 BTreeTupleGetPostingN(itup, 0),
1750 itup);
1751 /* Remember additional TIDs */
1752 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1753 {
1754 itemIndex--;
1755 _bt_savepostingitem(so, itemIndex, offnum,
1756 BTreeTupleGetPostingN(itup, i),
1757 tupleOffset);
1758 }
1759 }
1760 }
1761 if (!continuescan)
1762 {
1763 /* there can't be any more matches, so stop */
1764 so->currPos.moreLeft = false;
1765 break;
1766 }
1767
1768 offnum = OffsetNumberPrev(offnum);
1769 }
1770
1771 Assert(itemIndex >= 0);
1772 so->currPos.firstItem = itemIndex;
1773 so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1774 so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
1775 }
1776
1777 return (so->currPos.firstItem <= so->currPos.lastItem);
1778 }
1779
1780 /* Save an index item into so->currPos.items[itemIndex] */
1781 static void
_bt_saveitem(BTScanOpaque so,int itemIndex,OffsetNumber offnum,IndexTuple itup)1782 _bt_saveitem(BTScanOpaque so, int itemIndex,
1783 OffsetNumber offnum, IndexTuple itup)
1784 {
1785 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1786
1787 Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1788
1789 currItem->heapTid = itup->t_tid;
1790 currItem->indexOffset = offnum;
1791 if (so->currTuples)
1792 {
1793 Size itupsz = IndexTupleSize(itup);
1794
1795 currItem->tupleOffset = so->currPos.nextTupleOffset;
1796 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1797 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1798 }
1799 }
1800
1801 /*
1802 * Setup state to save TIDs/items from a single posting list tuple.
1803 *
1804 * Saves an index item into so->currPos.items[itemIndex] for TID that is
1805 * returned to scan first. Second or subsequent TIDs for posting list should
1806 * be saved by calling _bt_savepostingitem().
1807 *
1808 * Returns an offset into tuple storage space that main tuple is stored at if
1809 * needed.
1810 */
1811 static int
_bt_setuppostingitems(BTScanOpaque so,int itemIndex,OffsetNumber offnum,ItemPointer heapTid,IndexTuple itup)1812 _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1813 ItemPointer heapTid, IndexTuple itup)
1814 {
1815 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1816
1817 Assert(BTreeTupleIsPosting(itup));
1818
1819 currItem->heapTid = *heapTid;
1820 currItem->indexOffset = offnum;
1821 if (so->currTuples)
1822 {
1823 /* Save base IndexTuple (truncate posting list) */
1824 IndexTuple base;
1825 Size itupsz = BTreeTupleGetPostingOffset(itup);
1826
1827 itupsz = MAXALIGN(itupsz);
1828 currItem->tupleOffset = so->currPos.nextTupleOffset;
1829 base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1830 memcpy(base, itup, itupsz);
1831 /* Defensively reduce work area index tuple header size */
1832 base->t_info &= ~INDEX_SIZE_MASK;
1833 base->t_info |= itupsz;
1834 so->currPos.nextTupleOffset += itupsz;
1835
1836 return currItem->tupleOffset;
1837 }
1838
1839 return 0;
1840 }
1841
1842 /*
1843 * Save an index item into so->currPos.items[itemIndex] for current posting
1844 * tuple.
1845 *
1846 * Assumes that _bt_setuppostingitems() has already been called for current
1847 * posting list tuple. Caller passes its return value as tupleOffset.
1848 */
1849 static inline void
_bt_savepostingitem(BTScanOpaque so,int itemIndex,OffsetNumber offnum,ItemPointer heapTid,int tupleOffset)1850 _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1851 ItemPointer heapTid, int tupleOffset)
1852 {
1853 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1854
1855 currItem->heapTid = *heapTid;
1856 currItem->indexOffset = offnum;
1857
1858 /*
1859 * Have index-only scans return the same base IndexTuple for every TID
1860 * that originates from the same posting list
1861 */
1862 if (so->currTuples)
1863 currItem->tupleOffset = tupleOffset;
1864 }
1865
1866 /*
1867 * _bt_steppage() -- Step to next page containing valid data for scan
1868 *
1869 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1870 * if pinned, we'll drop the pin before moving to next page. The buffer is
1871 * not locked on entry.
1872 *
1873 * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1874 * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1875 * to InvalidBuffer. We return true to indicate success.
1876 */
1877 static bool
_bt_steppage(IndexScanDesc scan,ScanDirection dir)1878 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1879 {
1880 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1881 BlockNumber blkno = InvalidBlockNumber;
1882 bool status;
1883
1884 Assert(BTScanPosIsValid(so->currPos));
1885
1886 /* Before leaving current page, deal with any killed items */
1887 if (so->numKilled > 0)
1888 _bt_killitems(scan);
1889
1890 /*
1891 * Before we modify currPos, make a copy of the page data if there was a
1892 * mark position that needs it.
1893 */
1894 if (so->markItemIndex >= 0)
1895 {
1896 /* bump pin on current buffer for assignment to mark buffer */
1897 if (BTScanPosIsPinned(so->currPos))
1898 IncrBufferRefCount(so->currPos.buf);
1899 memcpy(&so->markPos, &so->currPos,
1900 offsetof(BTScanPosData, items[1]) +
1901 so->currPos.lastItem * sizeof(BTScanPosItem));
1902 if (so->markTuples)
1903 memcpy(so->markTuples, so->currTuples,
1904 so->currPos.nextTupleOffset);
1905 so->markPos.itemIndex = so->markItemIndex;
1906 so->markItemIndex = -1;
1907 }
1908
1909 if (ScanDirectionIsForward(dir))
1910 {
1911 /* Walk right to the next page with data */
1912 if (scan->parallel_scan != NULL)
1913 {
1914 /*
1915 * Seize the scan to get the next block number; if the scan has
1916 * ended already, bail out.
1917 */
1918 status = _bt_parallel_seize(scan, &blkno);
1919 if (!status)
1920 {
1921 /* release the previous buffer, if pinned */
1922 BTScanPosUnpinIfPinned(so->currPos);
1923 BTScanPosInvalidate(so->currPos);
1924 return false;
1925 }
1926 }
1927 else
1928 {
1929 /* Not parallel, so use the previously-saved nextPage link. */
1930 blkno = so->currPos.nextPage;
1931 }
1932
1933 /* Remember we left a page with data */
1934 so->currPos.moreLeft = true;
1935
1936 /* release the previous buffer, if pinned */
1937 BTScanPosUnpinIfPinned(so->currPos);
1938 }
1939 else
1940 {
1941 /* Remember we left a page with data */
1942 so->currPos.moreRight = true;
1943
1944 if (scan->parallel_scan != NULL)
1945 {
1946 /*
1947 * Seize the scan to get the current block number; if the scan has
1948 * ended already, bail out.
1949 */
1950 status = _bt_parallel_seize(scan, &blkno);
1951 BTScanPosUnpinIfPinned(so->currPos);
1952 if (!status)
1953 {
1954 BTScanPosInvalidate(so->currPos);
1955 return false;
1956 }
1957 }
1958 else
1959 {
1960 /* Not parallel, so just use our own notion of the current page */
1961 blkno = so->currPos.currPage;
1962 }
1963 }
1964
1965 if (!_bt_readnextpage(scan, blkno, dir))
1966 return false;
1967
1968 /* Drop the lock, and maybe the pin, on the current page */
1969 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1970
1971 return true;
1972 }
1973
1974 /*
1975 * _bt_readnextpage() -- Read next page containing valid data for scan
1976 *
1977 * On success exit, so->currPos is updated to contain data from the next
1978 * interesting page. Caller is responsible to release lock and pin on
1979 * buffer on success. We return true to indicate success.
1980 *
1981 * If there are no more matching records in the given direction, we drop all
1982 * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1983 */
1984 static bool
_bt_readnextpage(IndexScanDesc scan,BlockNumber blkno,ScanDirection dir)1985 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1986 {
1987 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1988 Relation rel;
1989 Page page;
1990 BTPageOpaque opaque;
1991 bool status;
1992
1993 rel = scan->indexRelation;
1994
1995 if (ScanDirectionIsForward(dir))
1996 {
1997 for (;;)
1998 {
1999 /*
2000 * if we're at end of scan, give up and mark parallel scan as
2001 * done, so that all the workers can finish their scan
2002 */
2003 if (blkno == P_NONE || !so->currPos.moreRight)
2004 {
2005 _bt_parallel_done(scan);
2006 BTScanPosInvalidate(so->currPos);
2007 return false;
2008 }
2009 /* check for interrupts while we're not holding any buffer lock */
2010 CHECK_FOR_INTERRUPTS();
2011 /* step right one page */
2012 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2013 page = BufferGetPage(so->currPos.buf);
2014 TestForOldSnapshot(scan->xs_snapshot, rel, page);
2015 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2016 /* check for deleted page */
2017 if (!P_IGNORE(opaque))
2018 {
2019 PredicateLockPage(rel, blkno, scan->xs_snapshot);
2020 /* see if there are any matches on this page */
2021 /* note that this will clear moreRight if we can stop */
2022 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
2023 break;
2024 }
2025 else if (scan->parallel_scan != NULL)
2026 {
2027 /* allow next page be processed by parallel worker */
2028 _bt_parallel_release(scan, opaque->btpo_next);
2029 }
2030
2031 /* nope, keep going */
2032 if (scan->parallel_scan != NULL)
2033 {
2034 _bt_relbuf(rel, so->currPos.buf);
2035 status = _bt_parallel_seize(scan, &blkno);
2036 if (!status)
2037 {
2038 BTScanPosInvalidate(so->currPos);
2039 return false;
2040 }
2041 }
2042 else
2043 {
2044 blkno = opaque->btpo_next;
2045 _bt_relbuf(rel, so->currPos.buf);
2046 }
2047 }
2048 }
2049 else
2050 {
2051 /*
2052 * Should only happen in parallel cases, when some other backend
2053 * advanced the scan.
2054 */
2055 if (so->currPos.currPage != blkno)
2056 {
2057 BTScanPosUnpinIfPinned(so->currPos);
2058 so->currPos.currPage = blkno;
2059 }
2060
2061 /*
2062 * Walk left to the next page with data. This is much more complex
2063 * than the walk-right case because of the possibility that the page
2064 * to our left splits while we are in flight to it, plus the
2065 * possibility that the page we were on gets deleted after we leave
2066 * it. See nbtree/README for details.
2067 *
2068 * It might be possible to rearrange this code to have less overhead
2069 * in pinning and locking, but that would require capturing the left
2070 * pointer when the page is initially read, and using it here, along
2071 * with big changes to _bt_walk_left() and the code below. It is not
2072 * clear whether this would be a win, since if the page immediately to
2073 * the left splits after we read this page and before we step left, we
2074 * would need to visit more pages than with the current code.
2075 *
2076 * Note that if we change the code so that we drop the pin for a scan
2077 * which uses a non-MVCC snapshot, we will need to modify the code for
2078 * walking left, to allow for the possibility that a referenced page
2079 * has been deleted. As long as the buffer is pinned or the snapshot
2080 * is MVCC the page cannot move past the half-dead state to fully
2081 * deleted.
2082 */
2083 if (BTScanPosIsPinned(so->currPos))
2084 _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2085 else
2086 so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2087
2088 for (;;)
2089 {
2090 /* Done if we know there are no matching keys to the left */
2091 if (!so->currPos.moreLeft)
2092 {
2093 _bt_relbuf(rel, so->currPos.buf);
2094 _bt_parallel_done(scan);
2095 BTScanPosInvalidate(so->currPos);
2096 return false;
2097 }
2098
2099 /* Step to next physical page */
2100 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
2101 scan->xs_snapshot);
2102
2103 /* if we're physically at end of index, return failure */
2104 if (so->currPos.buf == InvalidBuffer)
2105 {
2106 _bt_parallel_done(scan);
2107 BTScanPosInvalidate(so->currPos);
2108 return false;
2109 }
2110
2111 /*
2112 * Okay, we managed to move left to a non-deleted page. Done if
2113 * it's not half-dead and contains matching tuples. Else loop back
2114 * and do it all again.
2115 */
2116 page = BufferGetPage(so->currPos.buf);
2117 TestForOldSnapshot(scan->xs_snapshot, rel, page);
2118 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2119 if (!P_IGNORE(opaque))
2120 {
2121 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
2122 /* see if there are any matches on this page */
2123 /* note that this will clear moreLeft if we can stop */
2124 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
2125 break;
2126 }
2127 else if (scan->parallel_scan != NULL)
2128 {
2129 /* allow next page be processed by parallel worker */
2130 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
2131 }
2132
2133 /*
2134 * For parallel scans, get the last page scanned as it is quite
2135 * possible that by the time we try to seize the scan, some other
2136 * worker has already advanced the scan to a different page. We
2137 * must continue based on the latest page scanned by any worker.
2138 */
2139 if (scan->parallel_scan != NULL)
2140 {
2141 _bt_relbuf(rel, so->currPos.buf);
2142 status = _bt_parallel_seize(scan, &blkno);
2143 if (!status)
2144 {
2145 BTScanPosInvalidate(so->currPos);
2146 return false;
2147 }
2148 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2149 }
2150 }
2151 }
2152
2153 return true;
2154 }
2155
2156 /*
2157 * _bt_parallel_readpage() -- Read current page containing valid data for scan
2158 *
2159 * On success, release lock and maybe pin on buffer. We return true to
2160 * indicate success.
2161 */
2162 static bool
_bt_parallel_readpage(IndexScanDesc scan,BlockNumber blkno,ScanDirection dir)2163 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2164 {
2165 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2166
2167 _bt_initialize_more_data(so, dir);
2168
2169 if (!_bt_readnextpage(scan, blkno, dir))
2170 return false;
2171
2172 /* Drop the lock, and maybe the pin, on the current page */
2173 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2174
2175 return true;
2176 }
2177
2178 /*
2179 * _bt_walk_left() -- step left one page, if possible
2180 *
2181 * The given buffer must be pinned and read-locked. This will be dropped
2182 * before stepping left. On return, we have pin and read lock on the
2183 * returned page, instead.
2184 *
2185 * Returns InvalidBuffer if there is no page to the left (no lock is held
2186 * in that case).
2187 *
2188 * When working on a non-leaf level, it is possible for the returned page
2189 * to be half-dead; the caller should check that condition and step left
2190 * again if it's important.
2191 */
2192 static Buffer
_bt_walk_left(Relation rel,Buffer buf,Snapshot snapshot)2193 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
2194 {
2195 Page page;
2196 BTPageOpaque opaque;
2197
2198 page = BufferGetPage(buf);
2199 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2200
2201 for (;;)
2202 {
2203 BlockNumber obknum;
2204 BlockNumber lblkno;
2205 BlockNumber blkno;
2206 int tries;
2207
2208 /* if we're at end of tree, release buf and return failure */
2209 if (P_LEFTMOST(opaque))
2210 {
2211 _bt_relbuf(rel, buf);
2212 break;
2213 }
2214 /* remember original page we are stepping left from */
2215 obknum = BufferGetBlockNumber(buf);
2216 /* step left */
2217 blkno = lblkno = opaque->btpo_prev;
2218 _bt_relbuf(rel, buf);
2219 /* check for interrupts while we're not holding any buffer lock */
2220 CHECK_FOR_INTERRUPTS();
2221 buf = _bt_getbuf(rel, blkno, BT_READ);
2222 page = BufferGetPage(buf);
2223 TestForOldSnapshot(snapshot, rel, page);
2224 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2225
2226 /*
2227 * If this isn't the page we want, walk right till we find what we
2228 * want --- but go no more than four hops (an arbitrary limit). If we
2229 * don't find the correct page by then, the most likely bet is that
2230 * the original page got deleted and isn't in the sibling chain at all
2231 * anymore, not that its left sibling got split more than four times.
2232 *
2233 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2234 * because half-dead pages are still in the sibling chain. Caller
2235 * must reject half-dead pages if wanted.
2236 */
2237 tries = 0;
2238 for (;;)
2239 {
2240 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2241 {
2242 /* Found desired page, return it */
2243 return buf;
2244 }
2245 if (P_RIGHTMOST(opaque) || ++tries > 4)
2246 break;
2247 blkno = opaque->btpo_next;
2248 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2249 page = BufferGetPage(buf);
2250 TestForOldSnapshot(snapshot, rel, page);
2251 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2252 }
2253
2254 /* Return to the original page to see what's up */
2255 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2256 page = BufferGetPage(buf);
2257 TestForOldSnapshot(snapshot, rel, page);
2258 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2259 if (P_ISDELETED(opaque))
2260 {
2261 /*
2262 * It was deleted. Move right to first nondeleted page (there
2263 * must be one); that is the page that has acquired the deleted
2264 * one's keyspace, so stepping left from it will take us where we
2265 * want to be.
2266 */
2267 for (;;)
2268 {
2269 if (P_RIGHTMOST(opaque))
2270 elog(ERROR, "fell off the end of index \"%s\"",
2271 RelationGetRelationName(rel));
2272 blkno = opaque->btpo_next;
2273 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2274 page = BufferGetPage(buf);
2275 TestForOldSnapshot(snapshot, rel, page);
2276 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2277 if (!P_ISDELETED(opaque))
2278 break;
2279 }
2280
2281 /*
2282 * Now return to top of loop, resetting obknum to point to this
2283 * nondeleted page, and try again.
2284 */
2285 }
2286 else
2287 {
2288 /*
2289 * It wasn't deleted; the explanation had better be that the page
2290 * to the left got split or deleted. Without this check, we'd go
2291 * into an infinite loop if there's anything wrong.
2292 */
2293 if (opaque->btpo_prev == lblkno)
2294 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2295 obknum, RelationGetRelationName(rel));
2296 /* Okay to try again with new lblkno value */
2297 }
2298 }
2299
2300 return InvalidBuffer;
2301 }
2302
2303 /*
2304 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2305 *
2306 * If the index is empty, we will return InvalidBuffer; any other failure
2307 * condition causes ereport(). We will not return a dead page.
2308 *
2309 * The returned buffer is pinned and read-locked.
2310 */
2311 Buffer
_bt_get_endpoint(Relation rel,uint32 level,bool rightmost,Snapshot snapshot)2312 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2313 Snapshot snapshot)
2314 {
2315 Buffer buf;
2316 Page page;
2317 BTPageOpaque opaque;
2318 OffsetNumber offnum;
2319 BlockNumber blkno;
2320 IndexTuple itup;
2321
2322 /*
2323 * If we are looking for a leaf page, okay to descend from fast root;
2324 * otherwise better descend from true root. (There is no point in being
2325 * smarter about intermediate levels.)
2326 */
2327 if (level == 0)
2328 buf = _bt_getroot(rel, BT_READ);
2329 else
2330 buf = _bt_gettrueroot(rel);
2331
2332 if (!BufferIsValid(buf))
2333 return InvalidBuffer;
2334
2335 page = BufferGetPage(buf);
2336 TestForOldSnapshot(snapshot, rel, page);
2337 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2338
2339 for (;;)
2340 {
2341 /*
2342 * If we landed on a deleted page, step right to find a live page
2343 * (there must be one). Also, if we want the rightmost page, step
2344 * right if needed to get to it (this could happen if the page split
2345 * since we obtained a pointer to it).
2346 */
2347 while (P_IGNORE(opaque) ||
2348 (rightmost && !P_RIGHTMOST(opaque)))
2349 {
2350 blkno = opaque->btpo_next;
2351 if (blkno == P_NONE)
2352 elog(ERROR, "fell off the end of index \"%s\"",
2353 RelationGetRelationName(rel));
2354 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2355 page = BufferGetPage(buf);
2356 TestForOldSnapshot(snapshot, rel, page);
2357 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2358 }
2359
2360 /* Done? */
2361 if (opaque->btpo_level == level)
2362 break;
2363 if (opaque->btpo_level < level)
2364 ereport(ERROR,
2365 (errcode(ERRCODE_INDEX_CORRUPTED),
2366 errmsg_internal("btree level %u not found in index \"%s\"",
2367 level, RelationGetRelationName(rel))));
2368
2369 /* Descend to leftmost or rightmost child page */
2370 if (rightmost)
2371 offnum = PageGetMaxOffsetNumber(page);
2372 else
2373 offnum = P_FIRSTDATAKEY(opaque);
2374
2375 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2376 blkno = BTreeTupleGetDownLink(itup);
2377
2378 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2379 page = BufferGetPage(buf);
2380 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2381 }
2382
2383 return buf;
2384 }
2385
2386 /*
2387 * _bt_endpoint() -- Find the first or last page in the index, and scan
2388 * from there to the first key satisfying all the quals.
2389 *
2390 * This is used by _bt_first() to set up a scan when we've determined
2391 * that the scan must start at the beginning or end of the index (for
2392 * a forward or backward scan respectively). Exit conditions are the
2393 * same as for _bt_first().
2394 */
2395 static bool
_bt_endpoint(IndexScanDesc scan,ScanDirection dir)2396 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2397 {
2398 Relation rel = scan->indexRelation;
2399 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2400 Buffer buf;
2401 Page page;
2402 BTPageOpaque opaque;
2403 OffsetNumber start;
2404 BTScanPosItem *currItem;
2405
2406 /*
2407 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2408 * version of _bt_search(). We don't maintain a stack since we know we
2409 * won't need it.
2410 */
2411 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
2412
2413 if (!BufferIsValid(buf))
2414 {
2415 /*
2416 * Empty index. Lock the whole relation, as nothing finer to lock
2417 * exists.
2418 */
2419 PredicateLockRelation(rel, scan->xs_snapshot);
2420 BTScanPosInvalidate(so->currPos);
2421 return false;
2422 }
2423
2424 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2425 page = BufferGetPage(buf);
2426 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2427 Assert(P_ISLEAF(opaque));
2428
2429 if (ScanDirectionIsForward(dir))
2430 {
2431 /* There could be dead pages to the left, so not this: */
2432 /* Assert(P_LEFTMOST(opaque)); */
2433
2434 start = P_FIRSTDATAKEY(opaque);
2435 }
2436 else if (ScanDirectionIsBackward(dir))
2437 {
2438 Assert(P_RIGHTMOST(opaque));
2439
2440 start = PageGetMaxOffsetNumber(page);
2441 }
2442 else
2443 {
2444 elog(ERROR, "invalid scan direction: %d", (int) dir);
2445 start = 0; /* keep compiler quiet */
2446 }
2447
2448 /* remember which buffer we have pinned */
2449 so->currPos.buf = buf;
2450
2451 _bt_initialize_more_data(so, dir);
2452
2453 /*
2454 * Now load data from the first page of the scan.
2455 */
2456 if (!_bt_readpage(scan, dir, start))
2457 {
2458 /*
2459 * There's no actually-matching data on this page. Try to advance to
2460 * the next page. Return false if there's no matching data at all.
2461 */
2462 _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2463 if (!_bt_steppage(scan, dir))
2464 return false;
2465 }
2466 else
2467 {
2468 /* Drop the lock, and maybe the pin, on the current page */
2469 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2470 }
2471
2472 /* OK, itemIndex says what to return */
2473 currItem = &so->currPos.items[so->currPos.itemIndex];
2474 scan->xs_heaptid = currItem->heapTid;
2475 if (scan->xs_want_itup)
2476 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2477
2478 return true;
2479 }
2480
2481 /*
2482 * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2483 * for scan direction
2484 */
2485 static inline void
_bt_initialize_more_data(BTScanOpaque so,ScanDirection dir)2486 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2487 {
2488 /* initialize moreLeft/moreRight appropriately for scan direction */
2489 if (ScanDirectionIsForward(dir))
2490 {
2491 so->currPos.moreLeft = false;
2492 so->currPos.moreRight = true;
2493 }
2494 else
2495 {
2496 so->currPos.moreLeft = true;
2497 so->currPos.moreRight = false;
2498 }
2499 so->numKilled = 0; /* just paranoia */
2500 so->markItemIndex = -1; /* ditto */
2501 }
2502