1 /*-------------------------------------------------------------------------
2 *
3 * nbtsearch.c
4 * Search code for postgres btrees.
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16 #include "postgres.h"
17
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25
26
27 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
30 OffsetNumber offnum);
31 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
32 OffsetNumber offnum, IndexTuple itup);
33 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
34 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
35 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
36 ScanDirection dir);
37 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
38 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
39 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40
41
42 /*
43 * _bt_drop_lock_and_maybe_pin()
44 *
45 * Unlock the buffer; and if it is safe to release the pin, do that, too. It
46 * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47 * This will prevent vacuum from stalling in a blocked state trying to read a
48 * page when a cursor is sitting on it -- at least in many important cases.
49 *
50 * Set the buffer to invalid if the pin is released, since the buffer may be
51 * re-used. If we need to go back to this block (for example, to apply
52 * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
53 * will remain in shared memory for as long as it takes to scan the index
54 * buffer page.
55 */
56 static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan,BTScanPos sp)57 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
58 {
59 LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
60
61 if (IsMVCCSnapshot(scan->xs_snapshot) &&
62 RelationNeedsWAL(scan->indexRelation) &&
63 !scan->xs_want_itup)
64 {
65 ReleaseBuffer(sp->buf);
66 sp->buf = InvalidBuffer;
67 }
68 }
69
70 /*
71 * _bt_search() -- Search the tree for a particular scankey,
72 * or more precisely for the first leaf page it could be on.
73 *
74 * The passed scankey is an insertion-type scankey (see nbtree/README),
75 * but it can omit the rightmost column(s) of the index.
76 *
77 * Return value is a stack of parent-page pointers. *bufP is set to the
78 * address of the leaf-page buffer, which is read-locked and pinned.
79 * No locks are held on the parent pages, however!
80 *
81 * If the snapshot parameter is not NULL, "old snapshot" checking will take
82 * place during the descent through the tree. This is not needed when
83 * positioning for an insert or delete, so NULL is used for those cases.
84 *
85 * The returned buffer is locked according to access parameter. Additionally,
86 * access = BT_WRITE will allow an empty root page to be created and returned.
87 * When access = BT_READ, an empty index will result in *bufP being set to
88 * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
89 * during the search will be finished.
90 */
91 BTStack
_bt_search(Relation rel,BTScanInsert key,Buffer * bufP,int access,Snapshot snapshot)92 _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
93 Snapshot snapshot)
94 {
95 BTStack stack_in = NULL;
96 int page_access = BT_READ;
97
98 /* Get the root page to start with */
99 *bufP = _bt_getroot(rel, access);
100
101 /* If index is empty and access = BT_READ, no root page is created. */
102 if (!BufferIsValid(*bufP))
103 return (BTStack) NULL;
104
105 /* Loop iterates once per level descended in the tree */
106 for (;;)
107 {
108 Page page;
109 BTPageOpaque opaque;
110 OffsetNumber offnum;
111 ItemId itemid;
112 IndexTuple itup;
113 BlockNumber blkno;
114 BlockNumber par_blkno;
115 BTStack new_stack;
116
117 /*
118 * Race -- the page we just grabbed may have split since we read its
119 * pointer in the parent (or metapage). If it has, we may need to
120 * move right to its new sibling. Do that.
121 *
122 * In write-mode, allow _bt_moveright to finish any incomplete splits
123 * along the way. Strictly speaking, we'd only need to finish an
124 * incomplete split on the leaf page we're about to insert to, not on
125 * any of the upper levels (they are taken care of in _bt_getstackbuf,
126 * if the leaf page is split and we insert to the parent page). But
127 * this is a good opportunity to finish splits of internal pages too.
128 */
129 *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
130 page_access, snapshot);
131
132 /* if this is a leaf page, we're done */
133 page = BufferGetPage(*bufP);
134 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
135 if (P_ISLEAF(opaque))
136 break;
137
138 /*
139 * Find the appropriate item on the internal page, and get the child
140 * page that it points to.
141 */
142 offnum = _bt_binsrch(rel, key, *bufP);
143 itemid = PageGetItemId(page, offnum);
144 itup = (IndexTuple) PageGetItem(page, itemid);
145 blkno = BTreeInnerTupleGetDownLink(itup);
146 par_blkno = BufferGetBlockNumber(*bufP);
147
148 /*
149 * We need to save the location of the index entry we chose in the
150 * parent page on a stack. In case we split the tree, we'll use the
151 * stack to work back up to the parent page. We also save the actual
152 * downlink (block) to uniquely identify the index entry, in case it
153 * moves right while we're working lower in the tree. See the paper
154 * by Lehman and Yao for how this is detected and handled. (We use the
155 * child link during the second half of a page split -- if caller ends
156 * up splitting the child it usually ends up inserting a new pivot
157 * tuple for child's new right sibling immediately after the original
158 * bts_offset offset recorded here. The downlink block will be needed
159 * to check if bts_offset remains the position of this same pivot
160 * tuple.)
161 */
162 new_stack = (BTStack) palloc(sizeof(BTStackData));
163 new_stack->bts_blkno = par_blkno;
164 new_stack->bts_offset = offnum;
165 new_stack->bts_btentry = blkno;
166 new_stack->bts_parent = stack_in;
167
168 /*
169 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
170 * we're on the level 1 and asked to lock leaf page in write mode,
171 * then lock next page in write mode, because it must be a leaf.
172 */
173 if (opaque->btpo.level == 1 && access == BT_WRITE)
174 page_access = BT_WRITE;
175
176 /* drop the read lock on the parent page, acquire one on the child */
177 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
178
179 /* okay, all set to move down a level */
180 stack_in = new_stack;
181 }
182
183 /*
184 * If we're asked to lock leaf in write mode, but didn't manage to, then
185 * relock. This should only happen when the root page is a leaf page (and
186 * the only page in the index other than the metapage).
187 */
188 if (access == BT_WRITE && page_access == BT_READ)
189 {
190 /* trade in our read lock for a write lock */
191 LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
192 LockBuffer(*bufP, BT_WRITE);
193
194 /*
195 * If the page was split between the time that we surrendered our read
196 * lock and acquired our write lock, then this page may no longer be
197 * the right place for the key we want to insert. In this case, we
198 * need to move right in the tree. See Lehman and Yao for an
199 * excruciatingly precise description.
200 */
201 *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
202 snapshot);
203 }
204
205 return stack_in;
206 }
207
208 /*
209 * _bt_moveright() -- move right in the btree if necessary.
210 *
211 * When we follow a pointer to reach a page, it is possible that
212 * the page has changed in the meanwhile. If this happens, we're
213 * guaranteed that the page has "split right" -- that is, that any
214 * data that appeared on the page originally is either on the page
215 * or strictly to the right of it.
216 *
217 * This routine decides whether or not we need to move right in the
218 * tree by examining the high key entry on the page. If that entry is
219 * strictly less than the scankey, or <= the scankey in the
220 * key.nextkey=true case, then we followed the wrong link and we need
221 * to move right.
222 *
223 * The passed insertion-type scankey can omit the rightmost column(s) of the
224 * index. (see nbtree/README)
225 *
226 * When key.nextkey is false (the usual case), we are looking for the first
227 * item >= key. When key.nextkey is true, we are looking for the first item
228 * strictly greater than key.
229 *
230 * If forupdate is true, we will attempt to finish any incomplete splits
231 * that we encounter. This is required when locking a target page for an
232 * insertion, because we don't allow inserting on a page before the split
233 * is completed. 'stack' is only used if forupdate is true.
234 *
235 * On entry, we have the buffer pinned and a lock of the type specified by
236 * 'access'. If we move right, we release the buffer and lock and acquire
237 * the same on the right sibling. Return value is the buffer we stop at.
238 *
239 * If the snapshot parameter is not NULL, "old snapshot" checking will take
240 * place during the descent through the tree. This is not needed when
241 * positioning for an insert or delete, so NULL is used for those cases.
242 */
243 Buffer
_bt_moveright(Relation rel,BTScanInsert key,Buffer buf,bool forupdate,BTStack stack,int access,Snapshot snapshot)244 _bt_moveright(Relation rel,
245 BTScanInsert key,
246 Buffer buf,
247 bool forupdate,
248 BTStack stack,
249 int access,
250 Snapshot snapshot)
251 {
252 Page page;
253 BTPageOpaque opaque;
254 int32 cmpval;
255
256 /*
257 * When nextkey = false (normal case): if the scan key that brought us to
258 * this page is > the high key stored on the page, then the page has split
259 * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
260 * have some duplicates to the right as well as the left, but that's
261 * something that's only ever dealt with on the leaf level, after
262 * _bt_search has found an initial leaf page.)
263 *
264 * When nextkey = true: move right if the scan key is >= page's high key.
265 * (Note that key.scantid cannot be set in this case.)
266 *
267 * The page could even have split more than once, so scan as far as
268 * needed.
269 *
270 * We also have to move right if we followed a link that brought us to a
271 * dead page.
272 */
273 cmpval = key->nextkey ? 0 : 1;
274
275 for (;;)
276 {
277 page = BufferGetPage(buf);
278 TestForOldSnapshot(snapshot, rel, page);
279 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
280
281 if (P_RIGHTMOST(opaque))
282 break;
283
284 /*
285 * Finish any incomplete splits we encounter along the way.
286 */
287 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
288 {
289 BlockNumber blkno = BufferGetBlockNumber(buf);
290
291 /* upgrade our lock if necessary */
292 if (access == BT_READ)
293 {
294 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
295 LockBuffer(buf, BT_WRITE);
296 }
297
298 if (P_INCOMPLETE_SPLIT(opaque))
299 _bt_finish_split(rel, buf, stack);
300 else
301 _bt_relbuf(rel, buf);
302
303 /* re-acquire the lock in the right mode, and re-check */
304 buf = _bt_getbuf(rel, blkno, access);
305 continue;
306 }
307
308 if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
309 {
310 /* step right one page */
311 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
312 continue;
313 }
314 else
315 break;
316 }
317
318 if (P_IGNORE(opaque))
319 elog(ERROR, "fell off the end of index \"%s\"",
320 RelationGetRelationName(rel));
321
322 return buf;
323 }
324
325 /*
326 * _bt_binsrch() -- Do a binary search for a key on a particular page.
327 *
328 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
329 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
330 * particular, this means it is possible to return a value 1 greater than the
331 * number of keys on the page, if the scankey is > all keys on the page.)
332 *
333 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
334 * of the last key < given scankey, or last key <= given scankey if nextkey
335 * is true. (Since _bt_compare treats the first data key of such a page as
336 * minus infinity, there will be at least one key < scankey, so the result
337 * always points at one of the keys on the page.) This key indicates the
338 * right place to descend to be sure we find all leaf keys >= given scankey
339 * (or leaf keys > given scankey when nextkey is true).
340 *
341 * This procedure is not responsible for walking right, it just examines
342 * the given page. _bt_binsrch() has no lock or refcount side effects
343 * on the buffer.
344 */
345 static OffsetNumber
_bt_binsrch(Relation rel,BTScanInsert key,Buffer buf)346 _bt_binsrch(Relation rel,
347 BTScanInsert key,
348 Buffer buf)
349 {
350 Page page;
351 BTPageOpaque opaque;
352 OffsetNumber low,
353 high;
354 int32 result,
355 cmpval;
356
357 /* Requesting nextkey semantics while using scantid seems nonsensical */
358 Assert(!key->nextkey || key->scantid == NULL);
359
360 page = BufferGetPage(buf);
361 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
362
363 low = P_FIRSTDATAKEY(opaque);
364 high = PageGetMaxOffsetNumber(page);
365
366 /*
367 * If there are no keys on the page, return the first available slot. Note
368 * this covers two cases: the page is really empty (no keys), or it
369 * contains only a high key. The latter case is possible after vacuuming.
370 * This can never happen on an internal page, however, since they are
371 * never empty (an internal page must have children).
372 */
373 if (unlikely(high < low))
374 return low;
375
376 /*
377 * Binary search to find the first key on the page >= scan key, or first
378 * key > scankey when nextkey is true.
379 *
380 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
381 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
382 *
383 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
384 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
385 *
386 * We can fall out when high == low.
387 */
388 high++; /* establish the loop invariant for high */
389
390 cmpval = key->nextkey ? 0 : 1; /* select comparison value */
391
392 while (high > low)
393 {
394 OffsetNumber mid = low + ((high - low) / 2);
395
396 /* We have low <= mid < high, so mid points at a real slot */
397
398 result = _bt_compare(rel, key, page, mid);
399
400 if (result >= cmpval)
401 low = mid + 1;
402 else
403 high = mid;
404 }
405
406 /*
407 * At this point we have high == low, but be careful: they could point
408 * past the last slot on the page.
409 *
410 * On a leaf page, we always return the first key >= scan key (resp. >
411 * scan key), which could be the last slot + 1.
412 */
413 if (P_ISLEAF(opaque))
414 return low;
415
416 /*
417 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
418 * There must be one if _bt_compare() is playing by the rules.
419 */
420 Assert(low > P_FIRSTDATAKEY(opaque));
421
422 return OffsetNumberPrev(low);
423 }
424
425 /*
426 *
427 * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
428 *
429 * Like _bt_binsrch(), but with support for caching the binary search
430 * bounds. Only used during insertion, and only on the leaf page that it
431 * looks like caller will insert tuple on. Exclusive-locked and pinned
432 * leaf page is contained within insertstate.
433 *
434 * Caches the bounds fields in insertstate so that a subsequent call can
435 * reuse the low and strict high bounds of original binary search. Callers
436 * that use these fields directly must be prepared for the case where low
437 * and/or stricthigh are not on the same page (one or both exceed maxoff
438 * for the page). The case where there are no items on the page (high <
439 * low) makes bounds invalid.
440 *
441 * Caller is responsible for invalidating bounds when it modifies the page
442 * before calling here a second time.
443 */
444 OffsetNumber
_bt_binsrch_insert(Relation rel,BTInsertState insertstate)445 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
446 {
447 BTScanInsert key = insertstate->itup_key;
448 Page page;
449 BTPageOpaque opaque;
450 OffsetNumber low,
451 high,
452 stricthigh;
453 int32 result,
454 cmpval;
455
456 page = BufferGetPage(insertstate->buf);
457 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
458
459 Assert(P_ISLEAF(opaque));
460 Assert(!key->nextkey);
461
462 if (!insertstate->bounds_valid)
463 {
464 /* Start new binary search */
465 low = P_FIRSTDATAKEY(opaque);
466 high = PageGetMaxOffsetNumber(page);
467 }
468 else
469 {
470 /* Restore result of previous binary search against same page */
471 low = insertstate->low;
472 high = insertstate->stricthigh;
473 }
474
475 /* If there are no keys on the page, return the first available slot */
476 if (unlikely(high < low))
477 {
478 /* Caller can't reuse bounds */
479 insertstate->low = InvalidOffsetNumber;
480 insertstate->stricthigh = InvalidOffsetNumber;
481 insertstate->bounds_valid = false;
482 return low;
483 }
484
485 /*
486 * Binary search to find the first key on the page >= scan key. (nextkey
487 * is always false when inserting).
488 *
489 * The loop invariant is: all slots before 'low' are < scan key, all slots
490 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491 * maintained to save additional search effort for caller.
492 *
493 * We can fall out when high == low.
494 */
495 if (!insertstate->bounds_valid)
496 high++; /* establish the loop invariant for high */
497 stricthigh = high; /* high initially strictly higher */
498
499 cmpval = 1; /* !nextkey comparison value */
500
501 while (high > low)
502 {
503 OffsetNumber mid = low + ((high - low) / 2);
504
505 /* We have low <= mid < high, so mid points at a real slot */
506
507 result = _bt_compare(rel, key, page, mid);
508
509 if (result >= cmpval)
510 low = mid + 1;
511 else
512 {
513 high = mid;
514 if (result != 0)
515 stricthigh = high;
516 }
517 }
518
519 /*
520 * On a leaf page, a binary search always returns the first key >= scan
521 * key (at least in !nextkey case), which could be the last slot + 1. This
522 * is also the lower bound of cached search.
523 *
524 * stricthigh may also be the last slot + 1, which prevents caller from
525 * using bounds directly, but is still useful to us if we're called a
526 * second time with cached bounds (cached low will be < stricthigh when
527 * that happens).
528 */
529 insertstate->low = low;
530 insertstate->stricthigh = stricthigh;
531 insertstate->bounds_valid = true;
532
533 return low;
534 }
535
536 /*----------
537 * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
538 *
539 * page/offnum: location of btree item to be compared to.
540 *
541 * This routine returns:
542 * <0 if scankey < tuple at offnum;
543 * 0 if scankey == tuple at offnum;
544 * >0 if scankey > tuple at offnum.
545 * NULLs in the keys are treated as sortable values. Therefore
546 * "equality" does not necessarily mean that the item should be
547 * returned to the caller as a matching key!
548 *
549 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
550 * "minus infinity": this routine will always claim it is less than the
551 * scankey. The actual key value stored is explicitly truncated to 0
552 * attributes (explicitly minus infinity) with version 3+ indexes, but
553 * that isn't relied upon. This allows us to implement the Lehman and
554 * Yao convention that the first down-link pointer is before the first
555 * key. See backend/access/nbtree/README for details.
556 *----------
557 */
558 int32
_bt_compare(Relation rel,BTScanInsert key,Page page,OffsetNumber offnum)559 _bt_compare(Relation rel,
560 BTScanInsert key,
561 Page page,
562 OffsetNumber offnum)
563 {
564 TupleDesc itupdesc = RelationGetDescr(rel);
565 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
566 IndexTuple itup;
567 ItemPointer heapTid;
568 ScanKey scankey;
569 int ncmpkey;
570 int ntupatts;
571
572 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
573 Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
574 Assert(key->heapkeyspace || key->scantid == NULL);
575
576 /*
577 * Force result ">" if target item is first data item on an internal page
578 * --- see NOTE above.
579 */
580 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
581 return 1;
582
583 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
584 ntupatts = BTreeTupleGetNAtts(itup, rel);
585
586 /*
587 * The scan key is set up with the attribute number associated with each
588 * term in the key. It is important that, if the index is multi-key, the
589 * scan contain the first k key attributes, and that they be in order. If
590 * you think about how multi-key ordering works, you'll understand why
591 * this is.
592 *
593 * We don't test for violation of this condition here, however. The
594 * initial setup for the index scan had better have gotten it right (see
595 * _bt_first).
596 */
597
598 ncmpkey = Min(ntupatts, key->keysz);
599 Assert(key->heapkeyspace || ncmpkey == key->keysz);
600 scankey = key->scankeys;
601 for (int i = 1; i <= ncmpkey; i++)
602 {
603 Datum datum;
604 bool isNull;
605 int32 result;
606
607 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
608
609 /* see comments about NULLs handling in btbuild */
610 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
611 {
612 if (isNull)
613 result = 0; /* NULL "=" NULL */
614 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
615 result = -1; /* NULL "<" NOT_NULL */
616 else
617 result = 1; /* NULL ">" NOT_NULL */
618 }
619 else if (isNull) /* key is NOT_NULL and item is NULL */
620 {
621 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
622 result = 1; /* NOT_NULL ">" NULL */
623 else
624 result = -1; /* NOT_NULL "<" NULL */
625 }
626 else
627 {
628 /*
629 * The sk_func needs to be passed the index value as left arg and
630 * the sk_argument as right arg (they might be of different
631 * types). Since it is convenient for callers to think of
632 * _bt_compare as comparing the scankey to the index item, we have
633 * to flip the sign of the comparison result. (Unless it's a DESC
634 * column, in which case we *don't* flip the sign.)
635 */
636 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
637 scankey->sk_collation,
638 datum,
639 scankey->sk_argument));
640
641 if (!(scankey->sk_flags & SK_BT_DESC))
642 INVERT_COMPARE_RESULT(result);
643 }
644
645 /* if the keys are unequal, return the difference */
646 if (result != 0)
647 return result;
648
649 scankey++;
650 }
651
652 /*
653 * All non-truncated attributes (other than heap TID) were found to be
654 * equal. Treat truncated attributes as minus infinity when scankey has a
655 * key attribute value that would otherwise be compared directly.
656 *
657 * Note: it doesn't matter if ntupatts includes non-key attributes;
658 * scankey won't, so explicitly excluding non-key attributes isn't
659 * necessary.
660 */
661 if (key->keysz > ntupatts)
662 return 1;
663
664 /*
665 * Use the heap TID attribute and scantid to try to break the tie. The
666 * rules are the same as any other key attribute -- only the
667 * representation differs.
668 */
669 heapTid = BTreeTupleGetHeapTID(itup);
670 if (key->scantid == NULL)
671 {
672 /*
673 * Most searches have a scankey that is considered greater than a
674 * truncated pivot tuple if and when the scankey has equal values for
675 * attributes up to and including the least significant untruncated
676 * attribute in tuple.
677 *
678 * For example, if an index has the minimum two attributes (single
679 * user key attribute, plus heap TID attribute), and a page's high key
680 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
681 * will not descend to the page to the left. The search will descend
682 * right instead. The truncated attribute in pivot tuple means that
683 * all non-pivot tuples on the page to the left are strictly < 'foo',
684 * so it isn't necessary to descend left. In other words, search
685 * doesn't have to descend left because it isn't interested in a match
686 * that has a heap TID value of -inf.
687 *
688 * However, some searches (pivotsearch searches) actually require that
689 * we descend left when this happens. -inf is treated as a possible
690 * match for omitted scankey attribute(s). This is needed by page
691 * deletion, which must re-find leaf pages that are targets for
692 * deletion using their high keys.
693 *
694 * Note: the heap TID part of the test ensures that scankey is being
695 * compared to a pivot tuple with one or more truncated key
696 * attributes.
697 *
698 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
699 * left here, since they have no heap TID attribute (and cannot have
700 * any -inf key values in any case, since truncation can only remove
701 * non-key attributes). !heapkeyspace searches must always be
702 * prepared to deal with matches on both sides of the pivot once the
703 * leaf level is reached.
704 */
705 if (key->heapkeyspace && !key->pivotsearch &&
706 key->keysz == ntupatts && heapTid == NULL)
707 return 1;
708
709 /* All provided scankey arguments found to be equal */
710 return 0;
711 }
712
713 /*
714 * Treat truncated heap TID as minus infinity, since scankey has a key
715 * attribute value (scantid) that would otherwise be compared directly
716 */
717 Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
718 if (heapTid == NULL)
719 return 1;
720
721 Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
722 return ItemPointerCompare(key->scantid, heapTid);
723 }
724
725 /*
726 * _bt_first() -- Find the first item in a scan.
727 *
728 * We need to be clever about the direction of scan, the search
729 * conditions, and the tree ordering. We find the first item (or,
730 * if backwards scan, the last item) in the tree that satisfies the
731 * qualifications in the scan key. On success exit, the page containing
732 * the current index tuple is pinned but not locked, and data about
733 * the matching tuple(s) on the page has been loaded into so->currPos.
734 * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
735 * and if requested, scan->xs_itup points to a copy of the index tuple.
736 *
737 * If there are no matching items in the index, we return false, with no
738 * pins or locks held.
739 *
740 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
741 * are both search-type scankeys (see nbtree/README for more about this).
742 * Within this routine, we build a temporary insertion-type scankey to use
743 * in locating the scan start position.
744 */
745 bool
_bt_first(IndexScanDesc scan,ScanDirection dir)746 _bt_first(IndexScanDesc scan, ScanDirection dir)
747 {
748 Relation rel = scan->indexRelation;
749 BTScanOpaque so = (BTScanOpaque) scan->opaque;
750 Buffer buf;
751 BTStack stack;
752 OffsetNumber offnum;
753 StrategyNumber strat;
754 bool nextkey;
755 bool goback;
756 BTScanInsertData inskey;
757 ScanKey startKeys[INDEX_MAX_KEYS];
758 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
759 int keysCount = 0;
760 int i;
761 bool status = true;
762 StrategyNumber strat_total;
763 BTScanPosItem *currItem;
764 BlockNumber blkno;
765
766 Assert(!BTScanPosIsValid(so->currPos));
767
768 pgstat_count_index_scan(rel);
769
770 /*
771 * Examine the scan keys and eliminate any redundant keys; also mark the
772 * keys that must be matched to continue the scan.
773 */
774 _bt_preprocess_keys(scan);
775
776 /*
777 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
778 * never be satisfied (eg, x == 1 AND x > 2).
779 */
780 if (!so->qual_ok)
781 {
782 /* Notify any other workers that we're done with this scan key. */
783 _bt_parallel_done(scan);
784 return false;
785 }
786
787 /*
788 * For parallel scans, get the starting page from shared state. If the
789 * scan has not started, proceed to find out first leaf page in the usual
790 * way while keeping other participating processes waiting. If the scan
791 * has already begun, use the page number from the shared structure.
792 */
793 if (scan->parallel_scan != NULL)
794 {
795 status = _bt_parallel_seize(scan, &blkno);
796 if (!status)
797 return false;
798 else if (blkno == P_NONE)
799 {
800 _bt_parallel_done(scan);
801 return false;
802 }
803 else if (blkno != InvalidBlockNumber)
804 {
805 if (!_bt_parallel_readpage(scan, blkno, dir))
806 return false;
807 goto readcomplete;
808 }
809 }
810
811 /*----------
812 * Examine the scan keys to discover where we need to start the scan.
813 *
814 * We want to identify the keys that can be used as starting boundaries;
815 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
816 * a backwards scan. We can use keys for multiple attributes so long as
817 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
818 * a > or < boundary or find an attribute with no boundary (which can be
819 * thought of as the same as "> -infinity"), we can't use keys for any
820 * attributes to its right, because it would break our simplistic notion
821 * of what initial positioning strategy to use.
822 *
823 * When the scan keys include cross-type operators, _bt_preprocess_keys
824 * may not be able to eliminate redundant keys; in such cases we will
825 * arbitrarily pick a usable one for each attribute. This is correct
826 * but possibly not optimal behavior. (For example, with keys like
827 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
828 * x=5 would be more efficient.) Since the situation only arises given
829 * a poorly-worded query plus an incomplete opfamily, live with it.
830 *
831 * When both equality and inequality keys appear for a single attribute
832 * (again, only possible when cross-type operators appear), we *must*
833 * select one of the equality keys for the starting point, because
834 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
835 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
836 * start at x=4, we will fail and stop before reaching x=10. If multiple
837 * equality quals survive preprocessing, however, it doesn't matter which
838 * one we use --- by definition, they are either redundant or
839 * contradictory.
840 *
841 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
842 * If the index stores nulls at the end of the index we'll be starting
843 * from, and we have no boundary key for the column (which means the key
844 * we deduced NOT NULL from is an inequality key that constrains the other
845 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
846 * use as a boundary key. If we didn't do this, we might find ourselves
847 * traversing a lot of null entries at the start of the scan.
848 *
849 * In this loop, row-comparison keys are treated the same as keys on their
850 * first (leftmost) columns. We'll add on lower-order columns of the row
851 * comparison below, if possible.
852 *
853 * The selected scan keys (at most one per index column) are remembered by
854 * storing their addresses into the local startKeys[] array.
855 *----------
856 */
857 strat_total = BTEqualStrategyNumber;
858 if (so->numberOfKeys > 0)
859 {
860 AttrNumber curattr;
861 ScanKey chosen;
862 ScanKey impliesNN;
863 ScanKey cur;
864
865 /*
866 * chosen is the so-far-chosen key for the current attribute, if any.
867 * We don't cast the decision in stone until we reach keys for the
868 * next attribute.
869 */
870 curattr = 1;
871 chosen = NULL;
872 /* Also remember any scankey that implies a NOT NULL constraint */
873 impliesNN = NULL;
874
875 /*
876 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
877 * pass to handle after-last-key processing. Actual exit from the
878 * loop is at one of the "break" statements below.
879 */
880 for (cur = so->keyData, i = 0;; cur++, i++)
881 {
882 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
883 {
884 /*
885 * Done looking at keys for curattr. If we didn't find a
886 * usable boundary key, see if we can deduce a NOT NULL key.
887 */
888 if (chosen == NULL && impliesNN != NULL &&
889 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
890 ScanDirectionIsForward(dir) :
891 ScanDirectionIsBackward(dir)))
892 {
893 /* Yes, so build the key in notnullkeys[keysCount] */
894 chosen = ¬nullkeys[keysCount];
895 ScanKeyEntryInitialize(chosen,
896 (SK_SEARCHNOTNULL | SK_ISNULL |
897 (impliesNN->sk_flags &
898 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
899 curattr,
900 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
901 BTGreaterStrategyNumber :
902 BTLessStrategyNumber),
903 InvalidOid,
904 InvalidOid,
905 InvalidOid,
906 (Datum) 0);
907 }
908
909 /*
910 * If we still didn't find a usable boundary key, quit; else
911 * save the boundary key pointer in startKeys.
912 */
913 if (chosen == NULL)
914 break;
915 startKeys[keysCount++] = chosen;
916
917 /*
918 * Adjust strat_total, and quit if we have stored a > or <
919 * key.
920 */
921 strat = chosen->sk_strategy;
922 if (strat != BTEqualStrategyNumber)
923 {
924 strat_total = strat;
925 if (strat == BTGreaterStrategyNumber ||
926 strat == BTLessStrategyNumber)
927 break;
928 }
929
930 /*
931 * Done if that was the last attribute, or if next key is not
932 * in sequence (implying no boundary key is available for the
933 * next attribute).
934 */
935 if (i >= so->numberOfKeys ||
936 cur->sk_attno != curattr + 1)
937 break;
938
939 /*
940 * Reset for next attr.
941 */
942 curattr = cur->sk_attno;
943 chosen = NULL;
944 impliesNN = NULL;
945 }
946
947 /*
948 * Can we use this key as a starting boundary for this attr?
949 *
950 * If not, does it imply a NOT NULL constraint? (Because
951 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
952 * *any* inequality key works for that; we need not test.)
953 */
954 switch (cur->sk_strategy)
955 {
956 case BTLessStrategyNumber:
957 case BTLessEqualStrategyNumber:
958 if (chosen == NULL)
959 {
960 if (ScanDirectionIsBackward(dir))
961 chosen = cur;
962 else
963 impliesNN = cur;
964 }
965 break;
966 case BTEqualStrategyNumber:
967 /* override any non-equality choice */
968 chosen = cur;
969 break;
970 case BTGreaterEqualStrategyNumber:
971 case BTGreaterStrategyNumber:
972 if (chosen == NULL)
973 {
974 if (ScanDirectionIsForward(dir))
975 chosen = cur;
976 else
977 impliesNN = cur;
978 }
979 break;
980 }
981 }
982 }
983
984 /*
985 * If we found no usable boundary keys, we have to start from one end of
986 * the tree. Walk down that edge to the first or last key, and scan from
987 * there.
988 */
989 if (keysCount == 0)
990 {
991 bool match;
992
993 match = _bt_endpoint(scan, dir);
994
995 if (!match)
996 {
997 /* No match, so mark (parallel) scan finished */
998 _bt_parallel_done(scan);
999 }
1000
1001 return match;
1002 }
1003
1004 /*
1005 * We want to start the scan somewhere within the index. Set up an
1006 * insertion scankey we can use to search for the boundary point we
1007 * identified above. The insertion scankey is built using the keys
1008 * identified by startKeys[]. (Remaining insertion scankey fields are
1009 * initialized after initial-positioning strategy is finalized.)
1010 */
1011 Assert(keysCount <= INDEX_MAX_KEYS);
1012 for (i = 0; i < keysCount; i++)
1013 {
1014 ScanKey cur = startKeys[i];
1015
1016 Assert(cur->sk_attno == i + 1);
1017
1018 if (cur->sk_flags & SK_ROW_HEADER)
1019 {
1020 /*
1021 * Row comparison header: look to the first row member instead.
1022 *
1023 * The member scankeys are already in insertion format (ie, they
1024 * have sk_func = 3-way-comparison function), but we have to watch
1025 * out for nulls, which _bt_preprocess_keys didn't check. A null
1026 * in the first row member makes the condition unmatchable, just
1027 * like qual_ok = false.
1028 */
1029 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1030
1031 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1032 if (subkey->sk_flags & SK_ISNULL)
1033 {
1034 _bt_parallel_done(scan);
1035 return false;
1036 }
1037 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1038
1039 /*
1040 * If the row comparison is the last positioning key we accepted,
1041 * try to add additional keys from the lower-order row members.
1042 * (If we accepted independent conditions on additional index
1043 * columns, we use those instead --- doesn't seem worth trying to
1044 * determine which is more restrictive.) Note that this is OK
1045 * even if the row comparison is of ">" or "<" type, because the
1046 * condition applied to all but the last row member is effectively
1047 * ">=" or "<=", and so the extra keys don't break the positioning
1048 * scheme. But, by the same token, if we aren't able to use all
1049 * the row members, then the part of the row comparison that we
1050 * did use has to be treated as just a ">=" or "<=" condition, and
1051 * so we'd better adjust strat_total accordingly.
1052 */
1053 if (i == keysCount - 1)
1054 {
1055 bool used_all_subkeys = false;
1056
1057 Assert(!(subkey->sk_flags & SK_ROW_END));
1058 for (;;)
1059 {
1060 subkey++;
1061 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1062 if (subkey->sk_attno != keysCount + 1)
1063 break; /* out-of-sequence, can't use it */
1064 if (subkey->sk_strategy != cur->sk_strategy)
1065 break; /* wrong direction, can't use it */
1066 if (subkey->sk_flags & SK_ISNULL)
1067 break; /* can't use null keys */
1068 Assert(keysCount < INDEX_MAX_KEYS);
1069 memcpy(inskey.scankeys + keysCount, subkey,
1070 sizeof(ScanKeyData));
1071 keysCount++;
1072 if (subkey->sk_flags & SK_ROW_END)
1073 {
1074 used_all_subkeys = true;
1075 break;
1076 }
1077 }
1078 if (!used_all_subkeys)
1079 {
1080 switch (strat_total)
1081 {
1082 case BTLessStrategyNumber:
1083 strat_total = BTLessEqualStrategyNumber;
1084 break;
1085 case BTGreaterStrategyNumber:
1086 strat_total = BTGreaterEqualStrategyNumber;
1087 break;
1088 }
1089 }
1090 break; /* done with outer loop */
1091 }
1092 }
1093 else
1094 {
1095 /*
1096 * Ordinary comparison key. Transform the search-style scan key
1097 * to an insertion scan key by replacing the sk_func with the
1098 * appropriate btree comparison function.
1099 *
1100 * If scankey operator is not a cross-type comparison, we can use
1101 * the cached comparison function; otherwise gotta look it up in
1102 * the catalogs. (That can't lead to infinite recursion, since no
1103 * indexscan initiated by syscache lookup will use cross-data-type
1104 * operators.)
1105 *
1106 * We support the convention that sk_subtype == InvalidOid means
1107 * the opclass input type; this is a hack to simplify life for
1108 * ScanKeyInit().
1109 */
1110 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1111 cur->sk_subtype == InvalidOid)
1112 {
1113 FmgrInfo *procinfo;
1114
1115 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1116 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1117 cur->sk_flags,
1118 cur->sk_attno,
1119 InvalidStrategy,
1120 cur->sk_subtype,
1121 cur->sk_collation,
1122 procinfo,
1123 cur->sk_argument);
1124 }
1125 else
1126 {
1127 RegProcedure cmp_proc;
1128
1129 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1130 rel->rd_opcintype[i],
1131 cur->sk_subtype,
1132 BTORDER_PROC);
1133 if (!RegProcedureIsValid(cmp_proc))
1134 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1135 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1136 cur->sk_attno, RelationGetRelationName(rel));
1137 ScanKeyEntryInitialize(inskey.scankeys + i,
1138 cur->sk_flags,
1139 cur->sk_attno,
1140 InvalidStrategy,
1141 cur->sk_subtype,
1142 cur->sk_collation,
1143 cmp_proc,
1144 cur->sk_argument);
1145 }
1146 }
1147 }
1148
1149 /*----------
1150 * Examine the selected initial-positioning strategy to determine exactly
1151 * where we need to start the scan, and set flag variables to control the
1152 * code below.
1153 *
1154 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1155 * item >= scan key. If nextkey = true, they will locate the first
1156 * item > scan key.
1157 *
1158 * If goback = true, we will then step back one item, while if
1159 * goback = false, we will start the scan on the located item.
1160 *----------
1161 */
1162 switch (strat_total)
1163 {
1164 case BTLessStrategyNumber:
1165
1166 /*
1167 * Find first item >= scankey, then back up one to arrive at last
1168 * item < scankey. (Note: this positioning strategy is only used
1169 * for a backward scan, so that is always the correct starting
1170 * position.)
1171 */
1172 nextkey = false;
1173 goback = true;
1174 break;
1175
1176 case BTLessEqualStrategyNumber:
1177
1178 /*
1179 * Find first item > scankey, then back up one to arrive at last
1180 * item <= scankey. (Note: this positioning strategy is only used
1181 * for a backward scan, so that is always the correct starting
1182 * position.)
1183 */
1184 nextkey = true;
1185 goback = true;
1186 break;
1187
1188 case BTEqualStrategyNumber:
1189
1190 /*
1191 * If a backward scan was specified, need to start with last equal
1192 * item not first one.
1193 */
1194 if (ScanDirectionIsBackward(dir))
1195 {
1196 /*
1197 * This is the same as the <= strategy. We will check at the
1198 * end whether the found item is actually =.
1199 */
1200 nextkey = true;
1201 goback = true;
1202 }
1203 else
1204 {
1205 /*
1206 * This is the same as the >= strategy. We will check at the
1207 * end whether the found item is actually =.
1208 */
1209 nextkey = false;
1210 goback = false;
1211 }
1212 break;
1213
1214 case BTGreaterEqualStrategyNumber:
1215
1216 /*
1217 * Find first item >= scankey. (This is only used for forward
1218 * scans.)
1219 */
1220 nextkey = false;
1221 goback = false;
1222 break;
1223
1224 case BTGreaterStrategyNumber:
1225
1226 /*
1227 * Find first item > scankey. (This is only used for forward
1228 * scans.)
1229 */
1230 nextkey = true;
1231 goback = false;
1232 break;
1233
1234 default:
1235 /* can't get here, but keep compiler quiet */
1236 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1237 return false;
1238 }
1239
1240 /* Initialize remaining insertion scan key fields */
1241 inskey.heapkeyspace = _bt_heapkeyspace(rel);
1242 inskey.anynullkeys = false; /* unused */
1243 inskey.nextkey = nextkey;
1244 inskey.pivotsearch = false;
1245 inskey.scantid = NULL;
1246 inskey.keysz = keysCount;
1247
1248 /*
1249 * Use the manufactured insertion scan key to descend the tree and
1250 * position ourselves on the target leaf page.
1251 */
1252 stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1253
1254 /* don't need to keep the stack around... */
1255 _bt_freestack(stack);
1256
1257 if (!BufferIsValid(buf))
1258 {
1259 /*
1260 * We only get here if the index is completely empty. Lock relation
1261 * because nothing finer to lock exists.
1262 */
1263 PredicateLockRelation(rel, scan->xs_snapshot);
1264
1265 /*
1266 * mark parallel scan as done, so that all the workers can finish
1267 * their scan
1268 */
1269 _bt_parallel_done(scan);
1270 BTScanPosInvalidate(so->currPos);
1271
1272 return false;
1273 }
1274 else
1275 PredicateLockPage(rel, BufferGetBlockNumber(buf),
1276 scan->xs_snapshot);
1277
1278 _bt_initialize_more_data(so, dir);
1279
1280 /* position to the precise item on the page */
1281 offnum = _bt_binsrch(rel, &inskey, buf);
1282
1283 /*
1284 * If nextkey = false, we are positioned at the first item >= scan key, or
1285 * possibly at the end of a page on which all the existing items are less
1286 * than the scan key and we know that everything on later pages is greater
1287 * than or equal to scan key.
1288 *
1289 * If nextkey = true, we are positioned at the first item > scan key, or
1290 * possibly at the end of a page on which all the existing items are less
1291 * than or equal to the scan key and we know that everything on later
1292 * pages is greater than scan key.
1293 *
1294 * The actually desired starting point is either this item or the prior
1295 * one, or in the end-of-page case it's the first item on the next page or
1296 * the last item on this page. Adjust the starting offset if needed. (If
1297 * this results in an offset before the first item or after the last one,
1298 * _bt_readpage will report no items found, and then we'll step to the
1299 * next page as needed.)
1300 */
1301 if (goback)
1302 offnum = OffsetNumberPrev(offnum);
1303
1304 /* remember which buffer we have pinned, if any */
1305 Assert(!BTScanPosIsValid(so->currPos));
1306 so->currPos.buf = buf;
1307
1308 /*
1309 * Now load data from the first page of the scan.
1310 */
1311 if (!_bt_readpage(scan, dir, offnum))
1312 {
1313 /*
1314 * There's no actually-matching data on this page. Try to advance to
1315 * the next page. Return false if there's no matching data at all.
1316 */
1317 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1318 if (!_bt_steppage(scan, dir))
1319 return false;
1320 }
1321 else
1322 {
1323 /* Drop the lock, and maybe the pin, on the current page */
1324 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1325 }
1326
1327 readcomplete:
1328 /* OK, itemIndex says what to return */
1329 currItem = &so->currPos.items[so->currPos.itemIndex];
1330 scan->xs_heaptid = currItem->heapTid;
1331 if (scan->xs_want_itup)
1332 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1333
1334 return true;
1335 }
1336
1337 /*
1338 * _bt_next() -- Get the next item in a scan.
1339 *
1340 * On entry, so->currPos describes the current page, which may be pinned
1341 * but is not locked, and so->currPos.itemIndex identifies which item was
1342 * previously returned.
1343 *
1344 * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1345 * next heap tuple, and if requested, scan->xs_itup points to a copy of
1346 * the index tuple. so->currPos is updated as needed.
1347 *
1348 * On failure exit (no more tuples), we release pin and set
1349 * so->currPos.buf to InvalidBuffer.
1350 */
1351 bool
_bt_next(IndexScanDesc scan,ScanDirection dir)1352 _bt_next(IndexScanDesc scan, ScanDirection dir)
1353 {
1354 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1355 BTScanPosItem *currItem;
1356
1357 /*
1358 * Advance to next tuple on current page; or if there's no more, try to
1359 * step to the next page with data.
1360 */
1361 if (ScanDirectionIsForward(dir))
1362 {
1363 if (++so->currPos.itemIndex > so->currPos.lastItem)
1364 {
1365 if (!_bt_steppage(scan, dir))
1366 return false;
1367 }
1368 }
1369 else
1370 {
1371 if (--so->currPos.itemIndex < so->currPos.firstItem)
1372 {
1373 if (!_bt_steppage(scan, dir))
1374 return false;
1375 }
1376 }
1377
1378 /* OK, itemIndex says what to return */
1379 currItem = &so->currPos.items[so->currPos.itemIndex];
1380 scan->xs_heaptid = currItem->heapTid;
1381 if (scan->xs_want_itup)
1382 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1383
1384 return true;
1385 }
1386
1387 /*
1388 * _bt_readpage() -- Load data from current index page into so->currPos
1389 *
1390 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1391 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1392 * they are updated as appropriate. All other fields of so->currPos are
1393 * initialized from scratch here.
1394 *
1395 * We scan the current page starting at offnum and moving in the indicated
1396 * direction. All items matching the scan keys are loaded into currPos.items.
1397 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1398 * that there can be no more matching tuples in the current scan direction.
1399 *
1400 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1401 * prior to calling this function; this function will invoke
1402 * _bt_parallel_release before returning.
1403 *
1404 * Returns true if any matching items found on the page, false if none.
1405 */
1406 static bool
_bt_readpage(IndexScanDesc scan,ScanDirection dir,OffsetNumber offnum)1407 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1408 {
1409 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1410 Page page;
1411 BTPageOpaque opaque;
1412 OffsetNumber minoff;
1413 OffsetNumber maxoff;
1414 int itemIndex;
1415 bool continuescan;
1416 int indnatts;
1417
1418 /*
1419 * We must have the buffer pinned and locked, but the usual macro can't be
1420 * used here; this function is what makes it good for currPos.
1421 */
1422 Assert(BufferIsValid(so->currPos.buf));
1423
1424 page = BufferGetPage(so->currPos.buf);
1425 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1426
1427 /* allow next page be processed by parallel worker */
1428 if (scan->parallel_scan)
1429 {
1430 if (ScanDirectionIsForward(dir))
1431 _bt_parallel_release(scan, opaque->btpo_next);
1432 else
1433 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1434 }
1435
1436 continuescan = true; /* default assumption */
1437 indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1438 minoff = P_FIRSTDATAKEY(opaque);
1439 maxoff = PageGetMaxOffsetNumber(page);
1440
1441 /*
1442 * We note the buffer's block number so that we can release the pin later.
1443 * This allows us to re-read the buffer if it is needed again for hinting.
1444 */
1445 so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1446
1447 /*
1448 * We save the LSN of the page as we read it, so that we know whether it
1449 * safe to apply LP_DEAD hints to the page later. This allows us to drop
1450 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1451 */
1452 so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1453
1454 /*
1455 * we must save the page's right-link while scanning it; this tells us
1456 * where to step right to after we're done with these items. There is no
1457 * corresponding need for the left-link, since splits always go right.
1458 */
1459 so->currPos.nextPage = opaque->btpo_next;
1460
1461 /* initialize tuple workspace to empty */
1462 so->currPos.nextTupleOffset = 0;
1463
1464 /*
1465 * Now that the current page has been made consistent, the macro should be
1466 * good.
1467 */
1468 Assert(BTScanPosIsPinned(so->currPos));
1469
1470 if (ScanDirectionIsForward(dir))
1471 {
1472 /* load items[] in ascending order */
1473 itemIndex = 0;
1474
1475 offnum = Max(offnum, minoff);
1476
1477 while (offnum <= maxoff)
1478 {
1479 ItemId iid = PageGetItemId(page, offnum);
1480 IndexTuple itup;
1481
1482 /*
1483 * If the scan specifies not to return killed tuples, then we
1484 * treat a killed tuple as not passing the qual
1485 */
1486 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1487 {
1488 offnum = OffsetNumberNext(offnum);
1489 continue;
1490 }
1491
1492 itup = (IndexTuple) PageGetItem(page, iid);
1493
1494 if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1495 {
1496 /* tuple passes all scan key conditions, so remember it */
1497 _bt_saveitem(so, itemIndex, offnum, itup);
1498 itemIndex++;
1499 }
1500 /* When !continuescan, there can't be any more matches, so stop */
1501 if (!continuescan)
1502 break;
1503
1504 offnum = OffsetNumberNext(offnum);
1505 }
1506
1507 /*
1508 * We don't need to visit page to the right when the high key
1509 * indicates that no more matches will be found there.
1510 *
1511 * Checking the high key like this works out more often than you might
1512 * think. Leaf page splits pick a split point between the two most
1513 * dissimilar tuples (this is weighed against the need to evenly share
1514 * free space). Leaf pages with high key attribute values that can
1515 * only appear on non-pivot tuples on the right sibling page are
1516 * common.
1517 */
1518 if (continuescan && !P_RIGHTMOST(opaque))
1519 {
1520 ItemId iid = PageGetItemId(page, P_HIKEY);
1521 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1522 int truncatt;
1523
1524 truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1525 _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1526 }
1527
1528 if (!continuescan)
1529 so->currPos.moreRight = false;
1530
1531 Assert(itemIndex <= MaxIndexTuplesPerPage);
1532 so->currPos.firstItem = 0;
1533 so->currPos.lastItem = itemIndex - 1;
1534 so->currPos.itemIndex = 0;
1535 }
1536 else
1537 {
1538 /* load items[] in descending order */
1539 itemIndex = MaxIndexTuplesPerPage;
1540
1541 offnum = Min(offnum, maxoff);
1542
1543 while (offnum >= minoff)
1544 {
1545 ItemId iid = PageGetItemId(page, offnum);
1546 IndexTuple itup;
1547 bool tuple_alive;
1548 bool passes_quals;
1549
1550 /*
1551 * If the scan specifies not to return killed tuples, then we
1552 * treat a killed tuple as not passing the qual. Most of the
1553 * time, it's a win to not bother examining the tuple's index
1554 * keys, but just skip to the next tuple (previous, actually,
1555 * since we're scanning backwards). However, if this is the first
1556 * tuple on the page, we do check the index keys, to prevent
1557 * uselessly advancing to the page to the left. This is similar
1558 * to the high key optimization used by forward scans.
1559 */
1560 if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1561 {
1562 Assert(offnum >= P_FIRSTDATAKEY(opaque));
1563 if (offnum > P_FIRSTDATAKEY(opaque))
1564 {
1565 offnum = OffsetNumberPrev(offnum);
1566 continue;
1567 }
1568
1569 tuple_alive = false;
1570 }
1571 else
1572 tuple_alive = true;
1573
1574 itup = (IndexTuple) PageGetItem(page, iid);
1575
1576 passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1577 &continuescan);
1578 if (passes_quals && tuple_alive)
1579 {
1580 /* tuple passes all scan key conditions, so remember it */
1581 itemIndex--;
1582 _bt_saveitem(so, itemIndex, offnum, itup);
1583 }
1584 if (!continuescan)
1585 {
1586 /* there can't be any more matches, so stop */
1587 so->currPos.moreLeft = false;
1588 break;
1589 }
1590
1591 offnum = OffsetNumberPrev(offnum);
1592 }
1593
1594 Assert(itemIndex >= 0);
1595 so->currPos.firstItem = itemIndex;
1596 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1597 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1598 }
1599
1600 return (so->currPos.firstItem <= so->currPos.lastItem);
1601 }
1602
1603 /* Save an index item into so->currPos.items[itemIndex] */
1604 static void
_bt_saveitem(BTScanOpaque so,int itemIndex,OffsetNumber offnum,IndexTuple itup)1605 _bt_saveitem(BTScanOpaque so, int itemIndex,
1606 OffsetNumber offnum, IndexTuple itup)
1607 {
1608 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1609
1610 currItem->heapTid = itup->t_tid;
1611 currItem->indexOffset = offnum;
1612 if (so->currTuples)
1613 {
1614 Size itupsz = IndexTupleSize(itup);
1615
1616 currItem->tupleOffset = so->currPos.nextTupleOffset;
1617 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1618 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1619 }
1620 }
1621
1622 /*
1623 * _bt_steppage() -- Step to next page containing valid data for scan
1624 *
1625 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1626 * if pinned, we'll drop the pin before moving to next page. The buffer is
1627 * not locked on entry.
1628 *
1629 * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1630 * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1631 * to InvalidBuffer. We return true to indicate success.
1632 */
1633 static bool
_bt_steppage(IndexScanDesc scan,ScanDirection dir)1634 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1635 {
1636 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1637 BlockNumber blkno = InvalidBlockNumber;
1638 bool status = true;
1639
1640 Assert(BTScanPosIsValid(so->currPos));
1641
1642 /* Before leaving current page, deal with any killed items */
1643 if (so->numKilled > 0)
1644 _bt_killitems(scan);
1645
1646 /*
1647 * Before we modify currPos, make a copy of the page data if there was a
1648 * mark position that needs it.
1649 */
1650 if (so->markItemIndex >= 0)
1651 {
1652 /* bump pin on current buffer for assignment to mark buffer */
1653 if (BTScanPosIsPinned(so->currPos))
1654 IncrBufferRefCount(so->currPos.buf);
1655 memcpy(&so->markPos, &so->currPos,
1656 offsetof(BTScanPosData, items[1]) +
1657 so->currPos.lastItem * sizeof(BTScanPosItem));
1658 if (so->markTuples)
1659 memcpy(so->markTuples, so->currTuples,
1660 so->currPos.nextTupleOffset);
1661 so->markPos.itemIndex = so->markItemIndex;
1662 so->markItemIndex = -1;
1663 }
1664
1665 if (ScanDirectionIsForward(dir))
1666 {
1667 /* Walk right to the next page with data */
1668 if (scan->parallel_scan != NULL)
1669 {
1670 /*
1671 * Seize the scan to get the next block number; if the scan has
1672 * ended already, bail out.
1673 */
1674 status = _bt_parallel_seize(scan, &blkno);
1675 if (!status)
1676 {
1677 /* release the previous buffer, if pinned */
1678 BTScanPosUnpinIfPinned(so->currPos);
1679 BTScanPosInvalidate(so->currPos);
1680 return false;
1681 }
1682 }
1683 else
1684 {
1685 /* Not parallel, so use the previously-saved nextPage link. */
1686 blkno = so->currPos.nextPage;
1687 }
1688
1689 /* Remember we left a page with data */
1690 so->currPos.moreLeft = true;
1691
1692 /* release the previous buffer, if pinned */
1693 BTScanPosUnpinIfPinned(so->currPos);
1694 }
1695 else
1696 {
1697 /* Remember we left a page with data */
1698 so->currPos.moreRight = true;
1699
1700 if (scan->parallel_scan != NULL)
1701 {
1702 /*
1703 * Seize the scan to get the current block number; if the scan has
1704 * ended already, bail out.
1705 */
1706 status = _bt_parallel_seize(scan, &blkno);
1707 BTScanPosUnpinIfPinned(so->currPos);
1708 if (!status)
1709 {
1710 BTScanPosInvalidate(so->currPos);
1711 return false;
1712 }
1713 }
1714 else
1715 {
1716 /* Not parallel, so just use our own notion of the current page */
1717 blkno = so->currPos.currPage;
1718 }
1719 }
1720
1721 if (!_bt_readnextpage(scan, blkno, dir))
1722 return false;
1723
1724 /* Drop the lock, and maybe the pin, on the current page */
1725 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1726
1727 return true;
1728 }
1729
1730 /*
1731 * _bt_readnextpage() -- Read next page containing valid data for scan
1732 *
1733 * On success exit, so->currPos is updated to contain data from the next
1734 * interesting page. Caller is responsible to release lock and pin on
1735 * buffer on success. We return true to indicate success.
1736 *
1737 * If there are no more matching records in the given direction, we drop all
1738 * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1739 */
1740 static bool
_bt_readnextpage(IndexScanDesc scan,BlockNumber blkno,ScanDirection dir)1741 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1742 {
1743 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1744 Relation rel;
1745 Page page;
1746 BTPageOpaque opaque;
1747 bool status = true;
1748
1749 rel = scan->indexRelation;
1750
1751 if (ScanDirectionIsForward(dir))
1752 {
1753 for (;;)
1754 {
1755 /*
1756 * if we're at end of scan, give up and mark parallel scan as
1757 * done, so that all the workers can finish their scan
1758 */
1759 if (blkno == P_NONE || !so->currPos.moreRight)
1760 {
1761 _bt_parallel_done(scan);
1762 BTScanPosInvalidate(so->currPos);
1763 return false;
1764 }
1765 /* check for interrupts while we're not holding any buffer lock */
1766 CHECK_FOR_INTERRUPTS();
1767 /* step right one page */
1768 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1769 page = BufferGetPage(so->currPos.buf);
1770 TestForOldSnapshot(scan->xs_snapshot, rel, page);
1771 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1772 /* check for deleted page */
1773 if (!P_IGNORE(opaque))
1774 {
1775 PredicateLockPage(rel, blkno, scan->xs_snapshot);
1776 /* see if there are any matches on this page */
1777 /* note that this will clear moreRight if we can stop */
1778 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1779 break;
1780 }
1781 else if (scan->parallel_scan != NULL)
1782 {
1783 /* allow next page be processed by parallel worker */
1784 _bt_parallel_release(scan, opaque->btpo_next);
1785 }
1786
1787 /* nope, keep going */
1788 if (scan->parallel_scan != NULL)
1789 {
1790 _bt_relbuf(rel, so->currPos.buf);
1791 status = _bt_parallel_seize(scan, &blkno);
1792 if (!status)
1793 {
1794 BTScanPosInvalidate(so->currPos);
1795 return false;
1796 }
1797 }
1798 else
1799 {
1800 blkno = opaque->btpo_next;
1801 _bt_relbuf(rel, so->currPos.buf);
1802 }
1803 }
1804 }
1805 else
1806 {
1807 /*
1808 * Should only happen in parallel cases, when some other backend
1809 * advanced the scan.
1810 */
1811 if (so->currPos.currPage != blkno)
1812 {
1813 BTScanPosUnpinIfPinned(so->currPos);
1814 so->currPos.currPage = blkno;
1815 }
1816
1817 /*
1818 * Walk left to the next page with data. This is much more complex
1819 * than the walk-right case because of the possibility that the page
1820 * to our left splits while we are in flight to it, plus the
1821 * possibility that the page we were on gets deleted after we leave
1822 * it. See nbtree/README for details.
1823 *
1824 * It might be possible to rearrange this code to have less overhead
1825 * in pinning and locking, but that would require capturing the left
1826 * pointer when the page is initially read, and using it here, along
1827 * with big changes to _bt_walk_left() and the code below. It is not
1828 * clear whether this would be a win, since if the page immediately to
1829 * the left splits after we read this page and before we step left, we
1830 * would need to visit more pages than with the current code.
1831 *
1832 * Note that if we change the code so that we drop the pin for a scan
1833 * which uses a non-MVCC snapshot, we will need to modify the code for
1834 * walking left, to allow for the possibility that a referenced page
1835 * has been deleted. As long as the buffer is pinned or the snapshot
1836 * is MVCC the page cannot move past the half-dead state to fully
1837 * deleted.
1838 */
1839 if (BTScanPosIsPinned(so->currPos))
1840 LockBuffer(so->currPos.buf, BT_READ);
1841 else
1842 so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1843
1844 for (;;)
1845 {
1846 /* Done if we know there are no matching keys to the left */
1847 if (!so->currPos.moreLeft)
1848 {
1849 _bt_relbuf(rel, so->currPos.buf);
1850 _bt_parallel_done(scan);
1851 BTScanPosInvalidate(so->currPos);
1852 return false;
1853 }
1854
1855 /* Step to next physical page */
1856 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1857 scan->xs_snapshot);
1858
1859 /* if we're physically at end of index, return failure */
1860 if (so->currPos.buf == InvalidBuffer)
1861 {
1862 _bt_parallel_done(scan);
1863 BTScanPosInvalidate(so->currPos);
1864 return false;
1865 }
1866
1867 /*
1868 * Okay, we managed to move left to a non-deleted page. Done if
1869 * it's not half-dead and contains matching tuples. Else loop back
1870 * and do it all again.
1871 */
1872 page = BufferGetPage(so->currPos.buf);
1873 TestForOldSnapshot(scan->xs_snapshot, rel, page);
1874 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1875 if (!P_IGNORE(opaque))
1876 {
1877 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1878 /* see if there are any matches on this page */
1879 /* note that this will clear moreLeft if we can stop */
1880 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1881 break;
1882 }
1883 else if (scan->parallel_scan != NULL)
1884 {
1885 /* allow next page be processed by parallel worker */
1886 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1887 }
1888
1889 /*
1890 * For parallel scans, get the last page scanned as it is quite
1891 * possible that by the time we try to seize the scan, some other
1892 * worker has already advanced the scan to a different page. We
1893 * must continue based on the latest page scanned by any worker.
1894 */
1895 if (scan->parallel_scan != NULL)
1896 {
1897 _bt_relbuf(rel, so->currPos.buf);
1898 status = _bt_parallel_seize(scan, &blkno);
1899 if (!status)
1900 {
1901 BTScanPosInvalidate(so->currPos);
1902 return false;
1903 }
1904 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1905 }
1906 }
1907 }
1908
1909 return true;
1910 }
1911
1912 /*
1913 * _bt_parallel_readpage() -- Read current page containing valid data for scan
1914 *
1915 * On success, release lock and maybe pin on buffer. We return true to
1916 * indicate success.
1917 */
1918 static bool
_bt_parallel_readpage(IndexScanDesc scan,BlockNumber blkno,ScanDirection dir)1919 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1920 {
1921 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1922
1923 _bt_initialize_more_data(so, dir);
1924
1925 if (!_bt_readnextpage(scan, blkno, dir))
1926 return false;
1927
1928 /* Drop the lock, and maybe the pin, on the current page */
1929 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1930
1931 return true;
1932 }
1933
1934 /*
1935 * _bt_walk_left() -- step left one page, if possible
1936 *
1937 * The given buffer must be pinned and read-locked. This will be dropped
1938 * before stepping left. On return, we have pin and read lock on the
1939 * returned page, instead.
1940 *
1941 * Returns InvalidBuffer if there is no page to the left (no lock is held
1942 * in that case).
1943 *
1944 * When working on a non-leaf level, it is possible for the returned page
1945 * to be half-dead; the caller should check that condition and step left
1946 * again if it's important.
1947 */
1948 static Buffer
_bt_walk_left(Relation rel,Buffer buf,Snapshot snapshot)1949 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1950 {
1951 Page page;
1952 BTPageOpaque opaque;
1953
1954 page = BufferGetPage(buf);
1955 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1956
1957 for (;;)
1958 {
1959 BlockNumber obknum;
1960 BlockNumber lblkno;
1961 BlockNumber blkno;
1962 int tries;
1963
1964 /* if we're at end of tree, release buf and return failure */
1965 if (P_LEFTMOST(opaque))
1966 {
1967 _bt_relbuf(rel, buf);
1968 break;
1969 }
1970 /* remember original page we are stepping left from */
1971 obknum = BufferGetBlockNumber(buf);
1972 /* step left */
1973 blkno = lblkno = opaque->btpo_prev;
1974 _bt_relbuf(rel, buf);
1975 /* check for interrupts while we're not holding any buffer lock */
1976 CHECK_FOR_INTERRUPTS();
1977 buf = _bt_getbuf(rel, blkno, BT_READ);
1978 page = BufferGetPage(buf);
1979 TestForOldSnapshot(snapshot, rel, page);
1980 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1981
1982 /*
1983 * If this isn't the page we want, walk right till we find what we
1984 * want --- but go no more than four hops (an arbitrary limit). If we
1985 * don't find the correct page by then, the most likely bet is that
1986 * the original page got deleted and isn't in the sibling chain at all
1987 * anymore, not that its left sibling got split more than four times.
1988 *
1989 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1990 * because half-dead pages are still in the sibling chain. Caller
1991 * must reject half-dead pages if wanted.
1992 */
1993 tries = 0;
1994 for (;;)
1995 {
1996 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1997 {
1998 /* Found desired page, return it */
1999 return buf;
2000 }
2001 if (P_RIGHTMOST(opaque) || ++tries > 4)
2002 break;
2003 blkno = opaque->btpo_next;
2004 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2005 page = BufferGetPage(buf);
2006 TestForOldSnapshot(snapshot, rel, page);
2007 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2008 }
2009
2010 /* Return to the original page to see what's up */
2011 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2012 page = BufferGetPage(buf);
2013 TestForOldSnapshot(snapshot, rel, page);
2014 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2015 if (P_ISDELETED(opaque))
2016 {
2017 /*
2018 * It was deleted. Move right to first nondeleted page (there
2019 * must be one); that is the page that has acquired the deleted
2020 * one's keyspace, so stepping left from it will take us where we
2021 * want to be.
2022 */
2023 for (;;)
2024 {
2025 if (P_RIGHTMOST(opaque))
2026 elog(ERROR, "fell off the end of index \"%s\"",
2027 RelationGetRelationName(rel));
2028 blkno = opaque->btpo_next;
2029 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2030 page = BufferGetPage(buf);
2031 TestForOldSnapshot(snapshot, rel, page);
2032 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2033 if (!P_ISDELETED(opaque))
2034 break;
2035 }
2036
2037 /*
2038 * Now return to top of loop, resetting obknum to point to this
2039 * nondeleted page, and try again.
2040 */
2041 }
2042 else
2043 {
2044 /*
2045 * It wasn't deleted; the explanation had better be that the page
2046 * to the left got split or deleted. Without this check, we'd go
2047 * into an infinite loop if there's anything wrong.
2048 */
2049 if (opaque->btpo_prev == lblkno)
2050 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2051 obknum, RelationGetRelationName(rel));
2052 /* Okay to try again with new lblkno value */
2053 }
2054 }
2055
2056 return InvalidBuffer;
2057 }
2058
2059 /*
2060 * _bt_get_endpoint() -- Find the first or last page on a given tree level
2061 *
2062 * If the index is empty, we will return InvalidBuffer; any other failure
2063 * condition causes ereport(). We will not return a dead page.
2064 *
2065 * The returned buffer is pinned and read-locked.
2066 */
2067 Buffer
_bt_get_endpoint(Relation rel,uint32 level,bool rightmost,Snapshot snapshot)2068 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2069 Snapshot snapshot)
2070 {
2071 Buffer buf;
2072 Page page;
2073 BTPageOpaque opaque;
2074 OffsetNumber offnum;
2075 BlockNumber blkno;
2076 IndexTuple itup;
2077
2078 /*
2079 * If we are looking for a leaf page, okay to descend from fast root;
2080 * otherwise better descend from true root. (There is no point in being
2081 * smarter about intermediate levels.)
2082 */
2083 if (level == 0)
2084 buf = _bt_getroot(rel, BT_READ);
2085 else
2086 buf = _bt_gettrueroot(rel);
2087
2088 if (!BufferIsValid(buf))
2089 return InvalidBuffer;
2090
2091 page = BufferGetPage(buf);
2092 TestForOldSnapshot(snapshot, rel, page);
2093 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2094
2095 for (;;)
2096 {
2097 /*
2098 * If we landed on a deleted page, step right to find a live page
2099 * (there must be one). Also, if we want the rightmost page, step
2100 * right if needed to get to it (this could happen if the page split
2101 * since we obtained a pointer to it).
2102 */
2103 while (P_IGNORE(opaque) ||
2104 (rightmost && !P_RIGHTMOST(opaque)))
2105 {
2106 blkno = opaque->btpo_next;
2107 if (blkno == P_NONE)
2108 elog(ERROR, "fell off the end of index \"%s\"",
2109 RelationGetRelationName(rel));
2110 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2111 page = BufferGetPage(buf);
2112 TestForOldSnapshot(snapshot, rel, page);
2113 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2114 }
2115
2116 /* Done? */
2117 if (opaque->btpo.level == level)
2118 break;
2119 if (opaque->btpo.level < level)
2120 elog(ERROR, "btree level %u not found in index \"%s\"",
2121 level, RelationGetRelationName(rel));
2122
2123 /* Descend to leftmost or rightmost child page */
2124 if (rightmost)
2125 offnum = PageGetMaxOffsetNumber(page);
2126 else
2127 offnum = P_FIRSTDATAKEY(opaque);
2128
2129 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2130 blkno = BTreeInnerTupleGetDownLink(itup);
2131
2132 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2133 page = BufferGetPage(buf);
2134 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2135 }
2136
2137 return buf;
2138 }
2139
2140 /*
2141 * _bt_endpoint() -- Find the first or last page in the index, and scan
2142 * from there to the first key satisfying all the quals.
2143 *
2144 * This is used by _bt_first() to set up a scan when we've determined
2145 * that the scan must start at the beginning or end of the index (for
2146 * a forward or backward scan respectively). Exit conditions are the
2147 * same as for _bt_first().
2148 */
2149 static bool
_bt_endpoint(IndexScanDesc scan,ScanDirection dir)2150 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2151 {
2152 Relation rel = scan->indexRelation;
2153 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2154 Buffer buf;
2155 Page page;
2156 BTPageOpaque opaque;
2157 OffsetNumber start;
2158 BTScanPosItem *currItem;
2159
2160 /*
2161 * Scan down to the leftmost or rightmost leaf page. This is a simplified
2162 * version of _bt_search(). We don't maintain a stack since we know we
2163 * won't need it.
2164 */
2165 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
2166
2167 if (!BufferIsValid(buf))
2168 {
2169 /*
2170 * Empty index. Lock the whole relation, as nothing finer to lock
2171 * exists.
2172 */
2173 PredicateLockRelation(rel, scan->xs_snapshot);
2174 BTScanPosInvalidate(so->currPos);
2175 return false;
2176 }
2177
2178 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2179 page = BufferGetPage(buf);
2180 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2181 Assert(P_ISLEAF(opaque));
2182
2183 if (ScanDirectionIsForward(dir))
2184 {
2185 /* There could be dead pages to the left, so not this: */
2186 /* Assert(P_LEFTMOST(opaque)); */
2187
2188 start = P_FIRSTDATAKEY(opaque);
2189 }
2190 else if (ScanDirectionIsBackward(dir))
2191 {
2192 Assert(P_RIGHTMOST(opaque));
2193
2194 start = PageGetMaxOffsetNumber(page);
2195 }
2196 else
2197 {
2198 elog(ERROR, "invalid scan direction: %d", (int) dir);
2199 start = 0; /* keep compiler quiet */
2200 }
2201
2202 /* remember which buffer we have pinned */
2203 so->currPos.buf = buf;
2204
2205 _bt_initialize_more_data(so, dir);
2206
2207 /*
2208 * Now load data from the first page of the scan.
2209 */
2210 if (!_bt_readpage(scan, dir, start))
2211 {
2212 /*
2213 * There's no actually-matching data on this page. Try to advance to
2214 * the next page. Return false if there's no matching data at all.
2215 */
2216 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
2217 if (!_bt_steppage(scan, dir))
2218 return false;
2219 }
2220 else
2221 {
2222 /* Drop the lock, and maybe the pin, on the current page */
2223 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2224 }
2225
2226 /* OK, itemIndex says what to return */
2227 currItem = &so->currPos.items[so->currPos.itemIndex];
2228 scan->xs_heaptid = currItem->heapTid;
2229 if (scan->xs_want_itup)
2230 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2231
2232 return true;
2233 }
2234
2235 /*
2236 * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2237 * for scan direction
2238 */
2239 static inline void
_bt_initialize_more_data(BTScanOpaque so,ScanDirection dir)2240 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2241 {
2242 /* initialize moreLeft/moreRight appropriately for scan direction */
2243 if (ScanDirectionIsForward(dir))
2244 {
2245 so->currPos.moreLeft = false;
2246 so->currPos.moreRight = true;
2247 }
2248 else
2249 {
2250 so->currPos.moreLeft = true;
2251 so->currPos.moreRight = false;
2252 }
2253 so->numKilled = 0; /* just paranoia */
2254 so->markItemIndex = -1; /* ditto */
2255 }
2256