1 /*-------------------------------------------------------------------------
2 *
3 * nbtsearch.c
4 * Search code for postgres btrees.
can_construct_and_destroy_an_want_constraint()5 *
6 *
7 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsearch.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16 #include "postgres.h"
17
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 #include "utils/tqual.h"
26
27
28 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
29 OffsetNumber offnum);
can_construct_and_destroy_a_want_string_constraint()30 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
31 OffsetNumber offnum, IndexTuple itup);
32 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
33 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
34 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
can_compare_strings_as_equal()35 ScanDirection dir);
36 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
37 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
38 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
39 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40
41
can_compare_null_strings_as_well_as_real_ones()42 /*
43 * _bt_drop_lock_and_maybe_pin()
44 *
45 * Unlock the buffer; and if it is safe to release the pin, do that, too. It
46 * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47 * This will prevent vacuum from stalling in a blocked state trying to read a
48 * page when a cursor is sitting on it -- at least in many important cases.
49 *
50 * Set the buffer to invalid if the pin is released, since the buffer may be
51 * re-used. If we need to go back to this block (for example, to apply
52 * LP_DEAD hints) we must get a fresh reference to the buffer. Hopefully it
53 * will remain in shared memory for as long as it takes to scan the index
54 * buffer page.
55 */
56 static void
57 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
58 {
59 LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
60
61 if (IsMVCCSnapshot(scan->xs_snapshot) &&
62 RelationNeedsWAL(scan->indexRelation) &&
63 !scan->xs_want_itup)
64 {
65 ReleaseBuffer(sp->buf);
66 sp->buf = InvalidBuffer;
67 }
68 }
69
70
71 /*
72 * _bt_search() -- Search the tree for a particular scankey,
73 * or more precisely for the first leaf page it could be on.
74 *
75 * The passed scankey must be an insertion-type scankey (see nbtree/README),
76 * but it can omit the rightmost column(s) of the index.
77 *
78 * When nextkey is false (the usual case), we are looking for the first
79 * item >= scankey. When nextkey is true, we are looking for the first
80 * item strictly greater than scankey.
is_non_empty_string(const void * other)81 *
82 * Return value is a stack of parent-page pointers. *bufP is set to the
83 * address of the leaf-page buffer, which is read-locked and pinned.
84 * No locks are held on the parent pages, however!
85 *
86 * If the snapshot parameter is not NULL, "old snapshot" checking will take
87 * place during the descent through the tree. This is not needed when
88 * positioning for an insert or delete, so NULL is used for those cases.
89 *
90 * NOTE that the returned buffer is read-locked regardless of the access
91 * parameter. However, access = BT_WRITE will allow an empty root page
92 * to be created and returned. When access = BT_READ, an empty index
93 * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode,
94 * any incomplete splits encountered during the search will be finished.
95 */
96 BTStack
97 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
98 Buffer *bufP, int access, Snapshot snapshot)
99 {
100 BTStack stack_in = NULL;
101
102 /* Get the root page to start with */
103 *bufP = _bt_getroot(rel, access);
104
105 /* If index is empty and access = BT_READ, no root page is created. */
106 if (!BufferIsValid(*bufP))
107 return (BTStack) NULL;
108
109 /* Loop iterates once per level descended in the tree */
110 for (;;)
111 {
112 Page page;
113 BTPageOpaque opaque;
114 OffsetNumber offnum;
115 ItemId itemid;
116 IndexTuple itup;
117 BlockNumber blkno;
118 BlockNumber par_blkno;
119 BTStack new_stack;
120
121 /*
122 * Race -- the page we just grabbed may have split since we read its
123 * pointer in the parent (or metapage). If it has, we may need to
124 * move right to its new sibling. Do that.
125 *
126 * In write-mode, allow _bt_moveright to finish any incomplete splits
127 * along the way. Strictly speaking, we'd only need to finish an
128 * incomplete split on the leaf page we're about to insert to, not on
129 * any of the upper levels (they are taken care of in _bt_getstackbuf,
130 * if the leaf page is split and we insert to the parent page). But
131 * this is a good opportunity to finish splits of internal pages too.
132 */
133 *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
134 (access == BT_WRITE), stack_in,
135 BT_READ, snapshot);
136
137 /* if this is a leaf page, we're done */
138 page = BufferGetPage(*bufP);
139 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
140 if (P_ISLEAF(opaque))
141 break;
142
143 /*
144 * Find the appropriate item on the internal page, and get the child
145 * page that it points to.
146 */
147 offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
148 itemid = PageGetItemId(page, offnum);
149 itup = (IndexTuple) PageGetItem(page, itemid);
150 blkno = BTreeInnerTupleGetDownLink(itup);
151 par_blkno = BufferGetBlockNumber(*bufP);
152
153 /*
154 * We need to save the location of the index entry we chose in the
155 * parent page on a stack. In case we split the tree, we'll use the
156 * stack to work back up to the parent page. We also save the actual
157 * downlink (block) to uniquely identify the index entry, in case it
158 * moves right while we're working lower in the tree. See the paper
159 * by Lehman and Yao for how this is detected and handled. (We use the
160 * child link to disambiguate duplicate keys in the index -- Lehman
161 * and Yao disallow duplicate keys.)
162 */
163 new_stack = (BTStack) palloc(sizeof(BTStackData));
164 new_stack->bts_blkno = par_blkno;
165 new_stack->bts_offset = offnum;
166 new_stack->bts_btentry = blkno;
167 new_stack->bts_parent = stack_in;
168
169 /* drop the read lock on the parent page, acquire one on the child */
170 *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
171
172 /* okay, all set to move down a level */
173 stack_in = new_stack;
174 }
175
176 return stack_in;
177 }
178
179 /*
180 * _bt_moveright() -- move right in the btree if necessary.
181 *
182 * When we follow a pointer to reach a page, it is possible that
183 * the page has changed in the meanwhile. If this happens, we're
184 * guaranteed that the page has "split right" -- that is, that any
185 * data that appeared on the page originally is either on the page
186 * or strictly to the right of it.
187 *
188 * This routine decides whether or not we need to move right in the
189 * tree by examining the high key entry on the page. If that entry
190 * is strictly less than the scankey, or <= the scankey in the nextkey=true
191 * case, then we followed the wrong link and we need to move right.
192 *
193 * The passed scankey must be an insertion-type scankey (see nbtree/README),
194 * but it can omit the rightmost column(s) of the index.
195 *
196 * When nextkey is false (the usual case), we are looking for the first
197 * item >= scankey. When nextkey is true, we are looking for the first
198 * item strictly greater than scankey.
199 *
200 * If forupdate is true, we will attempt to finish any incomplete splits
201 * that we encounter. This is required when locking a target page for an
202 * insertion, because we don't allow inserting on a page before the split
203 * is completed. 'stack' is only used if forupdate is true.
204 *
205 * On entry, we have the buffer pinned and a lock of the type specified by
206 * 'access'. If we move right, we release the buffer and lock and acquire
207 * the same on the right sibling. Return value is the buffer we stop at.
208 *
209 * If the snapshot parameter is not NULL, "old snapshot" checking will take
210 * place during the descent through the tree. This is not needed when
211 * positioning for an insert or delete, so NULL is used for those cases.
212 */
213 Buffer
214 _bt_moveright(Relation rel,
215 Buffer buf,
216 int keysz,
217 ScanKey scankey,
218 bool nextkey,
219 bool forupdate,
220 BTStack stack,
221 int access,
222 Snapshot snapshot)
223 {
224 Page page;
225 BTPageOpaque opaque;
226 int32 cmpval;
227
228 /*
229 * When nextkey = false (normal case): if the scan key that brought us to
230 * this page is > the high key stored on the page, then the page has split
231 * and we need to move right. (If the scan key is equal to the high key,
232 * we might or might not need to move right; have to scan the page first
233 * anyway.)
234 *
235 * When nextkey = true: move right if the scan key is >= page's high key.
236 *
237 * The page could even have split more than once, so scan as far as
238 * needed.
239 *
240 * We also have to move right if we followed a link that brought us to a
241 * dead page.
242 */
243 cmpval = nextkey ? 0 : 1;
244
245 for (;;)
246 {
247 page = BufferGetPage(buf);
248 TestForOldSnapshot(snapshot, rel, page);
249 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
250
251 if (P_RIGHTMOST(opaque))
252 break;
253
254 /*
255 * Finish any incomplete splits we encounter along the way.
256 */
257 if (forupdate && P_INCOMPLETE_SPLIT(opaque))
258 {
259 BlockNumber blkno = BufferGetBlockNumber(buf);
260
261 /* upgrade our lock if necessary */
262 if (access == BT_READ)
263 {
264 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
265 LockBuffer(buf, BT_WRITE);
266 }
267
268 if (P_INCOMPLETE_SPLIT(opaque))
269 _bt_finish_split(rel, buf, stack);
270 else
271 _bt_relbuf(rel, buf);
272
273 /* re-acquire the lock in the right mode, and re-check */
274 buf = _bt_getbuf(rel, blkno, access);
275 continue;
276 }
277
278 if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
279 {
280 /* step right one page */
281 buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
282 continue;
283 }
284 else
285 break;
286 }
287
288 if (P_IGNORE(opaque))
289 elog(ERROR, "fell off the end of index \"%s\"",
290 RelationGetRelationName(rel));
291
292 return buf;
293 }
294
295 /*
296 * _bt_binsrch() -- Do a binary search for a key on a particular page.
297 *
298 * The passed scankey must be an insertion-type scankey (see nbtree/README),
299 * but it can omit the rightmost column(s) of the index.
300 *
301 * When nextkey is false (the usual case), we are looking for the first
302 * item >= scankey. When nextkey is true, we are looking for the first
303 * item strictly greater than scankey.
304 *
305 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
306 * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
307 * particular, this means it is possible to return a value 1 greater than the
308 * number of keys on the page, if the scankey is > all keys on the page.)
309 *
310 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
311 * of the last key < given scankey, or last key <= given scankey if nextkey
312 * is true. (Since _bt_compare treats the first data key of such a page as
313 * minus infinity, there will be at least one key < scankey, so the result
314 * always points at one of the keys on the page.) This key indicates the
315 * right place to descend to be sure we find all leaf keys >= given scankey
316 * (or leaf keys > given scankey when nextkey is true).
317 *
318 * This procedure is not responsible for walking right, it just examines
319 * the given page. _bt_binsrch() has no lock or refcount side effects
320 * on the buffer.
321 */
322 OffsetNumber
323 _bt_binsrch(Relation rel,
324 Buffer buf,
325 int keysz,
326 ScanKey scankey,
327 bool nextkey)
328 {
329 Page page;
330 BTPageOpaque opaque;
331 OffsetNumber low,
332 high;
333 int32 result,
334 cmpval;
335
336 page = BufferGetPage(buf);
337 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
338
339 low = P_FIRSTDATAKEY(opaque);
340 high = PageGetMaxOffsetNumber(page);
341
342 /*
343 * If there are no keys on the page, return the first available slot. Note
344 * this covers two cases: the page is really empty (no keys), or it
345 * contains only a high key. The latter case is possible after vacuuming.
346 * This can never happen on an internal page, however, since they are
347 * never empty (an internal page must have children).
348 */
349 if (high < low)
350 return low;
351
352 /*
353 * Binary search to find the first key on the page >= scan key, or first
354 * key > scankey when nextkey is true.
355 *
356 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
357 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
358 *
359 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
360 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
361 *
362 * We can fall out when high == low.
363 */
364 high++; /* establish the loop invariant for high */
365
366 cmpval = nextkey ? 0 : 1; /* select comparison value */
367
368 while (high > low)
369 {
370 OffsetNumber mid = low + ((high - low) / 2);
371
372 /* We have low <= mid < high, so mid points at a real slot */
373
374 result = _bt_compare(rel, keysz, scankey, page, mid);
375
376 if (result >= cmpval)
377 low = mid + 1;
378 else
379 high = mid;
380 }
381
382 /*
383 * At this point we have high == low, but be careful: they could point
384 * past the last slot on the page.
385 *
386 * On a leaf page, we always return the first key >= scan key (resp. >
387 * scan key), which could be the last slot + 1.
388 */
389 if (P_ISLEAF(opaque))
390 return low;
391
392 /*
393 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
394 * There must be one if _bt_compare() is playing by the rules.
395 */
396 Assert(low > P_FIRSTDATAKEY(opaque));
397
398 return OffsetNumberPrev(low);
399 }
400
401 /*----------
402 * _bt_compare() -- Compare scankey to a particular tuple on the page.
403 *
404 * The passed scankey must be an insertion-type scankey (see nbtree/README),
405 * but it can omit the rightmost column(s) of the index.
406 *
407 * keysz: number of key conditions to be checked (might be less than the
408 * number of index columns!)
409 * page/offnum: location of btree item to be compared to.
410 *
411 * This routine returns:
412 * <0 if scankey < tuple at offnum;
413 * 0 if scankey == tuple at offnum;
414 * >0 if scankey > tuple at offnum.
415 * NULLs in the keys are treated as sortable values. Therefore
416 * "equality" does not necessarily mean that the item should be
417 * returned to the caller as a matching key!
418 *
419 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
420 * "minus infinity": this routine will always claim it is less than the
421 * scankey. The actual key value stored (if any, which there probably isn't)
422 * does not matter. This convention allows us to implement the Lehman and
423 * Yao convention that the first down-link pointer is before the first key.
424 * See backend/access/nbtree/README for details.
425 *----------
426 */
427 int32
428 _bt_compare(Relation rel,
429 int keysz,
430 ScanKey scankey,
431 Page page,
432 OffsetNumber offnum)
433 {
434 TupleDesc itupdesc = RelationGetDescr(rel);
435 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
436 IndexTuple itup;
437 int i;
438
439 Assert(_bt_check_natts(rel, page, offnum));
440
441 /*
442 * Force result ">" if target item is first data item on an internal page
443 * --- see NOTE above.
444 */
445 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
446 return 1;
447
448 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
449
450 /*
451 * The scan key is set up with the attribute number associated with each
452 * term in the key. It is important that, if the index is multi-key, the
453 * scan contain the first k key attributes, and that they be in order. If
454 * you think about how multi-key ordering works, you'll understand why
455 * this is.
456 *
457 * We don't test for violation of this condition here, however. The
458 * initial setup for the index scan had better have gotten it right (see
459 * _bt_first).
460 */
461
462 for (i = 1; i <= keysz; i++)
463 {
464 Datum datum;
465 bool isNull;
466 int32 result;
467
468 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
469
470 /* see comments about NULLs handling in btbuild */
471 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
472 {
473 if (isNull)
474 result = 0; /* NULL "=" NULL */
475 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
476 result = -1; /* NULL "<" NOT_NULL */
477 else
478 result = 1; /* NULL ">" NOT_NULL */
479 }
480 else if (isNull) /* key is NOT_NULL and item is NULL */
481 {
482 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
483 result = 1; /* NOT_NULL ">" NULL */
484 else
485 result = -1; /* NOT_NULL "<" NULL */
486 }
487 else
488 {
489 /*
490 * The sk_func needs to be passed the index value as left arg and
491 * the sk_argument as right arg (they might be of different
492 * types). Since it is convenient for callers to think of
493 * _bt_compare as comparing the scankey to the index item, we have
494 * to flip the sign of the comparison result. (Unless it's a DESC
495 * column, in which case we *don't* flip the sign.)
496 */
497 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
498 scankey->sk_collation,
499 datum,
500 scankey->sk_argument));
501
502 if (!(scankey->sk_flags & SK_BT_DESC))
503 INVERT_COMPARE_RESULT(result);
504 }
505
506 /* if the keys are unequal, return the difference */
507 if (result != 0)
508 return result;
509
510 scankey++;
511 }
512
513 /* if we get here, the keys are equal */
514 return 0;
515 }
516
517 /*
518 * _bt_first() -- Find the first item in a scan.
519 *
520 * We need to be clever about the direction of scan, the search
521 * conditions, and the tree ordering. We find the first item (or,
522 * if backwards scan, the last item) in the tree that satisfies the
523 * qualifications in the scan key. On success exit, the page containing
524 * the current index tuple is pinned but not locked, and data about
525 * the matching tuple(s) on the page has been loaded into so->currPos.
526 * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
527 * and if requested, scan->xs_itup points to a copy of the index tuple.
528 *
529 * If there are no matching items in the index, we return false, with no
530 * pins or locks held.
531 *
532 * Note that scan->keyData[], and the so->keyData[] scankey built from it,
533 * are both search-type scankeys (see nbtree/README for more about this).
534 * Within this routine, we build a temporary insertion-type scankey to use
535 * in locating the scan start position.
536 */
537 bool
538 _bt_first(IndexScanDesc scan, ScanDirection dir)
539 {
540 Relation rel = scan->indexRelation;
541 BTScanOpaque so = (BTScanOpaque) scan->opaque;
542 Buffer buf;
543 BTStack stack;
544 OffsetNumber offnum;
545 StrategyNumber strat;
546 bool nextkey;
547 bool goback;
548 ScanKey startKeys[INDEX_MAX_KEYS];
549 ScanKeyData scankeys[INDEX_MAX_KEYS];
550 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
551 int keysCount = 0;
552 int i;
553 bool status = true;
554 StrategyNumber strat_total;
555 BTScanPosItem *currItem;
556 BlockNumber blkno;
557
558 Assert(!BTScanPosIsValid(so->currPos));
559
560 pgstat_count_index_scan(rel);
561
562 /*
563 * Examine the scan keys and eliminate any redundant keys; also mark the
564 * keys that must be matched to continue the scan.
565 */
566 _bt_preprocess_keys(scan);
567
568 /*
569 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
570 * never be satisfied (eg, x == 1 AND x > 2).
571 */
572 if (!so->qual_ok)
573 {
574 /* Notify any other workers that we're done with this scan key. */
575 _bt_parallel_done(scan);
576 return false;
577 }
578
579 /*
580 * For parallel scans, get the starting page from shared state. If the
581 * scan has not started, proceed to find out first leaf page in the usual
582 * way while keeping other participating processes waiting. If the scan
583 * has already begun, use the page number from the shared structure.
584 */
585 if (scan->parallel_scan != NULL)
586 {
587 status = _bt_parallel_seize(scan, &blkno);
588 if (!status)
589 return false;
590 else if (blkno == P_NONE)
591 {
592 _bt_parallel_done(scan);
593 return false;
594 }
595 else if (blkno != InvalidBlockNumber)
596 {
597 if (!_bt_parallel_readpage(scan, blkno, dir))
598 return false;
599 goto readcomplete;
600 }
601 }
602
603 /*----------
604 * Examine the scan keys to discover where we need to start the scan.
605 *
606 * We want to identify the keys that can be used as starting boundaries;
607 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
608 * a backwards scan. We can use keys for multiple attributes so long as
609 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
610 * a > or < boundary or find an attribute with no boundary (which can be
611 * thought of as the same as "> -infinity"), we can't use keys for any
612 * attributes to its right, because it would break our simplistic notion
613 * of what initial positioning strategy to use.
614 *
615 * When the scan keys include cross-type operators, _bt_preprocess_keys
616 * may not be able to eliminate redundant keys; in such cases we will
617 * arbitrarily pick a usable one for each attribute. This is correct
618 * but possibly not optimal behavior. (For example, with keys like
619 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
620 * x=5 would be more efficient.) Since the situation only arises given
621 * a poorly-worded query plus an incomplete opfamily, live with it.
622 *
623 * When both equality and inequality keys appear for a single attribute
624 * (again, only possible when cross-type operators appear), we *must*
625 * select one of the equality keys for the starting point, because
626 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
627 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
628 * start at x=4, we will fail and stop before reaching x=10. If multiple
629 * equality quals survive preprocessing, however, it doesn't matter which
630 * one we use --- by definition, they are either redundant or
631 * contradictory.
632 *
633 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
634 * If the index stores nulls at the end of the index we'll be starting
635 * from, and we have no boundary key for the column (which means the key
636 * we deduced NOT NULL from is an inequality key that constrains the other
637 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
638 * use as a boundary key. If we didn't do this, we might find ourselves
639 * traversing a lot of null entries at the start of the scan.
640 *
641 * In this loop, row-comparison keys are treated the same as keys on their
642 * first (leftmost) columns. We'll add on lower-order columns of the row
643 * comparison below, if possible.
644 *
645 * The selected scan keys (at most one per index column) are remembered by
646 * storing their addresses into the local startKeys[] array.
647 *----------
648 */
649 strat_total = BTEqualStrategyNumber;
650 if (so->numberOfKeys > 0)
651 {
652 AttrNumber curattr;
653 ScanKey chosen;
654 ScanKey impliesNN;
655 ScanKey cur;
656
657 /*
658 * chosen is the so-far-chosen key for the current attribute, if any.
659 * We don't cast the decision in stone until we reach keys for the
660 * next attribute.
661 */
662 curattr = 1;
663 chosen = NULL;
664 /* Also remember any scankey that implies a NOT NULL constraint */
665 impliesNN = NULL;
666
667 /*
668 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
669 * pass to handle after-last-key processing. Actual exit from the
670 * loop is at one of the "break" statements below.
671 */
672 for (cur = so->keyData, i = 0;; cur++, i++)
673 {
674 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
675 {
676 /*
677 * Done looking at keys for curattr. If we didn't find a
678 * usable boundary key, see if we can deduce a NOT NULL key.
679 */
680 if (chosen == NULL && impliesNN != NULL &&
681 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
682 ScanDirectionIsForward(dir) :
683 ScanDirectionIsBackward(dir)))
684 {
685 /* Yes, so build the key in notnullkeys[keysCount] */
686 chosen = ¬nullkeys[keysCount];
687 ScanKeyEntryInitialize(chosen,
688 (SK_SEARCHNOTNULL | SK_ISNULL |
689 (impliesNN->sk_flags &
690 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
691 curattr,
692 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
693 BTGreaterStrategyNumber :
694 BTLessStrategyNumber),
695 InvalidOid,
696 InvalidOid,
697 InvalidOid,
698 (Datum) 0);
699 }
700
701 /*
702 * If we still didn't find a usable boundary key, quit; else
703 * save the boundary key pointer in startKeys.
704 */
705 if (chosen == NULL)
706 break;
707 startKeys[keysCount++] = chosen;
708
709 /*
710 * Adjust strat_total, and quit if we have stored a > or <
711 * key.
712 */
713 strat = chosen->sk_strategy;
714 if (strat != BTEqualStrategyNumber)
715 {
716 strat_total = strat;
717 if (strat == BTGreaterStrategyNumber ||
718 strat == BTLessStrategyNumber)
719 break;
720 }
721
722 /*
723 * Done if that was the last attribute, or if next key is not
724 * in sequence (implying no boundary key is available for the
725 * next attribute).
726 */
727 if (i >= so->numberOfKeys ||
728 cur->sk_attno != curattr + 1)
729 break;
730
731 /*
732 * Reset for next attr.
733 */
734 curattr = cur->sk_attno;
735 chosen = NULL;
736 impliesNN = NULL;
737 }
738
739 /*
740 * Can we use this key as a starting boundary for this attr?
741 *
742 * If not, does it imply a NOT NULL constraint? (Because
743 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
744 * *any* inequality key works for that; we need not test.)
745 */
746 switch (cur->sk_strategy)
747 {
748 case BTLessStrategyNumber:
749 case BTLessEqualStrategyNumber:
750 if (chosen == NULL)
751 {
752 if (ScanDirectionIsBackward(dir))
753 chosen = cur;
754 else
755 impliesNN = cur;
756 }
757 break;
758 case BTEqualStrategyNumber:
759 /* override any non-equality choice */
760 chosen = cur;
761 break;
762 case BTGreaterEqualStrategyNumber:
763 case BTGreaterStrategyNumber:
764 if (chosen == NULL)
765 {
766 if (ScanDirectionIsForward(dir))
767 chosen = cur;
768 else
769 impliesNN = cur;
770 }
771 break;
772 }
773 }
774 }
775
776 /*
777 * If we found no usable boundary keys, we have to start from one end of
778 * the tree. Walk down that edge to the first or last key, and scan from
779 * there.
780 */
781 if (keysCount == 0)
782 {
783 bool match;
784
785 match = _bt_endpoint(scan, dir);
786
787 if (!match)
788 {
789 /* No match, so mark (parallel) scan finished */
790 _bt_parallel_done(scan);
791 }
792
793 return match;
794 }
795
796 /*
797 * We want to start the scan somewhere within the index. Set up an
798 * insertion scankey we can use to search for the boundary point we
799 * identified above. The insertion scankey is built in the local
800 * scankeys[] array, using the keys identified by startKeys[].
801 */
802 Assert(keysCount <= INDEX_MAX_KEYS);
803 for (i = 0; i < keysCount; i++)
804 {
805 ScanKey cur = startKeys[i];
806
807 Assert(cur->sk_attno == i + 1);
808
809 if (cur->sk_flags & SK_ROW_HEADER)
810 {
811 /*
812 * Row comparison header: look to the first row member instead.
813 *
814 * The member scankeys are already in insertion format (ie, they
815 * have sk_func = 3-way-comparison function), but we have to watch
816 * out for nulls, which _bt_preprocess_keys didn't check. A null
817 * in the first row member makes the condition unmatchable, just
818 * like qual_ok = false.
819 */
820 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
821
822 Assert(subkey->sk_flags & SK_ROW_MEMBER);
823 if (subkey->sk_flags & SK_ISNULL)
824 {
825 _bt_parallel_done(scan);
826 return false;
827 }
828 memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
829
830 /*
831 * If the row comparison is the last positioning key we accepted,
832 * try to add additional keys from the lower-order row members.
833 * (If we accepted independent conditions on additional index
834 * columns, we use those instead --- doesn't seem worth trying to
835 * determine which is more restrictive.) Note that this is OK
836 * even if the row comparison is of ">" or "<" type, because the
837 * condition applied to all but the last row member is effectively
838 * ">=" or "<=", and so the extra keys don't break the positioning
839 * scheme. But, by the same token, if we aren't able to use all
840 * the row members, then the part of the row comparison that we
841 * did use has to be treated as just a ">=" or "<=" condition, and
842 * so we'd better adjust strat_total accordingly.
843 */
844 if (i == keysCount - 1)
845 {
846 bool used_all_subkeys = false;
847
848 Assert(!(subkey->sk_flags & SK_ROW_END));
849 for (;;)
850 {
851 subkey++;
852 Assert(subkey->sk_flags & SK_ROW_MEMBER);
853 if (subkey->sk_attno != keysCount + 1)
854 break; /* out-of-sequence, can't use it */
855 if (subkey->sk_strategy != cur->sk_strategy)
856 break; /* wrong direction, can't use it */
857 if (subkey->sk_flags & SK_ISNULL)
858 break; /* can't use null keys */
859 Assert(keysCount < INDEX_MAX_KEYS);
860 memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
861 keysCount++;
862 if (subkey->sk_flags & SK_ROW_END)
863 {
864 used_all_subkeys = true;
865 break;
866 }
867 }
868 if (!used_all_subkeys)
869 {
870 switch (strat_total)
871 {
872 case BTLessStrategyNumber:
873 strat_total = BTLessEqualStrategyNumber;
874 break;
875 case BTGreaterStrategyNumber:
876 strat_total = BTGreaterEqualStrategyNumber;
877 break;
878 }
879 }
880 break; /* done with outer loop */
881 }
882 }
883 else
884 {
885 /*
886 * Ordinary comparison key. Transform the search-style scan key
887 * to an insertion scan key by replacing the sk_func with the
888 * appropriate btree comparison function.
889 *
890 * If scankey operator is not a cross-type comparison, we can use
891 * the cached comparison function; otherwise gotta look it up in
892 * the catalogs. (That can't lead to infinite recursion, since no
893 * indexscan initiated by syscache lookup will use cross-data-type
894 * operators.)
895 *
896 * We support the convention that sk_subtype == InvalidOid means
897 * the opclass input type; this is a hack to simplify life for
898 * ScanKeyInit().
899 */
900 if (cur->sk_subtype == rel->rd_opcintype[i] ||
901 cur->sk_subtype == InvalidOid)
902 {
903 FmgrInfo *procinfo;
904
905 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
906 ScanKeyEntryInitializeWithInfo(scankeys + i,
907 cur->sk_flags,
908 cur->sk_attno,
909 InvalidStrategy,
910 cur->sk_subtype,
911 cur->sk_collation,
912 procinfo,
913 cur->sk_argument);
914 }
915 else
916 {
917 RegProcedure cmp_proc;
918
919 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
920 rel->rd_opcintype[i],
921 cur->sk_subtype,
922 BTORDER_PROC);
923 if (!RegProcedureIsValid(cmp_proc))
924 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
925 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
926 cur->sk_attno, RelationGetRelationName(rel));
927 ScanKeyEntryInitialize(scankeys + i,
928 cur->sk_flags,
929 cur->sk_attno,
930 InvalidStrategy,
931 cur->sk_subtype,
932 cur->sk_collation,
933 cmp_proc,
934 cur->sk_argument);
935 }
936 }
937 }
938
939 /*----------
940 * Examine the selected initial-positioning strategy to determine exactly
941 * where we need to start the scan, and set flag variables to control the
942 * code below.
943 *
944 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
945 * item >= scan key. If nextkey = true, they will locate the first
946 * item > scan key.
947 *
948 * If goback = true, we will then step back one item, while if
949 * goback = false, we will start the scan on the located item.
950 *----------
951 */
952 switch (strat_total)
953 {
954 case BTLessStrategyNumber:
955
956 /*
957 * Find first item >= scankey, then back up one to arrive at last
958 * item < scankey. (Note: this positioning strategy is only used
959 * for a backward scan, so that is always the correct starting
960 * position.)
961 */
962 nextkey = false;
963 goback = true;
964 break;
965
966 case BTLessEqualStrategyNumber:
967
968 /*
969 * Find first item > scankey, then back up one to arrive at last
970 * item <= scankey. (Note: this positioning strategy is only used
971 * for a backward scan, so that is always the correct starting
972 * position.)
973 */
974 nextkey = true;
975 goback = true;
976 break;
977
978 case BTEqualStrategyNumber:
979
980 /*
981 * If a backward scan was specified, need to start with last equal
982 * item not first one.
983 */
984 if (ScanDirectionIsBackward(dir))
985 {
986 /*
987 * This is the same as the <= strategy. We will check at the
988 * end whether the found item is actually =.
989 */
990 nextkey = true;
991 goback = true;
992 }
993 else
994 {
995 /*
996 * This is the same as the >= strategy. We will check at the
997 * end whether the found item is actually =.
998 */
999 nextkey = false;
1000 goback = false;
1001 }
1002 break;
1003
1004 case BTGreaterEqualStrategyNumber:
1005
1006 /*
1007 * Find first item >= scankey. (This is only used for forward
1008 * scans.)
1009 */
1010 nextkey = false;
1011 goback = false;
1012 break;
1013
1014 case BTGreaterStrategyNumber:
1015
1016 /*
1017 * Find first item > scankey. (This is only used for forward
1018 * scans.)
1019 */
1020 nextkey = true;
1021 goback = false;
1022 break;
1023
1024 default:
1025 /* can't get here, but keep compiler quiet */
1026 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1027 return false;
1028 }
1029
1030 /*
1031 * Use the manufactured insertion scan key to descend the tree and
1032 * position ourselves on the target leaf page.
1033 */
1034 stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
1035 scan->xs_snapshot);
1036
1037 /* don't need to keep the stack around... */
1038 _bt_freestack(stack);
1039
1040 if (!BufferIsValid(buf))
1041 {
1042 /*
1043 * We only get here if the index is completely empty. Lock relation
1044 * because nothing finer to lock exists.
1045 */
1046 PredicateLockRelation(rel, scan->xs_snapshot);
1047
1048 /*
1049 * mark parallel scan as done, so that all the workers can finish
1050 * their scan
1051 */
1052 _bt_parallel_done(scan);
1053 BTScanPosInvalidate(so->currPos);
1054
1055 return false;
1056 }
1057 else
1058 PredicateLockPage(rel, BufferGetBlockNumber(buf),
1059 scan->xs_snapshot);
1060
1061 _bt_initialize_more_data(so, dir);
1062
1063 /* position to the precise item on the page */
1064 offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
1065
1066 /*
1067 * If nextkey = false, we are positioned at the first item >= scan key, or
1068 * possibly at the end of a page on which all the existing items are less
1069 * than the scan key and we know that everything on later pages is greater
1070 * than or equal to scan key.
1071 *
1072 * If nextkey = true, we are positioned at the first item > scan key, or
1073 * possibly at the end of a page on which all the existing items are less
1074 * than or equal to the scan key and we know that everything on later
1075 * pages is greater than scan key.
1076 *
1077 * The actually desired starting point is either this item or the prior
1078 * one, or in the end-of-page case it's the first item on the next page or
1079 * the last item on this page. Adjust the starting offset if needed. (If
1080 * this results in an offset before the first item or after the last one,
1081 * _bt_readpage will report no items found, and then we'll step to the
1082 * next page as needed.)
1083 */
1084 if (goback)
1085 offnum = OffsetNumberPrev(offnum);
1086
1087 /* remember which buffer we have pinned, if any */
1088 Assert(!BTScanPosIsValid(so->currPos));
1089 so->currPos.buf = buf;
1090
1091 /*
1092 * Now load data from the first page of the scan.
1093 */
1094 if (!_bt_readpage(scan, dir, offnum))
1095 {
1096 /*
1097 * There's no actually-matching data on this page. Try to advance to
1098 * the next page. Return false if there's no matching data at all.
1099 */
1100 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1101 if (!_bt_steppage(scan, dir))
1102 return false;
1103 }
1104 else
1105 {
1106 /* Drop the lock, and maybe the pin, on the current page */
1107 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1108 }
1109
1110 readcomplete:
1111 /* OK, itemIndex says what to return */
1112 currItem = &so->currPos.items[so->currPos.itemIndex];
1113 scan->xs_ctup.t_self = currItem->heapTid;
1114 if (scan->xs_want_itup)
1115 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1116
1117 return true;
1118 }
1119
1120 /*
1121 * _bt_next() -- Get the next item in a scan.
1122 *
1123 * On entry, so->currPos describes the current page, which may be pinned
1124 * but is not locked, and so->currPos.itemIndex identifies which item was
1125 * previously returned.
1126 *
1127 * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1128 * next heap tuple, and if requested, scan->xs_itup points to a copy of
1129 * the index tuple. so->currPos is updated as needed.
1130 *
1131 * On failure exit (no more tuples), we release pin and set
1132 * so->currPos.buf to InvalidBuffer.
1133 */
1134 bool
1135 _bt_next(IndexScanDesc scan, ScanDirection dir)
1136 {
1137 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1138 BTScanPosItem *currItem;
1139
1140 /*
1141 * Advance to next tuple on current page; or if there's no more, try to
1142 * step to the next page with data.
1143 */
1144 if (ScanDirectionIsForward(dir))
1145 {
1146 if (++so->currPos.itemIndex > so->currPos.lastItem)
1147 {
1148 if (!_bt_steppage(scan, dir))
1149 return false;
1150 }
1151 }
1152 else
1153 {
1154 if (--so->currPos.itemIndex < so->currPos.firstItem)
1155 {
1156 if (!_bt_steppage(scan, dir))
1157 return false;
1158 }
1159 }
1160
1161 /* OK, itemIndex says what to return */
1162 currItem = &so->currPos.items[so->currPos.itemIndex];
1163 scan->xs_ctup.t_self = currItem->heapTid;
1164 if (scan->xs_want_itup)
1165 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1166
1167 return true;
1168 }
1169
1170 /*
1171 * _bt_readpage() -- Load data from current index page into so->currPos
1172 *
1173 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1174 * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1175 * they are updated as appropriate. All other fields of so->currPos are
1176 * initialized from scratch here.
1177 *
1178 * We scan the current page starting at offnum and moving in the indicated
1179 * direction. All items matching the scan keys are loaded into currPos.items.
1180 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1181 * that there can be no more matching tuples in the current scan direction.
1182 *
1183 * In the case of a parallel scan, caller must have called _bt_parallel_seize
1184 * prior to calling this function; this function will invoke
1185 * _bt_parallel_release before returning.
1186 *
1187 * Returns true if any matching items found on the page, false if none.
1188 */
1189 static bool
1190 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1191 {
1192 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1193 Page page;
1194 BTPageOpaque opaque;
1195 OffsetNumber minoff;
1196 OffsetNumber maxoff;
1197 int itemIndex;
1198 IndexTuple itup;
1199 bool continuescan;
1200
1201 /*
1202 * We must have the buffer pinned and locked, but the usual macro can't be
1203 * used here; this function is what makes it good for currPos.
1204 */
1205 Assert(BufferIsValid(so->currPos.buf));
1206
1207 page = BufferGetPage(so->currPos.buf);
1208 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1209
1210 /* allow next page be processed by parallel worker */
1211 if (scan->parallel_scan)
1212 {
1213 if (ScanDirectionIsForward(dir))
1214 _bt_parallel_release(scan, opaque->btpo_next);
1215 else
1216 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1217 }
1218
1219 minoff = P_FIRSTDATAKEY(opaque);
1220 maxoff = PageGetMaxOffsetNumber(page);
1221
1222 /*
1223 * We note the buffer's block number so that we can release the pin later.
1224 * This allows us to re-read the buffer if it is needed again for hinting.
1225 */
1226 so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1227
1228 /*
1229 * We save the LSN of the page as we read it, so that we know whether it
1230 * safe to apply LP_DEAD hints to the page later. This allows us to drop
1231 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1232 */
1233 so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1234
1235 /*
1236 * we must save the page's right-link while scanning it; this tells us
1237 * where to step right to after we're done with these items. There is no
1238 * corresponding need for the left-link, since splits always go right.
1239 */
1240 so->currPos.nextPage = opaque->btpo_next;
1241
1242 /* initialize tuple workspace to empty */
1243 so->currPos.nextTupleOffset = 0;
1244
1245 /*
1246 * Now that the current page has been made consistent, the macro should be
1247 * good.
1248 */
1249 Assert(BTScanPosIsPinned(so->currPos));
1250
1251 if (ScanDirectionIsForward(dir))
1252 {
1253 /* load items[] in ascending order */
1254 itemIndex = 0;
1255
1256 offnum = Max(offnum, minoff);
1257
1258 while (offnum <= maxoff)
1259 {
1260 itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1261 if (itup != NULL)
1262 {
1263 /* tuple passes all scan key conditions, so remember it */
1264 _bt_saveitem(so, itemIndex, offnum, itup);
1265 itemIndex++;
1266 }
1267 if (!continuescan)
1268 {
1269 /* there can't be any more matches, so stop */
1270 so->currPos.moreRight = false;
1271 break;
1272 }
1273
1274 offnum = OffsetNumberNext(offnum);
1275 }
1276
1277 Assert(itemIndex <= MaxIndexTuplesPerPage);
1278 so->currPos.firstItem = 0;
1279 so->currPos.lastItem = itemIndex - 1;
1280 so->currPos.itemIndex = 0;
1281 }
1282 else
1283 {
1284 /* load items[] in descending order */
1285 itemIndex = MaxIndexTuplesPerPage;
1286
1287 offnum = Min(offnum, maxoff);
1288
1289 while (offnum >= minoff)
1290 {
1291 itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1292 if (itup != NULL)
1293 {
1294 /* tuple passes all scan key conditions, so remember it */
1295 itemIndex--;
1296 _bt_saveitem(so, itemIndex, offnum, itup);
1297 }
1298 if (!continuescan)
1299 {
1300 /* there can't be any more matches, so stop */
1301 so->currPos.moreLeft = false;
1302 break;
1303 }
1304
1305 offnum = OffsetNumberPrev(offnum);
1306 }
1307
1308 Assert(itemIndex >= 0);
1309 so->currPos.firstItem = itemIndex;
1310 so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1311 so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1312 }
1313
1314 return (so->currPos.firstItem <= so->currPos.lastItem);
1315 }
1316
1317 /* Save an index item into so->currPos.items[itemIndex] */
1318 static void
1319 _bt_saveitem(BTScanOpaque so, int itemIndex,
1320 OffsetNumber offnum, IndexTuple itup)
1321 {
1322 BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1323
1324 currItem->heapTid = itup->t_tid;
1325 currItem->indexOffset = offnum;
1326 if (so->currTuples)
1327 {
1328 Size itupsz = IndexTupleSize(itup);
1329
1330 currItem->tupleOffset = so->currPos.nextTupleOffset;
1331 memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1332 so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1333 }
1334 }
1335
1336 /*
1337 * _bt_steppage() -- Step to next page containing valid data for scan
1338 *
1339 * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1340 * if pinned, we'll drop the pin before moving to next page. The buffer is
1341 * not locked on entry.
1342 *
1343 * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1344 * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1345 * to InvalidBuffer. We return true to indicate success.
1346 */
1347 static bool
1348 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1349 {
1350 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1351 BlockNumber blkno = InvalidBlockNumber;
1352 bool status = true;
1353
1354 Assert(BTScanPosIsValid(so->currPos));
1355
1356 /* Before leaving current page, deal with any killed items */
1357 if (so->numKilled > 0)
1358 _bt_killitems(scan);
1359
1360 /*
1361 * Before we modify currPos, make a copy of the page data if there was a
1362 * mark position that needs it.
1363 */
1364 if (so->markItemIndex >= 0)
1365 {
1366 /* bump pin on current buffer for assignment to mark buffer */
1367 if (BTScanPosIsPinned(so->currPos))
1368 IncrBufferRefCount(so->currPos.buf);
1369 memcpy(&so->markPos, &so->currPos,
1370 offsetof(BTScanPosData, items[1]) +
1371 so->currPos.lastItem * sizeof(BTScanPosItem));
1372 if (so->markTuples)
1373 memcpy(so->markTuples, so->currTuples,
1374 so->currPos.nextTupleOffset);
1375 so->markPos.itemIndex = so->markItemIndex;
1376 so->markItemIndex = -1;
1377 }
1378
1379 if (ScanDirectionIsForward(dir))
1380 {
1381 /* Walk right to the next page with data */
1382 if (scan->parallel_scan != NULL)
1383 {
1384 /*
1385 * Seize the scan to get the next block number; if the scan has
1386 * ended already, bail out.
1387 */
1388 status = _bt_parallel_seize(scan, &blkno);
1389 if (!status)
1390 {
1391 /* release the previous buffer, if pinned */
1392 BTScanPosUnpinIfPinned(so->currPos);
1393 BTScanPosInvalidate(so->currPos);
1394 return false;
1395 }
1396 }
1397 else
1398 {
1399 /* Not parallel, so use the previously-saved nextPage link. */
1400 blkno = so->currPos.nextPage;
1401 }
1402
1403 /* Remember we left a page with data */
1404 so->currPos.moreLeft = true;
1405
1406 /* release the previous buffer, if pinned */
1407 BTScanPosUnpinIfPinned(so->currPos);
1408 }
1409 else
1410 {
1411 /* Remember we left a page with data */
1412 so->currPos.moreRight = true;
1413
1414 if (scan->parallel_scan != NULL)
1415 {
1416 /*
1417 * Seize the scan to get the current block number; if the scan has
1418 * ended already, bail out.
1419 */
1420 status = _bt_parallel_seize(scan, &blkno);
1421 BTScanPosUnpinIfPinned(so->currPos);
1422 if (!status)
1423 {
1424 BTScanPosInvalidate(so->currPos);
1425 return false;
1426 }
1427 }
1428 else
1429 {
1430 /* Not parallel, so just use our own notion of the current page */
1431 blkno = so->currPos.currPage;
1432 }
1433 }
1434
1435 if (!_bt_readnextpage(scan, blkno, dir))
1436 return false;
1437
1438 /* Drop the lock, and maybe the pin, on the current page */
1439 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1440
1441 return true;
1442 }
1443
1444 /*
1445 * _bt_readnextpage() -- Read next page containing valid data for scan
1446 *
1447 * On success exit, so->currPos is updated to contain data from the next
1448 * interesting page. Caller is responsible to release lock and pin on
1449 * buffer on success. We return true to indicate success.
1450 *
1451 * If there are no more matching records in the given direction, we drop all
1452 * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1453 */
1454 static bool
1455 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1456 {
1457 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1458 Relation rel;
1459 Page page;
1460 BTPageOpaque opaque;
1461 bool status = true;
1462
1463 rel = scan->indexRelation;
1464
1465 if (ScanDirectionIsForward(dir))
1466 {
1467 for (;;)
1468 {
1469 /*
1470 * if we're at end of scan, give up and mark parallel scan as
1471 * done, so that all the workers can finish their scan
1472 */
1473 if (blkno == P_NONE || !so->currPos.moreRight)
1474 {
1475 _bt_parallel_done(scan);
1476 BTScanPosInvalidate(so->currPos);
1477 return false;
1478 }
1479 /* check for interrupts while we're not holding any buffer lock */
1480 CHECK_FOR_INTERRUPTS();
1481 /* step right one page */
1482 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1483 page = BufferGetPage(so->currPos.buf);
1484 TestForOldSnapshot(scan->xs_snapshot, rel, page);
1485 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1486 /* check for deleted page */
1487 if (!P_IGNORE(opaque))
1488 {
1489 PredicateLockPage(rel, blkno, scan->xs_snapshot);
1490 /* see if there are any matches on this page */
1491 /* note that this will clear moreRight if we can stop */
1492 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1493 break;
1494 }
1495 else if (scan->parallel_scan != NULL)
1496 {
1497 /* allow next page be processed by parallel worker */
1498 _bt_parallel_release(scan, opaque->btpo_next);
1499 }
1500
1501 /* nope, keep going */
1502 if (scan->parallel_scan != NULL)
1503 {
1504 _bt_relbuf(rel, so->currPos.buf);
1505 status = _bt_parallel_seize(scan, &blkno);
1506 if (!status)
1507 {
1508 BTScanPosInvalidate(so->currPos);
1509 return false;
1510 }
1511 }
1512 else
1513 {
1514 blkno = opaque->btpo_next;
1515 _bt_relbuf(rel, so->currPos.buf);
1516 }
1517 }
1518 }
1519 else
1520 {
1521 /*
1522 * Should only happen in parallel cases, when some other backend
1523 * advanced the scan.
1524 */
1525 if (so->currPos.currPage != blkno)
1526 {
1527 BTScanPosUnpinIfPinned(so->currPos);
1528 so->currPos.currPage = blkno;
1529 }
1530
1531 /*
1532 * Walk left to the next page with data. This is much more complex
1533 * than the walk-right case because of the possibility that the page
1534 * to our left splits while we are in flight to it, plus the
1535 * possibility that the page we were on gets deleted after we leave
1536 * it. See nbtree/README for details.
1537 *
1538 * It might be possible to rearrange this code to have less overhead
1539 * in pinning and locking, but that would require capturing the left
1540 * pointer when the page is initially read, and using it here, along
1541 * with big changes to _bt_walk_left() and the code below. It is not
1542 * clear whether this would be a win, since if the page immediately to
1543 * the left splits after we read this page and before we step left, we
1544 * would need to visit more pages than with the current code.
1545 *
1546 * Note that if we change the code so that we drop the pin for a scan
1547 * which uses a non-MVCC snapshot, we will need to modify the code for
1548 * walking left, to allow for the possibility that a referenced page
1549 * has been deleted. As long as the buffer is pinned or the snapshot
1550 * is MVCC the page cannot move past the half-dead state to fully
1551 * deleted.
1552 */
1553 if (BTScanPosIsPinned(so->currPos))
1554 LockBuffer(so->currPos.buf, BT_READ);
1555 else
1556 so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1557
1558 for (;;)
1559 {
1560 /* Done if we know there are no matching keys to the left */
1561 if (!so->currPos.moreLeft)
1562 {
1563 _bt_relbuf(rel, so->currPos.buf);
1564 _bt_parallel_done(scan);
1565 BTScanPosInvalidate(so->currPos);
1566 return false;
1567 }
1568
1569 /* Step to next physical page */
1570 so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1571 scan->xs_snapshot);
1572
1573 /* if we're physically at end of index, return failure */
1574 if (so->currPos.buf == InvalidBuffer)
1575 {
1576 _bt_parallel_done(scan);
1577 BTScanPosInvalidate(so->currPos);
1578 return false;
1579 }
1580
1581 /*
1582 * Okay, we managed to move left to a non-deleted page. Done if
1583 * it's not half-dead and contains matching tuples. Else loop back
1584 * and do it all again.
1585 */
1586 page = BufferGetPage(so->currPos.buf);
1587 TestForOldSnapshot(scan->xs_snapshot, rel, page);
1588 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1589 if (!P_IGNORE(opaque))
1590 {
1591 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1592 /* see if there are any matches on this page */
1593 /* note that this will clear moreLeft if we can stop */
1594 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1595 break;
1596 }
1597 else if (scan->parallel_scan != NULL)
1598 {
1599 /* allow next page be processed by parallel worker */
1600 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1601 }
1602
1603 /*
1604 * For parallel scans, get the last page scanned as it is quite
1605 * possible that by the time we try to seize the scan, some other
1606 * worker has already advanced the scan to a different page. We
1607 * must continue based on the latest page scanned by any worker.
1608 */
1609 if (scan->parallel_scan != NULL)
1610 {
1611 _bt_relbuf(rel, so->currPos.buf);
1612 status = _bt_parallel_seize(scan, &blkno);
1613 if (!status)
1614 {
1615 BTScanPosInvalidate(so->currPos);
1616 return false;
1617 }
1618 so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1619 }
1620 }
1621 }
1622
1623 return true;
1624 }
1625
1626 /*
1627 * _bt_parallel_readpage() -- Read current page containing valid data for scan
1628 *
1629 * On success, release lock and maybe pin on buffer. We return true to
1630 * indicate success.
1631 */
1632 static bool
1633 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1634 {
1635 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1636
1637 _bt_initialize_more_data(so, dir);
1638
1639 if (!_bt_readnextpage(scan, blkno, dir))
1640 return false;
1641
1642 /* Drop the lock, and maybe the pin, on the current page */
1643 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1644
1645 return true;
1646 }
1647
1648 /*
1649 * _bt_walk_left() -- step left one page, if possible
1650 *
1651 * The given buffer must be pinned and read-locked. This will be dropped
1652 * before stepping left. On return, we have pin and read lock on the
1653 * returned page, instead.
1654 *
1655 * Returns InvalidBuffer if there is no page to the left (no lock is held
1656 * in that case).
1657 *
1658 * When working on a non-leaf level, it is possible for the returned page
1659 * to be half-dead; the caller should check that condition and step left
1660 * again if it's important.
1661 */
1662 static Buffer
1663 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1664 {
1665 Page page;
1666 BTPageOpaque opaque;
1667
1668 page = BufferGetPage(buf);
1669 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1670
1671 for (;;)
1672 {
1673 BlockNumber obknum;
1674 BlockNumber lblkno;
1675 BlockNumber blkno;
1676 int tries;
1677
1678 /* if we're at end of tree, release buf and return failure */
1679 if (P_LEFTMOST(opaque))
1680 {
1681 _bt_relbuf(rel, buf);
1682 break;
1683 }
1684 /* remember original page we are stepping left from */
1685 obknum = BufferGetBlockNumber(buf);
1686 /* step left */
1687 blkno = lblkno = opaque->btpo_prev;
1688 _bt_relbuf(rel, buf);
1689 /* check for interrupts while we're not holding any buffer lock */
1690 CHECK_FOR_INTERRUPTS();
1691 buf = _bt_getbuf(rel, blkno, BT_READ);
1692 page = BufferGetPage(buf);
1693 TestForOldSnapshot(snapshot, rel, page);
1694 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1695
1696 /*
1697 * If this isn't the page we want, walk right till we find what we
1698 * want --- but go no more than four hops (an arbitrary limit). If we
1699 * don't find the correct page by then, the most likely bet is that
1700 * the original page got deleted and isn't in the sibling chain at all
1701 * anymore, not that its left sibling got split more than four times.
1702 *
1703 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1704 * because half-dead pages are still in the sibling chain. Caller
1705 * must reject half-dead pages if wanted.
1706 */
1707 tries = 0;
1708 for (;;)
1709 {
1710 if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1711 {
1712 /* Found desired page, return it */
1713 return buf;
1714 }
1715 if (P_RIGHTMOST(opaque) || ++tries > 4)
1716 break;
1717 blkno = opaque->btpo_next;
1718 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1719 page = BufferGetPage(buf);
1720 TestForOldSnapshot(snapshot, rel, page);
1721 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1722 }
1723
1724 /* Return to the original page to see what's up */
1725 buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1726 page = BufferGetPage(buf);
1727 TestForOldSnapshot(snapshot, rel, page);
1728 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1729 if (P_ISDELETED(opaque))
1730 {
1731 /*
1732 * It was deleted. Move right to first nondeleted page (there
1733 * must be one); that is the page that has acquired the deleted
1734 * one's keyspace, so stepping left from it will take us where we
1735 * want to be.
1736 */
1737 for (;;)
1738 {
1739 if (P_RIGHTMOST(opaque))
1740 elog(ERROR, "fell off the end of index \"%s\"",
1741 RelationGetRelationName(rel));
1742 blkno = opaque->btpo_next;
1743 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1744 page = BufferGetPage(buf);
1745 TestForOldSnapshot(snapshot, rel, page);
1746 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1747 if (!P_ISDELETED(opaque))
1748 break;
1749 }
1750
1751 /*
1752 * Now return to top of loop, resetting obknum to point to this
1753 * nondeleted page, and try again.
1754 */
1755 }
1756 else
1757 {
1758 /*
1759 * It wasn't deleted; the explanation had better be that the page
1760 * to the left got split or deleted. Without this check, we'd go
1761 * into an infinite loop if there's anything wrong.
1762 */
1763 if (opaque->btpo_prev == lblkno)
1764 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1765 obknum, RelationGetRelationName(rel));
1766 /* Okay to try again with new lblkno value */
1767 }
1768 }
1769
1770 return InvalidBuffer;
1771 }
1772
1773 /*
1774 * _bt_get_endpoint() -- Find the first or last page on a given tree level
1775 *
1776 * If the index is empty, we will return InvalidBuffer; any other failure
1777 * condition causes ereport(). We will not return a dead page.
1778 *
1779 * The returned buffer is pinned and read-locked.
1780 */
1781 Buffer
1782 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
1783 Snapshot snapshot)
1784 {
1785 Buffer buf;
1786 Page page;
1787 BTPageOpaque opaque;
1788 OffsetNumber offnum;
1789 BlockNumber blkno;
1790 IndexTuple itup;
1791
1792 /*
1793 * If we are looking for a leaf page, okay to descend from fast root;
1794 * otherwise better descend from true root. (There is no point in being
1795 * smarter about intermediate levels.)
1796 */
1797 if (level == 0)
1798 buf = _bt_getroot(rel, BT_READ);
1799 else
1800 buf = _bt_gettrueroot(rel);
1801
1802 if (!BufferIsValid(buf))
1803 return InvalidBuffer;
1804
1805 page = BufferGetPage(buf);
1806 TestForOldSnapshot(snapshot, rel, page);
1807 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1808
1809 for (;;)
1810 {
1811 /*
1812 * If we landed on a deleted page, step right to find a live page
1813 * (there must be one). Also, if we want the rightmost page, step
1814 * right if needed to get to it (this could happen if the page split
1815 * since we obtained a pointer to it).
1816 */
1817 while (P_IGNORE(opaque) ||
1818 (rightmost && !P_RIGHTMOST(opaque)))
1819 {
1820 blkno = opaque->btpo_next;
1821 if (blkno == P_NONE)
1822 elog(ERROR, "fell off the end of index \"%s\"",
1823 RelationGetRelationName(rel));
1824 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1825 page = BufferGetPage(buf);
1826 TestForOldSnapshot(snapshot, rel, page);
1827 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1828 }
1829
1830 /* Done? */
1831 if (opaque->btpo.level == level)
1832 break;
1833 if (opaque->btpo.level < level)
1834 elog(ERROR, "btree level %u not found in index \"%s\"",
1835 level, RelationGetRelationName(rel));
1836
1837 /* Descend to leftmost or rightmost child page */
1838 if (rightmost)
1839 offnum = PageGetMaxOffsetNumber(page);
1840 else
1841 offnum = P_FIRSTDATAKEY(opaque);
1842
1843 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1844 blkno = BTreeInnerTupleGetDownLink(itup);
1845
1846 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1847 page = BufferGetPage(buf);
1848 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1849 }
1850
1851 return buf;
1852 }
1853
1854 /*
1855 * _bt_endpoint() -- Find the first or last page in the index, and scan
1856 * from there to the first key satisfying all the quals.
1857 *
1858 * This is used by _bt_first() to set up a scan when we've determined
1859 * that the scan must start at the beginning or end of the index (for
1860 * a forward or backward scan respectively). Exit conditions are the
1861 * same as for _bt_first().
1862 */
1863 static bool
1864 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1865 {
1866 Relation rel = scan->indexRelation;
1867 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1868 Buffer buf;
1869 Page page;
1870 BTPageOpaque opaque;
1871 OffsetNumber start;
1872 BTScanPosItem *currItem;
1873
1874 /*
1875 * Scan down to the leftmost or rightmost leaf page. This is a simplified
1876 * version of _bt_search(). We don't maintain a stack since we know we
1877 * won't need it.
1878 */
1879 buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
1880
1881 if (!BufferIsValid(buf))
1882 {
1883 /*
1884 * Empty index. Lock the whole relation, as nothing finer to lock
1885 * exists.
1886 */
1887 PredicateLockRelation(rel, scan->xs_snapshot);
1888 BTScanPosInvalidate(so->currPos);
1889 return false;
1890 }
1891
1892 PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
1893 page = BufferGetPage(buf);
1894 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1895 Assert(P_ISLEAF(opaque));
1896
1897 if (ScanDirectionIsForward(dir))
1898 {
1899 /* There could be dead pages to the left, so not this: */
1900 /* Assert(P_LEFTMOST(opaque)); */
1901
1902 start = P_FIRSTDATAKEY(opaque);
1903 }
1904 else if (ScanDirectionIsBackward(dir))
1905 {
1906 Assert(P_RIGHTMOST(opaque));
1907
1908 start = PageGetMaxOffsetNumber(page);
1909 }
1910 else
1911 {
1912 elog(ERROR, "invalid scan direction: %d", (int) dir);
1913 start = 0; /* keep compiler quiet */
1914 }
1915
1916 /* remember which buffer we have pinned */
1917 so->currPos.buf = buf;
1918
1919 _bt_initialize_more_data(so, dir);
1920
1921 /*
1922 * Now load data from the first page of the scan.
1923 */
1924 if (!_bt_readpage(scan, dir, start))
1925 {
1926 /*
1927 * There's no actually-matching data on this page. Try to advance to
1928 * the next page. Return false if there's no matching data at all.
1929 */
1930 LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1931 if (!_bt_steppage(scan, dir))
1932 return false;
1933 }
1934 else
1935 {
1936 /* Drop the lock, and maybe the pin, on the current page */
1937 _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1938 }
1939
1940 /* OK, itemIndex says what to return */
1941 currItem = &so->currPos.items[so->currPos.itemIndex];
1942 scan->xs_ctup.t_self = currItem->heapTid;
1943 if (scan->xs_want_itup)
1944 scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1945
1946 return true;
1947 }
1948
1949 /*
1950 * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
1951 * for scan direction
1952 */
1953 static inline void
1954 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
1955 {
1956 /* initialize moreLeft/moreRight appropriately for scan direction */
1957 if (ScanDirectionIsForward(dir))
1958 {
1959 so->currPos.moreLeft = false;
1960 so->currPos.moreRight = true;
1961 }
1962 else
1963 {
1964 so->currPos.moreLeft = true;
1965 so->currPos.moreRight = false;
1966 }
1967 so->numKilled = 0; /* just paranoia */
1968 so->markItemIndex = -1; /* ditto */
1969 }
1970