1 /*-------------------------------------------------------------------------
2  *
3  * nbtsearch.c
4  *	  Search code for postgres btrees.
5  *
6  *
7  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/nbtree/nbtsearch.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 
26 
27 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29 static int	_bt_binsrch_posting(BTScanInsert key, Page page,
30 								OffsetNumber offnum);
31 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
32 						 OffsetNumber offnum);
33 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
34 						 OffsetNumber offnum, IndexTuple itup);
35 static int	_bt_setuppostingitems(BTScanOpaque so, int itemIndex,
36 								  OffsetNumber offnum, ItemPointer heapTid,
37 								  IndexTuple itup);
38 static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
39 									   OffsetNumber offnum,
40 									   ItemPointer heapTid, int tupleOffset);
41 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
42 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
43 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
44 								  ScanDirection dir);
45 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
46 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
47 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
48 
49 
50 /*
51  *	_bt_drop_lock_and_maybe_pin()
52  *
53  * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
54  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
55  * This will prevent vacuum from stalling in a blocked state trying to read a
56  * page when a cursor is sitting on it -- at least in many important cases.
57  *
58  * Set the buffer to invalid if the pin is released, since the buffer may be
59  * re-used.  If we need to go back to this block (for example, to apply
60  * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
61  * will remain in shared memory for as long as it takes to scan the index
62  * buffer page.
63  */
64 static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan,BTScanPos sp)65 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
66 {
67 	_bt_unlockbuf(scan->indexRelation, sp->buf);
68 
69 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
70 		RelationNeedsWAL(scan->indexRelation) &&
71 		!scan->xs_want_itup)
72 	{
73 		ReleaseBuffer(sp->buf);
74 		sp->buf = InvalidBuffer;
75 	}
76 }
77 
78 /*
79  *	_bt_search() -- Search the tree for a particular scankey,
80  *		or more precisely for the first leaf page it could be on.
81  *
82  * The passed scankey is an insertion-type scankey (see nbtree/README),
83  * but it can omit the rightmost column(s) of the index.
84  *
85  * Return value is a stack of parent-page pointers (i.e. there is no entry for
86  * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
87  * which is locked and pinned.  No locks are held on the parent pages,
88  * however!
89  *
90  * If the snapshot parameter is not NULL, "old snapshot" checking will take
91  * place during the descent through the tree.  This is not needed when
92  * positioning for an insert or delete, so NULL is used for those cases.
93  *
94  * The returned buffer is locked according to access parameter.  Additionally,
95  * access = BT_WRITE will allow an empty root page to be created and returned.
96  * When access = BT_READ, an empty index will result in *bufP being set to
97  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
98  * during the search will be finished.
99  */
100 BTStack
_bt_search(Relation rel,BTScanInsert key,Buffer * bufP,int access,Snapshot snapshot)101 _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
102 		   Snapshot snapshot)
103 {
104 	BTStack		stack_in = NULL;
105 	int			page_access = BT_READ;
106 
107 	/* Get the root page to start with */
108 	*bufP = _bt_getroot(rel, access);
109 
110 	/* If index is empty and access = BT_READ, no root page is created. */
111 	if (!BufferIsValid(*bufP))
112 		return (BTStack) NULL;
113 
114 	/* Loop iterates once per level descended in the tree */
115 	for (;;)
116 	{
117 		Page		page;
118 		BTPageOpaque opaque;
119 		OffsetNumber offnum;
120 		ItemId		itemid;
121 		IndexTuple	itup;
122 		BlockNumber child;
123 		BTStack		new_stack;
124 
125 		/*
126 		 * Race -- the page we just grabbed may have split since we read its
127 		 * downlink in its parent page (or the metapage).  If it has, we may
128 		 * need to move right to its new sibling.  Do that.
129 		 *
130 		 * In write-mode, allow _bt_moveright to finish any incomplete splits
131 		 * along the way.  Strictly speaking, we'd only need to finish an
132 		 * incomplete split on the leaf page we're about to insert to, not on
133 		 * any of the upper levels (internal pages with incomplete splits are
134 		 * also taken care of in _bt_getstackbuf).  But this is a good
135 		 * opportunity to finish splits of internal pages too.
136 		 */
137 		*bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
138 							  page_access, snapshot);
139 
140 		/* if this is a leaf page, we're done */
141 		page = BufferGetPage(*bufP);
142 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
143 		if (P_ISLEAF(opaque))
144 			break;
145 
146 		/*
147 		 * Find the appropriate pivot tuple on this page.  Its downlink points
148 		 * to the child page that we're about to descend to.
149 		 */
150 		offnum = _bt_binsrch(rel, key, *bufP);
151 		itemid = PageGetItemId(page, offnum);
152 		itup = (IndexTuple) PageGetItem(page, itemid);
153 		Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
154 		child = BTreeTupleGetDownLink(itup);
155 
156 		/*
157 		 * We need to save the location of the pivot tuple we chose in a new
158 		 * stack entry for this page/level.  If caller ends up splitting a
159 		 * page one level down, it usually ends up inserting a new pivot
160 		 * tuple/downlink immediately after the location recorded here.
161 		 */
162 		new_stack = (BTStack) palloc(sizeof(BTStackData));
163 		new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
164 		new_stack->bts_offset = offnum;
165 		new_stack->bts_parent = stack_in;
166 
167 		/*
168 		 * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
169 		 * we're on the level 1 and asked to lock leaf page in write mode,
170 		 * then lock next page in write mode, because it must be a leaf.
171 		 */
172 		if (opaque->btpo_level == 1 && access == BT_WRITE)
173 			page_access = BT_WRITE;
174 
175 		/* drop the read lock on the page, then acquire one on its child */
176 		*bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
177 
178 		/* okay, all set to move down a level */
179 		stack_in = new_stack;
180 	}
181 
182 	/*
183 	 * If we're asked to lock leaf in write mode, but didn't manage to, then
184 	 * relock.  This should only happen when the root page is a leaf page (and
185 	 * the only page in the index other than the metapage).
186 	 */
187 	if (access == BT_WRITE && page_access == BT_READ)
188 	{
189 		/* trade in our read lock for a write lock */
190 		_bt_unlockbuf(rel, *bufP);
191 		_bt_lockbuf(rel, *bufP, BT_WRITE);
192 
193 		/*
194 		 * Race -- the leaf page may have split after we dropped the read lock
195 		 * but before we acquired a write lock.  If it has, we may need to
196 		 * move right to its new sibling.  Do that.
197 		 */
198 		*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
199 							  snapshot);
200 	}
201 
202 	return stack_in;
203 }
204 
205 /*
206  *	_bt_moveright() -- move right in the btree if necessary.
207  *
208  * When we follow a pointer to reach a page, it is possible that
209  * the page has changed in the meanwhile.  If this happens, we're
210  * guaranteed that the page has "split right" -- that is, that any
211  * data that appeared on the page originally is either on the page
212  * or strictly to the right of it.
213  *
214  * This routine decides whether or not we need to move right in the
215  * tree by examining the high key entry on the page.  If that entry is
216  * strictly less than the scankey, or <= the scankey in the
217  * key.nextkey=true case, then we followed the wrong link and we need
218  * to move right.
219  *
220  * The passed insertion-type scankey can omit the rightmost column(s) of the
221  * index. (see nbtree/README)
222  *
223  * When key.nextkey is false (the usual case), we are looking for the first
224  * item >= key.  When key.nextkey is true, we are looking for the first item
225  * strictly greater than key.
226  *
227  * If forupdate is true, we will attempt to finish any incomplete splits
228  * that we encounter.  This is required when locking a target page for an
229  * insertion, because we don't allow inserting on a page before the split
230  * is completed.  'stack' is only used if forupdate is true.
231  *
232  * On entry, we have the buffer pinned and a lock of the type specified by
233  * 'access'.  If we move right, we release the buffer and lock and acquire
234  * the same on the right sibling.  Return value is the buffer we stop at.
235  *
236  * If the snapshot parameter is not NULL, "old snapshot" checking will take
237  * place during the descent through the tree.  This is not needed when
238  * positioning for an insert or delete, so NULL is used for those cases.
239  */
240 Buffer
_bt_moveright(Relation rel,BTScanInsert key,Buffer buf,bool forupdate,BTStack stack,int access,Snapshot snapshot)241 _bt_moveright(Relation rel,
242 			  BTScanInsert key,
243 			  Buffer buf,
244 			  bool forupdate,
245 			  BTStack stack,
246 			  int access,
247 			  Snapshot snapshot)
248 {
249 	Page		page;
250 	BTPageOpaque opaque;
251 	int32		cmpval;
252 
253 	/*
254 	 * When nextkey = false (normal case): if the scan key that brought us to
255 	 * this page is > the high key stored on the page, then the page has split
256 	 * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
257 	 * have some duplicates to the right as well as the left, but that's
258 	 * something that's only ever dealt with on the leaf level, after
259 	 * _bt_search has found an initial leaf page.)
260 	 *
261 	 * When nextkey = true: move right if the scan key is >= page's high key.
262 	 * (Note that key.scantid cannot be set in this case.)
263 	 *
264 	 * The page could even have split more than once, so scan as far as
265 	 * needed.
266 	 *
267 	 * We also have to move right if we followed a link that brought us to a
268 	 * dead page.
269 	 */
270 	cmpval = key->nextkey ? 0 : 1;
271 
272 	for (;;)
273 	{
274 		page = BufferGetPage(buf);
275 		TestForOldSnapshot(snapshot, rel, page);
276 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
277 
278 		if (P_RIGHTMOST(opaque))
279 			break;
280 
281 		/*
282 		 * Finish any incomplete splits we encounter along the way.
283 		 */
284 		if (forupdate && P_INCOMPLETE_SPLIT(opaque))
285 		{
286 			BlockNumber blkno = BufferGetBlockNumber(buf);
287 
288 			/* upgrade our lock if necessary */
289 			if (access == BT_READ)
290 			{
291 				_bt_unlockbuf(rel, buf);
292 				_bt_lockbuf(rel, buf, BT_WRITE);
293 			}
294 
295 			if (P_INCOMPLETE_SPLIT(opaque))
296 				_bt_finish_split(rel, buf, stack);
297 			else
298 				_bt_relbuf(rel, buf);
299 
300 			/* re-acquire the lock in the right mode, and re-check */
301 			buf = _bt_getbuf(rel, blkno, access);
302 			continue;
303 		}
304 
305 		if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
306 		{
307 			/* step right one page */
308 			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
309 			continue;
310 		}
311 		else
312 			break;
313 	}
314 
315 	if (P_IGNORE(opaque))
316 		elog(ERROR, "fell off the end of index \"%s\"",
317 			 RelationGetRelationName(rel));
318 
319 	return buf;
320 }
321 
322 /*
323  *	_bt_binsrch() -- Do a binary search for a key on a particular page.
324  *
325  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
326  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
327  * particular, this means it is possible to return a value 1 greater than the
328  * number of keys on the page, if the scankey is > all keys on the page.)
329  *
330  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
331  * of the last key < given scankey, or last key <= given scankey if nextkey
332  * is true.  (Since _bt_compare treats the first data key of such a page as
333  * minus infinity, there will be at least one key < scankey, so the result
334  * always points at one of the keys on the page.)  This key indicates the
335  * right place to descend to be sure we find all leaf keys >= given scankey
336  * (or leaf keys > given scankey when nextkey is true).
337  *
338  * This procedure is not responsible for walking right, it just examines
339  * the given page.  _bt_binsrch() has no lock or refcount side effects
340  * on the buffer.
341  */
342 static OffsetNumber
_bt_binsrch(Relation rel,BTScanInsert key,Buffer buf)343 _bt_binsrch(Relation rel,
344 			BTScanInsert key,
345 			Buffer buf)
346 {
347 	Page		page;
348 	BTPageOpaque opaque;
349 	OffsetNumber low,
350 				high;
351 	int32		result,
352 				cmpval;
353 
354 	page = BufferGetPage(buf);
355 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
356 
357 	/* Requesting nextkey semantics while using scantid seems nonsensical */
358 	Assert(!key->nextkey || key->scantid == NULL);
359 	/* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
360 	Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
361 
362 	low = P_FIRSTDATAKEY(opaque);
363 	high = PageGetMaxOffsetNumber(page);
364 
365 	/*
366 	 * If there are no keys on the page, return the first available slot. Note
367 	 * this covers two cases: the page is really empty (no keys), or it
368 	 * contains only a high key.  The latter case is possible after vacuuming.
369 	 * This can never happen on an internal page, however, since they are
370 	 * never empty (an internal page must have children).
371 	 */
372 	if (unlikely(high < low))
373 		return low;
374 
375 	/*
376 	 * Binary search to find the first key on the page >= scan key, or first
377 	 * key > scankey when nextkey is true.
378 	 *
379 	 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
380 	 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
381 	 *
382 	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
383 	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
384 	 *
385 	 * We can fall out when high == low.
386 	 */
387 	high++;						/* establish the loop invariant for high */
388 
389 	cmpval = key->nextkey ? 0 : 1;	/* select comparison value */
390 
391 	while (high > low)
392 	{
393 		OffsetNumber mid = low + ((high - low) / 2);
394 
395 		/* We have low <= mid < high, so mid points at a real slot */
396 
397 		result = _bt_compare(rel, key, page, mid);
398 
399 		if (result >= cmpval)
400 			low = mid + 1;
401 		else
402 			high = mid;
403 	}
404 
405 	/*
406 	 * At this point we have high == low, but be careful: they could point
407 	 * past the last slot on the page.
408 	 *
409 	 * On a leaf page, we always return the first key >= scan key (resp. >
410 	 * scan key), which could be the last slot + 1.
411 	 */
412 	if (P_ISLEAF(opaque))
413 		return low;
414 
415 	/*
416 	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
417 	 * There must be one if _bt_compare() is playing by the rules.
418 	 */
419 	Assert(low > P_FIRSTDATAKEY(opaque));
420 
421 	return OffsetNumberPrev(low);
422 }
423 
424 /*
425  *
426  *	_bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
427  *
428  * Like _bt_binsrch(), but with support for caching the binary search
429  * bounds.  Only used during insertion, and only on the leaf page that it
430  * looks like caller will insert tuple on.  Exclusive-locked and pinned
431  * leaf page is contained within insertstate.
432  *
433  * Caches the bounds fields in insertstate so that a subsequent call can
434  * reuse the low and strict high bounds of original binary search.  Callers
435  * that use these fields directly must be prepared for the case where low
436  * and/or stricthigh are not on the same page (one or both exceed maxoff
437  * for the page).  The case where there are no items on the page (high <
438  * low) makes bounds invalid.
439  *
440  * Caller is responsible for invalidating bounds when it modifies the page
441  * before calling here a second time, and for dealing with posting list
442  * tuple matches (callers can use insertstate's postingoff field to
443  * determine which existing heap TID will need to be replaced by a posting
444  * list split).
445  */
446 OffsetNumber
_bt_binsrch_insert(Relation rel,BTInsertState insertstate)447 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
448 {
449 	BTScanInsert key = insertstate->itup_key;
450 	Page		page;
451 	BTPageOpaque opaque;
452 	OffsetNumber low,
453 				high,
454 				stricthigh;
455 	int32		result,
456 				cmpval;
457 
458 	page = BufferGetPage(insertstate->buf);
459 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
460 
461 	Assert(P_ISLEAF(opaque));
462 	Assert(!key->nextkey);
463 	Assert(insertstate->postingoff == 0);
464 
465 	if (!insertstate->bounds_valid)
466 	{
467 		/* Start new binary search */
468 		low = P_FIRSTDATAKEY(opaque);
469 		high = PageGetMaxOffsetNumber(page);
470 	}
471 	else
472 	{
473 		/* Restore result of previous binary search against same page */
474 		low = insertstate->low;
475 		high = insertstate->stricthigh;
476 	}
477 
478 	/* If there are no keys on the page, return the first available slot */
479 	if (unlikely(high < low))
480 	{
481 		/* Caller can't reuse bounds */
482 		insertstate->low = InvalidOffsetNumber;
483 		insertstate->stricthigh = InvalidOffsetNumber;
484 		insertstate->bounds_valid = false;
485 		return low;
486 	}
487 
488 	/*
489 	 * Binary search to find the first key on the page >= scan key. (nextkey
490 	 * is always false when inserting).
491 	 *
492 	 * The loop invariant is: all slots before 'low' are < scan key, all slots
493 	 * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
494 	 * maintained to save additional search effort for caller.
495 	 *
496 	 * We can fall out when high == low.
497 	 */
498 	if (!insertstate->bounds_valid)
499 		high++;					/* establish the loop invariant for high */
500 	stricthigh = high;			/* high initially strictly higher */
501 
502 	cmpval = 1;					/* !nextkey comparison value */
503 
504 	while (high > low)
505 	{
506 		OffsetNumber mid = low + ((high - low) / 2);
507 
508 		/* We have low <= mid < high, so mid points at a real slot */
509 
510 		result = _bt_compare(rel, key, page, mid);
511 
512 		if (result >= cmpval)
513 			low = mid + 1;
514 		else
515 		{
516 			high = mid;
517 			if (result != 0)
518 				stricthigh = high;
519 		}
520 
521 		/*
522 		 * If tuple at offset located by binary search is a posting list whose
523 		 * TID range overlaps with caller's scantid, perform posting list
524 		 * binary search to set postingoff for caller.  Caller must split the
525 		 * posting list when postingoff is set.  This should happen
526 		 * infrequently.
527 		 */
528 		if (unlikely(result == 0 && key->scantid != NULL))
529 		{
530 			/*
531 			 * postingoff should never be set more than once per leaf page
532 			 * binary search.  That would mean that there are duplicate table
533 			 * TIDs in the index, which is never okay.  Check for that here.
534 			 */
535 			if (insertstate->postingoff != 0)
536 				ereport(ERROR,
537 						(errcode(ERRCODE_INDEX_CORRUPTED),
538 						 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
539 										 ItemPointerGetBlockNumber(key->scantid),
540 										 ItemPointerGetOffsetNumber(key->scantid),
541 										 low, stricthigh,
542 										 BufferGetBlockNumber(insertstate->buf),
543 										 RelationGetRelationName(rel))));
544 
545 			insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
546 		}
547 	}
548 
549 	/*
550 	 * On a leaf page, a binary search always returns the first key >= scan
551 	 * key (at least in !nextkey case), which could be the last slot + 1. This
552 	 * is also the lower bound of cached search.
553 	 *
554 	 * stricthigh may also be the last slot + 1, which prevents caller from
555 	 * using bounds directly, but is still useful to us if we're called a
556 	 * second time with cached bounds (cached low will be < stricthigh when
557 	 * that happens).
558 	 */
559 	insertstate->low = low;
560 	insertstate->stricthigh = stricthigh;
561 	insertstate->bounds_valid = true;
562 
563 	return low;
564 }
565 
566 /*----------
567  *	_bt_binsrch_posting() -- posting list binary search.
568  *
569  * Helper routine for _bt_binsrch_insert().
570  *
571  * Returns offset into posting list where caller's scantid belongs.
572  *----------
573  */
574 static int
_bt_binsrch_posting(BTScanInsert key,Page page,OffsetNumber offnum)575 _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
576 {
577 	IndexTuple	itup;
578 	ItemId		itemid;
579 	int			low,
580 				high,
581 				mid,
582 				res;
583 
584 	/*
585 	 * If this isn't a posting tuple, then the index must be corrupt (if it is
586 	 * an ordinary non-pivot tuple then there must be an existing tuple with a
587 	 * heap TID that equals inserter's new heap TID/scantid).  Defensively
588 	 * check that tuple is a posting list tuple whose posting list range
589 	 * includes caller's scantid.
590 	 *
591 	 * (This is also needed because contrib/amcheck's rootdescend option needs
592 	 * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
593 	 */
594 	itemid = PageGetItemId(page, offnum);
595 	itup = (IndexTuple) PageGetItem(page, itemid);
596 	if (!BTreeTupleIsPosting(itup))
597 		return 0;
598 
599 	Assert(key->heapkeyspace && key->allequalimage);
600 
601 	/*
602 	 * In the event that posting list tuple has LP_DEAD bit set, indicate this
603 	 * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
604 	 * second call to _bt_binsrch_insert() can take place when its caller has
605 	 * removed the dead item.
606 	 */
607 	if (ItemIdIsDead(itemid))
608 		return -1;
609 
610 	/* "high" is past end of posting list for loop invariant */
611 	low = 0;
612 	high = BTreeTupleGetNPosting(itup);
613 	Assert(high >= 2);
614 
615 	while (high > low)
616 	{
617 		mid = low + ((high - low) / 2);
618 		res = ItemPointerCompare(key->scantid,
619 								 BTreeTupleGetPostingN(itup, mid));
620 
621 		if (res > 0)
622 			low = mid + 1;
623 		else if (res < 0)
624 			high = mid;
625 		else
626 			return mid;
627 	}
628 
629 	/* Exact match not found */
630 	return low;
631 }
632 
633 /*----------
634  *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
635  *
636  *	page/offnum: location of btree item to be compared to.
637  *
638  *		This routine returns:
639  *			<0 if scankey < tuple at offnum;
640  *			 0 if scankey == tuple at offnum;
641  *			>0 if scankey > tuple at offnum.
642  *
643  * NULLs in the keys are treated as sortable values.  Therefore
644  * "equality" does not necessarily mean that the item should be returned
645  * to the caller as a matching key.  Similarly, an insertion scankey
646  * with its scantid set is treated as equal to a posting tuple whose TID
647  * range overlaps with their scantid.  There generally won't be a
648  * matching TID in the posting tuple, which caller must handle
649  * themselves (e.g., by splitting the posting list tuple).
650  *
651  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
652  * "minus infinity": this routine will always claim it is less than the
653  * scankey.  The actual key value stored is explicitly truncated to 0
654  * attributes (explicitly minus infinity) with version 3+ indexes, but
655  * that isn't relied upon.  This allows us to implement the Lehman and
656  * Yao convention that the first down-link pointer is before the first
657  * key.  See backend/access/nbtree/README for details.
658  *----------
659  */
660 int32
_bt_compare(Relation rel,BTScanInsert key,Page page,OffsetNumber offnum)661 _bt_compare(Relation rel,
662 			BTScanInsert key,
663 			Page page,
664 			OffsetNumber offnum)
665 {
666 	TupleDesc	itupdesc = RelationGetDescr(rel);
667 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
668 	IndexTuple	itup;
669 	ItemPointer heapTid;
670 	ScanKey		scankey;
671 	int			ncmpkey;
672 	int			ntupatts;
673 	int32		result;
674 
675 	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
676 	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
677 	Assert(key->heapkeyspace || key->scantid == NULL);
678 
679 	/*
680 	 * Force result ">" if target item is first data item on an internal page
681 	 * --- see NOTE above.
682 	 */
683 	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
684 		return 1;
685 
686 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
687 	ntupatts = BTreeTupleGetNAtts(itup, rel);
688 
689 	/*
690 	 * The scan key is set up with the attribute number associated with each
691 	 * term in the key.  It is important that, if the index is multi-key, the
692 	 * scan contain the first k key attributes, and that they be in order.  If
693 	 * you think about how multi-key ordering works, you'll understand why
694 	 * this is.
695 	 *
696 	 * We don't test for violation of this condition here, however.  The
697 	 * initial setup for the index scan had better have gotten it right (see
698 	 * _bt_first).
699 	 */
700 
701 	ncmpkey = Min(ntupatts, key->keysz);
702 	Assert(key->heapkeyspace || ncmpkey == key->keysz);
703 	Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
704 	scankey = key->scankeys;
705 	for (int i = 1; i <= ncmpkey; i++)
706 	{
707 		Datum		datum;
708 		bool		isNull;
709 
710 		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
711 
712 		if (scankey->sk_flags & SK_ISNULL)	/* key is NULL */
713 		{
714 			if (isNull)
715 				result = 0;		/* NULL "=" NULL */
716 			else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
717 				result = -1;	/* NULL "<" NOT_NULL */
718 			else
719 				result = 1;		/* NULL ">" NOT_NULL */
720 		}
721 		else if (isNull)		/* key is NOT_NULL and item is NULL */
722 		{
723 			if (scankey->sk_flags & SK_BT_NULLS_FIRST)
724 				result = 1;		/* NOT_NULL ">" NULL */
725 			else
726 				result = -1;	/* NOT_NULL "<" NULL */
727 		}
728 		else
729 		{
730 			/*
731 			 * The sk_func needs to be passed the index value as left arg and
732 			 * the sk_argument as right arg (they might be of different
733 			 * types).  Since it is convenient for callers to think of
734 			 * _bt_compare as comparing the scankey to the index item, we have
735 			 * to flip the sign of the comparison result.  (Unless it's a DESC
736 			 * column, in which case we *don't* flip the sign.)
737 			 */
738 			result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
739 													 scankey->sk_collation,
740 													 datum,
741 													 scankey->sk_argument));
742 
743 			if (!(scankey->sk_flags & SK_BT_DESC))
744 				INVERT_COMPARE_RESULT(result);
745 		}
746 
747 		/* if the keys are unequal, return the difference */
748 		if (result != 0)
749 			return result;
750 
751 		scankey++;
752 	}
753 
754 	/*
755 	 * All non-truncated attributes (other than heap TID) were found to be
756 	 * equal.  Treat truncated attributes as minus infinity when scankey has a
757 	 * key attribute value that would otherwise be compared directly.
758 	 *
759 	 * Note: it doesn't matter if ntupatts includes non-key attributes;
760 	 * scankey won't, so explicitly excluding non-key attributes isn't
761 	 * necessary.
762 	 */
763 	if (key->keysz > ntupatts)
764 		return 1;
765 
766 	/*
767 	 * Use the heap TID attribute and scantid to try to break the tie.  The
768 	 * rules are the same as any other key attribute -- only the
769 	 * representation differs.
770 	 */
771 	heapTid = BTreeTupleGetHeapTID(itup);
772 	if (key->scantid == NULL)
773 	{
774 		/*
775 		 * Most searches have a scankey that is considered greater than a
776 		 * truncated pivot tuple if and when the scankey has equal values for
777 		 * attributes up to and including the least significant untruncated
778 		 * attribute in tuple.
779 		 *
780 		 * For example, if an index has the minimum two attributes (single
781 		 * user key attribute, plus heap TID attribute), and a page's high key
782 		 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
783 		 * will not descend to the page to the left.  The search will descend
784 		 * right instead.  The truncated attribute in pivot tuple means that
785 		 * all non-pivot tuples on the page to the left are strictly < 'foo',
786 		 * so it isn't necessary to descend left.  In other words, search
787 		 * doesn't have to descend left because it isn't interested in a match
788 		 * that has a heap TID value of -inf.
789 		 *
790 		 * However, some searches (pivotsearch searches) actually require that
791 		 * we descend left when this happens.  -inf is treated as a possible
792 		 * match for omitted scankey attribute(s).  This is needed by page
793 		 * deletion, which must re-find leaf pages that are targets for
794 		 * deletion using their high keys.
795 		 *
796 		 * Note: the heap TID part of the test ensures that scankey is being
797 		 * compared to a pivot tuple with one or more truncated key
798 		 * attributes.
799 		 *
800 		 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
801 		 * left here, since they have no heap TID attribute (and cannot have
802 		 * any -inf key values in any case, since truncation can only remove
803 		 * non-key attributes).  !heapkeyspace searches must always be
804 		 * prepared to deal with matches on both sides of the pivot once the
805 		 * leaf level is reached.
806 		 */
807 		if (key->heapkeyspace && !key->pivotsearch &&
808 			key->keysz == ntupatts && heapTid == NULL)
809 			return 1;
810 
811 		/* All provided scankey arguments found to be equal */
812 		return 0;
813 	}
814 
815 	/*
816 	 * Treat truncated heap TID as minus infinity, since scankey has a key
817 	 * attribute value (scantid) that would otherwise be compared directly
818 	 */
819 	Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
820 	if (heapTid == NULL)
821 		return 1;
822 
823 	/*
824 	 * Scankey must be treated as equal to a posting list tuple if its scantid
825 	 * value falls within the range of the posting list.  In all other cases
826 	 * there can only be a single heap TID value, which is compared directly
827 	 * with scantid.
828 	 */
829 	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
830 	result = ItemPointerCompare(key->scantid, heapTid);
831 	if (result <= 0 || !BTreeTupleIsPosting(itup))
832 		return result;
833 	else
834 	{
835 		result = ItemPointerCompare(key->scantid,
836 									BTreeTupleGetMaxHeapTID(itup));
837 		if (result > 0)
838 			return 1;
839 	}
840 
841 	return 0;
842 }
843 
844 /*
845  *	_bt_first() -- Find the first item in a scan.
846  *
847  *		We need to be clever about the direction of scan, the search
848  *		conditions, and the tree ordering.  We find the first item (or,
849  *		if backwards scan, the last item) in the tree that satisfies the
850  *		qualifications in the scan key.  On success exit, the page containing
851  *		the current index tuple is pinned but not locked, and data about
852  *		the matching tuple(s) on the page has been loaded into so->currPos.
853  *		scan->xs_ctup.t_self is set to the heap TID of the current tuple,
854  *		and if requested, scan->xs_itup points to a copy of the index tuple.
855  *
856  * If there are no matching items in the index, we return false, with no
857  * pins or locks held.
858  *
859  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
860  * are both search-type scankeys (see nbtree/README for more about this).
861  * Within this routine, we build a temporary insertion-type scankey to use
862  * in locating the scan start position.
863  */
864 bool
_bt_first(IndexScanDesc scan,ScanDirection dir)865 _bt_first(IndexScanDesc scan, ScanDirection dir)
866 {
867 	Relation	rel = scan->indexRelation;
868 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
869 	Buffer		buf;
870 	BTStack		stack;
871 	OffsetNumber offnum;
872 	StrategyNumber strat;
873 	bool		nextkey;
874 	bool		goback;
875 	BTScanInsertData inskey;
876 	ScanKey		startKeys[INDEX_MAX_KEYS];
877 	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
878 	int			keysCount = 0;
879 	int			i;
880 	bool		status;
881 	StrategyNumber strat_total;
882 	BTScanPosItem *currItem;
883 	BlockNumber blkno;
884 
885 	Assert(!BTScanPosIsValid(so->currPos));
886 
887 	pgstat_count_index_scan(rel);
888 
889 	/*
890 	 * Examine the scan keys and eliminate any redundant keys; also mark the
891 	 * keys that must be matched to continue the scan.
892 	 */
893 	_bt_preprocess_keys(scan);
894 
895 	/*
896 	 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
897 	 * never be satisfied (eg, x == 1 AND x > 2).
898 	 */
899 	if (!so->qual_ok)
900 	{
901 		/* Notify any other workers that we're done with this scan key. */
902 		_bt_parallel_done(scan);
903 		return false;
904 	}
905 
906 	/*
907 	 * For parallel scans, get the starting page from shared state. If the
908 	 * scan has not started, proceed to find out first leaf page in the usual
909 	 * way while keeping other participating processes waiting.  If the scan
910 	 * has already begun, use the page number from the shared structure.
911 	 */
912 	if (scan->parallel_scan != NULL)
913 	{
914 		status = _bt_parallel_seize(scan, &blkno);
915 		if (!status)
916 			return false;
917 		else if (blkno == P_NONE)
918 		{
919 			_bt_parallel_done(scan);
920 			return false;
921 		}
922 		else if (blkno != InvalidBlockNumber)
923 		{
924 			if (!_bt_parallel_readpage(scan, blkno, dir))
925 				return false;
926 			goto readcomplete;
927 		}
928 	}
929 
930 	/*----------
931 	 * Examine the scan keys to discover where we need to start the scan.
932 	 *
933 	 * We want to identify the keys that can be used as starting boundaries;
934 	 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
935 	 * a backwards scan.  We can use keys for multiple attributes so long as
936 	 * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
937 	 * a > or < boundary or find an attribute with no boundary (which can be
938 	 * thought of as the same as "> -infinity"), we can't use keys for any
939 	 * attributes to its right, because it would break our simplistic notion
940 	 * of what initial positioning strategy to use.
941 	 *
942 	 * When the scan keys include cross-type operators, _bt_preprocess_keys
943 	 * may not be able to eliminate redundant keys; in such cases we will
944 	 * arbitrarily pick a usable one for each attribute.  This is correct
945 	 * but possibly not optimal behavior.  (For example, with keys like
946 	 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
947 	 * x=5 would be more efficient.)  Since the situation only arises given
948 	 * a poorly-worded query plus an incomplete opfamily, live with it.
949 	 *
950 	 * When both equality and inequality keys appear for a single attribute
951 	 * (again, only possible when cross-type operators appear), we *must*
952 	 * select one of the equality keys for the starting point, because
953 	 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
954 	 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
955 	 * start at x=4, we will fail and stop before reaching x=10.  If multiple
956 	 * equality quals survive preprocessing, however, it doesn't matter which
957 	 * one we use --- by definition, they are either redundant or
958 	 * contradictory.
959 	 *
960 	 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
961 	 * If the index stores nulls at the end of the index we'll be starting
962 	 * from, and we have no boundary key for the column (which means the key
963 	 * we deduced NOT NULL from is an inequality key that constrains the other
964 	 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
965 	 * use as a boundary key.  If we didn't do this, we might find ourselves
966 	 * traversing a lot of null entries at the start of the scan.
967 	 *
968 	 * In this loop, row-comparison keys are treated the same as keys on their
969 	 * first (leftmost) columns.  We'll add on lower-order columns of the row
970 	 * comparison below, if possible.
971 	 *
972 	 * The selected scan keys (at most one per index column) are remembered by
973 	 * storing their addresses into the local startKeys[] array.
974 	 *----------
975 	 */
976 	strat_total = BTEqualStrategyNumber;
977 	if (so->numberOfKeys > 0)
978 	{
979 		AttrNumber	curattr;
980 		ScanKey		chosen;
981 		ScanKey		impliesNN;
982 		ScanKey		cur;
983 
984 		/*
985 		 * chosen is the so-far-chosen key for the current attribute, if any.
986 		 * We don't cast the decision in stone until we reach keys for the
987 		 * next attribute.
988 		 */
989 		curattr = 1;
990 		chosen = NULL;
991 		/* Also remember any scankey that implies a NOT NULL constraint */
992 		impliesNN = NULL;
993 
994 		/*
995 		 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
996 		 * pass to handle after-last-key processing.  Actual exit from the
997 		 * loop is at one of the "break" statements below.
998 		 */
999 		for (cur = so->keyData, i = 0;; cur++, i++)
1000 		{
1001 			if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1002 			{
1003 				/*
1004 				 * Done looking at keys for curattr.  If we didn't find a
1005 				 * usable boundary key, see if we can deduce a NOT NULL key.
1006 				 */
1007 				if (chosen == NULL && impliesNN != NULL &&
1008 					((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1009 					 ScanDirectionIsForward(dir) :
1010 					 ScanDirectionIsBackward(dir)))
1011 				{
1012 					/* Yes, so build the key in notnullkeys[keysCount] */
1013 					chosen = &notnullkeys[keysCount];
1014 					ScanKeyEntryInitialize(chosen,
1015 										   (SK_SEARCHNOTNULL | SK_ISNULL |
1016 											(impliesNN->sk_flags &
1017 											 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1018 										   curattr,
1019 										   ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1020 											BTGreaterStrategyNumber :
1021 											BTLessStrategyNumber),
1022 										   InvalidOid,
1023 										   InvalidOid,
1024 										   InvalidOid,
1025 										   (Datum) 0);
1026 				}
1027 
1028 				/*
1029 				 * If we still didn't find a usable boundary key, quit; else
1030 				 * save the boundary key pointer in startKeys.
1031 				 */
1032 				if (chosen == NULL)
1033 					break;
1034 				startKeys[keysCount++] = chosen;
1035 
1036 				/*
1037 				 * Adjust strat_total, and quit if we have stored a > or <
1038 				 * key.
1039 				 */
1040 				strat = chosen->sk_strategy;
1041 				if (strat != BTEqualStrategyNumber)
1042 				{
1043 					strat_total = strat;
1044 					if (strat == BTGreaterStrategyNumber ||
1045 						strat == BTLessStrategyNumber)
1046 						break;
1047 				}
1048 
1049 				/*
1050 				 * Done if that was the last attribute, or if next key is not
1051 				 * in sequence (implying no boundary key is available for the
1052 				 * next attribute).
1053 				 */
1054 				if (i >= so->numberOfKeys ||
1055 					cur->sk_attno != curattr + 1)
1056 					break;
1057 
1058 				/*
1059 				 * Reset for next attr.
1060 				 */
1061 				curattr = cur->sk_attno;
1062 				chosen = NULL;
1063 				impliesNN = NULL;
1064 			}
1065 
1066 			/*
1067 			 * Can we use this key as a starting boundary for this attr?
1068 			 *
1069 			 * If not, does it imply a NOT NULL constraint?  (Because
1070 			 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1071 			 * *any* inequality key works for that; we need not test.)
1072 			 */
1073 			switch (cur->sk_strategy)
1074 			{
1075 				case BTLessStrategyNumber:
1076 				case BTLessEqualStrategyNumber:
1077 					if (chosen == NULL)
1078 					{
1079 						if (ScanDirectionIsBackward(dir))
1080 							chosen = cur;
1081 						else
1082 							impliesNN = cur;
1083 					}
1084 					break;
1085 				case BTEqualStrategyNumber:
1086 					/* override any non-equality choice */
1087 					chosen = cur;
1088 					break;
1089 				case BTGreaterEqualStrategyNumber:
1090 				case BTGreaterStrategyNumber:
1091 					if (chosen == NULL)
1092 					{
1093 						if (ScanDirectionIsForward(dir))
1094 							chosen = cur;
1095 						else
1096 							impliesNN = cur;
1097 					}
1098 					break;
1099 			}
1100 		}
1101 	}
1102 
1103 	/*
1104 	 * If we found no usable boundary keys, we have to start from one end of
1105 	 * the tree.  Walk down that edge to the first or last key, and scan from
1106 	 * there.
1107 	 */
1108 	if (keysCount == 0)
1109 	{
1110 		bool		match;
1111 
1112 		match = _bt_endpoint(scan, dir);
1113 
1114 		if (!match)
1115 		{
1116 			/* No match, so mark (parallel) scan finished */
1117 			_bt_parallel_done(scan);
1118 		}
1119 
1120 		return match;
1121 	}
1122 
1123 	/*
1124 	 * We want to start the scan somewhere within the index.  Set up an
1125 	 * insertion scankey we can use to search for the boundary point we
1126 	 * identified above.  The insertion scankey is built using the keys
1127 	 * identified by startKeys[].  (Remaining insertion scankey fields are
1128 	 * initialized after initial-positioning strategy is finalized.)
1129 	 */
1130 	Assert(keysCount <= INDEX_MAX_KEYS);
1131 	for (i = 0; i < keysCount; i++)
1132 	{
1133 		ScanKey		cur = startKeys[i];
1134 
1135 		Assert(cur->sk_attno == i + 1);
1136 
1137 		if (cur->sk_flags & SK_ROW_HEADER)
1138 		{
1139 			/*
1140 			 * Row comparison header: look to the first row member instead.
1141 			 *
1142 			 * The member scankeys are already in insertion format (ie, they
1143 			 * have sk_func = 3-way-comparison function), but we have to watch
1144 			 * out for nulls, which _bt_preprocess_keys didn't check. A null
1145 			 * in the first row member makes the condition unmatchable, just
1146 			 * like qual_ok = false.
1147 			 */
1148 			ScanKey		subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1149 
1150 			Assert(subkey->sk_flags & SK_ROW_MEMBER);
1151 			if (subkey->sk_flags & SK_ISNULL)
1152 			{
1153 				_bt_parallel_done(scan);
1154 				return false;
1155 			}
1156 			memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1157 
1158 			/*
1159 			 * If the row comparison is the last positioning key we accepted,
1160 			 * try to add additional keys from the lower-order row members.
1161 			 * (If we accepted independent conditions on additional index
1162 			 * columns, we use those instead --- doesn't seem worth trying to
1163 			 * determine which is more restrictive.)  Note that this is OK
1164 			 * even if the row comparison is of ">" or "<" type, because the
1165 			 * condition applied to all but the last row member is effectively
1166 			 * ">=" or "<=", and so the extra keys don't break the positioning
1167 			 * scheme.  But, by the same token, if we aren't able to use all
1168 			 * the row members, then the part of the row comparison that we
1169 			 * did use has to be treated as just a ">=" or "<=" condition, and
1170 			 * so we'd better adjust strat_total accordingly.
1171 			 */
1172 			if (i == keysCount - 1)
1173 			{
1174 				bool		used_all_subkeys = false;
1175 
1176 				Assert(!(subkey->sk_flags & SK_ROW_END));
1177 				for (;;)
1178 				{
1179 					subkey++;
1180 					Assert(subkey->sk_flags & SK_ROW_MEMBER);
1181 					if (subkey->sk_attno != keysCount + 1)
1182 						break;	/* out-of-sequence, can't use it */
1183 					if (subkey->sk_strategy != cur->sk_strategy)
1184 						break;	/* wrong direction, can't use it */
1185 					if (subkey->sk_flags & SK_ISNULL)
1186 						break;	/* can't use null keys */
1187 					Assert(keysCount < INDEX_MAX_KEYS);
1188 					memcpy(inskey.scankeys + keysCount, subkey,
1189 						   sizeof(ScanKeyData));
1190 					keysCount++;
1191 					if (subkey->sk_flags & SK_ROW_END)
1192 					{
1193 						used_all_subkeys = true;
1194 						break;
1195 					}
1196 				}
1197 				if (!used_all_subkeys)
1198 				{
1199 					switch (strat_total)
1200 					{
1201 						case BTLessStrategyNumber:
1202 							strat_total = BTLessEqualStrategyNumber;
1203 							break;
1204 						case BTGreaterStrategyNumber:
1205 							strat_total = BTGreaterEqualStrategyNumber;
1206 							break;
1207 					}
1208 				}
1209 				break;			/* done with outer loop */
1210 			}
1211 		}
1212 		else
1213 		{
1214 			/*
1215 			 * Ordinary comparison key.  Transform the search-style scan key
1216 			 * to an insertion scan key by replacing the sk_func with the
1217 			 * appropriate btree comparison function.
1218 			 *
1219 			 * If scankey operator is not a cross-type comparison, we can use
1220 			 * the cached comparison function; otherwise gotta look it up in
1221 			 * the catalogs.  (That can't lead to infinite recursion, since no
1222 			 * indexscan initiated by syscache lookup will use cross-data-type
1223 			 * operators.)
1224 			 *
1225 			 * We support the convention that sk_subtype == InvalidOid means
1226 			 * the opclass input type; this is a hack to simplify life for
1227 			 * ScanKeyInit().
1228 			 */
1229 			if (cur->sk_subtype == rel->rd_opcintype[i] ||
1230 				cur->sk_subtype == InvalidOid)
1231 			{
1232 				FmgrInfo   *procinfo;
1233 
1234 				procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1235 				ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1236 											   cur->sk_flags,
1237 											   cur->sk_attno,
1238 											   InvalidStrategy,
1239 											   cur->sk_subtype,
1240 											   cur->sk_collation,
1241 											   procinfo,
1242 											   cur->sk_argument);
1243 			}
1244 			else
1245 			{
1246 				RegProcedure cmp_proc;
1247 
1248 				cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1249 											 rel->rd_opcintype[i],
1250 											 cur->sk_subtype,
1251 											 BTORDER_PROC);
1252 				if (!RegProcedureIsValid(cmp_proc))
1253 					elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1254 						 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1255 						 cur->sk_attno, RelationGetRelationName(rel));
1256 				ScanKeyEntryInitialize(inskey.scankeys + i,
1257 									   cur->sk_flags,
1258 									   cur->sk_attno,
1259 									   InvalidStrategy,
1260 									   cur->sk_subtype,
1261 									   cur->sk_collation,
1262 									   cmp_proc,
1263 									   cur->sk_argument);
1264 			}
1265 		}
1266 	}
1267 
1268 	/*----------
1269 	 * Examine the selected initial-positioning strategy to determine exactly
1270 	 * where we need to start the scan, and set flag variables to control the
1271 	 * code below.
1272 	 *
1273 	 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1274 	 * item >= scan key.  If nextkey = true, they will locate the first
1275 	 * item > scan key.
1276 	 *
1277 	 * If goback = true, we will then step back one item, while if
1278 	 * goback = false, we will start the scan on the located item.
1279 	 *----------
1280 	 */
1281 	switch (strat_total)
1282 	{
1283 		case BTLessStrategyNumber:
1284 
1285 			/*
1286 			 * Find first item >= scankey, then back up one to arrive at last
1287 			 * item < scankey.  (Note: this positioning strategy is only used
1288 			 * for a backward scan, so that is always the correct starting
1289 			 * position.)
1290 			 */
1291 			nextkey = false;
1292 			goback = true;
1293 			break;
1294 
1295 		case BTLessEqualStrategyNumber:
1296 
1297 			/*
1298 			 * Find first item > scankey, then back up one to arrive at last
1299 			 * item <= scankey.  (Note: this positioning strategy is only used
1300 			 * for a backward scan, so that is always the correct starting
1301 			 * position.)
1302 			 */
1303 			nextkey = true;
1304 			goback = true;
1305 			break;
1306 
1307 		case BTEqualStrategyNumber:
1308 
1309 			/*
1310 			 * If a backward scan was specified, need to start with last equal
1311 			 * item not first one.
1312 			 */
1313 			if (ScanDirectionIsBackward(dir))
1314 			{
1315 				/*
1316 				 * This is the same as the <= strategy.  We will check at the
1317 				 * end whether the found item is actually =.
1318 				 */
1319 				nextkey = true;
1320 				goback = true;
1321 			}
1322 			else
1323 			{
1324 				/*
1325 				 * This is the same as the >= strategy.  We will check at the
1326 				 * end whether the found item is actually =.
1327 				 */
1328 				nextkey = false;
1329 				goback = false;
1330 			}
1331 			break;
1332 
1333 		case BTGreaterEqualStrategyNumber:
1334 
1335 			/*
1336 			 * Find first item >= scankey.  (This is only used for forward
1337 			 * scans.)
1338 			 */
1339 			nextkey = false;
1340 			goback = false;
1341 			break;
1342 
1343 		case BTGreaterStrategyNumber:
1344 
1345 			/*
1346 			 * Find first item > scankey.  (This is only used for forward
1347 			 * scans.)
1348 			 */
1349 			nextkey = true;
1350 			goback = false;
1351 			break;
1352 
1353 		default:
1354 			/* can't get here, but keep compiler quiet */
1355 			elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1356 			return false;
1357 	}
1358 
1359 	/* Initialize remaining insertion scan key fields */
1360 	_bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1361 	inskey.anynullkeys = false; /* unused */
1362 	inskey.nextkey = nextkey;
1363 	inskey.pivotsearch = false;
1364 	inskey.scantid = NULL;
1365 	inskey.keysz = keysCount;
1366 
1367 	/*
1368 	 * Use the manufactured insertion scan key to descend the tree and
1369 	 * position ourselves on the target leaf page.
1370 	 */
1371 	stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1372 
1373 	/* don't need to keep the stack around... */
1374 	_bt_freestack(stack);
1375 
1376 	if (!BufferIsValid(buf))
1377 	{
1378 		/*
1379 		 * We only get here if the index is completely empty. Lock relation
1380 		 * because nothing finer to lock exists.
1381 		 */
1382 		PredicateLockRelation(rel, scan->xs_snapshot);
1383 
1384 		/*
1385 		 * mark parallel scan as done, so that all the workers can finish
1386 		 * their scan
1387 		 */
1388 		_bt_parallel_done(scan);
1389 		BTScanPosInvalidate(so->currPos);
1390 
1391 		return false;
1392 	}
1393 	else
1394 		PredicateLockPage(rel, BufferGetBlockNumber(buf),
1395 						  scan->xs_snapshot);
1396 
1397 	_bt_initialize_more_data(so, dir);
1398 
1399 	/* position to the precise item on the page */
1400 	offnum = _bt_binsrch(rel, &inskey, buf);
1401 
1402 	/*
1403 	 * If nextkey = false, we are positioned at the first item >= scan key, or
1404 	 * possibly at the end of a page on which all the existing items are less
1405 	 * than the scan key and we know that everything on later pages is greater
1406 	 * than or equal to scan key.
1407 	 *
1408 	 * If nextkey = true, we are positioned at the first item > scan key, or
1409 	 * possibly at the end of a page on which all the existing items are less
1410 	 * than or equal to the scan key and we know that everything on later
1411 	 * pages is greater than scan key.
1412 	 *
1413 	 * The actually desired starting point is either this item or the prior
1414 	 * one, or in the end-of-page case it's the first item on the next page or
1415 	 * the last item on this page.  Adjust the starting offset if needed. (If
1416 	 * this results in an offset before the first item or after the last one,
1417 	 * _bt_readpage will report no items found, and then we'll step to the
1418 	 * next page as needed.)
1419 	 */
1420 	if (goback)
1421 		offnum = OffsetNumberPrev(offnum);
1422 
1423 	/* remember which buffer we have pinned, if any */
1424 	Assert(!BTScanPosIsValid(so->currPos));
1425 	so->currPos.buf = buf;
1426 
1427 	/*
1428 	 * Now load data from the first page of the scan.
1429 	 */
1430 	if (!_bt_readpage(scan, dir, offnum))
1431 	{
1432 		/*
1433 		 * There's no actually-matching data on this page.  Try to advance to
1434 		 * the next page.  Return false if there's no matching data at all.
1435 		 */
1436 		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1437 		if (!_bt_steppage(scan, dir))
1438 			return false;
1439 	}
1440 	else
1441 	{
1442 		/* Drop the lock, and maybe the pin, on the current page */
1443 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1444 	}
1445 
1446 readcomplete:
1447 	/* OK, itemIndex says what to return */
1448 	currItem = &so->currPos.items[so->currPos.itemIndex];
1449 	scan->xs_heaptid = currItem->heapTid;
1450 	if (scan->xs_want_itup)
1451 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1452 
1453 	return true;
1454 }
1455 
1456 /*
1457  *	_bt_next() -- Get the next item in a scan.
1458  *
1459  *		On entry, so->currPos describes the current page, which may be pinned
1460  *		but is not locked, and so->currPos.itemIndex identifies which item was
1461  *		previously returned.
1462  *
1463  *		On successful exit, scan->xs_ctup.t_self is set to the TID of the
1464  *		next heap tuple, and if requested, scan->xs_itup points to a copy of
1465  *		the index tuple.  so->currPos is updated as needed.
1466  *
1467  *		On failure exit (no more tuples), we release pin and set
1468  *		so->currPos.buf to InvalidBuffer.
1469  */
1470 bool
_bt_next(IndexScanDesc scan,ScanDirection dir)1471 _bt_next(IndexScanDesc scan, ScanDirection dir)
1472 {
1473 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1474 	BTScanPosItem *currItem;
1475 
1476 	/*
1477 	 * Advance to next tuple on current page; or if there's no more, try to
1478 	 * step to the next page with data.
1479 	 */
1480 	if (ScanDirectionIsForward(dir))
1481 	{
1482 		if (++so->currPos.itemIndex > so->currPos.lastItem)
1483 		{
1484 			if (!_bt_steppage(scan, dir))
1485 				return false;
1486 		}
1487 	}
1488 	else
1489 	{
1490 		if (--so->currPos.itemIndex < so->currPos.firstItem)
1491 		{
1492 			if (!_bt_steppage(scan, dir))
1493 				return false;
1494 		}
1495 	}
1496 
1497 	/* OK, itemIndex says what to return */
1498 	currItem = &so->currPos.items[so->currPos.itemIndex];
1499 	scan->xs_heaptid = currItem->heapTid;
1500 	if (scan->xs_want_itup)
1501 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1502 
1503 	return true;
1504 }
1505 
1506 /*
1507  *	_bt_readpage() -- Load data from current index page into so->currPos
1508  *
1509  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1510  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
1511  * they are updated as appropriate.  All other fields of so->currPos are
1512  * initialized from scratch here.
1513  *
1514  * We scan the current page starting at offnum and moving in the indicated
1515  * direction.  All items matching the scan keys are loaded into currPos.items.
1516  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1517  * that there can be no more matching tuples in the current scan direction.
1518  *
1519  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1520  * prior to calling this function; this function will invoke
1521  * _bt_parallel_release before returning.
1522  *
1523  * Returns true if any matching items found on the page, false if none.
1524  */
1525 static bool
_bt_readpage(IndexScanDesc scan,ScanDirection dir,OffsetNumber offnum)1526 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1527 {
1528 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1529 	Page		page;
1530 	BTPageOpaque opaque;
1531 	OffsetNumber minoff;
1532 	OffsetNumber maxoff;
1533 	int			itemIndex;
1534 	bool		continuescan;
1535 	int			indnatts;
1536 
1537 	/*
1538 	 * We must have the buffer pinned and locked, but the usual macro can't be
1539 	 * used here; this function is what makes it good for currPos.
1540 	 */
1541 	Assert(BufferIsValid(so->currPos.buf));
1542 
1543 	page = BufferGetPage(so->currPos.buf);
1544 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1545 
1546 	/* allow next page be processed by parallel worker */
1547 	if (scan->parallel_scan)
1548 	{
1549 		if (ScanDirectionIsForward(dir))
1550 			_bt_parallel_release(scan, opaque->btpo_next);
1551 		else
1552 			_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1553 	}
1554 
1555 	continuescan = true;		/* default assumption */
1556 	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1557 	minoff = P_FIRSTDATAKEY(opaque);
1558 	maxoff = PageGetMaxOffsetNumber(page);
1559 
1560 	/*
1561 	 * We note the buffer's block number so that we can release the pin later.
1562 	 * This allows us to re-read the buffer if it is needed again for hinting.
1563 	 */
1564 	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1565 
1566 	/*
1567 	 * We save the LSN of the page as we read it, so that we know whether it
1568 	 * safe to apply LP_DEAD hints to the page later.  This allows us to drop
1569 	 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1570 	 */
1571 	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1572 
1573 	/*
1574 	 * we must save the page's right-link while scanning it; this tells us
1575 	 * where to step right to after we're done with these items.  There is no
1576 	 * corresponding need for the left-link, since splits always go right.
1577 	 */
1578 	so->currPos.nextPage = opaque->btpo_next;
1579 
1580 	/* initialize tuple workspace to empty */
1581 	so->currPos.nextTupleOffset = 0;
1582 
1583 	/*
1584 	 * Now that the current page has been made consistent, the macro should be
1585 	 * good.
1586 	 */
1587 	Assert(BTScanPosIsPinned(so->currPos));
1588 
1589 	if (ScanDirectionIsForward(dir))
1590 	{
1591 		/* load items[] in ascending order */
1592 		itemIndex = 0;
1593 
1594 		offnum = Max(offnum, minoff);
1595 
1596 		while (offnum <= maxoff)
1597 		{
1598 			ItemId		iid = PageGetItemId(page, offnum);
1599 			IndexTuple	itup;
1600 
1601 			/*
1602 			 * If the scan specifies not to return killed tuples, then we
1603 			 * treat a killed tuple as not passing the qual
1604 			 */
1605 			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1606 			{
1607 				offnum = OffsetNumberNext(offnum);
1608 				continue;
1609 			}
1610 
1611 			itup = (IndexTuple) PageGetItem(page, iid);
1612 
1613 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1614 			{
1615 				/* tuple passes all scan key conditions */
1616 				if (!BTreeTupleIsPosting(itup))
1617 				{
1618 					/* Remember it */
1619 					_bt_saveitem(so, itemIndex, offnum, itup);
1620 					itemIndex++;
1621 				}
1622 				else
1623 				{
1624 					int			tupleOffset;
1625 
1626 					/*
1627 					 * Set up state to return posting list, and remember first
1628 					 * TID
1629 					 */
1630 					tupleOffset =
1631 						_bt_setuppostingitems(so, itemIndex, offnum,
1632 											  BTreeTupleGetPostingN(itup, 0),
1633 											  itup);
1634 					itemIndex++;
1635 					/* Remember additional TIDs */
1636 					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1637 					{
1638 						_bt_savepostingitem(so, itemIndex, offnum,
1639 											BTreeTupleGetPostingN(itup, i),
1640 											tupleOffset);
1641 						itemIndex++;
1642 					}
1643 				}
1644 			}
1645 			/* When !continuescan, there can't be any more matches, so stop */
1646 			if (!continuescan)
1647 				break;
1648 
1649 			offnum = OffsetNumberNext(offnum);
1650 		}
1651 
1652 		/*
1653 		 * We don't need to visit page to the right when the high key
1654 		 * indicates that no more matches will be found there.
1655 		 *
1656 		 * Checking the high key like this works out more often than you might
1657 		 * think.  Leaf page splits pick a split point between the two most
1658 		 * dissimilar tuples (this is weighed against the need to evenly share
1659 		 * free space).  Leaf pages with high key attribute values that can
1660 		 * only appear on non-pivot tuples on the right sibling page are
1661 		 * common.
1662 		 */
1663 		if (continuescan && !P_RIGHTMOST(opaque))
1664 		{
1665 			ItemId		iid = PageGetItemId(page, P_HIKEY);
1666 			IndexTuple	itup = (IndexTuple) PageGetItem(page, iid);
1667 			int			truncatt;
1668 
1669 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1670 			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1671 		}
1672 
1673 		if (!continuescan)
1674 			so->currPos.moreRight = false;
1675 
1676 		Assert(itemIndex <= MaxTIDsPerBTreePage);
1677 		so->currPos.firstItem = 0;
1678 		so->currPos.lastItem = itemIndex - 1;
1679 		so->currPos.itemIndex = 0;
1680 	}
1681 	else
1682 	{
1683 		/* load items[] in descending order */
1684 		itemIndex = MaxTIDsPerBTreePage;
1685 
1686 		offnum = Min(offnum, maxoff);
1687 
1688 		while (offnum >= minoff)
1689 		{
1690 			ItemId		iid = PageGetItemId(page, offnum);
1691 			IndexTuple	itup;
1692 			bool		tuple_alive;
1693 			bool		passes_quals;
1694 
1695 			/*
1696 			 * If the scan specifies not to return killed tuples, then we
1697 			 * treat a killed tuple as not passing the qual.  Most of the
1698 			 * time, it's a win to not bother examining the tuple's index
1699 			 * keys, but just skip to the next tuple (previous, actually,
1700 			 * since we're scanning backwards).  However, if this is the first
1701 			 * tuple on the page, we do check the index keys, to prevent
1702 			 * uselessly advancing to the page to the left.  This is similar
1703 			 * to the high key optimization used by forward scans.
1704 			 */
1705 			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1706 			{
1707 				Assert(offnum >= P_FIRSTDATAKEY(opaque));
1708 				if (offnum > P_FIRSTDATAKEY(opaque))
1709 				{
1710 					offnum = OffsetNumberPrev(offnum);
1711 					continue;
1712 				}
1713 
1714 				tuple_alive = false;
1715 			}
1716 			else
1717 				tuple_alive = true;
1718 
1719 			itup = (IndexTuple) PageGetItem(page, iid);
1720 
1721 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1722 										 &continuescan);
1723 			if (passes_quals && tuple_alive)
1724 			{
1725 				/* tuple passes all scan key conditions */
1726 				if (!BTreeTupleIsPosting(itup))
1727 				{
1728 					/* Remember it */
1729 					itemIndex--;
1730 					_bt_saveitem(so, itemIndex, offnum, itup);
1731 				}
1732 				else
1733 				{
1734 					int			tupleOffset;
1735 
1736 					/*
1737 					 * Set up state to return posting list, and remember first
1738 					 * TID.
1739 					 *
1740 					 * Note that we deliberately save/return items from
1741 					 * posting lists in ascending heap TID order for backwards
1742 					 * scans.  This allows _bt_killitems() to make a
1743 					 * consistent assumption about the order of items
1744 					 * associated with the same posting list tuple.
1745 					 */
1746 					itemIndex--;
1747 					tupleOffset =
1748 						_bt_setuppostingitems(so, itemIndex, offnum,
1749 											  BTreeTupleGetPostingN(itup, 0),
1750 											  itup);
1751 					/* Remember additional TIDs */
1752 					for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1753 					{
1754 						itemIndex--;
1755 						_bt_savepostingitem(so, itemIndex, offnum,
1756 											BTreeTupleGetPostingN(itup, i),
1757 											tupleOffset);
1758 					}
1759 				}
1760 			}
1761 			if (!continuescan)
1762 			{
1763 				/* there can't be any more matches, so stop */
1764 				so->currPos.moreLeft = false;
1765 				break;
1766 			}
1767 
1768 			offnum = OffsetNumberPrev(offnum);
1769 		}
1770 
1771 		Assert(itemIndex >= 0);
1772 		so->currPos.firstItem = itemIndex;
1773 		so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1774 		so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
1775 	}
1776 
1777 	return (so->currPos.firstItem <= so->currPos.lastItem);
1778 }
1779 
1780 /* Save an index item into so->currPos.items[itemIndex] */
1781 static void
_bt_saveitem(BTScanOpaque so,int itemIndex,OffsetNumber offnum,IndexTuple itup)1782 _bt_saveitem(BTScanOpaque so, int itemIndex,
1783 			 OffsetNumber offnum, IndexTuple itup)
1784 {
1785 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1786 
1787 	Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1788 
1789 	currItem->heapTid = itup->t_tid;
1790 	currItem->indexOffset = offnum;
1791 	if (so->currTuples)
1792 	{
1793 		Size		itupsz = IndexTupleSize(itup);
1794 
1795 		currItem->tupleOffset = so->currPos.nextTupleOffset;
1796 		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1797 		so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1798 	}
1799 }
1800 
1801 /*
1802  * Setup state to save TIDs/items from a single posting list tuple.
1803  *
1804  * Saves an index item into so->currPos.items[itemIndex] for TID that is
1805  * returned to scan first.  Second or subsequent TIDs for posting list should
1806  * be saved by calling _bt_savepostingitem().
1807  *
1808  * Returns an offset into tuple storage space that main tuple is stored at if
1809  * needed.
1810  */
1811 static int
_bt_setuppostingitems(BTScanOpaque so,int itemIndex,OffsetNumber offnum,ItemPointer heapTid,IndexTuple itup)1812 _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1813 					  ItemPointer heapTid, IndexTuple itup)
1814 {
1815 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1816 
1817 	Assert(BTreeTupleIsPosting(itup));
1818 
1819 	currItem->heapTid = *heapTid;
1820 	currItem->indexOffset = offnum;
1821 	if (so->currTuples)
1822 	{
1823 		/* Save base IndexTuple (truncate posting list) */
1824 		IndexTuple	base;
1825 		Size		itupsz = BTreeTupleGetPostingOffset(itup);
1826 
1827 		itupsz = MAXALIGN(itupsz);
1828 		currItem->tupleOffset = so->currPos.nextTupleOffset;
1829 		base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1830 		memcpy(base, itup, itupsz);
1831 		/* Defensively reduce work area index tuple header size */
1832 		base->t_info &= ~INDEX_SIZE_MASK;
1833 		base->t_info |= itupsz;
1834 		so->currPos.nextTupleOffset += itupsz;
1835 
1836 		return currItem->tupleOffset;
1837 	}
1838 
1839 	return 0;
1840 }
1841 
1842 /*
1843  * Save an index item into so->currPos.items[itemIndex] for current posting
1844  * tuple.
1845  *
1846  * Assumes that _bt_setuppostingitems() has already been called for current
1847  * posting list tuple.  Caller passes its return value as tupleOffset.
1848  */
1849 static inline void
_bt_savepostingitem(BTScanOpaque so,int itemIndex,OffsetNumber offnum,ItemPointer heapTid,int tupleOffset)1850 _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1851 					ItemPointer heapTid, int tupleOffset)
1852 {
1853 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1854 
1855 	currItem->heapTid = *heapTid;
1856 	currItem->indexOffset = offnum;
1857 
1858 	/*
1859 	 * Have index-only scans return the same base IndexTuple for every TID
1860 	 * that originates from the same posting list
1861 	 */
1862 	if (so->currTuples)
1863 		currItem->tupleOffset = tupleOffset;
1864 }
1865 
1866 /*
1867  *	_bt_steppage() -- Step to next page containing valid data for scan
1868  *
1869  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1870  * if pinned, we'll drop the pin before moving to next page.  The buffer is
1871  * not locked on entry.
1872  *
1873  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1874  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
1875  * to InvalidBuffer.  We return true to indicate success.
1876  */
1877 static bool
_bt_steppage(IndexScanDesc scan,ScanDirection dir)1878 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1879 {
1880 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1881 	BlockNumber blkno = InvalidBlockNumber;
1882 	bool		status;
1883 
1884 	Assert(BTScanPosIsValid(so->currPos));
1885 
1886 	/* Before leaving current page, deal with any killed items */
1887 	if (so->numKilled > 0)
1888 		_bt_killitems(scan);
1889 
1890 	/*
1891 	 * Before we modify currPos, make a copy of the page data if there was a
1892 	 * mark position that needs it.
1893 	 */
1894 	if (so->markItemIndex >= 0)
1895 	{
1896 		/* bump pin on current buffer for assignment to mark buffer */
1897 		if (BTScanPosIsPinned(so->currPos))
1898 			IncrBufferRefCount(so->currPos.buf);
1899 		memcpy(&so->markPos, &so->currPos,
1900 			   offsetof(BTScanPosData, items[1]) +
1901 			   so->currPos.lastItem * sizeof(BTScanPosItem));
1902 		if (so->markTuples)
1903 			memcpy(so->markTuples, so->currTuples,
1904 				   so->currPos.nextTupleOffset);
1905 		so->markPos.itemIndex = so->markItemIndex;
1906 		so->markItemIndex = -1;
1907 	}
1908 
1909 	if (ScanDirectionIsForward(dir))
1910 	{
1911 		/* Walk right to the next page with data */
1912 		if (scan->parallel_scan != NULL)
1913 		{
1914 			/*
1915 			 * Seize the scan to get the next block number; if the scan has
1916 			 * ended already, bail out.
1917 			 */
1918 			status = _bt_parallel_seize(scan, &blkno);
1919 			if (!status)
1920 			{
1921 				/* release the previous buffer, if pinned */
1922 				BTScanPosUnpinIfPinned(so->currPos);
1923 				BTScanPosInvalidate(so->currPos);
1924 				return false;
1925 			}
1926 		}
1927 		else
1928 		{
1929 			/* Not parallel, so use the previously-saved nextPage link. */
1930 			blkno = so->currPos.nextPage;
1931 		}
1932 
1933 		/* Remember we left a page with data */
1934 		so->currPos.moreLeft = true;
1935 
1936 		/* release the previous buffer, if pinned */
1937 		BTScanPosUnpinIfPinned(so->currPos);
1938 	}
1939 	else
1940 	{
1941 		/* Remember we left a page with data */
1942 		so->currPos.moreRight = true;
1943 
1944 		if (scan->parallel_scan != NULL)
1945 		{
1946 			/*
1947 			 * Seize the scan to get the current block number; if the scan has
1948 			 * ended already, bail out.
1949 			 */
1950 			status = _bt_parallel_seize(scan, &blkno);
1951 			BTScanPosUnpinIfPinned(so->currPos);
1952 			if (!status)
1953 			{
1954 				BTScanPosInvalidate(so->currPos);
1955 				return false;
1956 			}
1957 		}
1958 		else
1959 		{
1960 			/* Not parallel, so just use our own notion of the current page */
1961 			blkno = so->currPos.currPage;
1962 		}
1963 	}
1964 
1965 	if (!_bt_readnextpage(scan, blkno, dir))
1966 		return false;
1967 
1968 	/* Drop the lock, and maybe the pin, on the current page */
1969 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1970 
1971 	return true;
1972 }
1973 
1974 /*
1975  *	_bt_readnextpage() -- Read next page containing valid data for scan
1976  *
1977  * On success exit, so->currPos is updated to contain data from the next
1978  * interesting page.  Caller is responsible to release lock and pin on
1979  * buffer on success.  We return true to indicate success.
1980  *
1981  * If there are no more matching records in the given direction, we drop all
1982  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1983  */
1984 static bool
_bt_readnextpage(IndexScanDesc scan,BlockNumber blkno,ScanDirection dir)1985 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1986 {
1987 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1988 	Relation	rel;
1989 	Page		page;
1990 	BTPageOpaque opaque;
1991 	bool		status;
1992 
1993 	rel = scan->indexRelation;
1994 
1995 	if (ScanDirectionIsForward(dir))
1996 	{
1997 		for (;;)
1998 		{
1999 			/*
2000 			 * if we're at end of scan, give up and mark parallel scan as
2001 			 * done, so that all the workers can finish their scan
2002 			 */
2003 			if (blkno == P_NONE || !so->currPos.moreRight)
2004 			{
2005 				_bt_parallel_done(scan);
2006 				BTScanPosInvalidate(so->currPos);
2007 				return false;
2008 			}
2009 			/* check for interrupts while we're not holding any buffer lock */
2010 			CHECK_FOR_INTERRUPTS();
2011 			/* step right one page */
2012 			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2013 			page = BufferGetPage(so->currPos.buf);
2014 			TestForOldSnapshot(scan->xs_snapshot, rel, page);
2015 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2016 			/* check for deleted page */
2017 			if (!P_IGNORE(opaque))
2018 			{
2019 				PredicateLockPage(rel, blkno, scan->xs_snapshot);
2020 				/* see if there are any matches on this page */
2021 				/* note that this will clear moreRight if we can stop */
2022 				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
2023 					break;
2024 			}
2025 			else if (scan->parallel_scan != NULL)
2026 			{
2027 				/* allow next page be processed by parallel worker */
2028 				_bt_parallel_release(scan, opaque->btpo_next);
2029 			}
2030 
2031 			/* nope, keep going */
2032 			if (scan->parallel_scan != NULL)
2033 			{
2034 				_bt_relbuf(rel, so->currPos.buf);
2035 				status = _bt_parallel_seize(scan, &blkno);
2036 				if (!status)
2037 				{
2038 					BTScanPosInvalidate(so->currPos);
2039 					return false;
2040 				}
2041 			}
2042 			else
2043 			{
2044 				blkno = opaque->btpo_next;
2045 				_bt_relbuf(rel, so->currPos.buf);
2046 			}
2047 		}
2048 	}
2049 	else
2050 	{
2051 		/*
2052 		 * Should only happen in parallel cases, when some other backend
2053 		 * advanced the scan.
2054 		 */
2055 		if (so->currPos.currPage != blkno)
2056 		{
2057 			BTScanPosUnpinIfPinned(so->currPos);
2058 			so->currPos.currPage = blkno;
2059 		}
2060 
2061 		/*
2062 		 * Walk left to the next page with data.  This is much more complex
2063 		 * than the walk-right case because of the possibility that the page
2064 		 * to our left splits while we are in flight to it, plus the
2065 		 * possibility that the page we were on gets deleted after we leave
2066 		 * it.  See nbtree/README for details.
2067 		 *
2068 		 * It might be possible to rearrange this code to have less overhead
2069 		 * in pinning and locking, but that would require capturing the left
2070 		 * pointer when the page is initially read, and using it here, along
2071 		 * with big changes to _bt_walk_left() and the code below.  It is not
2072 		 * clear whether this would be a win, since if the page immediately to
2073 		 * the left splits after we read this page and before we step left, we
2074 		 * would need to visit more pages than with the current code.
2075 		 *
2076 		 * Note that if we change the code so that we drop the pin for a scan
2077 		 * which uses a non-MVCC snapshot, we will need to modify the code for
2078 		 * walking left, to allow for the possibility that a referenced page
2079 		 * has been deleted.  As long as the buffer is pinned or the snapshot
2080 		 * is MVCC the page cannot move past the half-dead state to fully
2081 		 * deleted.
2082 		 */
2083 		if (BTScanPosIsPinned(so->currPos))
2084 			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
2085 		else
2086 			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2087 
2088 		for (;;)
2089 		{
2090 			/* Done if we know there are no matching keys to the left */
2091 			if (!so->currPos.moreLeft)
2092 			{
2093 				_bt_relbuf(rel, so->currPos.buf);
2094 				_bt_parallel_done(scan);
2095 				BTScanPosInvalidate(so->currPos);
2096 				return false;
2097 			}
2098 
2099 			/* Step to next physical page */
2100 			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
2101 											scan->xs_snapshot);
2102 
2103 			/* if we're physically at end of index, return failure */
2104 			if (so->currPos.buf == InvalidBuffer)
2105 			{
2106 				_bt_parallel_done(scan);
2107 				BTScanPosInvalidate(so->currPos);
2108 				return false;
2109 			}
2110 
2111 			/*
2112 			 * Okay, we managed to move left to a non-deleted page. Done if
2113 			 * it's not half-dead and contains matching tuples. Else loop back
2114 			 * and do it all again.
2115 			 */
2116 			page = BufferGetPage(so->currPos.buf);
2117 			TestForOldSnapshot(scan->xs_snapshot, rel, page);
2118 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2119 			if (!P_IGNORE(opaque))
2120 			{
2121 				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
2122 				/* see if there are any matches on this page */
2123 				/* note that this will clear moreLeft if we can stop */
2124 				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
2125 					break;
2126 			}
2127 			else if (scan->parallel_scan != NULL)
2128 			{
2129 				/* allow next page be processed by parallel worker */
2130 				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
2131 			}
2132 
2133 			/*
2134 			 * For parallel scans, get the last page scanned as it is quite
2135 			 * possible that by the time we try to seize the scan, some other
2136 			 * worker has already advanced the scan to a different page.  We
2137 			 * must continue based on the latest page scanned by any worker.
2138 			 */
2139 			if (scan->parallel_scan != NULL)
2140 			{
2141 				_bt_relbuf(rel, so->currPos.buf);
2142 				status = _bt_parallel_seize(scan, &blkno);
2143 				if (!status)
2144 				{
2145 					BTScanPosInvalidate(so->currPos);
2146 					return false;
2147 				}
2148 				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2149 			}
2150 		}
2151 	}
2152 
2153 	return true;
2154 }
2155 
2156 /*
2157  *	_bt_parallel_readpage() -- Read current page containing valid data for scan
2158  *
2159  * On success, release lock and maybe pin on buffer.  We return true to
2160  * indicate success.
2161  */
2162 static bool
_bt_parallel_readpage(IndexScanDesc scan,BlockNumber blkno,ScanDirection dir)2163 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2164 {
2165 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
2166 
2167 	_bt_initialize_more_data(so, dir);
2168 
2169 	if (!_bt_readnextpage(scan, blkno, dir))
2170 		return false;
2171 
2172 	/* Drop the lock, and maybe the pin, on the current page */
2173 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2174 
2175 	return true;
2176 }
2177 
2178 /*
2179  * _bt_walk_left() -- step left one page, if possible
2180  *
2181  * The given buffer must be pinned and read-locked.  This will be dropped
2182  * before stepping left.  On return, we have pin and read lock on the
2183  * returned page, instead.
2184  *
2185  * Returns InvalidBuffer if there is no page to the left (no lock is held
2186  * in that case).
2187  *
2188  * When working on a non-leaf level, it is possible for the returned page
2189  * to be half-dead; the caller should check that condition and step left
2190  * again if it's important.
2191  */
2192 static Buffer
_bt_walk_left(Relation rel,Buffer buf,Snapshot snapshot)2193 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
2194 {
2195 	Page		page;
2196 	BTPageOpaque opaque;
2197 
2198 	page = BufferGetPage(buf);
2199 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2200 
2201 	for (;;)
2202 	{
2203 		BlockNumber obknum;
2204 		BlockNumber lblkno;
2205 		BlockNumber blkno;
2206 		int			tries;
2207 
2208 		/* if we're at end of tree, release buf and return failure */
2209 		if (P_LEFTMOST(opaque))
2210 		{
2211 			_bt_relbuf(rel, buf);
2212 			break;
2213 		}
2214 		/* remember original page we are stepping left from */
2215 		obknum = BufferGetBlockNumber(buf);
2216 		/* step left */
2217 		blkno = lblkno = opaque->btpo_prev;
2218 		_bt_relbuf(rel, buf);
2219 		/* check for interrupts while we're not holding any buffer lock */
2220 		CHECK_FOR_INTERRUPTS();
2221 		buf = _bt_getbuf(rel, blkno, BT_READ);
2222 		page = BufferGetPage(buf);
2223 		TestForOldSnapshot(snapshot, rel, page);
2224 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2225 
2226 		/*
2227 		 * If this isn't the page we want, walk right till we find what we
2228 		 * want --- but go no more than four hops (an arbitrary limit). If we
2229 		 * don't find the correct page by then, the most likely bet is that
2230 		 * the original page got deleted and isn't in the sibling chain at all
2231 		 * anymore, not that its left sibling got split more than four times.
2232 		 *
2233 		 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2234 		 * because half-dead pages are still in the sibling chain.  Caller
2235 		 * must reject half-dead pages if wanted.
2236 		 */
2237 		tries = 0;
2238 		for (;;)
2239 		{
2240 			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2241 			{
2242 				/* Found desired page, return it */
2243 				return buf;
2244 			}
2245 			if (P_RIGHTMOST(opaque) || ++tries > 4)
2246 				break;
2247 			blkno = opaque->btpo_next;
2248 			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2249 			page = BufferGetPage(buf);
2250 			TestForOldSnapshot(snapshot, rel, page);
2251 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2252 		}
2253 
2254 		/* Return to the original page to see what's up */
2255 		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2256 		page = BufferGetPage(buf);
2257 		TestForOldSnapshot(snapshot, rel, page);
2258 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2259 		if (P_ISDELETED(opaque))
2260 		{
2261 			/*
2262 			 * It was deleted.  Move right to first nondeleted page (there
2263 			 * must be one); that is the page that has acquired the deleted
2264 			 * one's keyspace, so stepping left from it will take us where we
2265 			 * want to be.
2266 			 */
2267 			for (;;)
2268 			{
2269 				if (P_RIGHTMOST(opaque))
2270 					elog(ERROR, "fell off the end of index \"%s\"",
2271 						 RelationGetRelationName(rel));
2272 				blkno = opaque->btpo_next;
2273 				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2274 				page = BufferGetPage(buf);
2275 				TestForOldSnapshot(snapshot, rel, page);
2276 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2277 				if (!P_ISDELETED(opaque))
2278 					break;
2279 			}
2280 
2281 			/*
2282 			 * Now return to top of loop, resetting obknum to point to this
2283 			 * nondeleted page, and try again.
2284 			 */
2285 		}
2286 		else
2287 		{
2288 			/*
2289 			 * It wasn't deleted; the explanation had better be that the page
2290 			 * to the left got split or deleted. Without this check, we'd go
2291 			 * into an infinite loop if there's anything wrong.
2292 			 */
2293 			if (opaque->btpo_prev == lblkno)
2294 				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2295 					 obknum, RelationGetRelationName(rel));
2296 			/* Okay to try again with new lblkno value */
2297 		}
2298 	}
2299 
2300 	return InvalidBuffer;
2301 }
2302 
2303 /*
2304  * _bt_get_endpoint() -- Find the first or last page on a given tree level
2305  *
2306  * If the index is empty, we will return InvalidBuffer; any other failure
2307  * condition causes ereport().  We will not return a dead page.
2308  *
2309  * The returned buffer is pinned and read-locked.
2310  */
2311 Buffer
_bt_get_endpoint(Relation rel,uint32 level,bool rightmost,Snapshot snapshot)2312 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2313 				 Snapshot snapshot)
2314 {
2315 	Buffer		buf;
2316 	Page		page;
2317 	BTPageOpaque opaque;
2318 	OffsetNumber offnum;
2319 	BlockNumber blkno;
2320 	IndexTuple	itup;
2321 
2322 	/*
2323 	 * If we are looking for a leaf page, okay to descend from fast root;
2324 	 * otherwise better descend from true root.  (There is no point in being
2325 	 * smarter about intermediate levels.)
2326 	 */
2327 	if (level == 0)
2328 		buf = _bt_getroot(rel, BT_READ);
2329 	else
2330 		buf = _bt_gettrueroot(rel);
2331 
2332 	if (!BufferIsValid(buf))
2333 		return InvalidBuffer;
2334 
2335 	page = BufferGetPage(buf);
2336 	TestForOldSnapshot(snapshot, rel, page);
2337 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2338 
2339 	for (;;)
2340 	{
2341 		/*
2342 		 * If we landed on a deleted page, step right to find a live page
2343 		 * (there must be one).  Also, if we want the rightmost page, step
2344 		 * right if needed to get to it (this could happen if the page split
2345 		 * since we obtained a pointer to it).
2346 		 */
2347 		while (P_IGNORE(opaque) ||
2348 			   (rightmost && !P_RIGHTMOST(opaque)))
2349 		{
2350 			blkno = opaque->btpo_next;
2351 			if (blkno == P_NONE)
2352 				elog(ERROR, "fell off the end of index \"%s\"",
2353 					 RelationGetRelationName(rel));
2354 			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2355 			page = BufferGetPage(buf);
2356 			TestForOldSnapshot(snapshot, rel, page);
2357 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2358 		}
2359 
2360 		/* Done? */
2361 		if (opaque->btpo_level == level)
2362 			break;
2363 		if (opaque->btpo_level < level)
2364 			ereport(ERROR,
2365 					(errcode(ERRCODE_INDEX_CORRUPTED),
2366 					 errmsg_internal("btree level %u not found in index \"%s\"",
2367 									 level, RelationGetRelationName(rel))));
2368 
2369 		/* Descend to leftmost or rightmost child page */
2370 		if (rightmost)
2371 			offnum = PageGetMaxOffsetNumber(page);
2372 		else
2373 			offnum = P_FIRSTDATAKEY(opaque);
2374 
2375 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2376 		blkno = BTreeTupleGetDownLink(itup);
2377 
2378 		buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2379 		page = BufferGetPage(buf);
2380 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2381 	}
2382 
2383 	return buf;
2384 }
2385 
2386 /*
2387  *	_bt_endpoint() -- Find the first or last page in the index, and scan
2388  * from there to the first key satisfying all the quals.
2389  *
2390  * This is used by _bt_first() to set up a scan when we've determined
2391  * that the scan must start at the beginning or end of the index (for
2392  * a forward or backward scan respectively).  Exit conditions are the
2393  * same as for _bt_first().
2394  */
2395 static bool
_bt_endpoint(IndexScanDesc scan,ScanDirection dir)2396 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2397 {
2398 	Relation	rel = scan->indexRelation;
2399 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
2400 	Buffer		buf;
2401 	Page		page;
2402 	BTPageOpaque opaque;
2403 	OffsetNumber start;
2404 	BTScanPosItem *currItem;
2405 
2406 	/*
2407 	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
2408 	 * version of _bt_search().  We don't maintain a stack since we know we
2409 	 * won't need it.
2410 	 */
2411 	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
2412 
2413 	if (!BufferIsValid(buf))
2414 	{
2415 		/*
2416 		 * Empty index. Lock the whole relation, as nothing finer to lock
2417 		 * exists.
2418 		 */
2419 		PredicateLockRelation(rel, scan->xs_snapshot);
2420 		BTScanPosInvalidate(so->currPos);
2421 		return false;
2422 	}
2423 
2424 	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2425 	page = BufferGetPage(buf);
2426 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2427 	Assert(P_ISLEAF(opaque));
2428 
2429 	if (ScanDirectionIsForward(dir))
2430 	{
2431 		/* There could be dead pages to the left, so not this: */
2432 		/* Assert(P_LEFTMOST(opaque)); */
2433 
2434 		start = P_FIRSTDATAKEY(opaque);
2435 	}
2436 	else if (ScanDirectionIsBackward(dir))
2437 	{
2438 		Assert(P_RIGHTMOST(opaque));
2439 
2440 		start = PageGetMaxOffsetNumber(page);
2441 	}
2442 	else
2443 	{
2444 		elog(ERROR, "invalid scan direction: %d", (int) dir);
2445 		start = 0;				/* keep compiler quiet */
2446 	}
2447 
2448 	/* remember which buffer we have pinned */
2449 	so->currPos.buf = buf;
2450 
2451 	_bt_initialize_more_data(so, dir);
2452 
2453 	/*
2454 	 * Now load data from the first page of the scan.
2455 	 */
2456 	if (!_bt_readpage(scan, dir, start))
2457 	{
2458 		/*
2459 		 * There's no actually-matching data on this page.  Try to advance to
2460 		 * the next page.  Return false if there's no matching data at all.
2461 		 */
2462 		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2463 		if (!_bt_steppage(scan, dir))
2464 			return false;
2465 	}
2466 	else
2467 	{
2468 		/* Drop the lock, and maybe the pin, on the current page */
2469 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2470 	}
2471 
2472 	/* OK, itemIndex says what to return */
2473 	currItem = &so->currPos.items[so->currPos.itemIndex];
2474 	scan->xs_heaptid = currItem->heapTid;
2475 	if (scan->xs_want_itup)
2476 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2477 
2478 	return true;
2479 }
2480 
2481 /*
2482  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2483  * for scan direction
2484  */
2485 static inline void
_bt_initialize_more_data(BTScanOpaque so,ScanDirection dir)2486 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2487 {
2488 	/* initialize moreLeft/moreRight appropriately for scan direction */
2489 	if (ScanDirectionIsForward(dir))
2490 	{
2491 		so->currPos.moreLeft = false;
2492 		so->currPos.moreRight = true;
2493 	}
2494 	else
2495 	{
2496 		so->currPos.moreLeft = true;
2497 		so->currPos.moreRight = false;
2498 	}
2499 	so->numKilled = 0;			/* just paranoia */
2500 	so->markItemIndex = -1;		/* ditto */
2501 }
2502