1 /*-------------------------------------------------------------------------
2  *
3  * nbtsearch.c
4  *	  Search code for postgres btrees.
5  *
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/nbtree/nbtsearch.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 
26 
27 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28 static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
30 						 OffsetNumber offnum);
31 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
32 						 OffsetNumber offnum, IndexTuple itup);
33 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
34 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
35 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
36 								  ScanDirection dir);
37 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
38 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
39 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40 
41 
42 /*
43  *	_bt_drop_lock_and_maybe_pin()
44  *
45  * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
46  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47  * This will prevent vacuum from stalling in a blocked state trying to read a
48  * page when a cursor is sitting on it -- at least in many important cases.
49  *
50  * Set the buffer to invalid if the pin is released, since the buffer may be
51  * re-used.  If we need to go back to this block (for example, to apply
52  * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
53  * will remain in shared memory for as long as it takes to scan the index
54  * buffer page.
55  */
56 static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan,BTScanPos sp)57 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
58 {
59 	LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
60 
61 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
62 		RelationNeedsWAL(scan->indexRelation) &&
63 		!scan->xs_want_itup)
64 	{
65 		ReleaseBuffer(sp->buf);
66 		sp->buf = InvalidBuffer;
67 	}
68 }
69 
70 /*
71  *	_bt_search() -- Search the tree for a particular scankey,
72  *		or more precisely for the first leaf page it could be on.
73  *
74  * The passed scankey is an insertion-type scankey (see nbtree/README),
75  * but it can omit the rightmost column(s) of the index.
76  *
77  * Return value is a stack of parent-page pointers.  *bufP is set to the
78  * address of the leaf-page buffer, which is read-locked and pinned.
79  * No locks are held on the parent pages, however!
80  *
81  * If the snapshot parameter is not NULL, "old snapshot" checking will take
82  * place during the descent through the tree.  This is not needed when
83  * positioning for an insert or delete, so NULL is used for those cases.
84  *
85  * The returned buffer is locked according to access parameter.  Additionally,
86  * access = BT_WRITE will allow an empty root page to be created and returned.
87  * When access = BT_READ, an empty index will result in *bufP being set to
88  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
89  * during the search will be finished.
90  */
91 BTStack
_bt_search(Relation rel,BTScanInsert key,Buffer * bufP,int access,Snapshot snapshot)92 _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
93 		   Snapshot snapshot)
94 {
95 	BTStack		stack_in = NULL;
96 	int			page_access = BT_READ;
97 
98 	/* Get the root page to start with */
99 	*bufP = _bt_getroot(rel, access);
100 
101 	/* If index is empty and access = BT_READ, no root page is created. */
102 	if (!BufferIsValid(*bufP))
103 		return (BTStack) NULL;
104 
105 	/* Loop iterates once per level descended in the tree */
106 	for (;;)
107 	{
108 		Page		page;
109 		BTPageOpaque opaque;
110 		OffsetNumber offnum;
111 		ItemId		itemid;
112 		IndexTuple	itup;
113 		BlockNumber blkno;
114 		BlockNumber par_blkno;
115 		BTStack		new_stack;
116 
117 		/*
118 		 * Race -- the page we just grabbed may have split since we read its
119 		 * pointer in the parent (or metapage).  If it has, we may need to
120 		 * move right to its new sibling.  Do that.
121 		 *
122 		 * In write-mode, allow _bt_moveright to finish any incomplete splits
123 		 * along the way.  Strictly speaking, we'd only need to finish an
124 		 * incomplete split on the leaf page we're about to insert to, not on
125 		 * any of the upper levels (they are taken care of in _bt_getstackbuf,
126 		 * if the leaf page is split and we insert to the parent page).  But
127 		 * this is a good opportunity to finish splits of internal pages too.
128 		 */
129 		*bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
130 							  page_access, snapshot);
131 
132 		/* if this is a leaf page, we're done */
133 		page = BufferGetPage(*bufP);
134 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
135 		if (P_ISLEAF(opaque))
136 			break;
137 
138 		/*
139 		 * Find the appropriate item on the internal page, and get the child
140 		 * page that it points to.
141 		 */
142 		offnum = _bt_binsrch(rel, key, *bufP);
143 		itemid = PageGetItemId(page, offnum);
144 		itup = (IndexTuple) PageGetItem(page, itemid);
145 		blkno = BTreeInnerTupleGetDownLink(itup);
146 		par_blkno = BufferGetBlockNumber(*bufP);
147 
148 		/*
149 		 * We need to save the location of the index entry we chose in the
150 		 * parent page on a stack. In case we split the tree, we'll use the
151 		 * stack to work back up to the parent page.  We also save the actual
152 		 * downlink (block) to uniquely identify the index entry, in case it
153 		 * moves right while we're working lower in the tree.  See the paper
154 		 * by Lehman and Yao for how this is detected and handled. (We use the
155 		 * child link during the second half of a page split -- if caller ends
156 		 * up splitting the child it usually ends up inserting a new pivot
157 		 * tuple for child's new right sibling immediately after the original
158 		 * bts_offset offset recorded here.  The downlink block will be needed
159 		 * to check if bts_offset remains the position of this same pivot
160 		 * tuple.)
161 		 */
162 		new_stack = (BTStack) palloc(sizeof(BTStackData));
163 		new_stack->bts_blkno = par_blkno;
164 		new_stack->bts_offset = offnum;
165 		new_stack->bts_btentry = blkno;
166 		new_stack->bts_parent = stack_in;
167 
168 		/*
169 		 * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
170 		 * we're on the level 1 and asked to lock leaf page in write mode,
171 		 * then lock next page in write mode, because it must be a leaf.
172 		 */
173 		if (opaque->btpo.level == 1 && access == BT_WRITE)
174 			page_access = BT_WRITE;
175 
176 		/* drop the read lock on the parent page, acquire one on the child */
177 		*bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
178 
179 		/* okay, all set to move down a level */
180 		stack_in = new_stack;
181 	}
182 
183 	/*
184 	 * If we're asked to lock leaf in write mode, but didn't manage to, then
185 	 * relock.  This should only happen when the root page is a leaf page (and
186 	 * the only page in the index other than the metapage).
187 	 */
188 	if (access == BT_WRITE && page_access == BT_READ)
189 	{
190 		/* trade in our read lock for a write lock */
191 		LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
192 		LockBuffer(*bufP, BT_WRITE);
193 
194 		/*
195 		 * If the page was split between the time that we surrendered our read
196 		 * lock and acquired our write lock, then this page may no longer be
197 		 * the right place for the key we want to insert.  In this case, we
198 		 * need to move right in the tree.  See Lehman and Yao for an
199 		 * excruciatingly precise description.
200 		 */
201 		*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
202 							  snapshot);
203 	}
204 
205 	return stack_in;
206 }
207 
208 /*
209  *	_bt_moveright() -- move right in the btree if necessary.
210  *
211  * When we follow a pointer to reach a page, it is possible that
212  * the page has changed in the meanwhile.  If this happens, we're
213  * guaranteed that the page has "split right" -- that is, that any
214  * data that appeared on the page originally is either on the page
215  * or strictly to the right of it.
216  *
217  * This routine decides whether or not we need to move right in the
218  * tree by examining the high key entry on the page.  If that entry is
219  * strictly less than the scankey, or <= the scankey in the
220  * key.nextkey=true case, then we followed the wrong link and we need
221  * to move right.
222  *
223  * The passed insertion-type scankey can omit the rightmost column(s) of the
224  * index. (see nbtree/README)
225  *
226  * When key.nextkey is false (the usual case), we are looking for the first
227  * item >= key.  When key.nextkey is true, we are looking for the first item
228  * strictly greater than key.
229  *
230  * If forupdate is true, we will attempt to finish any incomplete splits
231  * that we encounter.  This is required when locking a target page for an
232  * insertion, because we don't allow inserting on a page before the split
233  * is completed.  'stack' is only used if forupdate is true.
234  *
235  * On entry, we have the buffer pinned and a lock of the type specified by
236  * 'access'.  If we move right, we release the buffer and lock and acquire
237  * the same on the right sibling.  Return value is the buffer we stop at.
238  *
239  * If the snapshot parameter is not NULL, "old snapshot" checking will take
240  * place during the descent through the tree.  This is not needed when
241  * positioning for an insert or delete, so NULL is used for those cases.
242  */
243 Buffer
_bt_moveright(Relation rel,BTScanInsert key,Buffer buf,bool forupdate,BTStack stack,int access,Snapshot snapshot)244 _bt_moveright(Relation rel,
245 			  BTScanInsert key,
246 			  Buffer buf,
247 			  bool forupdate,
248 			  BTStack stack,
249 			  int access,
250 			  Snapshot snapshot)
251 {
252 	Page		page;
253 	BTPageOpaque opaque;
254 	int32		cmpval;
255 
256 	/*
257 	 * When nextkey = false (normal case): if the scan key that brought us to
258 	 * this page is > the high key stored on the page, then the page has split
259 	 * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
260 	 * have some duplicates to the right as well as the left, but that's
261 	 * something that's only ever dealt with on the leaf level, after
262 	 * _bt_search has found an initial leaf page.)
263 	 *
264 	 * When nextkey = true: move right if the scan key is >= page's high key.
265 	 * (Note that key.scantid cannot be set in this case.)
266 	 *
267 	 * The page could even have split more than once, so scan as far as
268 	 * needed.
269 	 *
270 	 * We also have to move right if we followed a link that brought us to a
271 	 * dead page.
272 	 */
273 	cmpval = key->nextkey ? 0 : 1;
274 
275 	for (;;)
276 	{
277 		page = BufferGetPage(buf);
278 		TestForOldSnapshot(snapshot, rel, page);
279 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
280 
281 		if (P_RIGHTMOST(opaque))
282 			break;
283 
284 		/*
285 		 * Finish any incomplete splits we encounter along the way.
286 		 */
287 		if (forupdate && P_INCOMPLETE_SPLIT(opaque))
288 		{
289 			BlockNumber blkno = BufferGetBlockNumber(buf);
290 
291 			/* upgrade our lock if necessary */
292 			if (access == BT_READ)
293 			{
294 				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
295 				LockBuffer(buf, BT_WRITE);
296 			}
297 
298 			if (P_INCOMPLETE_SPLIT(opaque))
299 				_bt_finish_split(rel, buf, stack);
300 			else
301 				_bt_relbuf(rel, buf);
302 
303 			/* re-acquire the lock in the right mode, and re-check */
304 			buf = _bt_getbuf(rel, blkno, access);
305 			continue;
306 		}
307 
308 		if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
309 		{
310 			/* step right one page */
311 			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
312 			continue;
313 		}
314 		else
315 			break;
316 	}
317 
318 	if (P_IGNORE(opaque))
319 		elog(ERROR, "fell off the end of index \"%s\"",
320 			 RelationGetRelationName(rel));
321 
322 	return buf;
323 }
324 
325 /*
326  *	_bt_binsrch() -- Do a binary search for a key on a particular page.
327  *
328  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
329  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
330  * particular, this means it is possible to return a value 1 greater than the
331  * number of keys on the page, if the scankey is > all keys on the page.)
332  *
333  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
334  * of the last key < given scankey, or last key <= given scankey if nextkey
335  * is true.  (Since _bt_compare treats the first data key of such a page as
336  * minus infinity, there will be at least one key < scankey, so the result
337  * always points at one of the keys on the page.)  This key indicates the
338  * right place to descend to be sure we find all leaf keys >= given scankey
339  * (or leaf keys > given scankey when nextkey is true).
340  *
341  * This procedure is not responsible for walking right, it just examines
342  * the given page.  _bt_binsrch() has no lock or refcount side effects
343  * on the buffer.
344  */
345 static OffsetNumber
_bt_binsrch(Relation rel,BTScanInsert key,Buffer buf)346 _bt_binsrch(Relation rel,
347 			BTScanInsert key,
348 			Buffer buf)
349 {
350 	Page		page;
351 	BTPageOpaque opaque;
352 	OffsetNumber low,
353 				high;
354 	int32		result,
355 				cmpval;
356 
357 	/* Requesting nextkey semantics while using scantid seems nonsensical */
358 	Assert(!key->nextkey || key->scantid == NULL);
359 
360 	page = BufferGetPage(buf);
361 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
362 
363 	low = P_FIRSTDATAKEY(opaque);
364 	high = PageGetMaxOffsetNumber(page);
365 
366 	/*
367 	 * If there are no keys on the page, return the first available slot. Note
368 	 * this covers two cases: the page is really empty (no keys), or it
369 	 * contains only a high key.  The latter case is possible after vacuuming.
370 	 * This can never happen on an internal page, however, since they are
371 	 * never empty (an internal page must have children).
372 	 */
373 	if (unlikely(high < low))
374 		return low;
375 
376 	/*
377 	 * Binary search to find the first key on the page >= scan key, or first
378 	 * key > scankey when nextkey is true.
379 	 *
380 	 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
381 	 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
382 	 *
383 	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
384 	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
385 	 *
386 	 * We can fall out when high == low.
387 	 */
388 	high++;						/* establish the loop invariant for high */
389 
390 	cmpval = key->nextkey ? 0 : 1;	/* select comparison value */
391 
392 	while (high > low)
393 	{
394 		OffsetNumber mid = low + ((high - low) / 2);
395 
396 		/* We have low <= mid < high, so mid points at a real slot */
397 
398 		result = _bt_compare(rel, key, page, mid);
399 
400 		if (result >= cmpval)
401 			low = mid + 1;
402 		else
403 			high = mid;
404 	}
405 
406 	/*
407 	 * At this point we have high == low, but be careful: they could point
408 	 * past the last slot on the page.
409 	 *
410 	 * On a leaf page, we always return the first key >= scan key (resp. >
411 	 * scan key), which could be the last slot + 1.
412 	 */
413 	if (P_ISLEAF(opaque))
414 		return low;
415 
416 	/*
417 	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
418 	 * There must be one if _bt_compare() is playing by the rules.
419 	 */
420 	Assert(low > P_FIRSTDATAKEY(opaque));
421 
422 	return OffsetNumberPrev(low);
423 }
424 
425 /*
426  *
427  *	_bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
428  *
429  * Like _bt_binsrch(), but with support for caching the binary search
430  * bounds.  Only used during insertion, and only on the leaf page that it
431  * looks like caller will insert tuple on.  Exclusive-locked and pinned
432  * leaf page is contained within insertstate.
433  *
434  * Caches the bounds fields in insertstate so that a subsequent call can
435  * reuse the low and strict high bounds of original binary search.  Callers
436  * that use these fields directly must be prepared for the case where low
437  * and/or stricthigh are not on the same page (one or both exceed maxoff
438  * for the page).  The case where there are no items on the page (high <
439  * low) makes bounds invalid.
440  *
441  * Caller is responsible for invalidating bounds when it modifies the page
442  * before calling here a second time.
443  */
444 OffsetNumber
_bt_binsrch_insert(Relation rel,BTInsertState insertstate)445 _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
446 {
447 	BTScanInsert key = insertstate->itup_key;
448 	Page		page;
449 	BTPageOpaque opaque;
450 	OffsetNumber low,
451 				high,
452 				stricthigh;
453 	int32		result,
454 				cmpval;
455 
456 	page = BufferGetPage(insertstate->buf);
457 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
458 
459 	Assert(P_ISLEAF(opaque));
460 	Assert(!key->nextkey);
461 
462 	if (!insertstate->bounds_valid)
463 	{
464 		/* Start new binary search */
465 		low = P_FIRSTDATAKEY(opaque);
466 		high = PageGetMaxOffsetNumber(page);
467 	}
468 	else
469 	{
470 		/* Restore result of previous binary search against same page */
471 		low = insertstate->low;
472 		high = insertstate->stricthigh;
473 	}
474 
475 	/* If there are no keys on the page, return the first available slot */
476 	if (unlikely(high < low))
477 	{
478 		/* Caller can't reuse bounds */
479 		insertstate->low = InvalidOffsetNumber;
480 		insertstate->stricthigh = InvalidOffsetNumber;
481 		insertstate->bounds_valid = false;
482 		return low;
483 	}
484 
485 	/*
486 	 * Binary search to find the first key on the page >= scan key. (nextkey
487 	 * is always false when inserting).
488 	 *
489 	 * The loop invariant is: all slots before 'low' are < scan key, all slots
490 	 * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
491 	 * maintained to save additional search effort for caller.
492 	 *
493 	 * We can fall out when high == low.
494 	 */
495 	if (!insertstate->bounds_valid)
496 		high++;					/* establish the loop invariant for high */
497 	stricthigh = high;			/* high initially strictly higher */
498 
499 	cmpval = 1;					/* !nextkey comparison value */
500 
501 	while (high > low)
502 	{
503 		OffsetNumber mid = low + ((high - low) / 2);
504 
505 		/* We have low <= mid < high, so mid points at a real slot */
506 
507 		result = _bt_compare(rel, key, page, mid);
508 
509 		if (result >= cmpval)
510 			low = mid + 1;
511 		else
512 		{
513 			high = mid;
514 			if (result != 0)
515 				stricthigh = high;
516 		}
517 	}
518 
519 	/*
520 	 * On a leaf page, a binary search always returns the first key >= scan
521 	 * key (at least in !nextkey case), which could be the last slot + 1. This
522 	 * is also the lower bound of cached search.
523 	 *
524 	 * stricthigh may also be the last slot + 1, which prevents caller from
525 	 * using bounds directly, but is still useful to us if we're called a
526 	 * second time with cached bounds (cached low will be < stricthigh when
527 	 * that happens).
528 	 */
529 	insertstate->low = low;
530 	insertstate->stricthigh = stricthigh;
531 	insertstate->bounds_valid = true;
532 
533 	return low;
534 }
535 
536 /*----------
537  *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
538  *
539  *	page/offnum: location of btree item to be compared to.
540  *
541  *		This routine returns:
542  *			<0 if scankey < tuple at offnum;
543  *			 0 if scankey == tuple at offnum;
544  *			>0 if scankey > tuple at offnum.
545  *		NULLs in the keys are treated as sortable values.  Therefore
546  *		"equality" does not necessarily mean that the item should be
547  *		returned to the caller as a matching key!
548  *
549  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
550  * "minus infinity": this routine will always claim it is less than the
551  * scankey.  The actual key value stored is explicitly truncated to 0
552  * attributes (explicitly minus infinity) with version 3+ indexes, but
553  * that isn't relied upon.  This allows us to implement the Lehman and
554  * Yao convention that the first down-link pointer is before the first
555  * key.  See backend/access/nbtree/README for details.
556  *----------
557  */
558 int32
_bt_compare(Relation rel,BTScanInsert key,Page page,OffsetNumber offnum)559 _bt_compare(Relation rel,
560 			BTScanInsert key,
561 			Page page,
562 			OffsetNumber offnum)
563 {
564 	TupleDesc	itupdesc = RelationGetDescr(rel);
565 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
566 	IndexTuple	itup;
567 	ItemPointer heapTid;
568 	ScanKey		scankey;
569 	int			ncmpkey;
570 	int			ntupatts;
571 
572 	Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
573 	Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
574 	Assert(key->heapkeyspace || key->scantid == NULL);
575 
576 	/*
577 	 * Force result ">" if target item is first data item on an internal page
578 	 * --- see NOTE above.
579 	 */
580 	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
581 		return 1;
582 
583 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
584 	ntupatts = BTreeTupleGetNAtts(itup, rel);
585 
586 	/*
587 	 * The scan key is set up with the attribute number associated with each
588 	 * term in the key.  It is important that, if the index is multi-key, the
589 	 * scan contain the first k key attributes, and that they be in order.  If
590 	 * you think about how multi-key ordering works, you'll understand why
591 	 * this is.
592 	 *
593 	 * We don't test for violation of this condition here, however.  The
594 	 * initial setup for the index scan had better have gotten it right (see
595 	 * _bt_first).
596 	 */
597 
598 	ncmpkey = Min(ntupatts, key->keysz);
599 	Assert(key->heapkeyspace || ncmpkey == key->keysz);
600 	scankey = key->scankeys;
601 	for (int i = 1; i <= ncmpkey; i++)
602 	{
603 		Datum		datum;
604 		bool		isNull;
605 		int32		result;
606 
607 		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
608 
609 		/* see comments about NULLs handling in btbuild */
610 		if (scankey->sk_flags & SK_ISNULL)	/* key is NULL */
611 		{
612 			if (isNull)
613 				result = 0;		/* NULL "=" NULL */
614 			else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
615 				result = -1;	/* NULL "<" NOT_NULL */
616 			else
617 				result = 1;		/* NULL ">" NOT_NULL */
618 		}
619 		else if (isNull)		/* key is NOT_NULL and item is NULL */
620 		{
621 			if (scankey->sk_flags & SK_BT_NULLS_FIRST)
622 				result = 1;		/* NOT_NULL ">" NULL */
623 			else
624 				result = -1;	/* NOT_NULL "<" NULL */
625 		}
626 		else
627 		{
628 			/*
629 			 * The sk_func needs to be passed the index value as left arg and
630 			 * the sk_argument as right arg (they might be of different
631 			 * types).  Since it is convenient for callers to think of
632 			 * _bt_compare as comparing the scankey to the index item, we have
633 			 * to flip the sign of the comparison result.  (Unless it's a DESC
634 			 * column, in which case we *don't* flip the sign.)
635 			 */
636 			result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
637 													 scankey->sk_collation,
638 													 datum,
639 													 scankey->sk_argument));
640 
641 			if (!(scankey->sk_flags & SK_BT_DESC))
642 				INVERT_COMPARE_RESULT(result);
643 		}
644 
645 		/* if the keys are unequal, return the difference */
646 		if (result != 0)
647 			return result;
648 
649 		scankey++;
650 	}
651 
652 	/*
653 	 * All non-truncated attributes (other than heap TID) were found to be
654 	 * equal.  Treat truncated attributes as minus infinity when scankey has a
655 	 * key attribute value that would otherwise be compared directly.
656 	 *
657 	 * Note: it doesn't matter if ntupatts includes non-key attributes;
658 	 * scankey won't, so explicitly excluding non-key attributes isn't
659 	 * necessary.
660 	 */
661 	if (key->keysz > ntupatts)
662 		return 1;
663 
664 	/*
665 	 * Use the heap TID attribute and scantid to try to break the tie.  The
666 	 * rules are the same as any other key attribute -- only the
667 	 * representation differs.
668 	 */
669 	heapTid = BTreeTupleGetHeapTID(itup);
670 	if (key->scantid == NULL)
671 	{
672 		/*
673 		 * Most searches have a scankey that is considered greater than a
674 		 * truncated pivot tuple if and when the scankey has equal values for
675 		 * attributes up to and including the least significant untruncated
676 		 * attribute in tuple.
677 		 *
678 		 * For example, if an index has the minimum two attributes (single
679 		 * user key attribute, plus heap TID attribute), and a page's high key
680 		 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
681 		 * will not descend to the page to the left.  The search will descend
682 		 * right instead.  The truncated attribute in pivot tuple means that
683 		 * all non-pivot tuples on the page to the left are strictly < 'foo',
684 		 * so it isn't necessary to descend left.  In other words, search
685 		 * doesn't have to descend left because it isn't interested in a match
686 		 * that has a heap TID value of -inf.
687 		 *
688 		 * However, some searches (pivotsearch searches) actually require that
689 		 * we descend left when this happens.  -inf is treated as a possible
690 		 * match for omitted scankey attribute(s).  This is needed by page
691 		 * deletion, which must re-find leaf pages that are targets for
692 		 * deletion using their high keys.
693 		 *
694 		 * Note: the heap TID part of the test ensures that scankey is being
695 		 * compared to a pivot tuple with one or more truncated key
696 		 * attributes.
697 		 *
698 		 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
699 		 * left here, since they have no heap TID attribute (and cannot have
700 		 * any -inf key values in any case, since truncation can only remove
701 		 * non-key attributes).  !heapkeyspace searches must always be
702 		 * prepared to deal with matches on both sides of the pivot once the
703 		 * leaf level is reached.
704 		 */
705 		if (key->heapkeyspace && !key->pivotsearch &&
706 			key->keysz == ntupatts && heapTid == NULL)
707 			return 1;
708 
709 		/* All provided scankey arguments found to be equal */
710 		return 0;
711 	}
712 
713 	/*
714 	 * Treat truncated heap TID as minus infinity, since scankey has a key
715 	 * attribute value (scantid) that would otherwise be compared directly
716 	 */
717 	Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
718 	if (heapTid == NULL)
719 		return 1;
720 
721 	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
722 	return ItemPointerCompare(key->scantid, heapTid);
723 }
724 
725 /*
726  *	_bt_first() -- Find the first item in a scan.
727  *
728  *		We need to be clever about the direction of scan, the search
729  *		conditions, and the tree ordering.  We find the first item (or,
730  *		if backwards scan, the last item) in the tree that satisfies the
731  *		qualifications in the scan key.  On success exit, the page containing
732  *		the current index tuple is pinned but not locked, and data about
733  *		the matching tuple(s) on the page has been loaded into so->currPos.
734  *		scan->xs_ctup.t_self is set to the heap TID of the current tuple,
735  *		and if requested, scan->xs_itup points to a copy of the index tuple.
736  *
737  * If there are no matching items in the index, we return false, with no
738  * pins or locks held.
739  *
740  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
741  * are both search-type scankeys (see nbtree/README for more about this).
742  * Within this routine, we build a temporary insertion-type scankey to use
743  * in locating the scan start position.
744  */
745 bool
_bt_first(IndexScanDesc scan,ScanDirection dir)746 _bt_first(IndexScanDesc scan, ScanDirection dir)
747 {
748 	Relation	rel = scan->indexRelation;
749 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
750 	Buffer		buf;
751 	BTStack		stack;
752 	OffsetNumber offnum;
753 	StrategyNumber strat;
754 	bool		nextkey;
755 	bool		goback;
756 	BTScanInsertData inskey;
757 	ScanKey		startKeys[INDEX_MAX_KEYS];
758 	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
759 	int			keysCount = 0;
760 	int			i;
761 	bool		status = true;
762 	StrategyNumber strat_total;
763 	BTScanPosItem *currItem;
764 	BlockNumber blkno;
765 
766 	Assert(!BTScanPosIsValid(so->currPos));
767 
768 	pgstat_count_index_scan(rel);
769 
770 	/*
771 	 * Examine the scan keys and eliminate any redundant keys; also mark the
772 	 * keys that must be matched to continue the scan.
773 	 */
774 	_bt_preprocess_keys(scan);
775 
776 	/*
777 	 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
778 	 * never be satisfied (eg, x == 1 AND x > 2).
779 	 */
780 	if (!so->qual_ok)
781 	{
782 		/* Notify any other workers that we're done with this scan key. */
783 		_bt_parallel_done(scan);
784 		return false;
785 	}
786 
787 	/*
788 	 * For parallel scans, get the starting page from shared state. If the
789 	 * scan has not started, proceed to find out first leaf page in the usual
790 	 * way while keeping other participating processes waiting.  If the scan
791 	 * has already begun, use the page number from the shared structure.
792 	 */
793 	if (scan->parallel_scan != NULL)
794 	{
795 		status = _bt_parallel_seize(scan, &blkno);
796 		if (!status)
797 			return false;
798 		else if (blkno == P_NONE)
799 		{
800 			_bt_parallel_done(scan);
801 			return false;
802 		}
803 		else if (blkno != InvalidBlockNumber)
804 		{
805 			if (!_bt_parallel_readpage(scan, blkno, dir))
806 				return false;
807 			goto readcomplete;
808 		}
809 	}
810 
811 	/*----------
812 	 * Examine the scan keys to discover where we need to start the scan.
813 	 *
814 	 * We want to identify the keys that can be used as starting boundaries;
815 	 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
816 	 * a backwards scan.  We can use keys for multiple attributes so long as
817 	 * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
818 	 * a > or < boundary or find an attribute with no boundary (which can be
819 	 * thought of as the same as "> -infinity"), we can't use keys for any
820 	 * attributes to its right, because it would break our simplistic notion
821 	 * of what initial positioning strategy to use.
822 	 *
823 	 * When the scan keys include cross-type operators, _bt_preprocess_keys
824 	 * may not be able to eliminate redundant keys; in such cases we will
825 	 * arbitrarily pick a usable one for each attribute.  This is correct
826 	 * but possibly not optimal behavior.  (For example, with keys like
827 	 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
828 	 * x=5 would be more efficient.)  Since the situation only arises given
829 	 * a poorly-worded query plus an incomplete opfamily, live with it.
830 	 *
831 	 * When both equality and inequality keys appear for a single attribute
832 	 * (again, only possible when cross-type operators appear), we *must*
833 	 * select one of the equality keys for the starting point, because
834 	 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
835 	 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
836 	 * start at x=4, we will fail and stop before reaching x=10.  If multiple
837 	 * equality quals survive preprocessing, however, it doesn't matter which
838 	 * one we use --- by definition, they are either redundant or
839 	 * contradictory.
840 	 *
841 	 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
842 	 * If the index stores nulls at the end of the index we'll be starting
843 	 * from, and we have no boundary key for the column (which means the key
844 	 * we deduced NOT NULL from is an inequality key that constrains the other
845 	 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
846 	 * use as a boundary key.  If we didn't do this, we might find ourselves
847 	 * traversing a lot of null entries at the start of the scan.
848 	 *
849 	 * In this loop, row-comparison keys are treated the same as keys on their
850 	 * first (leftmost) columns.  We'll add on lower-order columns of the row
851 	 * comparison below, if possible.
852 	 *
853 	 * The selected scan keys (at most one per index column) are remembered by
854 	 * storing their addresses into the local startKeys[] array.
855 	 *----------
856 	 */
857 	strat_total = BTEqualStrategyNumber;
858 	if (so->numberOfKeys > 0)
859 	{
860 		AttrNumber	curattr;
861 		ScanKey		chosen;
862 		ScanKey		impliesNN;
863 		ScanKey		cur;
864 
865 		/*
866 		 * chosen is the so-far-chosen key for the current attribute, if any.
867 		 * We don't cast the decision in stone until we reach keys for the
868 		 * next attribute.
869 		 */
870 		curattr = 1;
871 		chosen = NULL;
872 		/* Also remember any scankey that implies a NOT NULL constraint */
873 		impliesNN = NULL;
874 
875 		/*
876 		 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
877 		 * pass to handle after-last-key processing.  Actual exit from the
878 		 * loop is at one of the "break" statements below.
879 		 */
880 		for (cur = so->keyData, i = 0;; cur++, i++)
881 		{
882 			if (i >= so->numberOfKeys || cur->sk_attno != curattr)
883 			{
884 				/*
885 				 * Done looking at keys for curattr.  If we didn't find a
886 				 * usable boundary key, see if we can deduce a NOT NULL key.
887 				 */
888 				if (chosen == NULL && impliesNN != NULL &&
889 					((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
890 					 ScanDirectionIsForward(dir) :
891 					 ScanDirectionIsBackward(dir)))
892 				{
893 					/* Yes, so build the key in notnullkeys[keysCount] */
894 					chosen = &notnullkeys[keysCount];
895 					ScanKeyEntryInitialize(chosen,
896 										   (SK_SEARCHNOTNULL | SK_ISNULL |
897 											(impliesNN->sk_flags &
898 											 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
899 										   curattr,
900 										   ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
901 											BTGreaterStrategyNumber :
902 											BTLessStrategyNumber),
903 										   InvalidOid,
904 										   InvalidOid,
905 										   InvalidOid,
906 										   (Datum) 0);
907 				}
908 
909 				/*
910 				 * If we still didn't find a usable boundary key, quit; else
911 				 * save the boundary key pointer in startKeys.
912 				 */
913 				if (chosen == NULL)
914 					break;
915 				startKeys[keysCount++] = chosen;
916 
917 				/*
918 				 * Adjust strat_total, and quit if we have stored a > or <
919 				 * key.
920 				 */
921 				strat = chosen->sk_strategy;
922 				if (strat != BTEqualStrategyNumber)
923 				{
924 					strat_total = strat;
925 					if (strat == BTGreaterStrategyNumber ||
926 						strat == BTLessStrategyNumber)
927 						break;
928 				}
929 
930 				/*
931 				 * Done if that was the last attribute, or if next key is not
932 				 * in sequence (implying no boundary key is available for the
933 				 * next attribute).
934 				 */
935 				if (i >= so->numberOfKeys ||
936 					cur->sk_attno != curattr + 1)
937 					break;
938 
939 				/*
940 				 * Reset for next attr.
941 				 */
942 				curattr = cur->sk_attno;
943 				chosen = NULL;
944 				impliesNN = NULL;
945 			}
946 
947 			/*
948 			 * Can we use this key as a starting boundary for this attr?
949 			 *
950 			 * If not, does it imply a NOT NULL constraint?  (Because
951 			 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
952 			 * *any* inequality key works for that; we need not test.)
953 			 */
954 			switch (cur->sk_strategy)
955 			{
956 				case BTLessStrategyNumber:
957 				case BTLessEqualStrategyNumber:
958 					if (chosen == NULL)
959 					{
960 						if (ScanDirectionIsBackward(dir))
961 							chosen = cur;
962 						else
963 							impliesNN = cur;
964 					}
965 					break;
966 				case BTEqualStrategyNumber:
967 					/* override any non-equality choice */
968 					chosen = cur;
969 					break;
970 				case BTGreaterEqualStrategyNumber:
971 				case BTGreaterStrategyNumber:
972 					if (chosen == NULL)
973 					{
974 						if (ScanDirectionIsForward(dir))
975 							chosen = cur;
976 						else
977 							impliesNN = cur;
978 					}
979 					break;
980 			}
981 		}
982 	}
983 
984 	/*
985 	 * If we found no usable boundary keys, we have to start from one end of
986 	 * the tree.  Walk down that edge to the first or last key, and scan from
987 	 * there.
988 	 */
989 	if (keysCount == 0)
990 	{
991 		bool		match;
992 
993 		match = _bt_endpoint(scan, dir);
994 
995 		if (!match)
996 		{
997 			/* No match, so mark (parallel) scan finished */
998 			_bt_parallel_done(scan);
999 		}
1000 
1001 		return match;
1002 	}
1003 
1004 	/*
1005 	 * We want to start the scan somewhere within the index.  Set up an
1006 	 * insertion scankey we can use to search for the boundary point we
1007 	 * identified above.  The insertion scankey is built using the keys
1008 	 * identified by startKeys[].  (Remaining insertion scankey fields are
1009 	 * initialized after initial-positioning strategy is finalized.)
1010 	 */
1011 	Assert(keysCount <= INDEX_MAX_KEYS);
1012 	for (i = 0; i < keysCount; i++)
1013 	{
1014 		ScanKey		cur = startKeys[i];
1015 
1016 		Assert(cur->sk_attno == i + 1);
1017 
1018 		if (cur->sk_flags & SK_ROW_HEADER)
1019 		{
1020 			/*
1021 			 * Row comparison header: look to the first row member instead.
1022 			 *
1023 			 * The member scankeys are already in insertion format (ie, they
1024 			 * have sk_func = 3-way-comparison function), but we have to watch
1025 			 * out for nulls, which _bt_preprocess_keys didn't check. A null
1026 			 * in the first row member makes the condition unmatchable, just
1027 			 * like qual_ok = false.
1028 			 */
1029 			ScanKey		subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1030 
1031 			Assert(subkey->sk_flags & SK_ROW_MEMBER);
1032 			if (subkey->sk_flags & SK_ISNULL)
1033 			{
1034 				_bt_parallel_done(scan);
1035 				return false;
1036 			}
1037 			memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1038 
1039 			/*
1040 			 * If the row comparison is the last positioning key we accepted,
1041 			 * try to add additional keys from the lower-order row members.
1042 			 * (If we accepted independent conditions on additional index
1043 			 * columns, we use those instead --- doesn't seem worth trying to
1044 			 * determine which is more restrictive.)  Note that this is OK
1045 			 * even if the row comparison is of ">" or "<" type, because the
1046 			 * condition applied to all but the last row member is effectively
1047 			 * ">=" or "<=", and so the extra keys don't break the positioning
1048 			 * scheme.  But, by the same token, if we aren't able to use all
1049 			 * the row members, then the part of the row comparison that we
1050 			 * did use has to be treated as just a ">=" or "<=" condition, and
1051 			 * so we'd better adjust strat_total accordingly.
1052 			 */
1053 			if (i == keysCount - 1)
1054 			{
1055 				bool		used_all_subkeys = false;
1056 
1057 				Assert(!(subkey->sk_flags & SK_ROW_END));
1058 				for (;;)
1059 				{
1060 					subkey++;
1061 					Assert(subkey->sk_flags & SK_ROW_MEMBER);
1062 					if (subkey->sk_attno != keysCount + 1)
1063 						break;	/* out-of-sequence, can't use it */
1064 					if (subkey->sk_strategy != cur->sk_strategy)
1065 						break;	/* wrong direction, can't use it */
1066 					if (subkey->sk_flags & SK_ISNULL)
1067 						break;	/* can't use null keys */
1068 					Assert(keysCount < INDEX_MAX_KEYS);
1069 					memcpy(inskey.scankeys + keysCount, subkey,
1070 						   sizeof(ScanKeyData));
1071 					keysCount++;
1072 					if (subkey->sk_flags & SK_ROW_END)
1073 					{
1074 						used_all_subkeys = true;
1075 						break;
1076 					}
1077 				}
1078 				if (!used_all_subkeys)
1079 				{
1080 					switch (strat_total)
1081 					{
1082 						case BTLessStrategyNumber:
1083 							strat_total = BTLessEqualStrategyNumber;
1084 							break;
1085 						case BTGreaterStrategyNumber:
1086 							strat_total = BTGreaterEqualStrategyNumber;
1087 							break;
1088 					}
1089 				}
1090 				break;			/* done with outer loop */
1091 			}
1092 		}
1093 		else
1094 		{
1095 			/*
1096 			 * Ordinary comparison key.  Transform the search-style scan key
1097 			 * to an insertion scan key by replacing the sk_func with the
1098 			 * appropriate btree comparison function.
1099 			 *
1100 			 * If scankey operator is not a cross-type comparison, we can use
1101 			 * the cached comparison function; otherwise gotta look it up in
1102 			 * the catalogs.  (That can't lead to infinite recursion, since no
1103 			 * indexscan initiated by syscache lookup will use cross-data-type
1104 			 * operators.)
1105 			 *
1106 			 * We support the convention that sk_subtype == InvalidOid means
1107 			 * the opclass input type; this is a hack to simplify life for
1108 			 * ScanKeyInit().
1109 			 */
1110 			if (cur->sk_subtype == rel->rd_opcintype[i] ||
1111 				cur->sk_subtype == InvalidOid)
1112 			{
1113 				FmgrInfo   *procinfo;
1114 
1115 				procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1116 				ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1117 											   cur->sk_flags,
1118 											   cur->sk_attno,
1119 											   InvalidStrategy,
1120 											   cur->sk_subtype,
1121 											   cur->sk_collation,
1122 											   procinfo,
1123 											   cur->sk_argument);
1124 			}
1125 			else
1126 			{
1127 				RegProcedure cmp_proc;
1128 
1129 				cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1130 											 rel->rd_opcintype[i],
1131 											 cur->sk_subtype,
1132 											 BTORDER_PROC);
1133 				if (!RegProcedureIsValid(cmp_proc))
1134 					elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1135 						 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1136 						 cur->sk_attno, RelationGetRelationName(rel));
1137 				ScanKeyEntryInitialize(inskey.scankeys + i,
1138 									   cur->sk_flags,
1139 									   cur->sk_attno,
1140 									   InvalidStrategy,
1141 									   cur->sk_subtype,
1142 									   cur->sk_collation,
1143 									   cmp_proc,
1144 									   cur->sk_argument);
1145 			}
1146 		}
1147 	}
1148 
1149 	/*----------
1150 	 * Examine the selected initial-positioning strategy to determine exactly
1151 	 * where we need to start the scan, and set flag variables to control the
1152 	 * code below.
1153 	 *
1154 	 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1155 	 * item >= scan key.  If nextkey = true, they will locate the first
1156 	 * item > scan key.
1157 	 *
1158 	 * If goback = true, we will then step back one item, while if
1159 	 * goback = false, we will start the scan on the located item.
1160 	 *----------
1161 	 */
1162 	switch (strat_total)
1163 	{
1164 		case BTLessStrategyNumber:
1165 
1166 			/*
1167 			 * Find first item >= scankey, then back up one to arrive at last
1168 			 * item < scankey.  (Note: this positioning strategy is only used
1169 			 * for a backward scan, so that is always the correct starting
1170 			 * position.)
1171 			 */
1172 			nextkey = false;
1173 			goback = true;
1174 			break;
1175 
1176 		case BTLessEqualStrategyNumber:
1177 
1178 			/*
1179 			 * Find first item > scankey, then back up one to arrive at last
1180 			 * item <= scankey.  (Note: this positioning strategy is only used
1181 			 * for a backward scan, so that is always the correct starting
1182 			 * position.)
1183 			 */
1184 			nextkey = true;
1185 			goback = true;
1186 			break;
1187 
1188 		case BTEqualStrategyNumber:
1189 
1190 			/*
1191 			 * If a backward scan was specified, need to start with last equal
1192 			 * item not first one.
1193 			 */
1194 			if (ScanDirectionIsBackward(dir))
1195 			{
1196 				/*
1197 				 * This is the same as the <= strategy.  We will check at the
1198 				 * end whether the found item is actually =.
1199 				 */
1200 				nextkey = true;
1201 				goback = true;
1202 			}
1203 			else
1204 			{
1205 				/*
1206 				 * This is the same as the >= strategy.  We will check at the
1207 				 * end whether the found item is actually =.
1208 				 */
1209 				nextkey = false;
1210 				goback = false;
1211 			}
1212 			break;
1213 
1214 		case BTGreaterEqualStrategyNumber:
1215 
1216 			/*
1217 			 * Find first item >= scankey.  (This is only used for forward
1218 			 * scans.)
1219 			 */
1220 			nextkey = false;
1221 			goback = false;
1222 			break;
1223 
1224 		case BTGreaterStrategyNumber:
1225 
1226 			/*
1227 			 * Find first item > scankey.  (This is only used for forward
1228 			 * scans.)
1229 			 */
1230 			nextkey = true;
1231 			goback = false;
1232 			break;
1233 
1234 		default:
1235 			/* can't get here, but keep compiler quiet */
1236 			elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1237 			return false;
1238 	}
1239 
1240 	/* Initialize remaining insertion scan key fields */
1241 	inskey.heapkeyspace = _bt_heapkeyspace(rel);
1242 	inskey.anynullkeys = false; /* unused */
1243 	inskey.nextkey = nextkey;
1244 	inskey.pivotsearch = false;
1245 	inskey.scantid = NULL;
1246 	inskey.keysz = keysCount;
1247 
1248 	/*
1249 	 * Use the manufactured insertion scan key to descend the tree and
1250 	 * position ourselves on the target leaf page.
1251 	 */
1252 	stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1253 
1254 	/* don't need to keep the stack around... */
1255 	_bt_freestack(stack);
1256 
1257 	if (!BufferIsValid(buf))
1258 	{
1259 		/*
1260 		 * We only get here if the index is completely empty. Lock relation
1261 		 * because nothing finer to lock exists.
1262 		 */
1263 		PredicateLockRelation(rel, scan->xs_snapshot);
1264 
1265 		/*
1266 		 * mark parallel scan as done, so that all the workers can finish
1267 		 * their scan
1268 		 */
1269 		_bt_parallel_done(scan);
1270 		BTScanPosInvalidate(so->currPos);
1271 
1272 		return false;
1273 	}
1274 	else
1275 		PredicateLockPage(rel, BufferGetBlockNumber(buf),
1276 						  scan->xs_snapshot);
1277 
1278 	_bt_initialize_more_data(so, dir);
1279 
1280 	/* position to the precise item on the page */
1281 	offnum = _bt_binsrch(rel, &inskey, buf);
1282 
1283 	/*
1284 	 * If nextkey = false, we are positioned at the first item >= scan key, or
1285 	 * possibly at the end of a page on which all the existing items are less
1286 	 * than the scan key and we know that everything on later pages is greater
1287 	 * than or equal to scan key.
1288 	 *
1289 	 * If nextkey = true, we are positioned at the first item > scan key, or
1290 	 * possibly at the end of a page on which all the existing items are less
1291 	 * than or equal to the scan key and we know that everything on later
1292 	 * pages is greater than scan key.
1293 	 *
1294 	 * The actually desired starting point is either this item or the prior
1295 	 * one, or in the end-of-page case it's the first item on the next page or
1296 	 * the last item on this page.  Adjust the starting offset if needed. (If
1297 	 * this results in an offset before the first item or after the last one,
1298 	 * _bt_readpage will report no items found, and then we'll step to the
1299 	 * next page as needed.)
1300 	 */
1301 	if (goback)
1302 		offnum = OffsetNumberPrev(offnum);
1303 
1304 	/* remember which buffer we have pinned, if any */
1305 	Assert(!BTScanPosIsValid(so->currPos));
1306 	so->currPos.buf = buf;
1307 
1308 	/*
1309 	 * Now load data from the first page of the scan.
1310 	 */
1311 	if (!_bt_readpage(scan, dir, offnum))
1312 	{
1313 		/*
1314 		 * There's no actually-matching data on this page.  Try to advance to
1315 		 * the next page.  Return false if there's no matching data at all.
1316 		 */
1317 		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1318 		if (!_bt_steppage(scan, dir))
1319 			return false;
1320 	}
1321 	else
1322 	{
1323 		/* Drop the lock, and maybe the pin, on the current page */
1324 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1325 	}
1326 
1327 readcomplete:
1328 	/* OK, itemIndex says what to return */
1329 	currItem = &so->currPos.items[so->currPos.itemIndex];
1330 	scan->xs_heaptid = currItem->heapTid;
1331 	if (scan->xs_want_itup)
1332 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1333 
1334 	return true;
1335 }
1336 
1337 /*
1338  *	_bt_next() -- Get the next item in a scan.
1339  *
1340  *		On entry, so->currPos describes the current page, which may be pinned
1341  *		but is not locked, and so->currPos.itemIndex identifies which item was
1342  *		previously returned.
1343  *
1344  *		On successful exit, scan->xs_ctup.t_self is set to the TID of the
1345  *		next heap tuple, and if requested, scan->xs_itup points to a copy of
1346  *		the index tuple.  so->currPos is updated as needed.
1347  *
1348  *		On failure exit (no more tuples), we release pin and set
1349  *		so->currPos.buf to InvalidBuffer.
1350  */
1351 bool
_bt_next(IndexScanDesc scan,ScanDirection dir)1352 _bt_next(IndexScanDesc scan, ScanDirection dir)
1353 {
1354 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1355 	BTScanPosItem *currItem;
1356 
1357 	/*
1358 	 * Advance to next tuple on current page; or if there's no more, try to
1359 	 * step to the next page with data.
1360 	 */
1361 	if (ScanDirectionIsForward(dir))
1362 	{
1363 		if (++so->currPos.itemIndex > so->currPos.lastItem)
1364 		{
1365 			if (!_bt_steppage(scan, dir))
1366 				return false;
1367 		}
1368 	}
1369 	else
1370 	{
1371 		if (--so->currPos.itemIndex < so->currPos.firstItem)
1372 		{
1373 			if (!_bt_steppage(scan, dir))
1374 				return false;
1375 		}
1376 	}
1377 
1378 	/* OK, itemIndex says what to return */
1379 	currItem = &so->currPos.items[so->currPos.itemIndex];
1380 	scan->xs_heaptid = currItem->heapTid;
1381 	if (scan->xs_want_itup)
1382 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1383 
1384 	return true;
1385 }
1386 
1387 /*
1388  *	_bt_readpage() -- Load data from current index page into so->currPos
1389  *
1390  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1391  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
1392  * they are updated as appropriate.  All other fields of so->currPos are
1393  * initialized from scratch here.
1394  *
1395  * We scan the current page starting at offnum and moving in the indicated
1396  * direction.  All items matching the scan keys are loaded into currPos.items.
1397  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1398  * that there can be no more matching tuples in the current scan direction.
1399  *
1400  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1401  * prior to calling this function; this function will invoke
1402  * _bt_parallel_release before returning.
1403  *
1404  * Returns true if any matching items found on the page, false if none.
1405  */
1406 static bool
_bt_readpage(IndexScanDesc scan,ScanDirection dir,OffsetNumber offnum)1407 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1408 {
1409 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1410 	Page		page;
1411 	BTPageOpaque opaque;
1412 	OffsetNumber minoff;
1413 	OffsetNumber maxoff;
1414 	int			itemIndex;
1415 	bool		continuescan;
1416 	int			indnatts;
1417 
1418 	/*
1419 	 * We must have the buffer pinned and locked, but the usual macro can't be
1420 	 * used here; this function is what makes it good for currPos.
1421 	 */
1422 	Assert(BufferIsValid(so->currPos.buf));
1423 
1424 	page = BufferGetPage(so->currPos.buf);
1425 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1426 
1427 	/* allow next page be processed by parallel worker */
1428 	if (scan->parallel_scan)
1429 	{
1430 		if (ScanDirectionIsForward(dir))
1431 			_bt_parallel_release(scan, opaque->btpo_next);
1432 		else
1433 			_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1434 	}
1435 
1436 	continuescan = true;		/* default assumption */
1437 	indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1438 	minoff = P_FIRSTDATAKEY(opaque);
1439 	maxoff = PageGetMaxOffsetNumber(page);
1440 
1441 	/*
1442 	 * We note the buffer's block number so that we can release the pin later.
1443 	 * This allows us to re-read the buffer if it is needed again for hinting.
1444 	 */
1445 	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1446 
1447 	/*
1448 	 * We save the LSN of the page as we read it, so that we know whether it
1449 	 * safe to apply LP_DEAD hints to the page later.  This allows us to drop
1450 	 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1451 	 */
1452 	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1453 
1454 	/*
1455 	 * we must save the page's right-link while scanning it; this tells us
1456 	 * where to step right to after we're done with these items.  There is no
1457 	 * corresponding need for the left-link, since splits always go right.
1458 	 */
1459 	so->currPos.nextPage = opaque->btpo_next;
1460 
1461 	/* initialize tuple workspace to empty */
1462 	so->currPos.nextTupleOffset = 0;
1463 
1464 	/*
1465 	 * Now that the current page has been made consistent, the macro should be
1466 	 * good.
1467 	 */
1468 	Assert(BTScanPosIsPinned(so->currPos));
1469 
1470 	if (ScanDirectionIsForward(dir))
1471 	{
1472 		/* load items[] in ascending order */
1473 		itemIndex = 0;
1474 
1475 		offnum = Max(offnum, minoff);
1476 
1477 		while (offnum <= maxoff)
1478 		{
1479 			ItemId		iid = PageGetItemId(page, offnum);
1480 			IndexTuple	itup;
1481 
1482 			/*
1483 			 * If the scan specifies not to return killed tuples, then we
1484 			 * treat a killed tuple as not passing the qual
1485 			 */
1486 			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1487 			{
1488 				offnum = OffsetNumberNext(offnum);
1489 				continue;
1490 			}
1491 
1492 			itup = (IndexTuple) PageGetItem(page, iid);
1493 
1494 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1495 			{
1496 				/* tuple passes all scan key conditions, so remember it */
1497 				_bt_saveitem(so, itemIndex, offnum, itup);
1498 				itemIndex++;
1499 			}
1500 			/* When !continuescan, there can't be any more matches, so stop */
1501 			if (!continuescan)
1502 				break;
1503 
1504 			offnum = OffsetNumberNext(offnum);
1505 		}
1506 
1507 		/*
1508 		 * We don't need to visit page to the right when the high key
1509 		 * indicates that no more matches will be found there.
1510 		 *
1511 		 * Checking the high key like this works out more often than you might
1512 		 * think.  Leaf page splits pick a split point between the two most
1513 		 * dissimilar tuples (this is weighed against the need to evenly share
1514 		 * free space).  Leaf pages with high key attribute values that can
1515 		 * only appear on non-pivot tuples on the right sibling page are
1516 		 * common.
1517 		 */
1518 		if (continuescan && !P_RIGHTMOST(opaque))
1519 		{
1520 			ItemId		iid = PageGetItemId(page, P_HIKEY);
1521 			IndexTuple	itup = (IndexTuple) PageGetItem(page, iid);
1522 			int			truncatt;
1523 
1524 			truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1525 			_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1526 		}
1527 
1528 		if (!continuescan)
1529 			so->currPos.moreRight = false;
1530 
1531 		Assert(itemIndex <= MaxIndexTuplesPerPage);
1532 		so->currPos.firstItem = 0;
1533 		so->currPos.lastItem = itemIndex - 1;
1534 		so->currPos.itemIndex = 0;
1535 	}
1536 	else
1537 	{
1538 		/* load items[] in descending order */
1539 		itemIndex = MaxIndexTuplesPerPage;
1540 
1541 		offnum = Min(offnum, maxoff);
1542 
1543 		while (offnum >= minoff)
1544 		{
1545 			ItemId		iid = PageGetItemId(page, offnum);
1546 			IndexTuple	itup;
1547 			bool		tuple_alive;
1548 			bool		passes_quals;
1549 
1550 			/*
1551 			 * If the scan specifies not to return killed tuples, then we
1552 			 * treat a killed tuple as not passing the qual.  Most of the
1553 			 * time, it's a win to not bother examining the tuple's index
1554 			 * keys, but just skip to the next tuple (previous, actually,
1555 			 * since we're scanning backwards).  However, if this is the first
1556 			 * tuple on the page, we do check the index keys, to prevent
1557 			 * uselessly advancing to the page to the left.  This is similar
1558 			 * to the high key optimization used by forward scans.
1559 			 */
1560 			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1561 			{
1562 				Assert(offnum >= P_FIRSTDATAKEY(opaque));
1563 				if (offnum > P_FIRSTDATAKEY(opaque))
1564 				{
1565 					offnum = OffsetNumberPrev(offnum);
1566 					continue;
1567 				}
1568 
1569 				tuple_alive = false;
1570 			}
1571 			else
1572 				tuple_alive = true;
1573 
1574 			itup = (IndexTuple) PageGetItem(page, iid);
1575 
1576 			passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1577 										 &continuescan);
1578 			if (passes_quals && tuple_alive)
1579 			{
1580 				/* tuple passes all scan key conditions, so remember it */
1581 				itemIndex--;
1582 				_bt_saveitem(so, itemIndex, offnum, itup);
1583 			}
1584 			if (!continuescan)
1585 			{
1586 				/* there can't be any more matches, so stop */
1587 				so->currPos.moreLeft = false;
1588 				break;
1589 			}
1590 
1591 			offnum = OffsetNumberPrev(offnum);
1592 		}
1593 
1594 		Assert(itemIndex >= 0);
1595 		so->currPos.firstItem = itemIndex;
1596 		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1597 		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1598 	}
1599 
1600 	return (so->currPos.firstItem <= so->currPos.lastItem);
1601 }
1602 
1603 /* Save an index item into so->currPos.items[itemIndex] */
1604 static void
_bt_saveitem(BTScanOpaque so,int itemIndex,OffsetNumber offnum,IndexTuple itup)1605 _bt_saveitem(BTScanOpaque so, int itemIndex,
1606 			 OffsetNumber offnum, IndexTuple itup)
1607 {
1608 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1609 
1610 	currItem->heapTid = itup->t_tid;
1611 	currItem->indexOffset = offnum;
1612 	if (so->currTuples)
1613 	{
1614 		Size		itupsz = IndexTupleSize(itup);
1615 
1616 		currItem->tupleOffset = so->currPos.nextTupleOffset;
1617 		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1618 		so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1619 	}
1620 }
1621 
1622 /*
1623  *	_bt_steppage() -- Step to next page containing valid data for scan
1624  *
1625  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1626  * if pinned, we'll drop the pin before moving to next page.  The buffer is
1627  * not locked on entry.
1628  *
1629  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1630  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
1631  * to InvalidBuffer.  We return true to indicate success.
1632  */
1633 static bool
_bt_steppage(IndexScanDesc scan,ScanDirection dir)1634 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1635 {
1636 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1637 	BlockNumber blkno = InvalidBlockNumber;
1638 	bool		status = true;
1639 
1640 	Assert(BTScanPosIsValid(so->currPos));
1641 
1642 	/* Before leaving current page, deal with any killed items */
1643 	if (so->numKilled > 0)
1644 		_bt_killitems(scan);
1645 
1646 	/*
1647 	 * Before we modify currPos, make a copy of the page data if there was a
1648 	 * mark position that needs it.
1649 	 */
1650 	if (so->markItemIndex >= 0)
1651 	{
1652 		/* bump pin on current buffer for assignment to mark buffer */
1653 		if (BTScanPosIsPinned(so->currPos))
1654 			IncrBufferRefCount(so->currPos.buf);
1655 		memcpy(&so->markPos, &so->currPos,
1656 			   offsetof(BTScanPosData, items[1]) +
1657 			   so->currPos.lastItem * sizeof(BTScanPosItem));
1658 		if (so->markTuples)
1659 			memcpy(so->markTuples, so->currTuples,
1660 				   so->currPos.nextTupleOffset);
1661 		so->markPos.itemIndex = so->markItemIndex;
1662 		so->markItemIndex = -1;
1663 	}
1664 
1665 	if (ScanDirectionIsForward(dir))
1666 	{
1667 		/* Walk right to the next page with data */
1668 		if (scan->parallel_scan != NULL)
1669 		{
1670 			/*
1671 			 * Seize the scan to get the next block number; if the scan has
1672 			 * ended already, bail out.
1673 			 */
1674 			status = _bt_parallel_seize(scan, &blkno);
1675 			if (!status)
1676 			{
1677 				/* release the previous buffer, if pinned */
1678 				BTScanPosUnpinIfPinned(so->currPos);
1679 				BTScanPosInvalidate(so->currPos);
1680 				return false;
1681 			}
1682 		}
1683 		else
1684 		{
1685 			/* Not parallel, so use the previously-saved nextPage link. */
1686 			blkno = so->currPos.nextPage;
1687 		}
1688 
1689 		/* Remember we left a page with data */
1690 		so->currPos.moreLeft = true;
1691 
1692 		/* release the previous buffer, if pinned */
1693 		BTScanPosUnpinIfPinned(so->currPos);
1694 	}
1695 	else
1696 	{
1697 		/* Remember we left a page with data */
1698 		so->currPos.moreRight = true;
1699 
1700 		if (scan->parallel_scan != NULL)
1701 		{
1702 			/*
1703 			 * Seize the scan to get the current block number; if the scan has
1704 			 * ended already, bail out.
1705 			 */
1706 			status = _bt_parallel_seize(scan, &blkno);
1707 			BTScanPosUnpinIfPinned(so->currPos);
1708 			if (!status)
1709 			{
1710 				BTScanPosInvalidate(so->currPos);
1711 				return false;
1712 			}
1713 		}
1714 		else
1715 		{
1716 			/* Not parallel, so just use our own notion of the current page */
1717 			blkno = so->currPos.currPage;
1718 		}
1719 	}
1720 
1721 	if (!_bt_readnextpage(scan, blkno, dir))
1722 		return false;
1723 
1724 	/* Drop the lock, and maybe the pin, on the current page */
1725 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1726 
1727 	return true;
1728 }
1729 
1730 /*
1731  *	_bt_readnextpage() -- Read next page containing valid data for scan
1732  *
1733  * On success exit, so->currPos is updated to contain data from the next
1734  * interesting page.  Caller is responsible to release lock and pin on
1735  * buffer on success.  We return true to indicate success.
1736  *
1737  * If there are no more matching records in the given direction, we drop all
1738  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1739  */
1740 static bool
_bt_readnextpage(IndexScanDesc scan,BlockNumber blkno,ScanDirection dir)1741 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1742 {
1743 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1744 	Relation	rel;
1745 	Page		page;
1746 	BTPageOpaque opaque;
1747 	bool		status = true;
1748 
1749 	rel = scan->indexRelation;
1750 
1751 	if (ScanDirectionIsForward(dir))
1752 	{
1753 		for (;;)
1754 		{
1755 			/*
1756 			 * if we're at end of scan, give up and mark parallel scan as
1757 			 * done, so that all the workers can finish their scan
1758 			 */
1759 			if (blkno == P_NONE || !so->currPos.moreRight)
1760 			{
1761 				_bt_parallel_done(scan);
1762 				BTScanPosInvalidate(so->currPos);
1763 				return false;
1764 			}
1765 			/* check for interrupts while we're not holding any buffer lock */
1766 			CHECK_FOR_INTERRUPTS();
1767 			/* step right one page */
1768 			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1769 			page = BufferGetPage(so->currPos.buf);
1770 			TestForOldSnapshot(scan->xs_snapshot, rel, page);
1771 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1772 			/* check for deleted page */
1773 			if (!P_IGNORE(opaque))
1774 			{
1775 				PredicateLockPage(rel, blkno, scan->xs_snapshot);
1776 				/* see if there are any matches on this page */
1777 				/* note that this will clear moreRight if we can stop */
1778 				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1779 					break;
1780 			}
1781 			else if (scan->parallel_scan != NULL)
1782 			{
1783 				/* allow next page be processed by parallel worker */
1784 				_bt_parallel_release(scan, opaque->btpo_next);
1785 			}
1786 
1787 			/* nope, keep going */
1788 			if (scan->parallel_scan != NULL)
1789 			{
1790 				_bt_relbuf(rel, so->currPos.buf);
1791 				status = _bt_parallel_seize(scan, &blkno);
1792 				if (!status)
1793 				{
1794 					BTScanPosInvalidate(so->currPos);
1795 					return false;
1796 				}
1797 			}
1798 			else
1799 			{
1800 				blkno = opaque->btpo_next;
1801 				_bt_relbuf(rel, so->currPos.buf);
1802 			}
1803 		}
1804 	}
1805 	else
1806 	{
1807 		/*
1808 		 * Should only happen in parallel cases, when some other backend
1809 		 * advanced the scan.
1810 		 */
1811 		if (so->currPos.currPage != blkno)
1812 		{
1813 			BTScanPosUnpinIfPinned(so->currPos);
1814 			so->currPos.currPage = blkno;
1815 		}
1816 
1817 		/*
1818 		 * Walk left to the next page with data.  This is much more complex
1819 		 * than the walk-right case because of the possibility that the page
1820 		 * to our left splits while we are in flight to it, plus the
1821 		 * possibility that the page we were on gets deleted after we leave
1822 		 * it.  See nbtree/README for details.
1823 		 *
1824 		 * It might be possible to rearrange this code to have less overhead
1825 		 * in pinning and locking, but that would require capturing the left
1826 		 * pointer when the page is initially read, and using it here, along
1827 		 * with big changes to _bt_walk_left() and the code below.  It is not
1828 		 * clear whether this would be a win, since if the page immediately to
1829 		 * the left splits after we read this page and before we step left, we
1830 		 * would need to visit more pages than with the current code.
1831 		 *
1832 		 * Note that if we change the code so that we drop the pin for a scan
1833 		 * which uses a non-MVCC snapshot, we will need to modify the code for
1834 		 * walking left, to allow for the possibility that a referenced page
1835 		 * has been deleted.  As long as the buffer is pinned or the snapshot
1836 		 * is MVCC the page cannot move past the half-dead state to fully
1837 		 * deleted.
1838 		 */
1839 		if (BTScanPosIsPinned(so->currPos))
1840 			LockBuffer(so->currPos.buf, BT_READ);
1841 		else
1842 			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1843 
1844 		for (;;)
1845 		{
1846 			/* Done if we know there are no matching keys to the left */
1847 			if (!so->currPos.moreLeft)
1848 			{
1849 				_bt_relbuf(rel, so->currPos.buf);
1850 				_bt_parallel_done(scan);
1851 				BTScanPosInvalidate(so->currPos);
1852 				return false;
1853 			}
1854 
1855 			/* Step to next physical page */
1856 			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1857 											scan->xs_snapshot);
1858 
1859 			/* if we're physically at end of index, return failure */
1860 			if (so->currPos.buf == InvalidBuffer)
1861 			{
1862 				_bt_parallel_done(scan);
1863 				BTScanPosInvalidate(so->currPos);
1864 				return false;
1865 			}
1866 
1867 			/*
1868 			 * Okay, we managed to move left to a non-deleted page. Done if
1869 			 * it's not half-dead and contains matching tuples. Else loop back
1870 			 * and do it all again.
1871 			 */
1872 			page = BufferGetPage(so->currPos.buf);
1873 			TestForOldSnapshot(scan->xs_snapshot, rel, page);
1874 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1875 			if (!P_IGNORE(opaque))
1876 			{
1877 				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1878 				/* see if there are any matches on this page */
1879 				/* note that this will clear moreLeft if we can stop */
1880 				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1881 					break;
1882 			}
1883 			else if (scan->parallel_scan != NULL)
1884 			{
1885 				/* allow next page be processed by parallel worker */
1886 				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1887 			}
1888 
1889 			/*
1890 			 * For parallel scans, get the last page scanned as it is quite
1891 			 * possible that by the time we try to seize the scan, some other
1892 			 * worker has already advanced the scan to a different page.  We
1893 			 * must continue based on the latest page scanned by any worker.
1894 			 */
1895 			if (scan->parallel_scan != NULL)
1896 			{
1897 				_bt_relbuf(rel, so->currPos.buf);
1898 				status = _bt_parallel_seize(scan, &blkno);
1899 				if (!status)
1900 				{
1901 					BTScanPosInvalidate(so->currPos);
1902 					return false;
1903 				}
1904 				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1905 			}
1906 		}
1907 	}
1908 
1909 	return true;
1910 }
1911 
1912 /*
1913  *	_bt_parallel_readpage() -- Read current page containing valid data for scan
1914  *
1915  * On success, release lock and maybe pin on buffer.  We return true to
1916  * indicate success.
1917  */
1918 static bool
_bt_parallel_readpage(IndexScanDesc scan,BlockNumber blkno,ScanDirection dir)1919 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1920 {
1921 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1922 
1923 	_bt_initialize_more_data(so, dir);
1924 
1925 	if (!_bt_readnextpage(scan, blkno, dir))
1926 		return false;
1927 
1928 	/* Drop the lock, and maybe the pin, on the current page */
1929 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1930 
1931 	return true;
1932 }
1933 
1934 /*
1935  * _bt_walk_left() -- step left one page, if possible
1936  *
1937  * The given buffer must be pinned and read-locked.  This will be dropped
1938  * before stepping left.  On return, we have pin and read lock on the
1939  * returned page, instead.
1940  *
1941  * Returns InvalidBuffer if there is no page to the left (no lock is held
1942  * in that case).
1943  *
1944  * When working on a non-leaf level, it is possible for the returned page
1945  * to be half-dead; the caller should check that condition and step left
1946  * again if it's important.
1947  */
1948 static Buffer
_bt_walk_left(Relation rel,Buffer buf,Snapshot snapshot)1949 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1950 {
1951 	Page		page;
1952 	BTPageOpaque opaque;
1953 
1954 	page = BufferGetPage(buf);
1955 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1956 
1957 	for (;;)
1958 	{
1959 		BlockNumber obknum;
1960 		BlockNumber lblkno;
1961 		BlockNumber blkno;
1962 		int			tries;
1963 
1964 		/* if we're at end of tree, release buf and return failure */
1965 		if (P_LEFTMOST(opaque))
1966 		{
1967 			_bt_relbuf(rel, buf);
1968 			break;
1969 		}
1970 		/* remember original page we are stepping left from */
1971 		obknum = BufferGetBlockNumber(buf);
1972 		/* step left */
1973 		blkno = lblkno = opaque->btpo_prev;
1974 		_bt_relbuf(rel, buf);
1975 		/* check for interrupts while we're not holding any buffer lock */
1976 		CHECK_FOR_INTERRUPTS();
1977 		buf = _bt_getbuf(rel, blkno, BT_READ);
1978 		page = BufferGetPage(buf);
1979 		TestForOldSnapshot(snapshot, rel, page);
1980 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1981 
1982 		/*
1983 		 * If this isn't the page we want, walk right till we find what we
1984 		 * want --- but go no more than four hops (an arbitrary limit). If we
1985 		 * don't find the correct page by then, the most likely bet is that
1986 		 * the original page got deleted and isn't in the sibling chain at all
1987 		 * anymore, not that its left sibling got split more than four times.
1988 		 *
1989 		 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1990 		 * because half-dead pages are still in the sibling chain.  Caller
1991 		 * must reject half-dead pages if wanted.
1992 		 */
1993 		tries = 0;
1994 		for (;;)
1995 		{
1996 			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1997 			{
1998 				/* Found desired page, return it */
1999 				return buf;
2000 			}
2001 			if (P_RIGHTMOST(opaque) || ++tries > 4)
2002 				break;
2003 			blkno = opaque->btpo_next;
2004 			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2005 			page = BufferGetPage(buf);
2006 			TestForOldSnapshot(snapshot, rel, page);
2007 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2008 		}
2009 
2010 		/* Return to the original page to see what's up */
2011 		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2012 		page = BufferGetPage(buf);
2013 		TestForOldSnapshot(snapshot, rel, page);
2014 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2015 		if (P_ISDELETED(opaque))
2016 		{
2017 			/*
2018 			 * It was deleted.  Move right to first nondeleted page (there
2019 			 * must be one); that is the page that has acquired the deleted
2020 			 * one's keyspace, so stepping left from it will take us where we
2021 			 * want to be.
2022 			 */
2023 			for (;;)
2024 			{
2025 				if (P_RIGHTMOST(opaque))
2026 					elog(ERROR, "fell off the end of index \"%s\"",
2027 						 RelationGetRelationName(rel));
2028 				blkno = opaque->btpo_next;
2029 				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2030 				page = BufferGetPage(buf);
2031 				TestForOldSnapshot(snapshot, rel, page);
2032 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2033 				if (!P_ISDELETED(opaque))
2034 					break;
2035 			}
2036 
2037 			/*
2038 			 * Now return to top of loop, resetting obknum to point to this
2039 			 * nondeleted page, and try again.
2040 			 */
2041 		}
2042 		else
2043 		{
2044 			/*
2045 			 * It wasn't deleted; the explanation had better be that the page
2046 			 * to the left got split or deleted. Without this check, we'd go
2047 			 * into an infinite loop if there's anything wrong.
2048 			 */
2049 			if (opaque->btpo_prev == lblkno)
2050 				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2051 					 obknum, RelationGetRelationName(rel));
2052 			/* Okay to try again with new lblkno value */
2053 		}
2054 	}
2055 
2056 	return InvalidBuffer;
2057 }
2058 
2059 /*
2060  * _bt_get_endpoint() -- Find the first or last page on a given tree level
2061  *
2062  * If the index is empty, we will return InvalidBuffer; any other failure
2063  * condition causes ereport().  We will not return a dead page.
2064  *
2065  * The returned buffer is pinned and read-locked.
2066  */
2067 Buffer
_bt_get_endpoint(Relation rel,uint32 level,bool rightmost,Snapshot snapshot)2068 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
2069 				 Snapshot snapshot)
2070 {
2071 	Buffer		buf;
2072 	Page		page;
2073 	BTPageOpaque opaque;
2074 	OffsetNumber offnum;
2075 	BlockNumber blkno;
2076 	IndexTuple	itup;
2077 
2078 	/*
2079 	 * If we are looking for a leaf page, okay to descend from fast root;
2080 	 * otherwise better descend from true root.  (There is no point in being
2081 	 * smarter about intermediate levels.)
2082 	 */
2083 	if (level == 0)
2084 		buf = _bt_getroot(rel, BT_READ);
2085 	else
2086 		buf = _bt_gettrueroot(rel);
2087 
2088 	if (!BufferIsValid(buf))
2089 		return InvalidBuffer;
2090 
2091 	page = BufferGetPage(buf);
2092 	TestForOldSnapshot(snapshot, rel, page);
2093 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2094 
2095 	for (;;)
2096 	{
2097 		/*
2098 		 * If we landed on a deleted page, step right to find a live page
2099 		 * (there must be one).  Also, if we want the rightmost page, step
2100 		 * right if needed to get to it (this could happen if the page split
2101 		 * since we obtained a pointer to it).
2102 		 */
2103 		while (P_IGNORE(opaque) ||
2104 			   (rightmost && !P_RIGHTMOST(opaque)))
2105 		{
2106 			blkno = opaque->btpo_next;
2107 			if (blkno == P_NONE)
2108 				elog(ERROR, "fell off the end of index \"%s\"",
2109 					 RelationGetRelationName(rel));
2110 			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2111 			page = BufferGetPage(buf);
2112 			TestForOldSnapshot(snapshot, rel, page);
2113 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2114 		}
2115 
2116 		/* Done? */
2117 		if (opaque->btpo.level == level)
2118 			break;
2119 		if (opaque->btpo.level < level)
2120 			elog(ERROR, "btree level %u not found in index \"%s\"",
2121 				 level, RelationGetRelationName(rel));
2122 
2123 		/* Descend to leftmost or rightmost child page */
2124 		if (rightmost)
2125 			offnum = PageGetMaxOffsetNumber(page);
2126 		else
2127 			offnum = P_FIRSTDATAKEY(opaque);
2128 
2129 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2130 		blkno = BTreeInnerTupleGetDownLink(itup);
2131 
2132 		buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2133 		page = BufferGetPage(buf);
2134 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2135 	}
2136 
2137 	return buf;
2138 }
2139 
2140 /*
2141  *	_bt_endpoint() -- Find the first or last page in the index, and scan
2142  * from there to the first key satisfying all the quals.
2143  *
2144  * This is used by _bt_first() to set up a scan when we've determined
2145  * that the scan must start at the beginning or end of the index (for
2146  * a forward or backward scan respectively).  Exit conditions are the
2147  * same as for _bt_first().
2148  */
2149 static bool
_bt_endpoint(IndexScanDesc scan,ScanDirection dir)2150 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2151 {
2152 	Relation	rel = scan->indexRelation;
2153 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
2154 	Buffer		buf;
2155 	Page		page;
2156 	BTPageOpaque opaque;
2157 	OffsetNumber start;
2158 	BTScanPosItem *currItem;
2159 
2160 	/*
2161 	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
2162 	 * version of _bt_search().  We don't maintain a stack since we know we
2163 	 * won't need it.
2164 	 */
2165 	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
2166 
2167 	if (!BufferIsValid(buf))
2168 	{
2169 		/*
2170 		 * Empty index. Lock the whole relation, as nothing finer to lock
2171 		 * exists.
2172 		 */
2173 		PredicateLockRelation(rel, scan->xs_snapshot);
2174 		BTScanPosInvalidate(so->currPos);
2175 		return false;
2176 	}
2177 
2178 	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2179 	page = BufferGetPage(buf);
2180 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2181 	Assert(P_ISLEAF(opaque));
2182 
2183 	if (ScanDirectionIsForward(dir))
2184 	{
2185 		/* There could be dead pages to the left, so not this: */
2186 		/* Assert(P_LEFTMOST(opaque)); */
2187 
2188 		start = P_FIRSTDATAKEY(opaque);
2189 	}
2190 	else if (ScanDirectionIsBackward(dir))
2191 	{
2192 		Assert(P_RIGHTMOST(opaque));
2193 
2194 		start = PageGetMaxOffsetNumber(page);
2195 	}
2196 	else
2197 	{
2198 		elog(ERROR, "invalid scan direction: %d", (int) dir);
2199 		start = 0;				/* keep compiler quiet */
2200 	}
2201 
2202 	/* remember which buffer we have pinned */
2203 	so->currPos.buf = buf;
2204 
2205 	_bt_initialize_more_data(so, dir);
2206 
2207 	/*
2208 	 * Now load data from the first page of the scan.
2209 	 */
2210 	if (!_bt_readpage(scan, dir, start))
2211 	{
2212 		/*
2213 		 * There's no actually-matching data on this page.  Try to advance to
2214 		 * the next page.  Return false if there's no matching data at all.
2215 		 */
2216 		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
2217 		if (!_bt_steppage(scan, dir))
2218 			return false;
2219 	}
2220 	else
2221 	{
2222 		/* Drop the lock, and maybe the pin, on the current page */
2223 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2224 	}
2225 
2226 	/* OK, itemIndex says what to return */
2227 	currItem = &so->currPos.items[so->currPos.itemIndex];
2228 	scan->xs_heaptid = currItem->heapTid;
2229 	if (scan->xs_want_itup)
2230 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2231 
2232 	return true;
2233 }
2234 
2235 /*
2236  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2237  * for scan direction
2238  */
2239 static inline void
_bt_initialize_more_data(BTScanOpaque so,ScanDirection dir)2240 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2241 {
2242 	/* initialize moreLeft/moreRight appropriately for scan direction */
2243 	if (ScanDirectionIsForward(dir))
2244 	{
2245 		so->currPos.moreLeft = false;
2246 		so->currPos.moreRight = true;
2247 	}
2248 	else
2249 	{
2250 		so->currPos.moreLeft = true;
2251 		so->currPos.moreRight = false;
2252 	}
2253 	so->numKilled = 0;			/* just paranoia */
2254 	so->markItemIndex = -1;		/* ditto */
2255 }
2256