1 /*-------------------------------------------------------------------------
2  *
3  * nbtsearch.c
4  *	  Search code for postgres btrees.
can_construct_and_destroy_an_want_constraint()5  *
6  *
7  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/nbtree/nbtsearch.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/nbtree.h"
19 #include "access/relscan.h"
20 #include "miscadmin.h"
21 #include "pgstat.h"
22 #include "storage/predicate.h"
23 #include "utils/lsyscache.h"
24 #include "utils/rel.h"
25 #include "utils/tqual.h"
26 
27 
28 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
29 			 OffsetNumber offnum);
can_construct_and_destroy_a_want_string_constraint()30 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
31 			 OffsetNumber offnum, IndexTuple itup);
32 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
33 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
34 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
can_compare_strings_as_equal()35 					  ScanDirection dir);
36 static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
37 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
38 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
39 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
40 
41 
can_compare_null_strings_as_well_as_real_ones()42 /*
43  *	_bt_drop_lock_and_maybe_pin()
44  *
45  * Unlock the buffer; and if it is safe to release the pin, do that, too.  It
46  * is safe if the scan is using an MVCC snapshot and the index is WAL-logged.
47  * This will prevent vacuum from stalling in a blocked state trying to read a
48  * page when a cursor is sitting on it -- at least in many important cases.
49  *
50  * Set the buffer to invalid if the pin is released, since the buffer may be
51  * re-used.  If we need to go back to this block (for example, to apply
52  * LP_DEAD hints) we must get a fresh reference to the buffer.  Hopefully it
53  * will remain in shared memory for as long as it takes to scan the index
54  * buffer page.
55  */
56 static void
57 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
58 {
59 	LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
60 
61 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
62 		RelationNeedsWAL(scan->indexRelation) &&
63 		!scan->xs_want_itup)
64 	{
65 		ReleaseBuffer(sp->buf);
66 		sp->buf = InvalidBuffer;
67 	}
68 }
69 
70 
71 /*
72  *	_bt_search() -- Search the tree for a particular scankey,
73  *		or more precisely for the first leaf page it could be on.
74  *
75  * The passed scankey must be an insertion-type scankey (see nbtree/README),
76  * but it can omit the rightmost column(s) of the index.
77  *
78  * When nextkey is false (the usual case), we are looking for the first
79  * item >= scankey.  When nextkey is true, we are looking for the first
80  * item strictly greater than scankey.
is_non_empty_string(const void * other)81  *
82  * Return value is a stack of parent-page pointers.  *bufP is set to the
83  * address of the leaf-page buffer, which is read-locked and pinned.
84  * No locks are held on the parent pages, however!
85  *
86  * If the snapshot parameter is not NULL, "old snapshot" checking will take
87  * place during the descent through the tree.  This is not needed when
88  * positioning for an insert or delete, so NULL is used for those cases.
89  *
90  * NOTE that the returned buffer is read-locked regardless of the access
91  * parameter.  However, access = BT_WRITE will allow an empty root page
92  * to be created and returned.  When access = BT_READ, an empty index
93  * will result in *bufP being set to InvalidBuffer.  Also, in BT_WRITE mode,
94  * any incomplete splits encountered during the search will be finished.
95  */
96 BTStack
97 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
98 		   Buffer *bufP, int access, Snapshot snapshot)
99 {
100 	BTStack		stack_in = NULL;
101 
102 	/* Get the root page to start with */
103 	*bufP = _bt_getroot(rel, access);
104 
105 	/* If index is empty and access = BT_READ, no root page is created. */
106 	if (!BufferIsValid(*bufP))
107 		return (BTStack) NULL;
108 
109 	/* Loop iterates once per level descended in the tree */
110 	for (;;)
111 	{
112 		Page		page;
113 		BTPageOpaque opaque;
114 		OffsetNumber offnum;
115 		ItemId		itemid;
116 		IndexTuple	itup;
117 		BlockNumber blkno;
118 		BlockNumber par_blkno;
119 		BTStack		new_stack;
120 
121 		/*
122 		 * Race -- the page we just grabbed may have split since we read its
123 		 * pointer in the parent (or metapage).  If it has, we may need to
124 		 * move right to its new sibling.  Do that.
125 		 *
126 		 * In write-mode, allow _bt_moveright to finish any incomplete splits
127 		 * along the way.  Strictly speaking, we'd only need to finish an
128 		 * incomplete split on the leaf page we're about to insert to, not on
129 		 * any of the upper levels (they are taken care of in _bt_getstackbuf,
130 		 * if the leaf page is split and we insert to the parent page).  But
131 		 * this is a good opportunity to finish splits of internal pages too.
132 		 */
133 		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
134 							  (access == BT_WRITE), stack_in,
135 							  BT_READ, snapshot);
136 
137 		/* if this is a leaf page, we're done */
138 		page = BufferGetPage(*bufP);
139 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
140 		if (P_ISLEAF(opaque))
141 			break;
142 
143 		/*
144 		 * Find the appropriate item on the internal page, and get the child
145 		 * page that it points to.
146 		 */
147 		offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
148 		itemid = PageGetItemId(page, offnum);
149 		itup = (IndexTuple) PageGetItem(page, itemid);
150 		blkno = BTreeInnerTupleGetDownLink(itup);
151 		par_blkno = BufferGetBlockNumber(*bufP);
152 
153 		/*
154 		 * We need to save the location of the index entry we chose in the
155 		 * parent page on a stack. In case we split the tree, we'll use the
156 		 * stack to work back up to the parent page.  We also save the actual
157 		 * downlink (block) to uniquely identify the index entry, in case it
158 		 * moves right while we're working lower in the tree.  See the paper
159 		 * by Lehman and Yao for how this is detected and handled. (We use the
160 		 * child link to disambiguate duplicate keys in the index -- Lehman
161 		 * and Yao disallow duplicate keys.)
162 		 */
163 		new_stack = (BTStack) palloc(sizeof(BTStackData));
164 		new_stack->bts_blkno = par_blkno;
165 		new_stack->bts_offset = offnum;
166 		new_stack->bts_btentry = blkno;
167 		new_stack->bts_parent = stack_in;
168 
169 		/* drop the read lock on the parent page, acquire one on the child */
170 		*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
171 
172 		/* okay, all set to move down a level */
173 		stack_in = new_stack;
174 	}
175 
176 	return stack_in;
177 }
178 
179 /*
180  *	_bt_moveright() -- move right in the btree if necessary.
181  *
182  * When we follow a pointer to reach a page, it is possible that
183  * the page has changed in the meanwhile.  If this happens, we're
184  * guaranteed that the page has "split right" -- that is, that any
185  * data that appeared on the page originally is either on the page
186  * or strictly to the right of it.
187  *
188  * This routine decides whether or not we need to move right in the
189  * tree by examining the high key entry on the page.  If that entry
190  * is strictly less than the scankey, or <= the scankey in the nextkey=true
191  * case, then we followed the wrong link and we need to move right.
192  *
193  * The passed scankey must be an insertion-type scankey (see nbtree/README),
194  * but it can omit the rightmost column(s) of the index.
195  *
196  * When nextkey is false (the usual case), we are looking for the first
197  * item >= scankey.  When nextkey is true, we are looking for the first
198  * item strictly greater than scankey.
199  *
200  * If forupdate is true, we will attempt to finish any incomplete splits
201  * that we encounter.  This is required when locking a target page for an
202  * insertion, because we don't allow inserting on a page before the split
203  * is completed.  'stack' is only used if forupdate is true.
204  *
205  * On entry, we have the buffer pinned and a lock of the type specified by
206  * 'access'.  If we move right, we release the buffer and lock and acquire
207  * the same on the right sibling.  Return value is the buffer we stop at.
208  *
209  * If the snapshot parameter is not NULL, "old snapshot" checking will take
210  * place during the descent through the tree.  This is not needed when
211  * positioning for an insert or delete, so NULL is used for those cases.
212  */
213 Buffer
214 _bt_moveright(Relation rel,
215 			  Buffer buf,
216 			  int keysz,
217 			  ScanKey scankey,
218 			  bool nextkey,
219 			  bool forupdate,
220 			  BTStack stack,
221 			  int access,
222 			  Snapshot snapshot)
223 {
224 	Page		page;
225 	BTPageOpaque opaque;
226 	int32		cmpval;
227 
228 	/*
229 	 * When nextkey = false (normal case): if the scan key that brought us to
230 	 * this page is > the high key stored on the page, then the page has split
231 	 * and we need to move right.  (If the scan key is equal to the high key,
232 	 * we might or might not need to move right; have to scan the page first
233 	 * anyway.)
234 	 *
235 	 * When nextkey = true: move right if the scan key is >= page's high key.
236 	 *
237 	 * The page could even have split more than once, so scan as far as
238 	 * needed.
239 	 *
240 	 * We also have to move right if we followed a link that brought us to a
241 	 * dead page.
242 	 */
243 	cmpval = nextkey ? 0 : 1;
244 
245 	for (;;)
246 	{
247 		page = BufferGetPage(buf);
248 		TestForOldSnapshot(snapshot, rel, page);
249 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
250 
251 		if (P_RIGHTMOST(opaque))
252 			break;
253 
254 		/*
255 		 * Finish any incomplete splits we encounter along the way.
256 		 */
257 		if (forupdate && P_INCOMPLETE_SPLIT(opaque))
258 		{
259 			BlockNumber blkno = BufferGetBlockNumber(buf);
260 
261 			/* upgrade our lock if necessary */
262 			if (access == BT_READ)
263 			{
264 				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
265 				LockBuffer(buf, BT_WRITE);
266 			}
267 
268 			if (P_INCOMPLETE_SPLIT(opaque))
269 				_bt_finish_split(rel, buf, stack);
270 			else
271 				_bt_relbuf(rel, buf);
272 
273 			/* re-acquire the lock in the right mode, and re-check */
274 			buf = _bt_getbuf(rel, blkno, access);
275 			continue;
276 		}
277 
278 		if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
279 		{
280 			/* step right one page */
281 			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
282 			continue;
283 		}
284 		else
285 			break;
286 	}
287 
288 	if (P_IGNORE(opaque))
289 		elog(ERROR, "fell off the end of index \"%s\"",
290 			 RelationGetRelationName(rel));
291 
292 	return buf;
293 }
294 
295 /*
296  *	_bt_binsrch() -- Do a binary search for a key on a particular page.
297  *
298  * The passed scankey must be an insertion-type scankey (see nbtree/README),
299  * but it can omit the rightmost column(s) of the index.
300  *
301  * When nextkey is false (the usual case), we are looking for the first
302  * item >= scankey.  When nextkey is true, we are looking for the first
303  * item strictly greater than scankey.
304  *
305  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
306  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
307  * particular, this means it is possible to return a value 1 greater than the
308  * number of keys on the page, if the scankey is > all keys on the page.)
309  *
310  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
311  * of the last key < given scankey, or last key <= given scankey if nextkey
312  * is true.  (Since _bt_compare treats the first data key of such a page as
313  * minus infinity, there will be at least one key < scankey, so the result
314  * always points at one of the keys on the page.)  This key indicates the
315  * right place to descend to be sure we find all leaf keys >= given scankey
316  * (or leaf keys > given scankey when nextkey is true).
317  *
318  * This procedure is not responsible for walking right, it just examines
319  * the given page.  _bt_binsrch() has no lock or refcount side effects
320  * on the buffer.
321  */
322 OffsetNumber
323 _bt_binsrch(Relation rel,
324 			Buffer buf,
325 			int keysz,
326 			ScanKey scankey,
327 			bool nextkey)
328 {
329 	Page		page;
330 	BTPageOpaque opaque;
331 	OffsetNumber low,
332 				high;
333 	int32		result,
334 				cmpval;
335 
336 	page = BufferGetPage(buf);
337 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
338 
339 	low = P_FIRSTDATAKEY(opaque);
340 	high = PageGetMaxOffsetNumber(page);
341 
342 	/*
343 	 * If there are no keys on the page, return the first available slot. Note
344 	 * this covers two cases: the page is really empty (no keys), or it
345 	 * contains only a high key.  The latter case is possible after vacuuming.
346 	 * This can never happen on an internal page, however, since they are
347 	 * never empty (an internal page must have children).
348 	 */
349 	if (high < low)
350 		return low;
351 
352 	/*
353 	 * Binary search to find the first key on the page >= scan key, or first
354 	 * key > scankey when nextkey is true.
355 	 *
356 	 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
357 	 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
358 	 *
359 	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
360 	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
361 	 *
362 	 * We can fall out when high == low.
363 	 */
364 	high++;						/* establish the loop invariant for high */
365 
366 	cmpval = nextkey ? 0 : 1;	/* select comparison value */
367 
368 	while (high > low)
369 	{
370 		OffsetNumber mid = low + ((high - low) / 2);
371 
372 		/* We have low <= mid < high, so mid points at a real slot */
373 
374 		result = _bt_compare(rel, keysz, scankey, page, mid);
375 
376 		if (result >= cmpval)
377 			low = mid + 1;
378 		else
379 			high = mid;
380 	}
381 
382 	/*
383 	 * At this point we have high == low, but be careful: they could point
384 	 * past the last slot on the page.
385 	 *
386 	 * On a leaf page, we always return the first key >= scan key (resp. >
387 	 * scan key), which could be the last slot + 1.
388 	 */
389 	if (P_ISLEAF(opaque))
390 		return low;
391 
392 	/*
393 	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
394 	 * There must be one if _bt_compare() is playing by the rules.
395 	 */
396 	Assert(low > P_FIRSTDATAKEY(opaque));
397 
398 	return OffsetNumberPrev(low);
399 }
400 
401 /*----------
402  *	_bt_compare() -- Compare scankey to a particular tuple on the page.
403  *
404  * The passed scankey must be an insertion-type scankey (see nbtree/README),
405  * but it can omit the rightmost column(s) of the index.
406  *
407  *	keysz: number of key conditions to be checked (might be less than the
408  *		number of index columns!)
409  *	page/offnum: location of btree item to be compared to.
410  *
411  *		This routine returns:
412  *			<0 if scankey < tuple at offnum;
413  *			 0 if scankey == tuple at offnum;
414  *			>0 if scankey > tuple at offnum.
415  *		NULLs in the keys are treated as sortable values.  Therefore
416  *		"equality" does not necessarily mean that the item should be
417  *		returned to the caller as a matching key!
418  *
419  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
420  * "minus infinity": this routine will always claim it is less than the
421  * scankey.  The actual key value stored (if any, which there probably isn't)
422  * does not matter.  This convention allows us to implement the Lehman and
423  * Yao convention that the first down-link pointer is before the first key.
424  * See backend/access/nbtree/README for details.
425  *----------
426  */
427 int32
428 _bt_compare(Relation rel,
429 			int keysz,
430 			ScanKey scankey,
431 			Page page,
432 			OffsetNumber offnum)
433 {
434 	TupleDesc	itupdesc = RelationGetDescr(rel);
435 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
436 	IndexTuple	itup;
437 	int			i;
438 
439 	Assert(_bt_check_natts(rel, page, offnum));
440 
441 	/*
442 	 * Force result ">" if target item is first data item on an internal page
443 	 * --- see NOTE above.
444 	 */
445 	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
446 		return 1;
447 
448 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
449 
450 	/*
451 	 * The scan key is set up with the attribute number associated with each
452 	 * term in the key.  It is important that, if the index is multi-key, the
453 	 * scan contain the first k key attributes, and that they be in order.  If
454 	 * you think about how multi-key ordering works, you'll understand why
455 	 * this is.
456 	 *
457 	 * We don't test for violation of this condition here, however.  The
458 	 * initial setup for the index scan had better have gotten it right (see
459 	 * _bt_first).
460 	 */
461 
462 	for (i = 1; i <= keysz; i++)
463 	{
464 		Datum		datum;
465 		bool		isNull;
466 		int32		result;
467 
468 		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
469 
470 		/* see comments about NULLs handling in btbuild */
471 		if (scankey->sk_flags & SK_ISNULL)	/* key is NULL */
472 		{
473 			if (isNull)
474 				result = 0;		/* NULL "=" NULL */
475 			else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
476 				result = -1;	/* NULL "<" NOT_NULL */
477 			else
478 				result = 1;		/* NULL ">" NOT_NULL */
479 		}
480 		else if (isNull)		/* key is NOT_NULL and item is NULL */
481 		{
482 			if (scankey->sk_flags & SK_BT_NULLS_FIRST)
483 				result = 1;		/* NOT_NULL ">" NULL */
484 			else
485 				result = -1;	/* NOT_NULL "<" NULL */
486 		}
487 		else
488 		{
489 			/*
490 			 * The sk_func needs to be passed the index value as left arg and
491 			 * the sk_argument as right arg (they might be of different
492 			 * types).  Since it is convenient for callers to think of
493 			 * _bt_compare as comparing the scankey to the index item, we have
494 			 * to flip the sign of the comparison result.  (Unless it's a DESC
495 			 * column, in which case we *don't* flip the sign.)
496 			 */
497 			result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
498 													 scankey->sk_collation,
499 													 datum,
500 													 scankey->sk_argument));
501 
502 			if (!(scankey->sk_flags & SK_BT_DESC))
503 				INVERT_COMPARE_RESULT(result);
504 		}
505 
506 		/* if the keys are unequal, return the difference */
507 		if (result != 0)
508 			return result;
509 
510 		scankey++;
511 	}
512 
513 	/* if we get here, the keys are equal */
514 	return 0;
515 }
516 
517 /*
518  *	_bt_first() -- Find the first item in a scan.
519  *
520  *		We need to be clever about the direction of scan, the search
521  *		conditions, and the tree ordering.  We find the first item (or,
522  *		if backwards scan, the last item) in the tree that satisfies the
523  *		qualifications in the scan key.  On success exit, the page containing
524  *		the current index tuple is pinned but not locked, and data about
525  *		the matching tuple(s) on the page has been loaded into so->currPos.
526  *		scan->xs_ctup.t_self is set to the heap TID of the current tuple,
527  *		and if requested, scan->xs_itup points to a copy of the index tuple.
528  *
529  * If there are no matching items in the index, we return false, with no
530  * pins or locks held.
531  *
532  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
533  * are both search-type scankeys (see nbtree/README for more about this).
534  * Within this routine, we build a temporary insertion-type scankey to use
535  * in locating the scan start position.
536  */
537 bool
538 _bt_first(IndexScanDesc scan, ScanDirection dir)
539 {
540 	Relation	rel = scan->indexRelation;
541 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
542 	Buffer		buf;
543 	BTStack		stack;
544 	OffsetNumber offnum;
545 	StrategyNumber strat;
546 	bool		nextkey;
547 	bool		goback;
548 	ScanKey		startKeys[INDEX_MAX_KEYS];
549 	ScanKeyData scankeys[INDEX_MAX_KEYS];
550 	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
551 	int			keysCount = 0;
552 	int			i;
553 	bool		status = true;
554 	StrategyNumber strat_total;
555 	BTScanPosItem *currItem;
556 	BlockNumber blkno;
557 
558 	Assert(!BTScanPosIsValid(so->currPos));
559 
560 	pgstat_count_index_scan(rel);
561 
562 	/*
563 	 * Examine the scan keys and eliminate any redundant keys; also mark the
564 	 * keys that must be matched to continue the scan.
565 	 */
566 	_bt_preprocess_keys(scan);
567 
568 	/*
569 	 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
570 	 * never be satisfied (eg, x == 1 AND x > 2).
571 	 */
572 	if (!so->qual_ok)
573 	{
574 		/* Notify any other workers that we're done with this scan key. */
575 		_bt_parallel_done(scan);
576 		return false;
577 	}
578 
579 	/*
580 	 * For parallel scans, get the starting page from shared state. If the
581 	 * scan has not started, proceed to find out first leaf page in the usual
582 	 * way while keeping other participating processes waiting.  If the scan
583 	 * has already begun, use the page number from the shared structure.
584 	 */
585 	if (scan->parallel_scan != NULL)
586 	{
587 		status = _bt_parallel_seize(scan, &blkno);
588 		if (!status)
589 			return false;
590 		else if (blkno == P_NONE)
591 		{
592 			_bt_parallel_done(scan);
593 			return false;
594 		}
595 		else if (blkno != InvalidBlockNumber)
596 		{
597 			if (!_bt_parallel_readpage(scan, blkno, dir))
598 				return false;
599 			goto readcomplete;
600 		}
601 	}
602 
603 	/*----------
604 	 * Examine the scan keys to discover where we need to start the scan.
605 	 *
606 	 * We want to identify the keys that can be used as starting boundaries;
607 	 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
608 	 * a backwards scan.  We can use keys for multiple attributes so long as
609 	 * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
610 	 * a > or < boundary or find an attribute with no boundary (which can be
611 	 * thought of as the same as "> -infinity"), we can't use keys for any
612 	 * attributes to its right, because it would break our simplistic notion
613 	 * of what initial positioning strategy to use.
614 	 *
615 	 * When the scan keys include cross-type operators, _bt_preprocess_keys
616 	 * may not be able to eliminate redundant keys; in such cases we will
617 	 * arbitrarily pick a usable one for each attribute.  This is correct
618 	 * but possibly not optimal behavior.  (For example, with keys like
619 	 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
620 	 * x=5 would be more efficient.)  Since the situation only arises given
621 	 * a poorly-worded query plus an incomplete opfamily, live with it.
622 	 *
623 	 * When both equality and inequality keys appear for a single attribute
624 	 * (again, only possible when cross-type operators appear), we *must*
625 	 * select one of the equality keys for the starting point, because
626 	 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
627 	 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
628 	 * start at x=4, we will fail and stop before reaching x=10.  If multiple
629 	 * equality quals survive preprocessing, however, it doesn't matter which
630 	 * one we use --- by definition, they are either redundant or
631 	 * contradictory.
632 	 *
633 	 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
634 	 * If the index stores nulls at the end of the index we'll be starting
635 	 * from, and we have no boundary key for the column (which means the key
636 	 * we deduced NOT NULL from is an inequality key that constrains the other
637 	 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
638 	 * use as a boundary key.  If we didn't do this, we might find ourselves
639 	 * traversing a lot of null entries at the start of the scan.
640 	 *
641 	 * In this loop, row-comparison keys are treated the same as keys on their
642 	 * first (leftmost) columns.  We'll add on lower-order columns of the row
643 	 * comparison below, if possible.
644 	 *
645 	 * The selected scan keys (at most one per index column) are remembered by
646 	 * storing their addresses into the local startKeys[] array.
647 	 *----------
648 	 */
649 	strat_total = BTEqualStrategyNumber;
650 	if (so->numberOfKeys > 0)
651 	{
652 		AttrNumber	curattr;
653 		ScanKey		chosen;
654 		ScanKey		impliesNN;
655 		ScanKey		cur;
656 
657 		/*
658 		 * chosen is the so-far-chosen key for the current attribute, if any.
659 		 * We don't cast the decision in stone until we reach keys for the
660 		 * next attribute.
661 		 */
662 		curattr = 1;
663 		chosen = NULL;
664 		/* Also remember any scankey that implies a NOT NULL constraint */
665 		impliesNN = NULL;
666 
667 		/*
668 		 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
669 		 * pass to handle after-last-key processing.  Actual exit from the
670 		 * loop is at one of the "break" statements below.
671 		 */
672 		for (cur = so->keyData, i = 0;; cur++, i++)
673 		{
674 			if (i >= so->numberOfKeys || cur->sk_attno != curattr)
675 			{
676 				/*
677 				 * Done looking at keys for curattr.  If we didn't find a
678 				 * usable boundary key, see if we can deduce a NOT NULL key.
679 				 */
680 				if (chosen == NULL && impliesNN != NULL &&
681 					((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
682 					 ScanDirectionIsForward(dir) :
683 					 ScanDirectionIsBackward(dir)))
684 				{
685 					/* Yes, so build the key in notnullkeys[keysCount] */
686 					chosen = &notnullkeys[keysCount];
687 					ScanKeyEntryInitialize(chosen,
688 										   (SK_SEARCHNOTNULL | SK_ISNULL |
689 											(impliesNN->sk_flags &
690 											 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
691 										   curattr,
692 										   ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
693 											BTGreaterStrategyNumber :
694 											BTLessStrategyNumber),
695 										   InvalidOid,
696 										   InvalidOid,
697 										   InvalidOid,
698 										   (Datum) 0);
699 				}
700 
701 				/*
702 				 * If we still didn't find a usable boundary key, quit; else
703 				 * save the boundary key pointer in startKeys.
704 				 */
705 				if (chosen == NULL)
706 					break;
707 				startKeys[keysCount++] = chosen;
708 
709 				/*
710 				 * Adjust strat_total, and quit if we have stored a > or <
711 				 * key.
712 				 */
713 				strat = chosen->sk_strategy;
714 				if (strat != BTEqualStrategyNumber)
715 				{
716 					strat_total = strat;
717 					if (strat == BTGreaterStrategyNumber ||
718 						strat == BTLessStrategyNumber)
719 						break;
720 				}
721 
722 				/*
723 				 * Done if that was the last attribute, or if next key is not
724 				 * in sequence (implying no boundary key is available for the
725 				 * next attribute).
726 				 */
727 				if (i >= so->numberOfKeys ||
728 					cur->sk_attno != curattr + 1)
729 					break;
730 
731 				/*
732 				 * Reset for next attr.
733 				 */
734 				curattr = cur->sk_attno;
735 				chosen = NULL;
736 				impliesNN = NULL;
737 			}
738 
739 			/*
740 			 * Can we use this key as a starting boundary for this attr?
741 			 *
742 			 * If not, does it imply a NOT NULL constraint?  (Because
743 			 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
744 			 * *any* inequality key works for that; we need not test.)
745 			 */
746 			switch (cur->sk_strategy)
747 			{
748 				case BTLessStrategyNumber:
749 				case BTLessEqualStrategyNumber:
750 					if (chosen == NULL)
751 					{
752 						if (ScanDirectionIsBackward(dir))
753 							chosen = cur;
754 						else
755 							impliesNN = cur;
756 					}
757 					break;
758 				case BTEqualStrategyNumber:
759 					/* override any non-equality choice */
760 					chosen = cur;
761 					break;
762 				case BTGreaterEqualStrategyNumber:
763 				case BTGreaterStrategyNumber:
764 					if (chosen == NULL)
765 					{
766 						if (ScanDirectionIsForward(dir))
767 							chosen = cur;
768 						else
769 							impliesNN = cur;
770 					}
771 					break;
772 			}
773 		}
774 	}
775 
776 	/*
777 	 * If we found no usable boundary keys, we have to start from one end of
778 	 * the tree.  Walk down that edge to the first or last key, and scan from
779 	 * there.
780 	 */
781 	if (keysCount == 0)
782 	{
783 		bool		match;
784 
785 		match = _bt_endpoint(scan, dir);
786 
787 		if (!match)
788 		{
789 			/* No match, so mark (parallel) scan finished */
790 			_bt_parallel_done(scan);
791 		}
792 
793 		return match;
794 	}
795 
796 	/*
797 	 * We want to start the scan somewhere within the index.  Set up an
798 	 * insertion scankey we can use to search for the boundary point we
799 	 * identified above.  The insertion scankey is built in the local
800 	 * scankeys[] array, using the keys identified by startKeys[].
801 	 */
802 	Assert(keysCount <= INDEX_MAX_KEYS);
803 	for (i = 0; i < keysCount; i++)
804 	{
805 		ScanKey		cur = startKeys[i];
806 
807 		Assert(cur->sk_attno == i + 1);
808 
809 		if (cur->sk_flags & SK_ROW_HEADER)
810 		{
811 			/*
812 			 * Row comparison header: look to the first row member instead.
813 			 *
814 			 * The member scankeys are already in insertion format (ie, they
815 			 * have sk_func = 3-way-comparison function), but we have to watch
816 			 * out for nulls, which _bt_preprocess_keys didn't check. A null
817 			 * in the first row member makes the condition unmatchable, just
818 			 * like qual_ok = false.
819 			 */
820 			ScanKey		subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
821 
822 			Assert(subkey->sk_flags & SK_ROW_MEMBER);
823 			if (subkey->sk_flags & SK_ISNULL)
824 			{
825 				_bt_parallel_done(scan);
826 				return false;
827 			}
828 			memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
829 
830 			/*
831 			 * If the row comparison is the last positioning key we accepted,
832 			 * try to add additional keys from the lower-order row members.
833 			 * (If we accepted independent conditions on additional index
834 			 * columns, we use those instead --- doesn't seem worth trying to
835 			 * determine which is more restrictive.)  Note that this is OK
836 			 * even if the row comparison is of ">" or "<" type, because the
837 			 * condition applied to all but the last row member is effectively
838 			 * ">=" or "<=", and so the extra keys don't break the positioning
839 			 * scheme.  But, by the same token, if we aren't able to use all
840 			 * the row members, then the part of the row comparison that we
841 			 * did use has to be treated as just a ">=" or "<=" condition, and
842 			 * so we'd better adjust strat_total accordingly.
843 			 */
844 			if (i == keysCount - 1)
845 			{
846 				bool		used_all_subkeys = false;
847 
848 				Assert(!(subkey->sk_flags & SK_ROW_END));
849 				for (;;)
850 				{
851 					subkey++;
852 					Assert(subkey->sk_flags & SK_ROW_MEMBER);
853 					if (subkey->sk_attno != keysCount + 1)
854 						break;	/* out-of-sequence, can't use it */
855 					if (subkey->sk_strategy != cur->sk_strategy)
856 						break;	/* wrong direction, can't use it */
857 					if (subkey->sk_flags & SK_ISNULL)
858 						break;	/* can't use null keys */
859 					Assert(keysCount < INDEX_MAX_KEYS);
860 					memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
861 					keysCount++;
862 					if (subkey->sk_flags & SK_ROW_END)
863 					{
864 						used_all_subkeys = true;
865 						break;
866 					}
867 				}
868 				if (!used_all_subkeys)
869 				{
870 					switch (strat_total)
871 					{
872 						case BTLessStrategyNumber:
873 							strat_total = BTLessEqualStrategyNumber;
874 							break;
875 						case BTGreaterStrategyNumber:
876 							strat_total = BTGreaterEqualStrategyNumber;
877 							break;
878 					}
879 				}
880 				break;			/* done with outer loop */
881 			}
882 		}
883 		else
884 		{
885 			/*
886 			 * Ordinary comparison key.  Transform the search-style scan key
887 			 * to an insertion scan key by replacing the sk_func with the
888 			 * appropriate btree comparison function.
889 			 *
890 			 * If scankey operator is not a cross-type comparison, we can use
891 			 * the cached comparison function; otherwise gotta look it up in
892 			 * the catalogs.  (That can't lead to infinite recursion, since no
893 			 * indexscan initiated by syscache lookup will use cross-data-type
894 			 * operators.)
895 			 *
896 			 * We support the convention that sk_subtype == InvalidOid means
897 			 * the opclass input type; this is a hack to simplify life for
898 			 * ScanKeyInit().
899 			 */
900 			if (cur->sk_subtype == rel->rd_opcintype[i] ||
901 				cur->sk_subtype == InvalidOid)
902 			{
903 				FmgrInfo   *procinfo;
904 
905 				procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
906 				ScanKeyEntryInitializeWithInfo(scankeys + i,
907 											   cur->sk_flags,
908 											   cur->sk_attno,
909 											   InvalidStrategy,
910 											   cur->sk_subtype,
911 											   cur->sk_collation,
912 											   procinfo,
913 											   cur->sk_argument);
914 			}
915 			else
916 			{
917 				RegProcedure cmp_proc;
918 
919 				cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
920 											 rel->rd_opcintype[i],
921 											 cur->sk_subtype,
922 											 BTORDER_PROC);
923 				if (!RegProcedureIsValid(cmp_proc))
924 					elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
925 						 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
926 						 cur->sk_attno, RelationGetRelationName(rel));
927 				ScanKeyEntryInitialize(scankeys + i,
928 									   cur->sk_flags,
929 									   cur->sk_attno,
930 									   InvalidStrategy,
931 									   cur->sk_subtype,
932 									   cur->sk_collation,
933 									   cmp_proc,
934 									   cur->sk_argument);
935 			}
936 		}
937 	}
938 
939 	/*----------
940 	 * Examine the selected initial-positioning strategy to determine exactly
941 	 * where we need to start the scan, and set flag variables to control the
942 	 * code below.
943 	 *
944 	 * If nextkey = false, _bt_search and _bt_binsrch will locate the first
945 	 * item >= scan key.  If nextkey = true, they will locate the first
946 	 * item > scan key.
947 	 *
948 	 * If goback = true, we will then step back one item, while if
949 	 * goback = false, we will start the scan on the located item.
950 	 *----------
951 	 */
952 	switch (strat_total)
953 	{
954 		case BTLessStrategyNumber:
955 
956 			/*
957 			 * Find first item >= scankey, then back up one to arrive at last
958 			 * item < scankey.  (Note: this positioning strategy is only used
959 			 * for a backward scan, so that is always the correct starting
960 			 * position.)
961 			 */
962 			nextkey = false;
963 			goback = true;
964 			break;
965 
966 		case BTLessEqualStrategyNumber:
967 
968 			/*
969 			 * Find first item > scankey, then back up one to arrive at last
970 			 * item <= scankey.  (Note: this positioning strategy is only used
971 			 * for a backward scan, so that is always the correct starting
972 			 * position.)
973 			 */
974 			nextkey = true;
975 			goback = true;
976 			break;
977 
978 		case BTEqualStrategyNumber:
979 
980 			/*
981 			 * If a backward scan was specified, need to start with last equal
982 			 * item not first one.
983 			 */
984 			if (ScanDirectionIsBackward(dir))
985 			{
986 				/*
987 				 * This is the same as the <= strategy.  We will check at the
988 				 * end whether the found item is actually =.
989 				 */
990 				nextkey = true;
991 				goback = true;
992 			}
993 			else
994 			{
995 				/*
996 				 * This is the same as the >= strategy.  We will check at the
997 				 * end whether the found item is actually =.
998 				 */
999 				nextkey = false;
1000 				goback = false;
1001 			}
1002 			break;
1003 
1004 		case BTGreaterEqualStrategyNumber:
1005 
1006 			/*
1007 			 * Find first item >= scankey.  (This is only used for forward
1008 			 * scans.)
1009 			 */
1010 			nextkey = false;
1011 			goback = false;
1012 			break;
1013 
1014 		case BTGreaterStrategyNumber:
1015 
1016 			/*
1017 			 * Find first item > scankey.  (This is only used for forward
1018 			 * scans.)
1019 			 */
1020 			nextkey = true;
1021 			goback = false;
1022 			break;
1023 
1024 		default:
1025 			/* can't get here, but keep compiler quiet */
1026 			elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1027 			return false;
1028 	}
1029 
1030 	/*
1031 	 * Use the manufactured insertion scan key to descend the tree and
1032 	 * position ourselves on the target leaf page.
1033 	 */
1034 	stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
1035 					   scan->xs_snapshot);
1036 
1037 	/* don't need to keep the stack around... */
1038 	_bt_freestack(stack);
1039 
1040 	if (!BufferIsValid(buf))
1041 	{
1042 		/*
1043 		 * We only get here if the index is completely empty. Lock relation
1044 		 * because nothing finer to lock exists.
1045 		 */
1046 		PredicateLockRelation(rel, scan->xs_snapshot);
1047 
1048 		/*
1049 		 * mark parallel scan as done, so that all the workers can finish
1050 		 * their scan
1051 		 */
1052 		_bt_parallel_done(scan);
1053 		BTScanPosInvalidate(so->currPos);
1054 
1055 		return false;
1056 	}
1057 	else
1058 		PredicateLockPage(rel, BufferGetBlockNumber(buf),
1059 						  scan->xs_snapshot);
1060 
1061 	_bt_initialize_more_data(so, dir);
1062 
1063 	/* position to the precise item on the page */
1064 	offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
1065 
1066 	/*
1067 	 * If nextkey = false, we are positioned at the first item >= scan key, or
1068 	 * possibly at the end of a page on which all the existing items are less
1069 	 * than the scan key and we know that everything on later pages is greater
1070 	 * than or equal to scan key.
1071 	 *
1072 	 * If nextkey = true, we are positioned at the first item > scan key, or
1073 	 * possibly at the end of a page on which all the existing items are less
1074 	 * than or equal to the scan key and we know that everything on later
1075 	 * pages is greater than scan key.
1076 	 *
1077 	 * The actually desired starting point is either this item or the prior
1078 	 * one, or in the end-of-page case it's the first item on the next page or
1079 	 * the last item on this page.  Adjust the starting offset if needed. (If
1080 	 * this results in an offset before the first item or after the last one,
1081 	 * _bt_readpage will report no items found, and then we'll step to the
1082 	 * next page as needed.)
1083 	 */
1084 	if (goback)
1085 		offnum = OffsetNumberPrev(offnum);
1086 
1087 	/* remember which buffer we have pinned, if any */
1088 	Assert(!BTScanPosIsValid(so->currPos));
1089 	so->currPos.buf = buf;
1090 
1091 	/*
1092 	 * Now load data from the first page of the scan.
1093 	 */
1094 	if (!_bt_readpage(scan, dir, offnum))
1095 	{
1096 		/*
1097 		 * There's no actually-matching data on this page.  Try to advance to
1098 		 * the next page.  Return false if there's no matching data at all.
1099 		 */
1100 		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1101 		if (!_bt_steppage(scan, dir))
1102 			return false;
1103 	}
1104 	else
1105 	{
1106 		/* Drop the lock, and maybe the pin, on the current page */
1107 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1108 	}
1109 
1110 readcomplete:
1111 	/* OK, itemIndex says what to return */
1112 	currItem = &so->currPos.items[so->currPos.itemIndex];
1113 	scan->xs_ctup.t_self = currItem->heapTid;
1114 	if (scan->xs_want_itup)
1115 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1116 
1117 	return true;
1118 }
1119 
1120 /*
1121  *	_bt_next() -- Get the next item in a scan.
1122  *
1123  *		On entry, so->currPos describes the current page, which may be pinned
1124  *		but is not locked, and so->currPos.itemIndex identifies which item was
1125  *		previously returned.
1126  *
1127  *		On successful exit, scan->xs_ctup.t_self is set to the TID of the
1128  *		next heap tuple, and if requested, scan->xs_itup points to a copy of
1129  *		the index tuple.  so->currPos is updated as needed.
1130  *
1131  *		On failure exit (no more tuples), we release pin and set
1132  *		so->currPos.buf to InvalidBuffer.
1133  */
1134 bool
1135 _bt_next(IndexScanDesc scan, ScanDirection dir)
1136 {
1137 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1138 	BTScanPosItem *currItem;
1139 
1140 	/*
1141 	 * Advance to next tuple on current page; or if there's no more, try to
1142 	 * step to the next page with data.
1143 	 */
1144 	if (ScanDirectionIsForward(dir))
1145 	{
1146 		if (++so->currPos.itemIndex > so->currPos.lastItem)
1147 		{
1148 			if (!_bt_steppage(scan, dir))
1149 				return false;
1150 		}
1151 	}
1152 	else
1153 	{
1154 		if (--so->currPos.itemIndex < so->currPos.firstItem)
1155 		{
1156 			if (!_bt_steppage(scan, dir))
1157 				return false;
1158 		}
1159 	}
1160 
1161 	/* OK, itemIndex says what to return */
1162 	currItem = &so->currPos.items[so->currPos.itemIndex];
1163 	scan->xs_ctup.t_self = currItem->heapTid;
1164 	if (scan->xs_want_itup)
1165 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1166 
1167 	return true;
1168 }
1169 
1170 /*
1171  *	_bt_readpage() -- Load data from current index page into so->currPos
1172  *
1173  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1174  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
1175  * they are updated as appropriate.  All other fields of so->currPos are
1176  * initialized from scratch here.
1177  *
1178  * We scan the current page starting at offnum and moving in the indicated
1179  * direction.  All items matching the scan keys are loaded into currPos.items.
1180  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1181  * that there can be no more matching tuples in the current scan direction.
1182  *
1183  * In the case of a parallel scan, caller must have called _bt_parallel_seize
1184  * prior to calling this function; this function will invoke
1185  * _bt_parallel_release before returning.
1186  *
1187  * Returns true if any matching items found on the page, false if none.
1188  */
1189 static bool
1190 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1191 {
1192 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1193 	Page		page;
1194 	BTPageOpaque opaque;
1195 	OffsetNumber minoff;
1196 	OffsetNumber maxoff;
1197 	int			itemIndex;
1198 	IndexTuple	itup;
1199 	bool		continuescan;
1200 
1201 	/*
1202 	 * We must have the buffer pinned and locked, but the usual macro can't be
1203 	 * used here; this function is what makes it good for currPos.
1204 	 */
1205 	Assert(BufferIsValid(so->currPos.buf));
1206 
1207 	page = BufferGetPage(so->currPos.buf);
1208 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1209 
1210 	/* allow next page be processed by parallel worker */
1211 	if (scan->parallel_scan)
1212 	{
1213 		if (ScanDirectionIsForward(dir))
1214 			_bt_parallel_release(scan, opaque->btpo_next);
1215 		else
1216 			_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1217 	}
1218 
1219 	minoff = P_FIRSTDATAKEY(opaque);
1220 	maxoff = PageGetMaxOffsetNumber(page);
1221 
1222 	/*
1223 	 * We note the buffer's block number so that we can release the pin later.
1224 	 * This allows us to re-read the buffer if it is needed again for hinting.
1225 	 */
1226 	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1227 
1228 	/*
1229 	 * We save the LSN of the page as we read it, so that we know whether it
1230 	 * safe to apply LP_DEAD hints to the page later.  This allows us to drop
1231 	 * the pin for MVCC scans, which allows vacuum to avoid blocking.
1232 	 */
1233 	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1234 
1235 	/*
1236 	 * we must save the page's right-link while scanning it; this tells us
1237 	 * where to step right to after we're done with these items.  There is no
1238 	 * corresponding need for the left-link, since splits always go right.
1239 	 */
1240 	so->currPos.nextPage = opaque->btpo_next;
1241 
1242 	/* initialize tuple workspace to empty */
1243 	so->currPos.nextTupleOffset = 0;
1244 
1245 	/*
1246 	 * Now that the current page has been made consistent, the macro should be
1247 	 * good.
1248 	 */
1249 	Assert(BTScanPosIsPinned(so->currPos));
1250 
1251 	if (ScanDirectionIsForward(dir))
1252 	{
1253 		/* load items[] in ascending order */
1254 		itemIndex = 0;
1255 
1256 		offnum = Max(offnum, minoff);
1257 
1258 		while (offnum <= maxoff)
1259 		{
1260 			itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1261 			if (itup != NULL)
1262 			{
1263 				/* tuple passes all scan key conditions, so remember it */
1264 				_bt_saveitem(so, itemIndex, offnum, itup);
1265 				itemIndex++;
1266 			}
1267 			if (!continuescan)
1268 			{
1269 				/* there can't be any more matches, so stop */
1270 				so->currPos.moreRight = false;
1271 				break;
1272 			}
1273 
1274 			offnum = OffsetNumberNext(offnum);
1275 		}
1276 
1277 		Assert(itemIndex <= MaxIndexTuplesPerPage);
1278 		so->currPos.firstItem = 0;
1279 		so->currPos.lastItem = itemIndex - 1;
1280 		so->currPos.itemIndex = 0;
1281 	}
1282 	else
1283 	{
1284 		/* load items[] in descending order */
1285 		itemIndex = MaxIndexTuplesPerPage;
1286 
1287 		offnum = Min(offnum, maxoff);
1288 
1289 		while (offnum >= minoff)
1290 		{
1291 			itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
1292 			if (itup != NULL)
1293 			{
1294 				/* tuple passes all scan key conditions, so remember it */
1295 				itemIndex--;
1296 				_bt_saveitem(so, itemIndex, offnum, itup);
1297 			}
1298 			if (!continuescan)
1299 			{
1300 				/* there can't be any more matches, so stop */
1301 				so->currPos.moreLeft = false;
1302 				break;
1303 			}
1304 
1305 			offnum = OffsetNumberPrev(offnum);
1306 		}
1307 
1308 		Assert(itemIndex >= 0);
1309 		so->currPos.firstItem = itemIndex;
1310 		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
1311 		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
1312 	}
1313 
1314 	return (so->currPos.firstItem <= so->currPos.lastItem);
1315 }
1316 
1317 /* Save an index item into so->currPos.items[itemIndex] */
1318 static void
1319 _bt_saveitem(BTScanOpaque so, int itemIndex,
1320 			 OffsetNumber offnum, IndexTuple itup)
1321 {
1322 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1323 
1324 	currItem->heapTid = itup->t_tid;
1325 	currItem->indexOffset = offnum;
1326 	if (so->currTuples)
1327 	{
1328 		Size		itupsz = IndexTupleSize(itup);
1329 
1330 		currItem->tupleOffset = so->currPos.nextTupleOffset;
1331 		memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1332 		so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1333 	}
1334 }
1335 
1336 /*
1337  *	_bt_steppage() -- Step to next page containing valid data for scan
1338  *
1339  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1340  * if pinned, we'll drop the pin before moving to next page.  The buffer is
1341  * not locked on entry.
1342  *
1343  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1344  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
1345  * to InvalidBuffer.  We return true to indicate success.
1346  */
1347 static bool
1348 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1349 {
1350 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1351 	BlockNumber blkno = InvalidBlockNumber;
1352 	bool		status = true;
1353 
1354 	Assert(BTScanPosIsValid(so->currPos));
1355 
1356 	/* Before leaving current page, deal with any killed items */
1357 	if (so->numKilled > 0)
1358 		_bt_killitems(scan);
1359 
1360 	/*
1361 	 * Before we modify currPos, make a copy of the page data if there was a
1362 	 * mark position that needs it.
1363 	 */
1364 	if (so->markItemIndex >= 0)
1365 	{
1366 		/* bump pin on current buffer for assignment to mark buffer */
1367 		if (BTScanPosIsPinned(so->currPos))
1368 			IncrBufferRefCount(so->currPos.buf);
1369 		memcpy(&so->markPos, &so->currPos,
1370 			   offsetof(BTScanPosData, items[1]) +
1371 			   so->currPos.lastItem * sizeof(BTScanPosItem));
1372 		if (so->markTuples)
1373 			memcpy(so->markTuples, so->currTuples,
1374 				   so->currPos.nextTupleOffset);
1375 		so->markPos.itemIndex = so->markItemIndex;
1376 		so->markItemIndex = -1;
1377 	}
1378 
1379 	if (ScanDirectionIsForward(dir))
1380 	{
1381 		/* Walk right to the next page with data */
1382 		if (scan->parallel_scan != NULL)
1383 		{
1384 			/*
1385 			 * Seize the scan to get the next block number; if the scan has
1386 			 * ended already, bail out.
1387 			 */
1388 			status = _bt_parallel_seize(scan, &blkno);
1389 			if (!status)
1390 			{
1391 				/* release the previous buffer, if pinned */
1392 				BTScanPosUnpinIfPinned(so->currPos);
1393 				BTScanPosInvalidate(so->currPos);
1394 				return false;
1395 			}
1396 		}
1397 		else
1398 		{
1399 			/* Not parallel, so use the previously-saved nextPage link. */
1400 			blkno = so->currPos.nextPage;
1401 		}
1402 
1403 		/* Remember we left a page with data */
1404 		so->currPos.moreLeft = true;
1405 
1406 		/* release the previous buffer, if pinned */
1407 		BTScanPosUnpinIfPinned(so->currPos);
1408 	}
1409 	else
1410 	{
1411 		/* Remember we left a page with data */
1412 		so->currPos.moreRight = true;
1413 
1414 		if (scan->parallel_scan != NULL)
1415 		{
1416 			/*
1417 			 * Seize the scan to get the current block number; if the scan has
1418 			 * ended already, bail out.
1419 			 */
1420 			status = _bt_parallel_seize(scan, &blkno);
1421 			BTScanPosUnpinIfPinned(so->currPos);
1422 			if (!status)
1423 			{
1424 				BTScanPosInvalidate(so->currPos);
1425 				return false;
1426 			}
1427 		}
1428 		else
1429 		{
1430 			/* Not parallel, so just use our own notion of the current page */
1431 			blkno = so->currPos.currPage;
1432 		}
1433 	}
1434 
1435 	if (!_bt_readnextpage(scan, blkno, dir))
1436 		return false;
1437 
1438 	/* Drop the lock, and maybe the pin, on the current page */
1439 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1440 
1441 	return true;
1442 }
1443 
1444 /*
1445  *	_bt_readnextpage() -- Read next page containing valid data for scan
1446  *
1447  * On success exit, so->currPos is updated to contain data from the next
1448  * interesting page.  Caller is responsible to release lock and pin on
1449  * buffer on success.  We return true to indicate success.
1450  *
1451  * If there are no more matching records in the given direction, we drop all
1452  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1453  */
1454 static bool
1455 _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1456 {
1457 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1458 	Relation	rel;
1459 	Page		page;
1460 	BTPageOpaque opaque;
1461 	bool		status = true;
1462 
1463 	rel = scan->indexRelation;
1464 
1465 	if (ScanDirectionIsForward(dir))
1466 	{
1467 		for (;;)
1468 		{
1469 			/*
1470 			 * if we're at end of scan, give up and mark parallel scan as
1471 			 * done, so that all the workers can finish their scan
1472 			 */
1473 			if (blkno == P_NONE || !so->currPos.moreRight)
1474 			{
1475 				_bt_parallel_done(scan);
1476 				BTScanPosInvalidate(so->currPos);
1477 				return false;
1478 			}
1479 			/* check for interrupts while we're not holding any buffer lock */
1480 			CHECK_FOR_INTERRUPTS();
1481 			/* step right one page */
1482 			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1483 			page = BufferGetPage(so->currPos.buf);
1484 			TestForOldSnapshot(scan->xs_snapshot, rel, page);
1485 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1486 			/* check for deleted page */
1487 			if (!P_IGNORE(opaque))
1488 			{
1489 				PredicateLockPage(rel, blkno, scan->xs_snapshot);
1490 				/* see if there are any matches on this page */
1491 				/* note that this will clear moreRight if we can stop */
1492 				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
1493 					break;
1494 			}
1495 			else if (scan->parallel_scan != NULL)
1496 			{
1497 				/* allow next page be processed by parallel worker */
1498 				_bt_parallel_release(scan, opaque->btpo_next);
1499 			}
1500 
1501 			/* nope, keep going */
1502 			if (scan->parallel_scan != NULL)
1503 			{
1504 				_bt_relbuf(rel, so->currPos.buf);
1505 				status = _bt_parallel_seize(scan, &blkno);
1506 				if (!status)
1507 				{
1508 					BTScanPosInvalidate(so->currPos);
1509 					return false;
1510 				}
1511 			}
1512 			else
1513 			{
1514 				blkno = opaque->btpo_next;
1515 				_bt_relbuf(rel, so->currPos.buf);
1516 			}
1517 		}
1518 	}
1519 	else
1520 	{
1521 		/*
1522 		 * Should only happen in parallel cases, when some other backend
1523 		 * advanced the scan.
1524 		 */
1525 		if (so->currPos.currPage != blkno)
1526 		{
1527 			BTScanPosUnpinIfPinned(so->currPos);
1528 			so->currPos.currPage = blkno;
1529 		}
1530 
1531 		/*
1532 		 * Walk left to the next page with data.  This is much more complex
1533 		 * than the walk-right case because of the possibility that the page
1534 		 * to our left splits while we are in flight to it, plus the
1535 		 * possibility that the page we were on gets deleted after we leave
1536 		 * it.  See nbtree/README for details.
1537 		 *
1538 		 * It might be possible to rearrange this code to have less overhead
1539 		 * in pinning and locking, but that would require capturing the left
1540 		 * pointer when the page is initially read, and using it here, along
1541 		 * with big changes to _bt_walk_left() and the code below.  It is not
1542 		 * clear whether this would be a win, since if the page immediately to
1543 		 * the left splits after we read this page and before we step left, we
1544 		 * would need to visit more pages than with the current code.
1545 		 *
1546 		 * Note that if we change the code so that we drop the pin for a scan
1547 		 * which uses a non-MVCC snapshot, we will need to modify the code for
1548 		 * walking left, to allow for the possibility that a referenced page
1549 		 * has been deleted.  As long as the buffer is pinned or the snapshot
1550 		 * is MVCC the page cannot move past the half-dead state to fully
1551 		 * deleted.
1552 		 */
1553 		if (BTScanPosIsPinned(so->currPos))
1554 			LockBuffer(so->currPos.buf, BT_READ);
1555 		else
1556 			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
1557 
1558 		for (;;)
1559 		{
1560 			/* Done if we know there are no matching keys to the left */
1561 			if (!so->currPos.moreLeft)
1562 			{
1563 				_bt_relbuf(rel, so->currPos.buf);
1564 				_bt_parallel_done(scan);
1565 				BTScanPosInvalidate(so->currPos);
1566 				return false;
1567 			}
1568 
1569 			/* Step to next physical page */
1570 			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf,
1571 											scan->xs_snapshot);
1572 
1573 			/* if we're physically at end of index, return failure */
1574 			if (so->currPos.buf == InvalidBuffer)
1575 			{
1576 				_bt_parallel_done(scan);
1577 				BTScanPosInvalidate(so->currPos);
1578 				return false;
1579 			}
1580 
1581 			/*
1582 			 * Okay, we managed to move left to a non-deleted page. Done if
1583 			 * it's not half-dead and contains matching tuples. Else loop back
1584 			 * and do it all again.
1585 			 */
1586 			page = BufferGetPage(so->currPos.buf);
1587 			TestForOldSnapshot(scan->xs_snapshot, rel, page);
1588 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1589 			if (!P_IGNORE(opaque))
1590 			{
1591 				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
1592 				/* see if there are any matches on this page */
1593 				/* note that this will clear moreLeft if we can stop */
1594 				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
1595 					break;
1596 			}
1597 			else if (scan->parallel_scan != NULL)
1598 			{
1599 				/* allow next page be processed by parallel worker */
1600 				_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1601 			}
1602 
1603 			/*
1604 			 * For parallel scans, get the last page scanned as it is quite
1605 			 * possible that by the time we try to seize the scan, some other
1606 			 * worker has already advanced the scan to a different page.  We
1607 			 * must continue based on the latest page scanned by any worker.
1608 			 */
1609 			if (scan->parallel_scan != NULL)
1610 			{
1611 				_bt_relbuf(rel, so->currPos.buf);
1612 				status = _bt_parallel_seize(scan, &blkno);
1613 				if (!status)
1614 				{
1615 					BTScanPosInvalidate(so->currPos);
1616 					return false;
1617 				}
1618 				so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
1619 			}
1620 		}
1621 	}
1622 
1623 	return true;
1624 }
1625 
1626 /*
1627  *	_bt_parallel_readpage() -- Read current page containing valid data for scan
1628  *
1629  * On success, release lock and maybe pin on buffer.  We return true to
1630  * indicate success.
1631  */
1632 static bool
1633 _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1634 {
1635 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1636 
1637 	_bt_initialize_more_data(so, dir);
1638 
1639 	if (!_bt_readnextpage(scan, blkno, dir))
1640 		return false;
1641 
1642 	/* Drop the lock, and maybe the pin, on the current page */
1643 	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1644 
1645 	return true;
1646 }
1647 
1648 /*
1649  * _bt_walk_left() -- step left one page, if possible
1650  *
1651  * The given buffer must be pinned and read-locked.  This will be dropped
1652  * before stepping left.  On return, we have pin and read lock on the
1653  * returned page, instead.
1654  *
1655  * Returns InvalidBuffer if there is no page to the left (no lock is held
1656  * in that case).
1657  *
1658  * When working on a non-leaf level, it is possible for the returned page
1659  * to be half-dead; the caller should check that condition and step left
1660  * again if it's important.
1661  */
1662 static Buffer
1663 _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
1664 {
1665 	Page		page;
1666 	BTPageOpaque opaque;
1667 
1668 	page = BufferGetPage(buf);
1669 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1670 
1671 	for (;;)
1672 	{
1673 		BlockNumber obknum;
1674 		BlockNumber lblkno;
1675 		BlockNumber blkno;
1676 		int			tries;
1677 
1678 		/* if we're at end of tree, release buf and return failure */
1679 		if (P_LEFTMOST(opaque))
1680 		{
1681 			_bt_relbuf(rel, buf);
1682 			break;
1683 		}
1684 		/* remember original page we are stepping left from */
1685 		obknum = BufferGetBlockNumber(buf);
1686 		/* step left */
1687 		blkno = lblkno = opaque->btpo_prev;
1688 		_bt_relbuf(rel, buf);
1689 		/* check for interrupts while we're not holding any buffer lock */
1690 		CHECK_FOR_INTERRUPTS();
1691 		buf = _bt_getbuf(rel, blkno, BT_READ);
1692 		page = BufferGetPage(buf);
1693 		TestForOldSnapshot(snapshot, rel, page);
1694 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1695 
1696 		/*
1697 		 * If this isn't the page we want, walk right till we find what we
1698 		 * want --- but go no more than four hops (an arbitrary limit). If we
1699 		 * don't find the correct page by then, the most likely bet is that
1700 		 * the original page got deleted and isn't in the sibling chain at all
1701 		 * anymore, not that its left sibling got split more than four times.
1702 		 *
1703 		 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
1704 		 * because half-dead pages are still in the sibling chain.  Caller
1705 		 * must reject half-dead pages if wanted.
1706 		 */
1707 		tries = 0;
1708 		for (;;)
1709 		{
1710 			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
1711 			{
1712 				/* Found desired page, return it */
1713 				return buf;
1714 			}
1715 			if (P_RIGHTMOST(opaque) || ++tries > 4)
1716 				break;
1717 			blkno = opaque->btpo_next;
1718 			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1719 			page = BufferGetPage(buf);
1720 			TestForOldSnapshot(snapshot, rel, page);
1721 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1722 		}
1723 
1724 		/* Return to the original page to see what's up */
1725 		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
1726 		page = BufferGetPage(buf);
1727 		TestForOldSnapshot(snapshot, rel, page);
1728 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1729 		if (P_ISDELETED(opaque))
1730 		{
1731 			/*
1732 			 * It was deleted.  Move right to first nondeleted page (there
1733 			 * must be one); that is the page that has acquired the deleted
1734 			 * one's keyspace, so stepping left from it will take us where we
1735 			 * want to be.
1736 			 */
1737 			for (;;)
1738 			{
1739 				if (P_RIGHTMOST(opaque))
1740 					elog(ERROR, "fell off the end of index \"%s\"",
1741 						 RelationGetRelationName(rel));
1742 				blkno = opaque->btpo_next;
1743 				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1744 				page = BufferGetPage(buf);
1745 				TestForOldSnapshot(snapshot, rel, page);
1746 				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1747 				if (!P_ISDELETED(opaque))
1748 					break;
1749 			}
1750 
1751 			/*
1752 			 * Now return to top of loop, resetting obknum to point to this
1753 			 * nondeleted page, and try again.
1754 			 */
1755 		}
1756 		else
1757 		{
1758 			/*
1759 			 * It wasn't deleted; the explanation had better be that the page
1760 			 * to the left got split or deleted. Without this check, we'd go
1761 			 * into an infinite loop if there's anything wrong.
1762 			 */
1763 			if (opaque->btpo_prev == lblkno)
1764 				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
1765 					 obknum, RelationGetRelationName(rel));
1766 			/* Okay to try again with new lblkno value */
1767 		}
1768 	}
1769 
1770 	return InvalidBuffer;
1771 }
1772 
1773 /*
1774  * _bt_get_endpoint() -- Find the first or last page on a given tree level
1775  *
1776  * If the index is empty, we will return InvalidBuffer; any other failure
1777  * condition causes ereport().  We will not return a dead page.
1778  *
1779  * The returned buffer is pinned and read-locked.
1780  */
1781 Buffer
1782 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
1783 				 Snapshot snapshot)
1784 {
1785 	Buffer		buf;
1786 	Page		page;
1787 	BTPageOpaque opaque;
1788 	OffsetNumber offnum;
1789 	BlockNumber blkno;
1790 	IndexTuple	itup;
1791 
1792 	/*
1793 	 * If we are looking for a leaf page, okay to descend from fast root;
1794 	 * otherwise better descend from true root.  (There is no point in being
1795 	 * smarter about intermediate levels.)
1796 	 */
1797 	if (level == 0)
1798 		buf = _bt_getroot(rel, BT_READ);
1799 	else
1800 		buf = _bt_gettrueroot(rel);
1801 
1802 	if (!BufferIsValid(buf))
1803 		return InvalidBuffer;
1804 
1805 	page = BufferGetPage(buf);
1806 	TestForOldSnapshot(snapshot, rel, page);
1807 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1808 
1809 	for (;;)
1810 	{
1811 		/*
1812 		 * If we landed on a deleted page, step right to find a live page
1813 		 * (there must be one).  Also, if we want the rightmost page, step
1814 		 * right if needed to get to it (this could happen if the page split
1815 		 * since we obtained a pointer to it).
1816 		 */
1817 		while (P_IGNORE(opaque) ||
1818 			   (rightmost && !P_RIGHTMOST(opaque)))
1819 		{
1820 			blkno = opaque->btpo_next;
1821 			if (blkno == P_NONE)
1822 				elog(ERROR, "fell off the end of index \"%s\"",
1823 					 RelationGetRelationName(rel));
1824 			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1825 			page = BufferGetPage(buf);
1826 			TestForOldSnapshot(snapshot, rel, page);
1827 			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1828 		}
1829 
1830 		/* Done? */
1831 		if (opaque->btpo.level == level)
1832 			break;
1833 		if (opaque->btpo.level < level)
1834 			elog(ERROR, "btree level %u not found in index \"%s\"",
1835 				 level, RelationGetRelationName(rel));
1836 
1837 		/* Descend to leftmost or rightmost child page */
1838 		if (rightmost)
1839 			offnum = PageGetMaxOffsetNumber(page);
1840 		else
1841 			offnum = P_FIRSTDATAKEY(opaque);
1842 
1843 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1844 		blkno = BTreeInnerTupleGetDownLink(itup);
1845 
1846 		buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1847 		page = BufferGetPage(buf);
1848 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1849 	}
1850 
1851 	return buf;
1852 }
1853 
1854 /*
1855  *	_bt_endpoint() -- Find the first or last page in the index, and scan
1856  * from there to the first key satisfying all the quals.
1857  *
1858  * This is used by _bt_first() to set up a scan when we've determined
1859  * that the scan must start at the beginning or end of the index (for
1860  * a forward or backward scan respectively).  Exit conditions are the
1861  * same as for _bt_first().
1862  */
1863 static bool
1864 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
1865 {
1866 	Relation	rel = scan->indexRelation;
1867 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
1868 	Buffer		buf;
1869 	Page		page;
1870 	BTPageOpaque opaque;
1871 	OffsetNumber start;
1872 	BTScanPosItem *currItem;
1873 
1874 	/*
1875 	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
1876 	 * version of _bt_search().  We don't maintain a stack since we know we
1877 	 * won't need it.
1878 	 */
1879 	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
1880 
1881 	if (!BufferIsValid(buf))
1882 	{
1883 		/*
1884 		 * Empty index. Lock the whole relation, as nothing finer to lock
1885 		 * exists.
1886 		 */
1887 		PredicateLockRelation(rel, scan->xs_snapshot);
1888 		BTScanPosInvalidate(so->currPos);
1889 		return false;
1890 	}
1891 
1892 	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
1893 	page = BufferGetPage(buf);
1894 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1895 	Assert(P_ISLEAF(opaque));
1896 
1897 	if (ScanDirectionIsForward(dir))
1898 	{
1899 		/* There could be dead pages to the left, so not this: */
1900 		/* Assert(P_LEFTMOST(opaque)); */
1901 
1902 		start = P_FIRSTDATAKEY(opaque);
1903 	}
1904 	else if (ScanDirectionIsBackward(dir))
1905 	{
1906 		Assert(P_RIGHTMOST(opaque));
1907 
1908 		start = PageGetMaxOffsetNumber(page);
1909 	}
1910 	else
1911 	{
1912 		elog(ERROR, "invalid scan direction: %d", (int) dir);
1913 		start = 0;				/* keep compiler quiet */
1914 	}
1915 
1916 	/* remember which buffer we have pinned */
1917 	so->currPos.buf = buf;
1918 
1919 	_bt_initialize_more_data(so, dir);
1920 
1921 	/*
1922 	 * Now load data from the first page of the scan.
1923 	 */
1924 	if (!_bt_readpage(scan, dir, start))
1925 	{
1926 		/*
1927 		 * There's no actually-matching data on this page.  Try to advance to
1928 		 * the next page.  Return false if there's no matching data at all.
1929 		 */
1930 		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1931 		if (!_bt_steppage(scan, dir))
1932 			return false;
1933 	}
1934 	else
1935 	{
1936 		/* Drop the lock, and maybe the pin, on the current page */
1937 		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1938 	}
1939 
1940 	/* OK, itemIndex says what to return */
1941 	currItem = &so->currPos.items[so->currPos.itemIndex];
1942 	scan->xs_ctup.t_self = currItem->heapTid;
1943 	if (scan->xs_want_itup)
1944 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1945 
1946 	return true;
1947 }
1948 
1949 /*
1950  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
1951  * for scan direction
1952  */
1953 static inline void
1954 _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
1955 {
1956 	/* initialize moreLeft/moreRight appropriately for scan direction */
1957 	if (ScanDirectionIsForward(dir))
1958 	{
1959 		so->currPos.moreLeft = false;
1960 		so->currPos.moreRight = true;
1961 	}
1962 	else
1963 	{
1964 		so->currPos.moreLeft = true;
1965 		so->currPos.moreRight = false;
1966 	}
1967 	so->numKilled = 0;			/* just paranoia */
1968 	so->markItemIndex = -1;		/* ditto */
1969 }
1970