1 /*-------------------------------------------------------------------------
2  *
3  * nodeIndexonlyscan.c
4  *	  Routines to support index-only scans
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/executor/nodeIndexonlyscan.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * INTERFACE ROUTINES
17  *		ExecIndexOnlyScan			scans an index
18  *		IndexOnlyNext				retrieve next tuple
19  *		ExecInitIndexOnlyScan		creates and initializes state info.
20  *		ExecReScanIndexOnlyScan		rescans the indexed relation.
21  *		ExecEndIndexOnlyScan		releases all storage.
22  *		ExecIndexOnlyMarkPos		marks scan position.
23  *		ExecIndexOnlyRestrPos		restores scan position.
24  */
25 #include "postgres.h"
26 
27 #include "access/relscan.h"
28 #include "access/visibilitymap.h"
29 #include "executor/execdebug.h"
30 #include "executor/nodeIndexonlyscan.h"
31 #include "executor/nodeIndexscan.h"
32 #include "storage/bufmgr.h"
33 #include "storage/predicate.h"
34 #include "utils/memutils.h"
35 #include "utils/rel.h"
36 
37 
38 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
39 static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
40 				TupleDesc itupdesc);
41 
42 
43 /* ----------------------------------------------------------------
44  *		IndexOnlyNext
45  *
46  *		Retrieve a tuple from the IndexOnlyScan node's index.
47  * ----------------------------------------------------------------
48  */
49 static TupleTableSlot *
IndexOnlyNext(IndexOnlyScanState * node)50 IndexOnlyNext(IndexOnlyScanState *node)
51 {
52 	EState	   *estate;
53 	ExprContext *econtext;
54 	ScanDirection direction;
55 	IndexScanDesc scandesc;
56 	TupleTableSlot *slot;
57 	ItemPointer tid;
58 
59 	/*
60 	 * extract necessary information from index scan node
61 	 */
62 	estate = node->ss.ps.state;
63 	direction = estate->es_direction;
64 	/* flip direction if this is an overall backward scan */
65 	if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
66 	{
67 		if (ScanDirectionIsForward(direction))
68 			direction = BackwardScanDirection;
69 		else if (ScanDirectionIsBackward(direction))
70 			direction = ForwardScanDirection;
71 	}
72 	scandesc = node->ioss_ScanDesc;
73 	econtext = node->ss.ps.ps_ExprContext;
74 	slot = node->ss.ss_ScanTupleSlot;
75 
76 	/*
77 	 * OK, now that we have what we need, fetch the next tuple.
78 	 */
79 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
80 	{
81 		HeapTuple	tuple = NULL;
82 
83 		/*
84 		 * We can skip the heap fetch if the TID references a heap page on
85 		 * which all tuples are known visible to everybody.  In any case,
86 		 * we'll use the index tuple not the heap tuple as the data source.
87 		 *
88 		 * Note on Memory Ordering Effects: visibilitymap_get_status does not
89 		 * lock the visibility map buffer, and therefore the result we read
90 		 * here could be slightly stale.  However, it can't be stale enough to
91 		 * matter.
92 		 *
93 		 * We need to detect clearing a VM bit due to an insert right away,
94 		 * because the tuple is present in the index page but not visible. The
95 		 * reading of the TID by this scan (using a shared lock on the index
96 		 * buffer) is serialized with the insert of the TID into the index
97 		 * (using an exclusive lock on the index buffer). Because the VM bit
98 		 * is cleared before updating the index, and locking/unlocking of the
99 		 * index page acts as a full memory barrier, we are sure to see the
100 		 * cleared bit if we see a recently-inserted TID.
101 		 *
102 		 * Deletes do not update the index page (only VACUUM will clear out
103 		 * the TID), so the clearing of the VM bit by a delete is not
104 		 * serialized with this test below, and we may see a value that is
105 		 * significantly stale. However, we don't care about the delete right
106 		 * away, because the tuple is still visible until the deleting
107 		 * transaction commits or the statement ends (if it's our
108 		 * transaction). In either case, the lock on the VM buffer will have
109 		 * been released (acting as a write barrier) after clearing the bit.
110 		 * And for us to have a snapshot that includes the deleting
111 		 * transaction (making the tuple invisible), we must have acquired
112 		 * ProcArrayLock after that time, acting as a read barrier.
113 		 *
114 		 * It's worth going through this complexity to avoid needing to lock
115 		 * the VM buffer, which could cause significant contention.
116 		 */
117 		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
118 							ItemPointerGetBlockNumber(tid),
119 							&node->ioss_VMBuffer))
120 		{
121 			/*
122 			 * Rats, we have to visit the heap to check visibility.
123 			 */
124 			node->ioss_HeapFetches++;
125 			tuple = index_fetch_heap(scandesc);
126 			if (tuple == NULL)
127 				continue;		/* no visible tuple, try next index entry */
128 
129 			/*
130 			 * Only MVCC snapshots are supported here, so there should be no
131 			 * need to keep following the HOT chain once a visible entry has
132 			 * been found.  If we did want to allow that, we'd need to keep
133 			 * more state to remember not to call index_getnext_tid next time.
134 			 */
135 			if (scandesc->xs_continue_hot)
136 				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
137 
138 			/*
139 			 * Note: at this point we are holding a pin on the heap page, as
140 			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
141 			 * but it's not clear whether it's a win to do so.  The next index
142 			 * entry might require a visit to the same heap page.
143 			 */
144 		}
145 
146 		/*
147 		 * Fill the scan tuple slot with data from the index.
148 		 */
149 		StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc);
150 
151 		/*
152 		 * If the index was lossy, we have to recheck the index quals.
153 		 * (Currently, this can never happen, but we should support the case
154 		 * for possible future use, eg with GiST indexes.)
155 		 */
156 		if (scandesc->xs_recheck)
157 		{
158 			econtext->ecxt_scantuple = slot;
159 			ResetExprContext(econtext);
160 			if (!ExecQual(node->indexqual, econtext, false))
161 			{
162 				/* Fails recheck, so drop it and loop back for another */
163 				InstrCountFiltered2(node, 1);
164 				continue;
165 			}
166 		}
167 
168 		/*
169 		 * We don't currently support rechecking ORDER BY distances.  (In
170 		 * principle, if the index can support retrieval of the originally
171 		 * indexed value, it should be able to produce an exact distance
172 		 * calculation too.  So it's not clear that adding code here for
173 		 * recheck/re-sort would be worth the trouble.  But we should at least
174 		 * throw an error if someone tries it.)
175 		 */
176 		if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
177 			ereport(ERROR,
178 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
179 					 errmsg("lossy distance functions are not supported in index-only scans")));
180 
181 		/*
182 		 * If we didn't access the heap, then we'll need to take a predicate
183 		 * lock explicitly, as if we had.  For now we do that at page level.
184 		 */
185 		if (tuple == NULL)
186 			PredicateLockPage(scandesc->heapRelation,
187 							  ItemPointerGetBlockNumber(tid),
188 							  estate->es_snapshot);
189 
190 		return slot;
191 	}
192 
193 	/*
194 	 * if we get here it means the index scan failed so we are at the end of
195 	 * the scan..
196 	 */
197 	return ExecClearTuple(slot);
198 }
199 
200 /*
201  * StoreIndexTuple
202  *		Fill the slot with data from the index tuple.
203  *
204  * At some point this might be generally-useful functionality, but
205  * right now we don't need it elsewhere.
206  */
207 static void
StoreIndexTuple(TupleTableSlot * slot,IndexTuple itup,TupleDesc itupdesc)208 StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
209 {
210 	int			nindexatts = itupdesc->natts;
211 	Datum	   *values = slot->tts_values;
212 	bool	   *isnull = slot->tts_isnull;
213 	int			i;
214 
215 	/*
216 	 * Note: we must use the tupdesc supplied by the AM in index_getattr, not
217 	 * the slot's tupdesc, in case the latter has different datatypes (this
218 	 * happens for btree name_ops in particular).  They'd better have the same
219 	 * number of columns though, as well as being datatype-compatible which is
220 	 * something we can't so easily check.
221 	 */
222 	Assert(slot->tts_tupleDescriptor->natts == nindexatts);
223 
224 	ExecClearTuple(slot);
225 	for (i = 0; i < nindexatts; i++)
226 		values[i] = index_getattr(itup, i + 1, itupdesc, &isnull[i]);
227 	ExecStoreVirtualTuple(slot);
228 }
229 
230 /*
231  * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
232  *
233  * This can't really happen, since an index can't supply CTID which would
234  * be necessary data for any potential EvalPlanQual target relation.  If it
235  * did happen, the EPQ code would pass us the wrong data, namely a heap
236  * tuple not an index tuple.  So throw an error.
237  */
238 static bool
IndexOnlyRecheck(IndexOnlyScanState * node,TupleTableSlot * slot)239 IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
240 {
241 	elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
242 	return false;				/* keep compiler quiet */
243 }
244 
245 /* ----------------------------------------------------------------
246  *		ExecIndexOnlyScan(node)
247  * ----------------------------------------------------------------
248  */
249 TupleTableSlot *
ExecIndexOnlyScan(IndexOnlyScanState * node)250 ExecIndexOnlyScan(IndexOnlyScanState *node)
251 {
252 	/*
253 	 * If we have runtime keys and they've not already been set up, do it now.
254 	 */
255 	if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
256 		ExecReScan((PlanState *) node);
257 
258 	return ExecScan(&node->ss,
259 					(ExecScanAccessMtd) IndexOnlyNext,
260 					(ExecScanRecheckMtd) IndexOnlyRecheck);
261 }
262 
263 /* ----------------------------------------------------------------
264  *		ExecReScanIndexOnlyScan(node)
265  *
266  *		Recalculates the values of any scan keys whose value depends on
267  *		information known at runtime, then rescans the indexed relation.
268  *
269  *		Updating the scan key was formerly done separately in
270  *		ExecUpdateIndexScanKeys. Integrating it into ReScan makes
271  *		rescans of indices and relations/general streams more uniform.
272  * ----------------------------------------------------------------
273  */
274 void
ExecReScanIndexOnlyScan(IndexOnlyScanState * node)275 ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
276 {
277 	/*
278 	 * If we are doing runtime key calculations (ie, any of the index key
279 	 * values weren't simple Consts), compute the new key values.  But first,
280 	 * reset the context so we don't leak memory as each outer tuple is
281 	 * scanned.  Note this assumes that we will recalculate *all* runtime keys
282 	 * on each call.
283 	 */
284 	if (node->ioss_NumRuntimeKeys != 0)
285 	{
286 		ExprContext *econtext = node->ioss_RuntimeContext;
287 
288 		ResetExprContext(econtext);
289 		ExecIndexEvalRuntimeKeys(econtext,
290 								 node->ioss_RuntimeKeys,
291 								 node->ioss_NumRuntimeKeys);
292 	}
293 	node->ioss_RuntimeKeysReady = true;
294 
295 	/* reset index scan */
296 	index_rescan(node->ioss_ScanDesc,
297 				 node->ioss_ScanKeys, node->ioss_NumScanKeys,
298 				 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
299 
300 	ExecScanReScan(&node->ss);
301 }
302 
303 
304 /* ----------------------------------------------------------------
305  *		ExecEndIndexOnlyScan
306  * ----------------------------------------------------------------
307  */
308 void
ExecEndIndexOnlyScan(IndexOnlyScanState * node)309 ExecEndIndexOnlyScan(IndexOnlyScanState *node)
310 {
311 	Relation	indexRelationDesc;
312 	IndexScanDesc indexScanDesc;
313 	Relation	relation;
314 
315 	/*
316 	 * extract information from the node
317 	 */
318 	indexRelationDesc = node->ioss_RelationDesc;
319 	indexScanDesc = node->ioss_ScanDesc;
320 	relation = node->ss.ss_currentRelation;
321 
322 	/* Release VM buffer pin, if any. */
323 	if (node->ioss_VMBuffer != InvalidBuffer)
324 	{
325 		ReleaseBuffer(node->ioss_VMBuffer);
326 		node->ioss_VMBuffer = InvalidBuffer;
327 	}
328 
329 	/*
330 	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
331 	 */
332 #ifdef NOT_USED
333 	ExecFreeExprContext(&node->ss.ps);
334 	if (node->ioss_RuntimeContext)
335 		FreeExprContext(node->ioss_RuntimeContext, true);
336 #endif
337 
338 	/*
339 	 * clear out tuple table slots
340 	 */
341 	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
342 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
343 
344 	/*
345 	 * close the index relation (no-op if we didn't open it)
346 	 */
347 	if (indexScanDesc)
348 		index_endscan(indexScanDesc);
349 	if (indexRelationDesc)
350 		index_close(indexRelationDesc, NoLock);
351 
352 	/*
353 	 * close the heap relation.
354 	 */
355 	ExecCloseScanRelation(relation);
356 }
357 
358 /* ----------------------------------------------------------------
359  *		ExecIndexOnlyMarkPos
360  * ----------------------------------------------------------------
361  */
362 void
ExecIndexOnlyMarkPos(IndexOnlyScanState * node)363 ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
364 {
365 	index_markpos(node->ioss_ScanDesc);
366 }
367 
368 /* ----------------------------------------------------------------
369  *		ExecIndexOnlyRestrPos
370  * ----------------------------------------------------------------
371  */
372 void
ExecIndexOnlyRestrPos(IndexOnlyScanState * node)373 ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
374 {
375 	index_restrpos(node->ioss_ScanDesc);
376 }
377 
378 /* ----------------------------------------------------------------
379  *		ExecInitIndexOnlyScan
380  *
381  *		Initializes the index scan's state information, creates
382  *		scan keys, and opens the base and index relations.
383  *
384  *		Note: index scans have 2 sets of state information because
385  *			  we have to keep track of the base relation and the
386  *			  index relation.
387  * ----------------------------------------------------------------
388  */
389 IndexOnlyScanState *
ExecInitIndexOnlyScan(IndexOnlyScan * node,EState * estate,int eflags)390 ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
391 {
392 	IndexOnlyScanState *indexstate;
393 	Relation	currentRelation;
394 	bool		relistarget;
395 	TupleDesc	tupDesc;
396 
397 	/*
398 	 * create state structure
399 	 */
400 	indexstate = makeNode(IndexOnlyScanState);
401 	indexstate->ss.ps.plan = (Plan *) node;
402 	indexstate->ss.ps.state = estate;
403 	indexstate->ioss_HeapFetches = 0;
404 
405 	/*
406 	 * Miscellaneous initialization
407 	 *
408 	 * create expression context for node
409 	 */
410 	ExecAssignExprContext(estate, &indexstate->ss.ps);
411 
412 	indexstate->ss.ps.ps_TupFromTlist = false;
413 
414 	/*
415 	 * initialize child expressions
416 	 *
417 	 * Note: we don't initialize all of the indexorderby expression, only the
418 	 * sub-parts corresponding to runtime keys (see below).
419 	 */
420 	indexstate->ss.ps.targetlist = (List *)
421 		ExecInitExpr((Expr *) node->scan.plan.targetlist,
422 					 (PlanState *) indexstate);
423 	indexstate->ss.ps.qual = (List *)
424 		ExecInitExpr((Expr *) node->scan.plan.qual,
425 					 (PlanState *) indexstate);
426 	indexstate->indexqual = (List *)
427 		ExecInitExpr((Expr *) node->indexqual,
428 					 (PlanState *) indexstate);
429 
430 	/*
431 	 * tuple table initialization
432 	 */
433 	ExecInitResultTupleSlot(estate, &indexstate->ss.ps);
434 	ExecInitScanTupleSlot(estate, &indexstate->ss);
435 
436 	/*
437 	 * open the base relation and acquire appropriate lock on it.
438 	 */
439 	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
440 
441 	indexstate->ss.ss_currentRelation = currentRelation;
442 	indexstate->ss.ss_currentScanDesc = NULL;	/* no heap scan here */
443 
444 	/*
445 	 * Build the scan tuple type using the indextlist generated by the
446 	 * planner.  We use this, rather than the index's physical tuple
447 	 * descriptor, because the latter contains storage column types not the
448 	 * types of the original datums.  (It's the AM's responsibility to return
449 	 * suitable data anyway.)
450 	 */
451 	tupDesc = ExecTypeFromTL(node->indextlist, false);
452 	ExecAssignScanType(&indexstate->ss, tupDesc);
453 
454 	/*
455 	 * Initialize result tuple type and projection info.  The node's
456 	 * targetlist will contain Vars with varno = INDEX_VAR, referencing the
457 	 * scan tuple.
458 	 */
459 	ExecAssignResultTypeFromTL(&indexstate->ss.ps);
460 	ExecAssignScanProjectionInfoWithVarno(&indexstate->ss, INDEX_VAR);
461 
462 	/*
463 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
464 	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
465 	 * references to nonexistent indexes.
466 	 */
467 	if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
468 		return indexstate;
469 
470 	/*
471 	 * Open the index relation.
472 	 *
473 	 * If the parent table is one of the target relations of the query, then
474 	 * InitPlan already opened and write-locked the index, so we can avoid
475 	 * taking another lock here.  Otherwise we need a normal reader's lock.
476 	 */
477 	relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid);
478 	indexstate->ioss_RelationDesc = index_open(node->indexid,
479 									 relistarget ? NoLock : AccessShareLock);
480 
481 	/*
482 	 * Initialize index-specific scan state
483 	 */
484 	indexstate->ioss_RuntimeKeysReady = false;
485 	indexstate->ioss_RuntimeKeys = NULL;
486 	indexstate->ioss_NumRuntimeKeys = 0;
487 
488 	/*
489 	 * build the index scan keys from the index qualification
490 	 */
491 	ExecIndexBuildScanKeys((PlanState *) indexstate,
492 						   indexstate->ioss_RelationDesc,
493 						   node->indexqual,
494 						   false,
495 						   &indexstate->ioss_ScanKeys,
496 						   &indexstate->ioss_NumScanKeys,
497 						   &indexstate->ioss_RuntimeKeys,
498 						   &indexstate->ioss_NumRuntimeKeys,
499 						   NULL,	/* no ArrayKeys */
500 						   NULL);
501 
502 	/*
503 	 * any ORDER BY exprs have to be turned into scankeys in the same way
504 	 */
505 	ExecIndexBuildScanKeys((PlanState *) indexstate,
506 						   indexstate->ioss_RelationDesc,
507 						   node->indexorderby,
508 						   true,
509 						   &indexstate->ioss_OrderByKeys,
510 						   &indexstate->ioss_NumOrderByKeys,
511 						   &indexstate->ioss_RuntimeKeys,
512 						   &indexstate->ioss_NumRuntimeKeys,
513 						   NULL,	/* no ArrayKeys */
514 						   NULL);
515 
516 	/*
517 	 * If we have runtime keys, we need an ExprContext to evaluate them. The
518 	 * node's standard context won't do because we want to reset that context
519 	 * for every tuple.  So, build another context just like the other one...
520 	 * -tgl 7/11/00
521 	 */
522 	if (indexstate->ioss_NumRuntimeKeys != 0)
523 	{
524 		ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
525 
526 		ExecAssignExprContext(estate, &indexstate->ss.ps);
527 		indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
528 		indexstate->ss.ps.ps_ExprContext = stdecontext;
529 	}
530 	else
531 	{
532 		indexstate->ioss_RuntimeContext = NULL;
533 	}
534 
535 	/*
536 	 * Initialize scan descriptor.
537 	 */
538 	indexstate->ioss_ScanDesc = index_beginscan(currentRelation,
539 												indexstate->ioss_RelationDesc,
540 												estate->es_snapshot,
541 												indexstate->ioss_NumScanKeys,
542 											indexstate->ioss_NumOrderByKeys);
543 
544 	/* Set it up for index-only scan */
545 	indexstate->ioss_ScanDesc->xs_want_itup = true;
546 	indexstate->ioss_VMBuffer = InvalidBuffer;
547 
548 	/*
549 	 * If no run-time keys to calculate, go ahead and pass the scankeys to the
550 	 * index AM.
551 	 */
552 	if (indexstate->ioss_NumRuntimeKeys == 0)
553 		index_rescan(indexstate->ioss_ScanDesc,
554 					 indexstate->ioss_ScanKeys,
555 					 indexstate->ioss_NumScanKeys,
556 					 indexstate->ioss_OrderByKeys,
557 					 indexstate->ioss_NumOrderByKeys);
558 
559 	/*
560 	 * all done.
561 	 */
562 	return indexstate;
563 }
564