1 /*-------------------------------------------------------------------------
2 *
3 * nodeIndexonlyscan.c
4 * Routines to support index-only scans
5 *
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/executor/nodeIndexonlyscan.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 /*
16 * INTERFACE ROUTINES
17 * ExecIndexOnlyScan scans an index
18 * IndexOnlyNext retrieve next tuple
19 * ExecInitIndexOnlyScan creates and initializes state info.
20 * ExecReScanIndexOnlyScan rescans the indexed relation.
21 * ExecEndIndexOnlyScan releases all storage.
22 * ExecIndexOnlyMarkPos marks scan position.
23 * ExecIndexOnlyRestrPos restores scan position.
24 */
25 #include "postgres.h"
26
27 #include "access/relscan.h"
28 #include "access/visibilitymap.h"
29 #include "executor/execdebug.h"
30 #include "executor/nodeIndexonlyscan.h"
31 #include "executor/nodeIndexscan.h"
32 #include "storage/bufmgr.h"
33 #include "storage/predicate.h"
34 #include "utils/memutils.h"
35 #include "utils/rel.h"
36
37
38 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
39 static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
40 TupleDesc itupdesc);
41
42
43 /* ----------------------------------------------------------------
44 * IndexOnlyNext
45 *
46 * Retrieve a tuple from the IndexOnlyScan node's index.
47 * ----------------------------------------------------------------
48 */
49 static TupleTableSlot *
IndexOnlyNext(IndexOnlyScanState * node)50 IndexOnlyNext(IndexOnlyScanState *node)
51 {
52 EState *estate;
53 ExprContext *econtext;
54 ScanDirection direction;
55 IndexScanDesc scandesc;
56 TupleTableSlot *slot;
57 ItemPointer tid;
58
59 /*
60 * extract necessary information from index scan node
61 */
62 estate = node->ss.ps.state;
63 direction = estate->es_direction;
64 /* flip direction if this is an overall backward scan */
65 if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
66 {
67 if (ScanDirectionIsForward(direction))
68 direction = BackwardScanDirection;
69 else if (ScanDirectionIsBackward(direction))
70 direction = ForwardScanDirection;
71 }
72 scandesc = node->ioss_ScanDesc;
73 econtext = node->ss.ps.ps_ExprContext;
74 slot = node->ss.ss_ScanTupleSlot;
75
76 /*
77 * OK, now that we have what we need, fetch the next tuple.
78 */
79 while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
80 {
81 HeapTuple tuple = NULL;
82
83 /*
84 * We can skip the heap fetch if the TID references a heap page on
85 * which all tuples are known visible to everybody. In any case,
86 * we'll use the index tuple not the heap tuple as the data source.
87 *
88 * Note on Memory Ordering Effects: visibilitymap_get_status does not
89 * lock the visibility map buffer, and therefore the result we read
90 * here could be slightly stale. However, it can't be stale enough to
91 * matter.
92 *
93 * We need to detect clearing a VM bit due to an insert right away,
94 * because the tuple is present in the index page but not visible. The
95 * reading of the TID by this scan (using a shared lock on the index
96 * buffer) is serialized with the insert of the TID into the index
97 * (using an exclusive lock on the index buffer). Because the VM bit
98 * is cleared before updating the index, and locking/unlocking of the
99 * index page acts as a full memory barrier, we are sure to see the
100 * cleared bit if we see a recently-inserted TID.
101 *
102 * Deletes do not update the index page (only VACUUM will clear out
103 * the TID), so the clearing of the VM bit by a delete is not
104 * serialized with this test below, and we may see a value that is
105 * significantly stale. However, we don't care about the delete right
106 * away, because the tuple is still visible until the deleting
107 * transaction commits or the statement ends (if it's our
108 * transaction). In either case, the lock on the VM buffer will have
109 * been released (acting as a write barrier) after clearing the bit.
110 * And for us to have a snapshot that includes the deleting
111 * transaction (making the tuple invisible), we must have acquired
112 * ProcArrayLock after that time, acting as a read barrier.
113 *
114 * It's worth going through this complexity to avoid needing to lock
115 * the VM buffer, which could cause significant contention.
116 */
117 if (!VM_ALL_VISIBLE(scandesc->heapRelation,
118 ItemPointerGetBlockNumber(tid),
119 &node->ioss_VMBuffer))
120 {
121 /*
122 * Rats, we have to visit the heap to check visibility.
123 */
124 node->ioss_HeapFetches++;
125 tuple = index_fetch_heap(scandesc);
126 if (tuple == NULL)
127 continue; /* no visible tuple, try next index entry */
128
129 /*
130 * Only MVCC snapshots are supported here, so there should be no
131 * need to keep following the HOT chain once a visible entry has
132 * been found. If we did want to allow that, we'd need to keep
133 * more state to remember not to call index_getnext_tid next time.
134 */
135 if (scandesc->xs_continue_hot)
136 elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
137
138 /*
139 * Note: at this point we are holding a pin on the heap page, as
140 * recorded in scandesc->xs_cbuf. We could release that pin now,
141 * but it's not clear whether it's a win to do so. The next index
142 * entry might require a visit to the same heap page.
143 */
144 }
145
146 /*
147 * Fill the scan tuple slot with data from the index.
148 */
149 StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc);
150
151 /*
152 * If the index was lossy, we have to recheck the index quals.
153 * (Currently, this can never happen, but we should support the case
154 * for possible future use, eg with GiST indexes.)
155 */
156 if (scandesc->xs_recheck)
157 {
158 econtext->ecxt_scantuple = slot;
159 ResetExprContext(econtext);
160 if (!ExecQual(node->indexqual, econtext, false))
161 {
162 /* Fails recheck, so drop it and loop back for another */
163 InstrCountFiltered2(node, 1);
164 continue;
165 }
166 }
167
168 /*
169 * We don't currently support rechecking ORDER BY distances. (In
170 * principle, if the index can support retrieval of the originally
171 * indexed value, it should be able to produce an exact distance
172 * calculation too. So it's not clear that adding code here for
173 * recheck/re-sort would be worth the trouble. But we should at least
174 * throw an error if someone tries it.)
175 */
176 if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
177 ereport(ERROR,
178 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
179 errmsg("lossy distance functions are not supported in index-only scans")));
180
181 /*
182 * If we didn't access the heap, then we'll need to take a predicate
183 * lock explicitly, as if we had. For now we do that at page level.
184 */
185 if (tuple == NULL)
186 PredicateLockPage(scandesc->heapRelation,
187 ItemPointerGetBlockNumber(tid),
188 estate->es_snapshot);
189
190 return slot;
191 }
192
193 /*
194 * if we get here it means the index scan failed so we are at the end of
195 * the scan..
196 */
197 return ExecClearTuple(slot);
198 }
199
200 /*
201 * StoreIndexTuple
202 * Fill the slot with data from the index tuple.
203 *
204 * At some point this might be generally-useful functionality, but
205 * right now we don't need it elsewhere.
206 */
207 static void
StoreIndexTuple(TupleTableSlot * slot,IndexTuple itup,TupleDesc itupdesc)208 StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
209 {
210 int nindexatts = itupdesc->natts;
211 Datum *values = slot->tts_values;
212 bool *isnull = slot->tts_isnull;
213 int i;
214
215 /*
216 * Note: we must use the tupdesc supplied by the AM in index_getattr, not
217 * the slot's tupdesc, in case the latter has different datatypes (this
218 * happens for btree name_ops in particular). They'd better have the same
219 * number of columns though, as well as being datatype-compatible which is
220 * something we can't so easily check.
221 */
222 Assert(slot->tts_tupleDescriptor->natts == nindexatts);
223
224 ExecClearTuple(slot);
225 for (i = 0; i < nindexatts; i++)
226 values[i] = index_getattr(itup, i + 1, itupdesc, &isnull[i]);
227 ExecStoreVirtualTuple(slot);
228 }
229
230 /*
231 * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
232 *
233 * This can't really happen, since an index can't supply CTID which would
234 * be necessary data for any potential EvalPlanQual target relation. If it
235 * did happen, the EPQ code would pass us the wrong data, namely a heap
236 * tuple not an index tuple. So throw an error.
237 */
238 static bool
IndexOnlyRecheck(IndexOnlyScanState * node,TupleTableSlot * slot)239 IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
240 {
241 elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
242 return false; /* keep compiler quiet */
243 }
244
245 /* ----------------------------------------------------------------
246 * ExecIndexOnlyScan(node)
247 * ----------------------------------------------------------------
248 */
249 TupleTableSlot *
ExecIndexOnlyScan(IndexOnlyScanState * node)250 ExecIndexOnlyScan(IndexOnlyScanState *node)
251 {
252 /*
253 * If we have runtime keys and they've not already been set up, do it now.
254 */
255 if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
256 ExecReScan((PlanState *) node);
257
258 return ExecScan(&node->ss,
259 (ExecScanAccessMtd) IndexOnlyNext,
260 (ExecScanRecheckMtd) IndexOnlyRecheck);
261 }
262
263 /* ----------------------------------------------------------------
264 * ExecReScanIndexOnlyScan(node)
265 *
266 * Recalculates the values of any scan keys whose value depends on
267 * information known at runtime, then rescans the indexed relation.
268 *
269 * Updating the scan key was formerly done separately in
270 * ExecUpdateIndexScanKeys. Integrating it into ReScan makes
271 * rescans of indices and relations/general streams more uniform.
272 * ----------------------------------------------------------------
273 */
274 void
ExecReScanIndexOnlyScan(IndexOnlyScanState * node)275 ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
276 {
277 /*
278 * If we are doing runtime key calculations (ie, any of the index key
279 * values weren't simple Consts), compute the new key values. But first,
280 * reset the context so we don't leak memory as each outer tuple is
281 * scanned. Note this assumes that we will recalculate *all* runtime keys
282 * on each call.
283 */
284 if (node->ioss_NumRuntimeKeys != 0)
285 {
286 ExprContext *econtext = node->ioss_RuntimeContext;
287
288 ResetExprContext(econtext);
289 ExecIndexEvalRuntimeKeys(econtext,
290 node->ioss_RuntimeKeys,
291 node->ioss_NumRuntimeKeys);
292 }
293 node->ioss_RuntimeKeysReady = true;
294
295 /* reset index scan */
296 index_rescan(node->ioss_ScanDesc,
297 node->ioss_ScanKeys, node->ioss_NumScanKeys,
298 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
299
300 ExecScanReScan(&node->ss);
301 }
302
303
304 /* ----------------------------------------------------------------
305 * ExecEndIndexOnlyScan
306 * ----------------------------------------------------------------
307 */
308 void
ExecEndIndexOnlyScan(IndexOnlyScanState * node)309 ExecEndIndexOnlyScan(IndexOnlyScanState *node)
310 {
311 Relation indexRelationDesc;
312 IndexScanDesc indexScanDesc;
313 Relation relation;
314
315 /*
316 * extract information from the node
317 */
318 indexRelationDesc = node->ioss_RelationDesc;
319 indexScanDesc = node->ioss_ScanDesc;
320 relation = node->ss.ss_currentRelation;
321
322 /* Release VM buffer pin, if any. */
323 if (node->ioss_VMBuffer != InvalidBuffer)
324 {
325 ReleaseBuffer(node->ioss_VMBuffer);
326 node->ioss_VMBuffer = InvalidBuffer;
327 }
328
329 /*
330 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
331 */
332 #ifdef NOT_USED
333 ExecFreeExprContext(&node->ss.ps);
334 if (node->ioss_RuntimeContext)
335 FreeExprContext(node->ioss_RuntimeContext, true);
336 #endif
337
338 /*
339 * clear out tuple table slots
340 */
341 ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
342 ExecClearTuple(node->ss.ss_ScanTupleSlot);
343
344 /*
345 * close the index relation (no-op if we didn't open it)
346 */
347 if (indexScanDesc)
348 index_endscan(indexScanDesc);
349 if (indexRelationDesc)
350 index_close(indexRelationDesc, NoLock);
351
352 /*
353 * close the heap relation.
354 */
355 ExecCloseScanRelation(relation);
356 }
357
358 /* ----------------------------------------------------------------
359 * ExecIndexOnlyMarkPos
360 * ----------------------------------------------------------------
361 */
362 void
ExecIndexOnlyMarkPos(IndexOnlyScanState * node)363 ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
364 {
365 index_markpos(node->ioss_ScanDesc);
366 }
367
368 /* ----------------------------------------------------------------
369 * ExecIndexOnlyRestrPos
370 * ----------------------------------------------------------------
371 */
372 void
ExecIndexOnlyRestrPos(IndexOnlyScanState * node)373 ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
374 {
375 index_restrpos(node->ioss_ScanDesc);
376 }
377
378 /* ----------------------------------------------------------------
379 * ExecInitIndexOnlyScan
380 *
381 * Initializes the index scan's state information, creates
382 * scan keys, and opens the base and index relations.
383 *
384 * Note: index scans have 2 sets of state information because
385 * we have to keep track of the base relation and the
386 * index relation.
387 * ----------------------------------------------------------------
388 */
389 IndexOnlyScanState *
ExecInitIndexOnlyScan(IndexOnlyScan * node,EState * estate,int eflags)390 ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
391 {
392 IndexOnlyScanState *indexstate;
393 Relation currentRelation;
394 bool relistarget;
395 TupleDesc tupDesc;
396
397 /*
398 * create state structure
399 */
400 indexstate = makeNode(IndexOnlyScanState);
401 indexstate->ss.ps.plan = (Plan *) node;
402 indexstate->ss.ps.state = estate;
403 indexstate->ioss_HeapFetches = 0;
404
405 /*
406 * Miscellaneous initialization
407 *
408 * create expression context for node
409 */
410 ExecAssignExprContext(estate, &indexstate->ss.ps);
411
412 indexstate->ss.ps.ps_TupFromTlist = false;
413
414 /*
415 * initialize child expressions
416 *
417 * Note: we don't initialize all of the indexorderby expression, only the
418 * sub-parts corresponding to runtime keys (see below).
419 */
420 indexstate->ss.ps.targetlist = (List *)
421 ExecInitExpr((Expr *) node->scan.plan.targetlist,
422 (PlanState *) indexstate);
423 indexstate->ss.ps.qual = (List *)
424 ExecInitExpr((Expr *) node->scan.plan.qual,
425 (PlanState *) indexstate);
426 indexstate->indexqual = (List *)
427 ExecInitExpr((Expr *) node->indexqual,
428 (PlanState *) indexstate);
429
430 /*
431 * tuple table initialization
432 */
433 ExecInitResultTupleSlot(estate, &indexstate->ss.ps);
434 ExecInitScanTupleSlot(estate, &indexstate->ss);
435
436 /*
437 * open the base relation and acquire appropriate lock on it.
438 */
439 currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
440
441 indexstate->ss.ss_currentRelation = currentRelation;
442 indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
443
444 /*
445 * Build the scan tuple type using the indextlist generated by the
446 * planner. We use this, rather than the index's physical tuple
447 * descriptor, because the latter contains storage column types not the
448 * types of the original datums. (It's the AM's responsibility to return
449 * suitable data anyway.)
450 */
451 tupDesc = ExecTypeFromTL(node->indextlist, false);
452 ExecAssignScanType(&indexstate->ss, tupDesc);
453
454 /*
455 * Initialize result tuple type and projection info. The node's
456 * targetlist will contain Vars with varno = INDEX_VAR, referencing the
457 * scan tuple.
458 */
459 ExecAssignResultTypeFromTL(&indexstate->ss.ps);
460 ExecAssignScanProjectionInfoWithVarno(&indexstate->ss, INDEX_VAR);
461
462 /*
463 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
464 * here. This allows an index-advisor plugin to EXPLAIN a plan containing
465 * references to nonexistent indexes.
466 */
467 if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
468 return indexstate;
469
470 /*
471 * Open the index relation.
472 *
473 * If the parent table is one of the target relations of the query, then
474 * InitPlan already opened and write-locked the index, so we can avoid
475 * taking another lock here. Otherwise we need a normal reader's lock.
476 */
477 relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid);
478 indexstate->ioss_RelationDesc = index_open(node->indexid,
479 relistarget ? NoLock : AccessShareLock);
480
481 /*
482 * Initialize index-specific scan state
483 */
484 indexstate->ioss_RuntimeKeysReady = false;
485 indexstate->ioss_RuntimeKeys = NULL;
486 indexstate->ioss_NumRuntimeKeys = 0;
487
488 /*
489 * build the index scan keys from the index qualification
490 */
491 ExecIndexBuildScanKeys((PlanState *) indexstate,
492 indexstate->ioss_RelationDesc,
493 node->indexqual,
494 false,
495 &indexstate->ioss_ScanKeys,
496 &indexstate->ioss_NumScanKeys,
497 &indexstate->ioss_RuntimeKeys,
498 &indexstate->ioss_NumRuntimeKeys,
499 NULL, /* no ArrayKeys */
500 NULL);
501
502 /*
503 * any ORDER BY exprs have to be turned into scankeys in the same way
504 */
505 ExecIndexBuildScanKeys((PlanState *) indexstate,
506 indexstate->ioss_RelationDesc,
507 node->indexorderby,
508 true,
509 &indexstate->ioss_OrderByKeys,
510 &indexstate->ioss_NumOrderByKeys,
511 &indexstate->ioss_RuntimeKeys,
512 &indexstate->ioss_NumRuntimeKeys,
513 NULL, /* no ArrayKeys */
514 NULL);
515
516 /*
517 * If we have runtime keys, we need an ExprContext to evaluate them. The
518 * node's standard context won't do because we want to reset that context
519 * for every tuple. So, build another context just like the other one...
520 * -tgl 7/11/00
521 */
522 if (indexstate->ioss_NumRuntimeKeys != 0)
523 {
524 ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
525
526 ExecAssignExprContext(estate, &indexstate->ss.ps);
527 indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
528 indexstate->ss.ps.ps_ExprContext = stdecontext;
529 }
530 else
531 {
532 indexstate->ioss_RuntimeContext = NULL;
533 }
534
535 /*
536 * Initialize scan descriptor.
537 */
538 indexstate->ioss_ScanDesc = index_beginscan(currentRelation,
539 indexstate->ioss_RelationDesc,
540 estate->es_snapshot,
541 indexstate->ioss_NumScanKeys,
542 indexstate->ioss_NumOrderByKeys);
543
544 /* Set it up for index-only scan */
545 indexstate->ioss_ScanDesc->xs_want_itup = true;
546 indexstate->ioss_VMBuffer = InvalidBuffer;
547
548 /*
549 * If no run-time keys to calculate, go ahead and pass the scankeys to the
550 * index AM.
551 */
552 if (indexstate->ioss_NumRuntimeKeys == 0)
553 index_rescan(indexstate->ioss_ScanDesc,
554 indexstate->ioss_ScanKeys,
555 indexstate->ioss_NumScanKeys,
556 indexstate->ioss_OrderByKeys,
557 indexstate->ioss_NumOrderByKeys);
558
559 /*
560 * all done.
561 */
562 return indexstate;
563 }
564