1 /*-------------------------------------------------------------------------
2 *
3 * nodeBitmapHeapscan.c
4 * Routines to support bitmapped scans of relations
5 *
6 * NOTE: it is critical that this plan type only be used with MVCC-compliant
7 * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8 * special snapshots). The reason is that since index and heap scans are
9 * decoupled, there can be no assurance that the index tuple prompting a
10 * visit to a particular heap TID still exists when the visit is made.
11 * Therefore the tuple might not exist anymore either (which is OK because
12 * heap_fetch will cope) --- but worse, the tuple slot could have been
13 * re-used for a newer tuple. With an MVCC snapshot the newer tuple is
14 * certain to fail the time qual and so it will not be mistakenly returned,
15 * but with anything else we might return a tuple that doesn't meet the
16 * required index qual conditions.
17 *
18 *
19 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
20 * Portions Copyright (c) 1994, Regents of the University of California
21 *
22 *
23 * IDENTIFICATION
24 * src/backend/executor/nodeBitmapHeapscan.c
25 *
26 *-------------------------------------------------------------------------
27 */
28 /*
29 * INTERFACE ROUTINES
30 * ExecBitmapHeapScan scans a relation using bitmap info
31 * ExecBitmapHeapNext workhorse for above
32 * ExecInitBitmapHeapScan creates and initializes state info.
33 * ExecReScanBitmapHeapScan prepares to rescan the plan.
34 * ExecEndBitmapHeapScan releases all storage.
35 */
36 #include "postgres.h"
37
38 #include "access/relscan.h"
39 #include "access/transam.h"
40 #include "executor/execdebug.h"
41 #include "executor/nodeBitmapHeapscan.h"
42 #include "pgstat.h"
43 #include "storage/bufmgr.h"
44 #include "storage/predicate.h"
45 #include "utils/memutils.h"
46 #include "utils/rel.h"
47 #include "utils/spccache.h"
48 #include "utils/snapmgr.h"
49 #include "utils/tqual.h"
50
51
52 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
53 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
54
55
56 /* ----------------------------------------------------------------
57 * BitmapHeapNext
58 *
59 * Retrieve next tuple from the BitmapHeapScan node's currentRelation
60 * ----------------------------------------------------------------
61 */
62 static TupleTableSlot *
BitmapHeapNext(BitmapHeapScanState * node)63 BitmapHeapNext(BitmapHeapScanState *node)
64 {
65 ExprContext *econtext;
66 HeapScanDesc scan;
67 TIDBitmap *tbm;
68 TBMIterator *tbmiterator;
69 TBMIterateResult *tbmres;
70
71 #ifdef USE_PREFETCH
72 TBMIterator *prefetch_iterator;
73 #endif
74 OffsetNumber targoffset;
75 TupleTableSlot *slot;
76
77 /*
78 * extract necessary information from index scan node
79 */
80 econtext = node->ss.ps.ps_ExprContext;
81 slot = node->ss.ss_ScanTupleSlot;
82 scan = node->ss.ss_currentScanDesc;
83 tbm = node->tbm;
84 tbmiterator = node->tbmiterator;
85 tbmres = node->tbmres;
86 #ifdef USE_PREFETCH
87 prefetch_iterator = node->prefetch_iterator;
88 #endif
89
90 /*
91 * If we haven't yet performed the underlying index scan, do it, and begin
92 * the iteration over the bitmap.
93 *
94 * For prefetching, we use *two* iterators, one for the pages we are
95 * actually scanning and another that runs ahead of the first for
96 * prefetching. node->prefetch_pages tracks exactly how many pages ahead
97 * the prefetch iterator is. Also, node->prefetch_target tracks the
98 * desired prefetch distance, which starts small and increases up to the
99 * node->prefetch_maximum. This is to avoid doing a lot of prefetching in
100 * a scan that stops after a few tuples because of a LIMIT.
101 */
102 if (tbm == NULL)
103 {
104 tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
105
106 if (!tbm || !IsA(tbm, TIDBitmap))
107 elog(ERROR, "unrecognized result from subplan");
108
109 node->tbm = tbm;
110 node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
111 node->tbmres = tbmres = NULL;
112
113 #ifdef USE_PREFETCH
114 if (node->prefetch_maximum > 0)
115 {
116 node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
117 node->prefetch_pages = 0;
118 node->prefetch_target = -1;
119 }
120 #endif /* USE_PREFETCH */
121 }
122
123 for (;;)
124 {
125 Page dp;
126 ItemId lp;
127
128 /*
129 * Get next page of results if needed
130 */
131 if (tbmres == NULL)
132 {
133 node->tbmres = tbmres = tbm_iterate(tbmiterator);
134 if (tbmres == NULL)
135 {
136 /* no more entries in the bitmap */
137 break;
138 }
139
140 #ifdef USE_PREFETCH
141 if (node->prefetch_pages > 0)
142 {
143 /* The main iterator has closed the distance by one page */
144 node->prefetch_pages--;
145 }
146 else if (prefetch_iterator)
147 {
148 /* Do not let the prefetch iterator get behind the main one */
149 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
150
151 if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
152 elog(ERROR, "prefetch and main iterators are out of sync");
153 }
154 #endif /* USE_PREFETCH */
155
156 /*
157 * Ignore any claimed entries past what we think is the end of the
158 * relation. (This is probably not necessary given that we got at
159 * least AccessShareLock on the table before performing any of the
160 * indexscans, but let's be safe.)
161 */
162 if (tbmres->blockno >= scan->rs_nblocks)
163 {
164 node->tbmres = tbmres = NULL;
165 continue;
166 }
167
168 /*
169 * Fetch the current heap page and identify candidate tuples.
170 */
171 bitgetpage(scan, tbmres);
172
173 if (tbmres->ntuples >= 0)
174 node->exact_pages++;
175 else
176 node->lossy_pages++;
177
178 /*
179 * Set rs_cindex to first slot to examine
180 */
181 scan->rs_cindex = 0;
182
183 #ifdef USE_PREFETCH
184
185 /*
186 * Increase prefetch target if it's not yet at the max. Note that
187 * we will increase it to zero after fetching the very first
188 * page/tuple, then to one after the second tuple is fetched, then
189 * it doubles as later pages are fetched.
190 */
191 if (node->prefetch_target >= node->prefetch_maximum)
192 /* don't increase any further */ ;
193 else if (node->prefetch_target >= node->prefetch_maximum / 2)
194 node->prefetch_target = node->prefetch_maximum;
195 else if (node->prefetch_target > 0)
196 node->prefetch_target *= 2;
197 else
198 node->prefetch_target++;
199 #endif /* USE_PREFETCH */
200 }
201 else
202 {
203 /*
204 * Continuing in previously obtained page; advance rs_cindex
205 */
206 scan->rs_cindex++;
207
208 #ifdef USE_PREFETCH
209
210 /*
211 * Try to prefetch at least a few pages even before we get to the
212 * second page if we don't stop reading after the first tuple.
213 */
214 if (node->prefetch_target < node->prefetch_maximum)
215 node->prefetch_target++;
216 #endif /* USE_PREFETCH */
217 }
218
219 /*
220 * Out of range? If so, nothing more to look at on this page
221 */
222 if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
223 {
224 node->tbmres = tbmres = NULL;
225 continue;
226 }
227
228 #ifdef USE_PREFETCH
229
230 /*
231 * We issue prefetch requests *after* fetching the current page to try
232 * to avoid having prefetching interfere with the main I/O. Also, this
233 * should happen only when we have determined there is still something
234 * to do on the current page, else we may uselessly prefetch the same
235 * page we are just about to request for real.
236 */
237 if (prefetch_iterator)
238 {
239 while (node->prefetch_pages < node->prefetch_target)
240 {
241 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
242
243 if (tbmpre == NULL)
244 {
245 /* No more pages to prefetch */
246 tbm_end_iterate(prefetch_iterator);
247 node->prefetch_iterator = prefetch_iterator = NULL;
248 break;
249 }
250 node->prefetch_pages++;
251 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
252 }
253 }
254 #endif /* USE_PREFETCH */
255
256 /*
257 * Okay to fetch the tuple
258 */
259 targoffset = scan->rs_vistuples[scan->rs_cindex];
260 dp = (Page) BufferGetPage(scan->rs_cbuf);
261 lp = PageGetItemId(dp, targoffset);
262 Assert(ItemIdIsNormal(lp));
263
264 scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
265 scan->rs_ctup.t_len = ItemIdGetLength(lp);
266 scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
267 ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
268
269 pgstat_count_heap_fetch(scan->rs_rd);
270
271 /*
272 * Set up the result slot to point to this tuple. Note that the slot
273 * acquires a pin on the buffer.
274 */
275 ExecStoreTuple(&scan->rs_ctup,
276 slot,
277 scan->rs_cbuf,
278 false);
279
280 /*
281 * If we are using lossy info, we have to recheck the qual conditions
282 * at every tuple.
283 */
284 if (tbmres->recheck)
285 {
286 econtext->ecxt_scantuple = slot;
287 ResetExprContext(econtext);
288
289 if (!ExecQual(node->bitmapqualorig, econtext, false))
290 {
291 /* Fails recheck, so drop it and loop back for another */
292 InstrCountFiltered2(node, 1);
293 ExecClearTuple(slot);
294 continue;
295 }
296 }
297
298 /* OK to return this tuple */
299 return slot;
300 }
301
302 /*
303 * if we get here it means we are at the end of the scan..
304 */
305 return ExecClearTuple(slot);
306 }
307
308 /*
309 * bitgetpage - subroutine for BitmapHeapNext()
310 *
311 * This routine reads and pins the specified page of the relation, then
312 * builds an array indicating which tuples on the page are both potentially
313 * interesting according to the bitmap, and visible according to the snapshot.
314 */
315 static void
bitgetpage(HeapScanDesc scan,TBMIterateResult * tbmres)316 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
317 {
318 BlockNumber page = tbmres->blockno;
319 Buffer buffer;
320 Snapshot snapshot;
321 int ntup;
322
323 /*
324 * Acquire pin on the target heap page, trading in any pin we held before.
325 */
326 Assert(page < scan->rs_nblocks);
327
328 scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
329 scan->rs_rd,
330 page);
331 buffer = scan->rs_cbuf;
332 snapshot = scan->rs_snapshot;
333
334 ntup = 0;
335
336 /*
337 * Prune and repair fragmentation for the whole page, if possible.
338 */
339 heap_page_prune_opt(scan->rs_rd, buffer);
340
341 /*
342 * We must hold share lock on the buffer content while examining tuple
343 * visibility. Afterwards, however, the tuples we have found to be
344 * visible are guaranteed good as long as we hold the buffer pin.
345 */
346 LockBuffer(buffer, BUFFER_LOCK_SHARE);
347
348 /*
349 * We need two separate strategies for lossy and non-lossy cases.
350 */
351 if (tbmres->ntuples >= 0)
352 {
353 /*
354 * Bitmap is non-lossy, so we just look through the offsets listed in
355 * tbmres; but we have to follow any HOT chain starting at each such
356 * offset.
357 */
358 int curslot;
359
360 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
361 {
362 OffsetNumber offnum = tbmres->offsets[curslot];
363 ItemPointerData tid;
364 HeapTupleData heapTuple;
365
366 ItemPointerSet(&tid, page, offnum);
367 if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
368 &heapTuple, NULL, true))
369 scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
370 }
371 }
372 else
373 {
374 /*
375 * Bitmap is lossy, so we must examine each item pointer on the page.
376 * But we can ignore HOT chains, since we'll check each tuple anyway.
377 */
378 Page dp = (Page) BufferGetPage(buffer);
379 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
380 OffsetNumber offnum;
381
382 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
383 {
384 ItemId lp;
385 HeapTupleData loctup;
386 bool valid;
387
388 lp = PageGetItemId(dp, offnum);
389 if (!ItemIdIsNormal(lp))
390 continue;
391 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
392 loctup.t_len = ItemIdGetLength(lp);
393 loctup.t_tableOid = scan->rs_rd->rd_id;
394 ItemPointerSet(&loctup.t_self, page, offnum);
395 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
396 if (valid)
397 {
398 scan->rs_vistuples[ntup++] = offnum;
399 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
400 }
401 CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
402 buffer, snapshot);
403 }
404 }
405
406 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
407
408 Assert(ntup <= MaxHeapTuplesPerPage);
409 scan->rs_ntuples = ntup;
410 }
411
412 /*
413 * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
414 */
415 static bool
BitmapHeapRecheck(BitmapHeapScanState * node,TupleTableSlot * slot)416 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
417 {
418 ExprContext *econtext;
419
420 /*
421 * extract necessary information from index scan node
422 */
423 econtext = node->ss.ps.ps_ExprContext;
424
425 /* Does the tuple meet the original qual conditions? */
426 econtext->ecxt_scantuple = slot;
427
428 ResetExprContext(econtext);
429
430 return ExecQual(node->bitmapqualorig, econtext, false);
431 }
432
433 /* ----------------------------------------------------------------
434 * ExecBitmapHeapScan(node)
435 * ----------------------------------------------------------------
436 */
437 TupleTableSlot *
ExecBitmapHeapScan(BitmapHeapScanState * node)438 ExecBitmapHeapScan(BitmapHeapScanState *node)
439 {
440 return ExecScan(&node->ss,
441 (ExecScanAccessMtd) BitmapHeapNext,
442 (ExecScanRecheckMtd) BitmapHeapRecheck);
443 }
444
445 /* ----------------------------------------------------------------
446 * ExecReScanBitmapHeapScan(node)
447 * ----------------------------------------------------------------
448 */
449 void
ExecReScanBitmapHeapScan(BitmapHeapScanState * node)450 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
451 {
452 PlanState *outerPlan = outerPlanState(node);
453
454 /* rescan to release any page pin */
455 heap_rescan(node->ss.ss_currentScanDesc, NULL);
456
457 if (node->tbmiterator)
458 tbm_end_iterate(node->tbmiterator);
459 if (node->prefetch_iterator)
460 tbm_end_iterate(node->prefetch_iterator);
461 if (node->tbm)
462 tbm_free(node->tbm);
463 node->tbm = NULL;
464 node->tbmiterator = NULL;
465 node->tbmres = NULL;
466 node->prefetch_iterator = NULL;
467
468 ExecScanReScan(&node->ss);
469
470 /*
471 * if chgParam of subnode is not null then plan will be re-scanned by
472 * first ExecProcNode.
473 */
474 if (outerPlan->chgParam == NULL)
475 ExecReScan(outerPlan);
476 }
477
478 /* ----------------------------------------------------------------
479 * ExecEndBitmapHeapScan
480 * ----------------------------------------------------------------
481 */
482 void
ExecEndBitmapHeapScan(BitmapHeapScanState * node)483 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
484 {
485 Relation relation;
486 HeapScanDesc scanDesc;
487
488 /*
489 * extract information from the node
490 */
491 relation = node->ss.ss_currentRelation;
492 scanDesc = node->ss.ss_currentScanDesc;
493
494 /*
495 * Free the exprcontext
496 */
497 ExecFreeExprContext(&node->ss.ps);
498
499 /*
500 * clear out tuple table slots
501 */
502 ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
503 ExecClearTuple(node->ss.ss_ScanTupleSlot);
504
505 /*
506 * close down subplans
507 */
508 ExecEndNode(outerPlanState(node));
509
510 /*
511 * release bitmap if any
512 */
513 if (node->tbmiterator)
514 tbm_end_iterate(node->tbmiterator);
515 if (node->prefetch_iterator)
516 tbm_end_iterate(node->prefetch_iterator);
517 if (node->tbm)
518 tbm_free(node->tbm);
519
520 /*
521 * close heap scan
522 */
523 heap_endscan(scanDesc);
524
525 /*
526 * close the heap relation.
527 */
528 ExecCloseScanRelation(relation);
529 }
530
531 /* ----------------------------------------------------------------
532 * ExecInitBitmapHeapScan
533 *
534 * Initializes the scan's state information.
535 * ----------------------------------------------------------------
536 */
537 BitmapHeapScanState *
ExecInitBitmapHeapScan(BitmapHeapScan * node,EState * estate,int eflags)538 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
539 {
540 BitmapHeapScanState *scanstate;
541 Relation currentRelation;
542 int io_concurrency;
543
544 /* check for unsupported flags */
545 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
546
547 /*
548 * Assert caller didn't ask for an unsafe snapshot --- see comments at
549 * head of file.
550 */
551 Assert(IsMVCCSnapshot(estate->es_snapshot));
552
553 /*
554 * create state structure
555 */
556 scanstate = makeNode(BitmapHeapScanState);
557 scanstate->ss.ps.plan = (Plan *) node;
558 scanstate->ss.ps.state = estate;
559
560 scanstate->tbm = NULL;
561 scanstate->tbmiterator = NULL;
562 scanstate->tbmres = NULL;
563 scanstate->exact_pages = 0;
564 scanstate->lossy_pages = 0;
565 scanstate->prefetch_iterator = NULL;
566 scanstate->prefetch_pages = 0;
567 scanstate->prefetch_target = 0;
568 /* may be updated below */
569 scanstate->prefetch_maximum = target_prefetch_pages;
570
571 /*
572 * Miscellaneous initialization
573 *
574 * create expression context for node
575 */
576 ExecAssignExprContext(estate, &scanstate->ss.ps);
577
578 scanstate->ss.ps.ps_TupFromTlist = false;
579
580 /*
581 * initialize child expressions
582 */
583 scanstate->ss.ps.targetlist = (List *)
584 ExecInitExpr((Expr *) node->scan.plan.targetlist,
585 (PlanState *) scanstate);
586 scanstate->ss.ps.qual = (List *)
587 ExecInitExpr((Expr *) node->scan.plan.qual,
588 (PlanState *) scanstate);
589 scanstate->bitmapqualorig = (List *)
590 ExecInitExpr((Expr *) node->bitmapqualorig,
591 (PlanState *) scanstate);
592
593 /*
594 * tuple table initialization
595 */
596 ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
597 ExecInitScanTupleSlot(estate, &scanstate->ss);
598
599 /*
600 * open the base relation and acquire appropriate lock on it.
601 */
602 currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
603
604 /*
605 * Determine the maximum for prefetch_target. If the tablespace has a
606 * specific IO concurrency set, use that to compute the corresponding
607 * maximum value; otherwise, we already initialized to the value computed
608 * by the GUC machinery.
609 */
610 io_concurrency =
611 get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
612 if (io_concurrency != effective_io_concurrency)
613 {
614 double maximum;
615
616 if (ComputeIoConcurrency(io_concurrency, &maximum))
617 scanstate->prefetch_maximum = rint(maximum);
618 }
619
620 scanstate->ss.ss_currentRelation = currentRelation;
621
622 /*
623 * Even though we aren't going to do a conventional seqscan, it is useful
624 * to create a HeapScanDesc --- most of the fields in it are usable.
625 */
626 scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
627 estate->es_snapshot,
628 0,
629 NULL);
630
631 /*
632 * get the scan type from the relation descriptor.
633 */
634 ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
635
636 /*
637 * Initialize result tuple type and projection info.
638 */
639 ExecAssignResultTypeFromTL(&scanstate->ss.ps);
640 ExecAssignScanProjectionInfo(&scanstate->ss);
641
642 /*
643 * initialize child nodes
644 *
645 * We do this last because the child nodes will open indexscans on our
646 * relation's indexes, and we want to be sure we have acquired a lock on
647 * the relation first.
648 */
649 outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
650
651 /*
652 * all done.
653 */
654 return scanstate;
655 }
656