1 /*-------------------------------------------------------------------------
2 *
3 * nodeBitmapHeapscan.c
4 * Routines to support bitmapped scans of relations
5 *
6 * NOTE: it is critical that this plan type only be used with MVCC-compliant
7 * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8 * special snapshots). The reason is that since index and heap scans are
9 * decoupled, there can be no assurance that the index tuple prompting a
10 * visit to a particular heap TID still exists when the visit is made.
11 * Therefore the tuple might not exist anymore either (which is OK because
12 * heap_fetch will cope) --- but worse, the tuple slot could have been
13 * re-used for a newer tuple. With an MVCC snapshot the newer tuple is
14 * certain to fail the time qual and so it will not be mistakenly returned,
15 * but with anything else we might return a tuple that doesn't meet the
16 * required index qual conditions.
17 *
18 *
19 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
20 * Portions Copyright (c) 1994, Regents of the University of California
21 *
22 *
23 * IDENTIFICATION
24 * src/backend/executor/nodeBitmapHeapscan.c
25 *
26 *-------------------------------------------------------------------------
27 */
28 /*
29 * INTERFACE ROUTINES
30 * ExecBitmapHeapScan scans a relation using bitmap info
31 * ExecBitmapHeapNext workhorse for above
32 * ExecInitBitmapHeapScan creates and initializes state info.
33 * ExecReScanBitmapHeapScan prepares to rescan the plan.
34 * ExecEndBitmapHeapScan releases all storage.
35 */
36 #include "postgres.h"
37
38 #include <math.h>
39
40 #include "access/relscan.h"
41 #include "access/transam.h"
42 #include "executor/execdebug.h"
43 #include "executor/nodeBitmapHeapscan.h"
44 #include "miscadmin.h"
45 #include "pgstat.h"
46 #include "storage/bufmgr.h"
47 #include "storage/predicate.h"
48 #include "utils/memutils.h"
49 #include "utils/rel.h"
50 #include "utils/spccache.h"
51 #include "utils/snapmgr.h"
52 #include "utils/tqual.h"
53
54
55 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
56 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
57 static inline void BitmapDoneInitializingSharedState(
58 ParallelBitmapHeapState *pstate);
59 static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
60 TBMIterateResult *tbmres);
61 static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
62 static inline void BitmapPrefetch(BitmapHeapScanState *node,
63 HeapScanDesc scan);
64 static bool BitmapShouldInitializeSharedState(
65 ParallelBitmapHeapState *pstate);
66
67
68 /* ----------------------------------------------------------------
69 * BitmapHeapNext
70 *
71 * Retrieve next tuple from the BitmapHeapScan node's currentRelation
72 * ----------------------------------------------------------------
73 */
74 static TupleTableSlot *
BitmapHeapNext(BitmapHeapScanState * node)75 BitmapHeapNext(BitmapHeapScanState *node)
76 {
77 ExprContext *econtext;
78 HeapScanDesc scan;
79 TIDBitmap *tbm;
80 TBMIterator *tbmiterator = NULL;
81 TBMSharedIterator *shared_tbmiterator = NULL;
82 TBMIterateResult *tbmres;
83 OffsetNumber targoffset;
84 TupleTableSlot *slot;
85 ParallelBitmapHeapState *pstate = node->pstate;
86 dsa_area *dsa = node->ss.ps.state->es_query_dsa;
87
88 /*
89 * extract necessary information from index scan node
90 */
91 econtext = node->ss.ps.ps_ExprContext;
92 slot = node->ss.ss_ScanTupleSlot;
93 scan = node->ss.ss_currentScanDesc;
94 tbm = node->tbm;
95 if (pstate == NULL)
96 tbmiterator = node->tbmiterator;
97 else
98 shared_tbmiterator = node->shared_tbmiterator;
99 tbmres = node->tbmres;
100
101 /*
102 * If we haven't yet performed the underlying index scan, do it, and begin
103 * the iteration over the bitmap.
104 *
105 * For prefetching, we use *two* iterators, one for the pages we are
106 * actually scanning and another that runs ahead of the first for
107 * prefetching. node->prefetch_pages tracks exactly how many pages ahead
108 * the prefetch iterator is. Also, node->prefetch_target tracks the
109 * desired prefetch distance, which starts small and increases up to the
110 * node->prefetch_maximum. This is to avoid doing a lot of prefetching in
111 * a scan that stops after a few tuples because of a LIMIT.
112 */
113 if (!node->initialized)
114 {
115 if (!pstate)
116 {
117 tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
118
119 if (!tbm || !IsA(tbm, TIDBitmap))
120 elog(ERROR, "unrecognized result from subplan");
121
122 node->tbm = tbm;
123 node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
124 node->tbmres = tbmres = NULL;
125
126 #ifdef USE_PREFETCH
127 if (node->prefetch_maximum > 0)
128 {
129 node->prefetch_iterator = tbm_begin_iterate(tbm);
130 node->prefetch_pages = 0;
131 node->prefetch_target = -1;
132 }
133 #endif /* USE_PREFETCH */
134 }
135 else
136 {
137 /*
138 * The leader will immediately come out of the function, but
139 * others will be blocked until leader populates the TBM and wakes
140 * them up.
141 */
142 if (BitmapShouldInitializeSharedState(pstate))
143 {
144 tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
145 if (!tbm || !IsA(tbm, TIDBitmap))
146 elog(ERROR, "unrecognized result from subplan");
147
148 node->tbm = tbm;
149
150 /*
151 * Prepare to iterate over the TBM. This will return the
152 * dsa_pointer of the iterator state which will be used by
153 * multiple processes to iterate jointly.
154 */
155 pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
156 #ifdef USE_PREFETCH
157 if (node->prefetch_maximum > 0)
158 {
159 pstate->prefetch_iterator =
160 tbm_prepare_shared_iterate(tbm);
161
162 /*
163 * We don't need the mutex here as we haven't yet woke up
164 * others.
165 */
166 pstate->prefetch_pages = 0;
167 pstate->prefetch_target = -1;
168 }
169 #endif
170
171 /* We have initialized the shared state so wake up others. */
172 BitmapDoneInitializingSharedState(pstate);
173 }
174
175 /* Allocate a private iterator and attach the shared state to it */
176 node->shared_tbmiterator = shared_tbmiterator =
177 tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
178 node->tbmres = tbmres = NULL;
179
180 #ifdef USE_PREFETCH
181 if (node->prefetch_maximum > 0)
182 {
183 node->shared_prefetch_iterator =
184 tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
185 }
186 #endif /* USE_PREFETCH */
187 }
188 node->initialized = true;
189 }
190
191 for (;;)
192 {
193 Page dp;
194 ItemId lp;
195
196 CHECK_FOR_INTERRUPTS();
197
198 /*
199 * Get next page of results if needed
200 */
201 if (tbmres == NULL)
202 {
203 if (!pstate)
204 node->tbmres = tbmres = tbm_iterate(tbmiterator);
205 else
206 node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
207 if (tbmres == NULL)
208 {
209 /* no more entries in the bitmap */
210 break;
211 }
212
213 BitmapAdjustPrefetchIterator(node, tbmres);
214
215 /*
216 * Ignore any claimed entries past what we think is the end of the
217 * relation. (This is probably not necessary given that we got at
218 * least AccessShareLock on the table before performing any of the
219 * indexscans, but let's be safe.)
220 */
221 if (tbmres->blockno >= scan->rs_nblocks)
222 {
223 node->tbmres = tbmres = NULL;
224 continue;
225 }
226
227 /*
228 * Fetch the current heap page and identify candidate tuples.
229 */
230 bitgetpage(scan, tbmres);
231
232 if (tbmres->ntuples >= 0)
233 node->exact_pages++;
234 else
235 node->lossy_pages++;
236
237 /*
238 * Set rs_cindex to first slot to examine
239 */
240 scan->rs_cindex = 0;
241
242 /* Adjust the prefetch target */
243 BitmapAdjustPrefetchTarget(node);
244 }
245 else
246 {
247 /*
248 * Continuing in previously obtained page; advance rs_cindex
249 */
250 scan->rs_cindex++;
251
252 #ifdef USE_PREFETCH
253
254 /*
255 * Try to prefetch at least a few pages even before we get to the
256 * second page if we don't stop reading after the first tuple.
257 */
258 if (!pstate)
259 {
260 if (node->prefetch_target < node->prefetch_maximum)
261 node->prefetch_target++;
262 }
263 else if (pstate->prefetch_target < node->prefetch_maximum)
264 {
265 /* take spinlock while updating shared state */
266 SpinLockAcquire(&pstate->mutex);
267 if (pstate->prefetch_target < node->prefetch_maximum)
268 pstate->prefetch_target++;
269 SpinLockRelease(&pstate->mutex);
270 }
271 #endif /* USE_PREFETCH */
272 }
273
274 /*
275 * Out of range? If so, nothing more to look at on this page
276 */
277 if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
278 {
279 node->tbmres = tbmres = NULL;
280 continue;
281 }
282
283 /*
284 * We issue prefetch requests *after* fetching the current page to try
285 * to avoid having prefetching interfere with the main I/O. Also, this
286 * should happen only when we have determined there is still something
287 * to do on the current page, else we may uselessly prefetch the same
288 * page we are just about to request for real.
289 */
290 BitmapPrefetch(node, scan);
291
292 /*
293 * Okay to fetch the tuple
294 */
295 targoffset = scan->rs_vistuples[scan->rs_cindex];
296 dp = (Page) BufferGetPage(scan->rs_cbuf);
297 lp = PageGetItemId(dp, targoffset);
298 Assert(ItemIdIsNormal(lp));
299
300 scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
301 scan->rs_ctup.t_len = ItemIdGetLength(lp);
302 scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
303 ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
304
305 pgstat_count_heap_fetch(scan->rs_rd);
306
307 /*
308 * Set up the result slot to point to this tuple. Note that the slot
309 * acquires a pin on the buffer.
310 */
311 ExecStoreTuple(&scan->rs_ctup,
312 slot,
313 scan->rs_cbuf,
314 false);
315
316 /*
317 * If we are using lossy info, we have to recheck the qual conditions
318 * at every tuple.
319 */
320 if (tbmres->recheck)
321 {
322 econtext->ecxt_scantuple = slot;
323 ResetExprContext(econtext);
324
325 if (!ExecQual(node->bitmapqualorig, econtext))
326 {
327 /* Fails recheck, so drop it and loop back for another */
328 InstrCountFiltered2(node, 1);
329 ExecClearTuple(slot);
330 continue;
331 }
332 }
333
334 /* OK to return this tuple */
335 return slot;
336 }
337
338 /*
339 * if we get here it means we are at the end of the scan..
340 */
341 return ExecClearTuple(slot);
342 }
343
344 /*
345 * bitgetpage - subroutine for BitmapHeapNext()
346 *
347 * This routine reads and pins the specified page of the relation, then
348 * builds an array indicating which tuples on the page are both potentially
349 * interesting according to the bitmap, and visible according to the snapshot.
350 */
351 static void
bitgetpage(HeapScanDesc scan,TBMIterateResult * tbmres)352 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
353 {
354 BlockNumber page = tbmres->blockno;
355 Buffer buffer;
356 Snapshot snapshot;
357 int ntup;
358
359 /*
360 * Acquire pin on the target heap page, trading in any pin we held before.
361 */
362 Assert(page < scan->rs_nblocks);
363
364 scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
365 scan->rs_rd,
366 page);
367 buffer = scan->rs_cbuf;
368 snapshot = scan->rs_snapshot;
369
370 ntup = 0;
371
372 /*
373 * Prune and repair fragmentation for the whole page, if possible.
374 */
375 heap_page_prune_opt(scan->rs_rd, buffer);
376
377 /*
378 * We must hold share lock on the buffer content while examining tuple
379 * visibility. Afterwards, however, the tuples we have found to be
380 * visible are guaranteed good as long as we hold the buffer pin.
381 */
382 LockBuffer(buffer, BUFFER_LOCK_SHARE);
383
384 /*
385 * We need two separate strategies for lossy and non-lossy cases.
386 */
387 if (tbmres->ntuples >= 0)
388 {
389 /*
390 * Bitmap is non-lossy, so we just look through the offsets listed in
391 * tbmres; but we have to follow any HOT chain starting at each such
392 * offset.
393 */
394 int curslot;
395
396 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
397 {
398 OffsetNumber offnum = tbmres->offsets[curslot];
399 ItemPointerData tid;
400 HeapTupleData heapTuple;
401
402 ItemPointerSet(&tid, page, offnum);
403 if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
404 &heapTuple, NULL, true))
405 scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
406 }
407 }
408 else
409 {
410 /*
411 * Bitmap is lossy, so we must examine each item pointer on the page.
412 * But we can ignore HOT chains, since we'll check each tuple anyway.
413 */
414 Page dp = (Page) BufferGetPage(buffer);
415 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
416 OffsetNumber offnum;
417
418 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
419 {
420 ItemId lp;
421 HeapTupleData loctup;
422 bool valid;
423
424 lp = PageGetItemId(dp, offnum);
425 if (!ItemIdIsNormal(lp))
426 continue;
427 loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
428 loctup.t_len = ItemIdGetLength(lp);
429 loctup.t_tableOid = scan->rs_rd->rd_id;
430 ItemPointerSet(&loctup.t_self, page, offnum);
431 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
432 if (valid)
433 {
434 scan->rs_vistuples[ntup++] = offnum;
435 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
436 }
437 CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
438 buffer, snapshot);
439 }
440 }
441
442 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
443
444 Assert(ntup <= MaxHeapTuplesPerPage);
445 scan->rs_ntuples = ntup;
446 }
447
448 /*
449 * BitmapDoneInitializingSharedState - Shared state is initialized
450 *
451 * By this time the leader has already populated the TBM and initialized the
452 * shared state so wake up other processes.
453 */
454 static inline void
BitmapDoneInitializingSharedState(ParallelBitmapHeapState * pstate)455 BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
456 {
457 SpinLockAcquire(&pstate->mutex);
458 pstate->state = BM_FINISHED;
459 SpinLockRelease(&pstate->mutex);
460 ConditionVariableBroadcast(&pstate->cv);
461 }
462
463 /*
464 * BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
465 */
466 static inline void
BitmapAdjustPrefetchIterator(BitmapHeapScanState * node,TBMIterateResult * tbmres)467 BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
468 TBMIterateResult *tbmres)
469 {
470 #ifdef USE_PREFETCH
471 ParallelBitmapHeapState *pstate = node->pstate;
472
473 if (pstate == NULL)
474 {
475 TBMIterator *prefetch_iterator = node->prefetch_iterator;
476
477 if (node->prefetch_pages > 0)
478 {
479 /* The main iterator has closed the distance by one page */
480 node->prefetch_pages--;
481 }
482 else if (prefetch_iterator)
483 {
484 /* Do not let the prefetch iterator get behind the main one */
485 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
486
487 if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
488 elog(ERROR, "prefetch and main iterators are out of sync");
489 }
490 return;
491 }
492
493 if (node->prefetch_maximum > 0)
494 {
495 TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
496
497 SpinLockAcquire(&pstate->mutex);
498 if (pstate->prefetch_pages > 0)
499 {
500 pstate->prefetch_pages--;
501 SpinLockRelease(&pstate->mutex);
502 }
503 else
504 {
505 /* Release the mutex before iterating */
506 SpinLockRelease(&pstate->mutex);
507
508 /*
509 * In case of shared mode, we can not ensure that the current
510 * blockno of the main iterator and that of the prefetch iterator
511 * are same. It's possible that whatever blockno we are
512 * prefetching will be processed by another process. Therefore,
513 * we don't validate the blockno here as we do in non-parallel
514 * case.
515 */
516 if (prefetch_iterator)
517 tbm_shared_iterate(prefetch_iterator);
518 }
519 }
520 #endif /* USE_PREFETCH */
521 }
522
523 /*
524 * BitmapAdjustPrefetchTarget - Adjust the prefetch target
525 *
526 * Increase prefetch target if it's not yet at the max. Note that
527 * we will increase it to zero after fetching the very first
528 * page/tuple, then to one after the second tuple is fetched, then
529 * it doubles as later pages are fetched.
530 */
531 static inline void
BitmapAdjustPrefetchTarget(BitmapHeapScanState * node)532 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
533 {
534 #ifdef USE_PREFETCH
535 ParallelBitmapHeapState *pstate = node->pstate;
536
537 if (pstate == NULL)
538 {
539 if (node->prefetch_target >= node->prefetch_maximum)
540 /* don't increase any further */ ;
541 else if (node->prefetch_target >= node->prefetch_maximum / 2)
542 node->prefetch_target = node->prefetch_maximum;
543 else if (node->prefetch_target > 0)
544 node->prefetch_target *= 2;
545 else
546 node->prefetch_target++;
547 return;
548 }
549
550 /* Do an unlocked check first to save spinlock acquisitions. */
551 if (pstate->prefetch_target < node->prefetch_maximum)
552 {
553 SpinLockAcquire(&pstate->mutex);
554 if (pstate->prefetch_target >= node->prefetch_maximum)
555 /* don't increase any further */ ;
556 else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
557 pstate->prefetch_target = node->prefetch_maximum;
558 else if (pstate->prefetch_target > 0)
559 pstate->prefetch_target *= 2;
560 else
561 pstate->prefetch_target++;
562 SpinLockRelease(&pstate->mutex);
563 }
564 #endif /* USE_PREFETCH */
565 }
566
567 /*
568 * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
569 */
570 static inline void
BitmapPrefetch(BitmapHeapScanState * node,HeapScanDesc scan)571 BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
572 {
573 #ifdef USE_PREFETCH
574 ParallelBitmapHeapState *pstate = node->pstate;
575
576 if (pstate == NULL)
577 {
578 TBMIterator *prefetch_iterator = node->prefetch_iterator;
579
580 if (prefetch_iterator)
581 {
582 while (node->prefetch_pages < node->prefetch_target)
583 {
584 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
585
586 if (tbmpre == NULL)
587 {
588 /* No more pages to prefetch */
589 tbm_end_iterate(prefetch_iterator);
590 node->prefetch_iterator = NULL;
591 break;
592 }
593 node->prefetch_pages++;
594 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
595 }
596 }
597
598 return;
599 }
600
601 if (pstate->prefetch_pages < pstate->prefetch_target)
602 {
603 TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
604
605 if (prefetch_iterator)
606 {
607 while (1)
608 {
609 TBMIterateResult *tbmpre;
610 bool do_prefetch = false;
611
612 /*
613 * Recheck under the mutex. If some other process has already
614 * done enough prefetching then we need not to do anything.
615 */
616 SpinLockAcquire(&pstate->mutex);
617 if (pstate->prefetch_pages < pstate->prefetch_target)
618 {
619 pstate->prefetch_pages++;
620 do_prefetch = true;
621 }
622 SpinLockRelease(&pstate->mutex);
623
624 if (!do_prefetch)
625 return;
626
627 tbmpre = tbm_shared_iterate(prefetch_iterator);
628 if (tbmpre == NULL)
629 {
630 /* No more pages to prefetch */
631 tbm_end_shared_iterate(prefetch_iterator);
632 node->shared_prefetch_iterator = NULL;
633 break;
634 }
635
636 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
637 }
638 }
639 }
640 #endif /* USE_PREFETCH */
641 }
642
643 /*
644 * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
645 */
646 static bool
BitmapHeapRecheck(BitmapHeapScanState * node,TupleTableSlot * slot)647 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
648 {
649 ExprContext *econtext;
650
651 /*
652 * extract necessary information from index scan node
653 */
654 econtext = node->ss.ps.ps_ExprContext;
655
656 /* Does the tuple meet the original qual conditions? */
657 econtext->ecxt_scantuple = slot;
658
659 ResetExprContext(econtext);
660
661 return ExecQual(node->bitmapqualorig, econtext);
662 }
663
664 /* ----------------------------------------------------------------
665 * ExecBitmapHeapScan(node)
666 * ----------------------------------------------------------------
667 */
668 static TupleTableSlot *
ExecBitmapHeapScan(PlanState * pstate)669 ExecBitmapHeapScan(PlanState *pstate)
670 {
671 BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate);
672
673 return ExecScan(&node->ss,
674 (ExecScanAccessMtd) BitmapHeapNext,
675 (ExecScanRecheckMtd) BitmapHeapRecheck);
676 }
677
678 /* ----------------------------------------------------------------
679 * ExecReScanBitmapHeapScan(node)
680 * ----------------------------------------------------------------
681 */
682 void
ExecReScanBitmapHeapScan(BitmapHeapScanState * node)683 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
684 {
685 PlanState *outerPlan = outerPlanState(node);
686
687 /* rescan to release any page pin */
688 heap_rescan(node->ss.ss_currentScanDesc, NULL);
689
690 if (node->tbmiterator)
691 tbm_end_iterate(node->tbmiterator);
692 if (node->prefetch_iterator)
693 tbm_end_iterate(node->prefetch_iterator);
694 if (node->shared_tbmiterator)
695 tbm_end_shared_iterate(node->shared_tbmiterator);
696 if (node->shared_prefetch_iterator)
697 tbm_end_shared_iterate(node->shared_prefetch_iterator);
698 if (node->tbm)
699 tbm_free(node->tbm);
700 node->tbm = NULL;
701 node->tbmiterator = NULL;
702 node->tbmres = NULL;
703 node->prefetch_iterator = NULL;
704 node->initialized = false;
705 node->shared_tbmiterator = NULL;
706 node->shared_prefetch_iterator = NULL;
707
708 ExecScanReScan(&node->ss);
709
710 /*
711 * if chgParam of subnode is not null then plan will be re-scanned by
712 * first ExecProcNode.
713 */
714 if (outerPlan->chgParam == NULL)
715 ExecReScan(outerPlan);
716 }
717
718 /* ----------------------------------------------------------------
719 * ExecEndBitmapHeapScan
720 * ----------------------------------------------------------------
721 */
722 void
ExecEndBitmapHeapScan(BitmapHeapScanState * node)723 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
724 {
725 Relation relation;
726 HeapScanDesc scanDesc;
727
728 /*
729 * extract information from the node
730 */
731 relation = node->ss.ss_currentRelation;
732 scanDesc = node->ss.ss_currentScanDesc;
733
734 /*
735 * Free the exprcontext
736 */
737 ExecFreeExprContext(&node->ss.ps);
738
739 /*
740 * clear out tuple table slots
741 */
742 ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
743 ExecClearTuple(node->ss.ss_ScanTupleSlot);
744
745 /*
746 * close down subplans
747 */
748 ExecEndNode(outerPlanState(node));
749
750 /*
751 * release bitmap if any
752 */
753 if (node->tbmiterator)
754 tbm_end_iterate(node->tbmiterator);
755 if (node->prefetch_iterator)
756 tbm_end_iterate(node->prefetch_iterator);
757 if (node->tbm)
758 tbm_free(node->tbm);
759 if (node->shared_tbmiterator)
760 tbm_end_shared_iterate(node->shared_tbmiterator);
761 if (node->shared_prefetch_iterator)
762 tbm_end_shared_iterate(node->shared_prefetch_iterator);
763
764 /*
765 * close heap scan
766 */
767 heap_endscan(scanDesc);
768
769 /*
770 * close the heap relation.
771 */
772 ExecCloseScanRelation(relation);
773 }
774
775 /* ----------------------------------------------------------------
776 * ExecInitBitmapHeapScan
777 *
778 * Initializes the scan's state information.
779 * ----------------------------------------------------------------
780 */
781 BitmapHeapScanState *
ExecInitBitmapHeapScan(BitmapHeapScan * node,EState * estate,int eflags)782 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
783 {
784 BitmapHeapScanState *scanstate;
785 Relation currentRelation;
786 int io_concurrency;
787
788 /* check for unsupported flags */
789 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
790
791 /*
792 * Assert caller didn't ask for an unsafe snapshot --- see comments at
793 * head of file.
794 */
795 Assert(IsMVCCSnapshot(estate->es_snapshot));
796
797 /*
798 * create state structure
799 */
800 scanstate = makeNode(BitmapHeapScanState);
801 scanstate->ss.ps.plan = (Plan *) node;
802 scanstate->ss.ps.state = estate;
803 scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
804
805 scanstate->tbm = NULL;
806 scanstate->tbmiterator = NULL;
807 scanstate->tbmres = NULL;
808 scanstate->exact_pages = 0;
809 scanstate->lossy_pages = 0;
810 scanstate->prefetch_iterator = NULL;
811 scanstate->prefetch_pages = 0;
812 scanstate->prefetch_target = 0;
813 /* may be updated below */
814 scanstate->prefetch_maximum = target_prefetch_pages;
815 scanstate->pscan_len = 0;
816 scanstate->initialized = false;
817 scanstate->shared_tbmiterator = NULL;
818 scanstate->pstate = NULL;
819
820 /*
821 * Miscellaneous initialization
822 *
823 * create expression context for node
824 */
825 ExecAssignExprContext(estate, &scanstate->ss.ps);
826
827 /*
828 * initialize child expressions
829 */
830 scanstate->ss.ps.qual =
831 ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
832 scanstate->bitmapqualorig =
833 ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
834
835 /*
836 * tuple table initialization
837 */
838 ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
839 ExecInitScanTupleSlot(estate, &scanstate->ss);
840
841 /*
842 * open the base relation and acquire appropriate lock on it.
843 */
844 currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
845
846 /*
847 * Determine the maximum for prefetch_target. If the tablespace has a
848 * specific IO concurrency set, use that to compute the corresponding
849 * maximum value; otherwise, we already initialized to the value computed
850 * by the GUC machinery.
851 */
852 io_concurrency =
853 get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
854 if (io_concurrency != effective_io_concurrency)
855 {
856 double maximum;
857
858 if (ComputeIoConcurrency(io_concurrency, &maximum))
859 scanstate->prefetch_maximum = rint(maximum);
860 }
861
862 scanstate->ss.ss_currentRelation = currentRelation;
863
864 /*
865 * Even though we aren't going to do a conventional seqscan, it is useful
866 * to create a HeapScanDesc --- most of the fields in it are usable.
867 */
868 scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
869 estate->es_snapshot,
870 0,
871 NULL);
872
873 /*
874 * get the scan type from the relation descriptor.
875 */
876 ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
877
878 /*
879 * Initialize result tuple type and projection info.
880 */
881 ExecAssignResultTypeFromTL(&scanstate->ss.ps);
882 ExecAssignScanProjectionInfo(&scanstate->ss);
883
884 /*
885 * initialize child nodes
886 *
887 * We do this last because the child nodes will open indexscans on our
888 * relation's indexes, and we want to be sure we have acquired a lock on
889 * the relation first.
890 */
891 outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
892
893 /*
894 * all done.
895 */
896 return scanstate;
897 }
898
899 /*----------------
900 * BitmapShouldInitializeSharedState
901 *
902 * The first process to come here and see the state to the BM_INITIAL
903 * will become the leader for the parallel bitmap scan and will be
904 * responsible for populating the TIDBitmap. The other processes will
905 * be blocked by the condition variable until the leader wakes them up.
906 * ---------------
907 */
908 static bool
BitmapShouldInitializeSharedState(ParallelBitmapHeapState * pstate)909 BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
910 {
911 SharedBitmapState state;
912
913 while (1)
914 {
915 SpinLockAcquire(&pstate->mutex);
916 state = pstate->state;
917 if (pstate->state == BM_INITIAL)
918 pstate->state = BM_INPROGRESS;
919 SpinLockRelease(&pstate->mutex);
920
921 /* Exit if bitmap is done, or if we're the leader. */
922 if (state != BM_INPROGRESS)
923 break;
924
925 /* Wait for the leader to wake us up. */
926 ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
927 }
928
929 ConditionVariableCancelSleep();
930
931 return (state == BM_INITIAL);
932 }
933
934 /* ----------------------------------------------------------------
935 * ExecBitmapHeapEstimate
936 *
937 * estimates the space required to serialize bitmap scan node.
938 * ----------------------------------------------------------------
939 */
940 void
ExecBitmapHeapEstimate(BitmapHeapScanState * node,ParallelContext * pcxt)941 ExecBitmapHeapEstimate(BitmapHeapScanState *node,
942 ParallelContext *pcxt)
943 {
944 EState *estate = node->ss.ps.state;
945
946 node->pscan_len = add_size(offsetof(ParallelBitmapHeapState,
947 phs_snapshot_data),
948 EstimateSnapshotSpace(estate->es_snapshot));
949
950 shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
951 shm_toc_estimate_keys(&pcxt->estimator, 1);
952 }
953
954 /* ----------------------------------------------------------------
955 * ExecBitmapHeapInitializeDSM
956 *
957 * Set up a parallel bitmap heap scan descriptor.
958 * ----------------------------------------------------------------
959 */
960 void
ExecBitmapHeapInitializeDSM(BitmapHeapScanState * node,ParallelContext * pcxt)961 ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
962 ParallelContext *pcxt)
963 {
964 ParallelBitmapHeapState *pstate;
965 EState *estate = node->ss.ps.state;
966 dsa_area *dsa = node->ss.ps.state->es_query_dsa;
967
968 /* If there's no DSA, there are no workers; initialize nothing. */
969 if (dsa == NULL)
970 return;
971
972 pstate = shm_toc_allocate(pcxt->toc, node->pscan_len);
973
974 pstate->tbmiterator = 0;
975 pstate->prefetch_iterator = 0;
976
977 /* Initialize the mutex */
978 SpinLockInit(&pstate->mutex);
979 pstate->prefetch_pages = 0;
980 pstate->prefetch_target = 0;
981 pstate->state = BM_INITIAL;
982
983 ConditionVariableInit(&pstate->cv);
984 SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data);
985
986 shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
987 node->pstate = pstate;
988 }
989
990 /* ----------------------------------------------------------------
991 * ExecBitmapHeapReInitializeDSM
992 *
993 * Reset shared state before beginning a fresh scan.
994 * ----------------------------------------------------------------
995 */
996 void
ExecBitmapHeapReInitializeDSM(BitmapHeapScanState * node,ParallelContext * pcxt)997 ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
998 ParallelContext *pcxt)
999 {
1000 ParallelBitmapHeapState *pstate = node->pstate;
1001 dsa_area *dsa = node->ss.ps.state->es_query_dsa;
1002
1003 /* If there's no DSA, there are no workers; do nothing. */
1004 if (dsa == NULL)
1005 return;
1006
1007 pstate->state = BM_INITIAL;
1008
1009 if (DsaPointerIsValid(pstate->tbmiterator))
1010 tbm_free_shared_area(dsa, pstate->tbmiterator);
1011
1012 if (DsaPointerIsValid(pstate->prefetch_iterator))
1013 tbm_free_shared_area(dsa, pstate->prefetch_iterator);
1014
1015 pstate->tbmiterator = InvalidDsaPointer;
1016 pstate->prefetch_iterator = InvalidDsaPointer;
1017 }
1018
1019 /* ----------------------------------------------------------------
1020 * ExecBitmapHeapInitializeWorker
1021 *
1022 * Copy relevant information from TOC into planstate.
1023 * ----------------------------------------------------------------
1024 */
1025 void
ExecBitmapHeapInitializeWorker(BitmapHeapScanState * node,shm_toc * toc)1026 ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, shm_toc *toc)
1027 {
1028 ParallelBitmapHeapState *pstate;
1029 Snapshot snapshot;
1030
1031 Assert(node->ss.ps.state->es_query_dsa != NULL);
1032
1033 pstate = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false);
1034 node->pstate = pstate;
1035
1036 snapshot = RestoreSnapshot(pstate->phs_snapshot_data);
1037 heap_update_snapshot(node->ss.ss_currentScanDesc, snapshot);
1038 }
1039