1 /*-------------------------------------------------------------------------
2  *
3  * nodeBitmapHeapscan.c
4  *	  Routines to support bitmapped scans of relations
5  *
6  * NOTE: it is critical that this plan type only be used with MVCC-compliant
7  * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8  * special snapshots).  The reason is that since index and heap scans are
9  * decoupled, there can be no assurance that the index tuple prompting a
10  * visit to a particular heap TID still exists when the visit is made.
11  * Therefore the tuple might not exist anymore either (which is OK because
12  * heap_fetch will cope) --- but worse, the tuple slot could have been
13  * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
14  * certain to fail the time qual and so it will not be mistakenly returned,
15  * but with anything else we might return a tuple that doesn't meet the
16  * required index qual conditions.
17  *
18  *
19  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
20  * Portions Copyright (c) 1994, Regents of the University of California
21  *
22  *
23  * IDENTIFICATION
24  *	  src/backend/executor/nodeBitmapHeapscan.c
25  *
26  *-------------------------------------------------------------------------
27  */
28 /*
29  * INTERFACE ROUTINES
30  *		ExecBitmapHeapScan			scans a relation using bitmap info
31  *		ExecBitmapHeapNext			workhorse for above
32  *		ExecInitBitmapHeapScan		creates and initializes state info.
33  *		ExecReScanBitmapHeapScan	prepares to rescan the plan.
34  *		ExecEndBitmapHeapScan		releases all storage.
35  */
36 #include "postgres.h"
37 
38 #include <math.h>
39 
40 #include "access/relscan.h"
41 #include "access/tableam.h"
42 #include "access/transam.h"
43 #include "access/visibilitymap.h"
44 #include "executor/execdebug.h"
45 #include "executor/nodeBitmapHeapscan.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/bufmgr.h"
49 #include "storage/predicate.h"
50 #include "utils/memutils.h"
51 #include "utils/rel.h"
52 #include "utils/snapmgr.h"
53 #include "utils/spccache.h"
54 
55 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
56 static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
57 static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
58 												TBMIterateResult *tbmres);
59 static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
60 static inline void BitmapPrefetch(BitmapHeapScanState *node,
61 								  TableScanDesc scan);
62 static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
63 
64 
65 /* ----------------------------------------------------------------
66  *		BitmapHeapNext
67  *
68  *		Retrieve next tuple from the BitmapHeapScan node's currentRelation
69  * ----------------------------------------------------------------
70  */
71 static TupleTableSlot *
BitmapHeapNext(BitmapHeapScanState * node)72 BitmapHeapNext(BitmapHeapScanState *node)
73 {
74 	ExprContext *econtext;
75 	TableScanDesc scan;
76 	TIDBitmap  *tbm;
77 	TBMIterator *tbmiterator = NULL;
78 	TBMSharedIterator *shared_tbmiterator = NULL;
79 	TBMIterateResult *tbmres;
80 	TupleTableSlot *slot;
81 	ParallelBitmapHeapState *pstate = node->pstate;
82 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
83 
84 	/*
85 	 * extract necessary information from index scan node
86 	 */
87 	econtext = node->ss.ps.ps_ExprContext;
88 	slot = node->ss.ss_ScanTupleSlot;
89 	scan = node->ss.ss_currentScanDesc;
90 	tbm = node->tbm;
91 	if (pstate == NULL)
92 		tbmiterator = node->tbmiterator;
93 	else
94 		shared_tbmiterator = node->shared_tbmiterator;
95 	tbmres = node->tbmres;
96 
97 	/*
98 	 * If we haven't yet performed the underlying index scan, do it, and begin
99 	 * the iteration over the bitmap.
100 	 *
101 	 * For prefetching, we use *two* iterators, one for the pages we are
102 	 * actually scanning and another that runs ahead of the first for
103 	 * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
104 	 * the prefetch iterator is.  Also, node->prefetch_target tracks the
105 	 * desired prefetch distance, which starts small and increases up to the
106 	 * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
107 	 * a scan that stops after a few tuples because of a LIMIT.
108 	 */
109 	if (!node->initialized)
110 	{
111 		if (!pstate)
112 		{
113 			tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
114 
115 			if (!tbm || !IsA(tbm, TIDBitmap))
116 				elog(ERROR, "unrecognized result from subplan");
117 
118 			node->tbm = tbm;
119 			node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
120 			node->tbmres = tbmres = NULL;
121 
122 #ifdef USE_PREFETCH
123 			if (node->prefetch_maximum > 0)
124 			{
125 				node->prefetch_iterator = tbm_begin_iterate(tbm);
126 				node->prefetch_pages = 0;
127 				node->prefetch_target = -1;
128 			}
129 #endif							/* USE_PREFETCH */
130 		}
131 		else
132 		{
133 			/*
134 			 * The leader will immediately come out of the function, but
135 			 * others will be blocked until leader populates the TBM and wakes
136 			 * them up.
137 			 */
138 			if (BitmapShouldInitializeSharedState(pstate))
139 			{
140 				tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
141 				if (!tbm || !IsA(tbm, TIDBitmap))
142 					elog(ERROR, "unrecognized result from subplan");
143 
144 				node->tbm = tbm;
145 
146 				/*
147 				 * Prepare to iterate over the TBM. This will return the
148 				 * dsa_pointer of the iterator state which will be used by
149 				 * multiple processes to iterate jointly.
150 				 */
151 				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
152 #ifdef USE_PREFETCH
153 				if (node->prefetch_maximum > 0)
154 				{
155 					pstate->prefetch_iterator =
156 						tbm_prepare_shared_iterate(tbm);
157 
158 					/*
159 					 * We don't need the mutex here as we haven't yet woke up
160 					 * others.
161 					 */
162 					pstate->prefetch_pages = 0;
163 					pstate->prefetch_target = -1;
164 				}
165 #endif
166 
167 				/* We have initialized the shared state so wake up others. */
168 				BitmapDoneInitializingSharedState(pstate);
169 			}
170 
171 			/* Allocate a private iterator and attach the shared state to it */
172 			node->shared_tbmiterator = shared_tbmiterator =
173 				tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
174 			node->tbmres = tbmres = NULL;
175 
176 #ifdef USE_PREFETCH
177 			if (node->prefetch_maximum > 0)
178 			{
179 				node->shared_prefetch_iterator =
180 					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
181 			}
182 #endif							/* USE_PREFETCH */
183 		}
184 		node->initialized = true;
185 	}
186 
187 	for (;;)
188 	{
189 		bool		skip_fetch;
190 
191 		CHECK_FOR_INTERRUPTS();
192 
193 		/*
194 		 * Get next page of results if needed
195 		 */
196 		if (tbmres == NULL)
197 		{
198 			if (!pstate)
199 				node->tbmres = tbmres = tbm_iterate(tbmiterator);
200 			else
201 				node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
202 			if (tbmres == NULL)
203 			{
204 				/* no more entries in the bitmap */
205 				break;
206 			}
207 
208 			BitmapAdjustPrefetchIterator(node, tbmres);
209 
210 			/*
211 			 * We can skip fetching the heap page if we don't need any fields
212 			 * from the heap, and the bitmap entries don't need rechecking,
213 			 * and all tuples on the page are visible to our transaction.
214 			 *
215 			 * XXX: It's a layering violation that we do these checks above
216 			 * tableam, they should probably moved below it at some point.
217 			 */
218 			skip_fetch = (node->can_skip_fetch &&
219 						  !tbmres->recheck &&
220 						  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
221 										 tbmres->blockno,
222 										 &node->vmbuffer));
223 
224 			if (skip_fetch)
225 			{
226 				/* can't be lossy in the skip_fetch case */
227 				Assert(tbmres->ntuples >= 0);
228 
229 				/*
230 				 * The number of tuples on this page is put into
231 				 * node->return_empty_tuples.
232 				 */
233 				node->return_empty_tuples = tbmres->ntuples;
234 			}
235 			else if (!table_scan_bitmap_next_block(scan, tbmres))
236 			{
237 				/* AM doesn't think this block is valid, skip */
238 				continue;
239 			}
240 
241 			if (tbmres->ntuples >= 0)
242 				node->exact_pages++;
243 			else
244 				node->lossy_pages++;
245 
246 			/* Adjust the prefetch target */
247 			BitmapAdjustPrefetchTarget(node);
248 		}
249 		else
250 		{
251 			/*
252 			 * Continuing in previously obtained page.
253 			 */
254 
255 #ifdef USE_PREFETCH
256 
257 			/*
258 			 * Try to prefetch at least a few pages even before we get to the
259 			 * second page if we don't stop reading after the first tuple.
260 			 */
261 			if (!pstate)
262 			{
263 				if (node->prefetch_target < node->prefetch_maximum)
264 					node->prefetch_target++;
265 			}
266 			else if (pstate->prefetch_target < node->prefetch_maximum)
267 			{
268 				/* take spinlock while updating shared state */
269 				SpinLockAcquire(&pstate->mutex);
270 				if (pstate->prefetch_target < node->prefetch_maximum)
271 					pstate->prefetch_target++;
272 				SpinLockRelease(&pstate->mutex);
273 			}
274 #endif							/* USE_PREFETCH */
275 		}
276 
277 		/*
278 		 * We issue prefetch requests *after* fetching the current page to try
279 		 * to avoid having prefetching interfere with the main I/O. Also, this
280 		 * should happen only when we have determined there is still something
281 		 * to do on the current page, else we may uselessly prefetch the same
282 		 * page we are just about to request for real.
283 		 *
284 		 * XXX: It's a layering violation that we do these checks above
285 		 * tableam, they should probably moved below it at some point.
286 		 */
287 		BitmapPrefetch(node, scan);
288 
289 		if (node->return_empty_tuples > 0)
290 		{
291 			/*
292 			 * If we don't have to fetch the tuple, just return nulls.
293 			 */
294 			ExecStoreAllNullTuple(slot);
295 
296 			if (--node->return_empty_tuples == 0)
297 			{
298 				/* no more tuples to return in the next round */
299 				node->tbmres = tbmres = NULL;
300 			}
301 		}
302 		else
303 		{
304 			/*
305 			 * Attempt to fetch tuple from AM.
306 			 */
307 			if (!table_scan_bitmap_next_tuple(scan, tbmres, slot))
308 			{
309 				/* nothing more to look at on this page */
310 				node->tbmres = tbmres = NULL;
311 				continue;
312 			}
313 
314 			/*
315 			 * If we are using lossy info, we have to recheck the qual
316 			 * conditions at every tuple.
317 			 */
318 			if (tbmres->recheck)
319 			{
320 				econtext->ecxt_scantuple = slot;
321 				if (!ExecQualAndReset(node->bitmapqualorig, econtext))
322 				{
323 					/* Fails recheck, so drop it and loop back for another */
324 					InstrCountFiltered2(node, 1);
325 					ExecClearTuple(slot);
326 					continue;
327 				}
328 			}
329 		}
330 
331 		/* OK to return this tuple */
332 		return slot;
333 	}
334 
335 	/*
336 	 * if we get here it means we are at the end of the scan..
337 	 */
338 	return ExecClearTuple(slot);
339 }
340 
341 /*
342  *	BitmapDoneInitializingSharedState - Shared state is initialized
343  *
344  *	By this time the leader has already populated the TBM and initialized the
345  *	shared state so wake up other processes.
346  */
347 static inline void
BitmapDoneInitializingSharedState(ParallelBitmapHeapState * pstate)348 BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
349 {
350 	SpinLockAcquire(&pstate->mutex);
351 	pstate->state = BM_FINISHED;
352 	SpinLockRelease(&pstate->mutex);
353 	ConditionVariableBroadcast(&pstate->cv);
354 }
355 
356 /*
357  *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
358  */
359 static inline void
BitmapAdjustPrefetchIterator(BitmapHeapScanState * node,TBMIterateResult * tbmres)360 BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
361 							 TBMIterateResult *tbmres)
362 {
363 #ifdef USE_PREFETCH
364 	ParallelBitmapHeapState *pstate = node->pstate;
365 
366 	if (pstate == NULL)
367 	{
368 		TBMIterator *prefetch_iterator = node->prefetch_iterator;
369 
370 		if (node->prefetch_pages > 0)
371 		{
372 			/* The main iterator has closed the distance by one page */
373 			node->prefetch_pages--;
374 		}
375 		else if (prefetch_iterator)
376 		{
377 			/* Do not let the prefetch iterator get behind the main one */
378 			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
379 
380 			if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
381 				elog(ERROR, "prefetch and main iterators are out of sync");
382 		}
383 		return;
384 	}
385 
386 	if (node->prefetch_maximum > 0)
387 	{
388 		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
389 
390 		SpinLockAcquire(&pstate->mutex);
391 		if (pstate->prefetch_pages > 0)
392 		{
393 			pstate->prefetch_pages--;
394 			SpinLockRelease(&pstate->mutex);
395 		}
396 		else
397 		{
398 			/* Release the mutex before iterating */
399 			SpinLockRelease(&pstate->mutex);
400 
401 			/*
402 			 * In case of shared mode, we can not ensure that the current
403 			 * blockno of the main iterator and that of the prefetch iterator
404 			 * are same.  It's possible that whatever blockno we are
405 			 * prefetching will be processed by another process.  Therefore,
406 			 * we don't validate the blockno here as we do in non-parallel
407 			 * case.
408 			 */
409 			if (prefetch_iterator)
410 				tbm_shared_iterate(prefetch_iterator);
411 		}
412 	}
413 #endif							/* USE_PREFETCH */
414 }
415 
416 /*
417  * BitmapAdjustPrefetchTarget - Adjust the prefetch target
418  *
419  * Increase prefetch target if it's not yet at the max.  Note that
420  * we will increase it to zero after fetching the very first
421  * page/tuple, then to one after the second tuple is fetched, then
422  * it doubles as later pages are fetched.
423  */
424 static inline void
BitmapAdjustPrefetchTarget(BitmapHeapScanState * node)425 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
426 {
427 #ifdef USE_PREFETCH
428 	ParallelBitmapHeapState *pstate = node->pstate;
429 
430 	if (pstate == NULL)
431 	{
432 		if (node->prefetch_target >= node->prefetch_maximum)
433 			 /* don't increase any further */ ;
434 		else if (node->prefetch_target >= node->prefetch_maximum / 2)
435 			node->prefetch_target = node->prefetch_maximum;
436 		else if (node->prefetch_target > 0)
437 			node->prefetch_target *= 2;
438 		else
439 			node->prefetch_target++;
440 		return;
441 	}
442 
443 	/* Do an unlocked check first to save spinlock acquisitions. */
444 	if (pstate->prefetch_target < node->prefetch_maximum)
445 	{
446 		SpinLockAcquire(&pstate->mutex);
447 		if (pstate->prefetch_target >= node->prefetch_maximum)
448 			 /* don't increase any further */ ;
449 		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
450 			pstate->prefetch_target = node->prefetch_maximum;
451 		else if (pstate->prefetch_target > 0)
452 			pstate->prefetch_target *= 2;
453 		else
454 			pstate->prefetch_target++;
455 		SpinLockRelease(&pstate->mutex);
456 	}
457 #endif							/* USE_PREFETCH */
458 }
459 
460 /*
461  * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
462  */
463 static inline void
BitmapPrefetch(BitmapHeapScanState * node,TableScanDesc scan)464 BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
465 {
466 #ifdef USE_PREFETCH
467 	ParallelBitmapHeapState *pstate = node->pstate;
468 
469 	if (pstate == NULL)
470 	{
471 		TBMIterator *prefetch_iterator = node->prefetch_iterator;
472 
473 		if (prefetch_iterator)
474 		{
475 			while (node->prefetch_pages < node->prefetch_target)
476 			{
477 				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
478 				bool		skip_fetch;
479 
480 				if (tbmpre == NULL)
481 				{
482 					/* No more pages to prefetch */
483 					tbm_end_iterate(prefetch_iterator);
484 					node->prefetch_iterator = NULL;
485 					break;
486 				}
487 				node->prefetch_pages++;
488 
489 				/*
490 				 * If we expect not to have to actually read this heap page,
491 				 * skip this prefetch call, but continue to run the prefetch
492 				 * logic normally.  (Would it be better not to increment
493 				 * prefetch_pages?)
494 				 *
495 				 * This depends on the assumption that the index AM will
496 				 * report the same recheck flag for this future heap page as
497 				 * it did for the current heap page; which is not a certainty
498 				 * but is true in many cases.
499 				 */
500 				skip_fetch = (node->can_skip_fetch &&
501 							  (node->tbmres ? !node->tbmres->recheck : false) &&
502 							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
503 											 tbmpre->blockno,
504 											 &node->pvmbuffer));
505 
506 				if (!skip_fetch)
507 					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
508 			}
509 		}
510 
511 		return;
512 	}
513 
514 	if (pstate->prefetch_pages < pstate->prefetch_target)
515 	{
516 		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
517 
518 		if (prefetch_iterator)
519 		{
520 			while (1)
521 			{
522 				TBMIterateResult *tbmpre;
523 				bool		do_prefetch = false;
524 				bool		skip_fetch;
525 
526 				/*
527 				 * Recheck under the mutex. If some other process has already
528 				 * done enough prefetching then we need not to do anything.
529 				 */
530 				SpinLockAcquire(&pstate->mutex);
531 				if (pstate->prefetch_pages < pstate->prefetch_target)
532 				{
533 					pstate->prefetch_pages++;
534 					do_prefetch = true;
535 				}
536 				SpinLockRelease(&pstate->mutex);
537 
538 				if (!do_prefetch)
539 					return;
540 
541 				tbmpre = tbm_shared_iterate(prefetch_iterator);
542 				if (tbmpre == NULL)
543 				{
544 					/* No more pages to prefetch */
545 					tbm_end_shared_iterate(prefetch_iterator);
546 					node->shared_prefetch_iterator = NULL;
547 					break;
548 				}
549 
550 				/* As above, skip prefetch if we expect not to need page */
551 				skip_fetch = (node->can_skip_fetch &&
552 							  (node->tbmres ? !node->tbmres->recheck : false) &&
553 							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
554 											 tbmpre->blockno,
555 											 &node->pvmbuffer));
556 
557 				if (!skip_fetch)
558 					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
559 			}
560 		}
561 	}
562 #endif							/* USE_PREFETCH */
563 }
564 
565 /*
566  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
567  */
568 static bool
BitmapHeapRecheck(BitmapHeapScanState * node,TupleTableSlot * slot)569 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
570 {
571 	ExprContext *econtext;
572 
573 	/*
574 	 * extract necessary information from index scan node
575 	 */
576 	econtext = node->ss.ps.ps_ExprContext;
577 
578 	/* Does the tuple meet the original qual conditions? */
579 	econtext->ecxt_scantuple = slot;
580 	return ExecQualAndReset(node->bitmapqualorig, econtext);
581 }
582 
583 /* ----------------------------------------------------------------
584  *		ExecBitmapHeapScan(node)
585  * ----------------------------------------------------------------
586  */
587 static TupleTableSlot *
ExecBitmapHeapScan(PlanState * pstate)588 ExecBitmapHeapScan(PlanState *pstate)
589 {
590 	BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate);
591 
592 	return ExecScan(&node->ss,
593 					(ExecScanAccessMtd) BitmapHeapNext,
594 					(ExecScanRecheckMtd) BitmapHeapRecheck);
595 }
596 
597 /* ----------------------------------------------------------------
598  *		ExecReScanBitmapHeapScan(node)
599  * ----------------------------------------------------------------
600  */
601 void
ExecReScanBitmapHeapScan(BitmapHeapScanState * node)602 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
603 {
604 	PlanState  *outerPlan = outerPlanState(node);
605 
606 	/* rescan to release any page pin */
607 	table_rescan(node->ss.ss_currentScanDesc, NULL);
608 
609 	/* release bitmaps and buffers if any */
610 	if (node->tbmiterator)
611 		tbm_end_iterate(node->tbmiterator);
612 	if (node->prefetch_iterator)
613 		tbm_end_iterate(node->prefetch_iterator);
614 	if (node->shared_tbmiterator)
615 		tbm_end_shared_iterate(node->shared_tbmiterator);
616 	if (node->shared_prefetch_iterator)
617 		tbm_end_shared_iterate(node->shared_prefetch_iterator);
618 	if (node->tbm)
619 		tbm_free(node->tbm);
620 	if (node->vmbuffer != InvalidBuffer)
621 		ReleaseBuffer(node->vmbuffer);
622 	if (node->pvmbuffer != InvalidBuffer)
623 		ReleaseBuffer(node->pvmbuffer);
624 	node->tbm = NULL;
625 	node->tbmiterator = NULL;
626 	node->tbmres = NULL;
627 	node->prefetch_iterator = NULL;
628 	node->initialized = false;
629 	node->shared_tbmiterator = NULL;
630 	node->shared_prefetch_iterator = NULL;
631 	node->vmbuffer = InvalidBuffer;
632 	node->pvmbuffer = InvalidBuffer;
633 
634 	ExecScanReScan(&node->ss);
635 
636 	/*
637 	 * if chgParam of subnode is not null then plan will be re-scanned by
638 	 * first ExecProcNode.
639 	 */
640 	if (outerPlan->chgParam == NULL)
641 		ExecReScan(outerPlan);
642 }
643 
644 /* ----------------------------------------------------------------
645  *		ExecEndBitmapHeapScan
646  * ----------------------------------------------------------------
647  */
648 void
ExecEndBitmapHeapScan(BitmapHeapScanState * node)649 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
650 {
651 	TableScanDesc scanDesc;
652 
653 	/*
654 	 * extract information from the node
655 	 */
656 	scanDesc = node->ss.ss_currentScanDesc;
657 
658 	/*
659 	 * Free the exprcontext
660 	 */
661 	ExecFreeExprContext(&node->ss.ps);
662 
663 	/*
664 	 * clear out tuple table slots
665 	 */
666 	if (node->ss.ps.ps_ResultTupleSlot)
667 		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
668 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
669 
670 	/*
671 	 * close down subplans
672 	 */
673 	ExecEndNode(outerPlanState(node));
674 
675 	/*
676 	 * release bitmaps and buffers if any
677 	 */
678 	if (node->tbmiterator)
679 		tbm_end_iterate(node->tbmiterator);
680 	if (node->prefetch_iterator)
681 		tbm_end_iterate(node->prefetch_iterator);
682 	if (node->tbm)
683 		tbm_free(node->tbm);
684 	if (node->shared_tbmiterator)
685 		tbm_end_shared_iterate(node->shared_tbmiterator);
686 	if (node->shared_prefetch_iterator)
687 		tbm_end_shared_iterate(node->shared_prefetch_iterator);
688 	if (node->vmbuffer != InvalidBuffer)
689 		ReleaseBuffer(node->vmbuffer);
690 	if (node->pvmbuffer != InvalidBuffer)
691 		ReleaseBuffer(node->pvmbuffer);
692 
693 	/*
694 	 * close heap scan
695 	 */
696 	table_endscan(scanDesc);
697 }
698 
699 /* ----------------------------------------------------------------
700  *		ExecInitBitmapHeapScan
701  *
702  *		Initializes the scan's state information.
703  * ----------------------------------------------------------------
704  */
705 BitmapHeapScanState *
ExecInitBitmapHeapScan(BitmapHeapScan * node,EState * estate,int eflags)706 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
707 {
708 	BitmapHeapScanState *scanstate;
709 	Relation	currentRelation;
710 
711 	/* check for unsupported flags */
712 	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
713 
714 	/*
715 	 * Assert caller didn't ask for an unsafe snapshot --- see comments at
716 	 * head of file.
717 	 */
718 	Assert(IsMVCCSnapshot(estate->es_snapshot));
719 
720 	/*
721 	 * create state structure
722 	 */
723 	scanstate = makeNode(BitmapHeapScanState);
724 	scanstate->ss.ps.plan = (Plan *) node;
725 	scanstate->ss.ps.state = estate;
726 	scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
727 
728 	scanstate->tbm = NULL;
729 	scanstate->tbmiterator = NULL;
730 	scanstate->tbmres = NULL;
731 	scanstate->return_empty_tuples = 0;
732 	scanstate->vmbuffer = InvalidBuffer;
733 	scanstate->pvmbuffer = InvalidBuffer;
734 	scanstate->exact_pages = 0;
735 	scanstate->lossy_pages = 0;
736 	scanstate->prefetch_iterator = NULL;
737 	scanstate->prefetch_pages = 0;
738 	scanstate->prefetch_target = 0;
739 	scanstate->pscan_len = 0;
740 	scanstate->initialized = false;
741 	scanstate->shared_tbmiterator = NULL;
742 	scanstate->shared_prefetch_iterator = NULL;
743 	scanstate->pstate = NULL;
744 
745 	/*
746 	 * We can potentially skip fetching heap pages if we do not need any
747 	 * columns of the table, either for checking non-indexable quals or for
748 	 * returning data.  This test is a bit simplistic, as it checks the
749 	 * stronger condition that there's no qual or return tlist at all.  But in
750 	 * most cases it's probably not worth working harder than that.
751 	 */
752 	scanstate->can_skip_fetch = (node->scan.plan.qual == NIL &&
753 								 node->scan.plan.targetlist == NIL);
754 
755 	/*
756 	 * Miscellaneous initialization
757 	 *
758 	 * create expression context for node
759 	 */
760 	ExecAssignExprContext(estate, &scanstate->ss.ps);
761 
762 	/*
763 	 * open the scan relation
764 	 */
765 	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
766 
767 	/*
768 	 * initialize child nodes
769 	 */
770 	outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
771 
772 	/*
773 	 * get the scan type from the relation descriptor.
774 	 */
775 	ExecInitScanTupleSlot(estate, &scanstate->ss,
776 						  RelationGetDescr(currentRelation),
777 						  table_slot_callbacks(currentRelation));
778 
779 	/*
780 	 * Initialize result type and projection.
781 	 */
782 	ExecInitResultTypeTL(&scanstate->ss.ps);
783 	ExecAssignScanProjectionInfo(&scanstate->ss);
784 
785 	/*
786 	 * initialize child expressions
787 	 */
788 	scanstate->ss.ps.qual =
789 		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
790 	scanstate->bitmapqualorig =
791 		ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
792 
793 	/*
794 	 * Maximum number of prefetches for the tablespace if configured,
795 	 * otherwise the current value of the effective_io_concurrency GUC.
796 	 */
797 	scanstate->prefetch_maximum =
798 		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
799 
800 	scanstate->ss.ss_currentRelation = currentRelation;
801 
802 	scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation,
803 														  estate->es_snapshot,
804 														  0,
805 														  NULL);
806 
807 	/*
808 	 * all done.
809 	 */
810 	return scanstate;
811 }
812 
813 /*----------------
814  *		BitmapShouldInitializeSharedState
815  *
816  *		The first process to come here and see the state to the BM_INITIAL
817  *		will become the leader for the parallel bitmap scan and will be
818  *		responsible for populating the TIDBitmap.  The other processes will
819  *		be blocked by the condition variable until the leader wakes them up.
820  * ---------------
821  */
822 static bool
BitmapShouldInitializeSharedState(ParallelBitmapHeapState * pstate)823 BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
824 {
825 	SharedBitmapState state;
826 
827 	while (1)
828 	{
829 		SpinLockAcquire(&pstate->mutex);
830 		state = pstate->state;
831 		if (pstate->state == BM_INITIAL)
832 			pstate->state = BM_INPROGRESS;
833 		SpinLockRelease(&pstate->mutex);
834 
835 		/* Exit if bitmap is done, or if we're the leader. */
836 		if (state != BM_INPROGRESS)
837 			break;
838 
839 		/* Wait for the leader to wake us up. */
840 		ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
841 	}
842 
843 	ConditionVariableCancelSleep();
844 
845 	return (state == BM_INITIAL);
846 }
847 
848 /* ----------------------------------------------------------------
849  *		ExecBitmapHeapEstimate
850  *
851  *		Compute the amount of space we'll need in the parallel
852  *		query DSM, and inform pcxt->estimator about our needs.
853  * ----------------------------------------------------------------
854  */
855 void
ExecBitmapHeapEstimate(BitmapHeapScanState * node,ParallelContext * pcxt)856 ExecBitmapHeapEstimate(BitmapHeapScanState *node,
857 					   ParallelContext *pcxt)
858 {
859 	EState	   *estate = node->ss.ps.state;
860 
861 	node->pscan_len = add_size(offsetof(ParallelBitmapHeapState,
862 										phs_snapshot_data),
863 							   EstimateSnapshotSpace(estate->es_snapshot));
864 
865 	shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
866 	shm_toc_estimate_keys(&pcxt->estimator, 1);
867 }
868 
869 /* ----------------------------------------------------------------
870  *		ExecBitmapHeapInitializeDSM
871  *
872  *		Set up a parallel bitmap heap scan descriptor.
873  * ----------------------------------------------------------------
874  */
875 void
ExecBitmapHeapInitializeDSM(BitmapHeapScanState * node,ParallelContext * pcxt)876 ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
877 							ParallelContext *pcxt)
878 {
879 	ParallelBitmapHeapState *pstate;
880 	EState	   *estate = node->ss.ps.state;
881 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
882 
883 	/* If there's no DSA, there are no workers; initialize nothing. */
884 	if (dsa == NULL)
885 		return;
886 
887 	pstate = shm_toc_allocate(pcxt->toc, node->pscan_len);
888 
889 	pstate->tbmiterator = 0;
890 	pstate->prefetch_iterator = 0;
891 
892 	/* Initialize the mutex */
893 	SpinLockInit(&pstate->mutex);
894 	pstate->prefetch_pages = 0;
895 	pstate->prefetch_target = 0;
896 	pstate->state = BM_INITIAL;
897 
898 	ConditionVariableInit(&pstate->cv);
899 	SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data);
900 
901 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
902 	node->pstate = pstate;
903 }
904 
905 /* ----------------------------------------------------------------
906  *		ExecBitmapHeapReInitializeDSM
907  *
908  *		Reset shared state before beginning a fresh scan.
909  * ----------------------------------------------------------------
910  */
911 void
ExecBitmapHeapReInitializeDSM(BitmapHeapScanState * node,ParallelContext * pcxt)912 ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
913 							  ParallelContext *pcxt)
914 {
915 	ParallelBitmapHeapState *pstate = node->pstate;
916 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
917 
918 	/* If there's no DSA, there are no workers; do nothing. */
919 	if (dsa == NULL)
920 		return;
921 
922 	pstate->state = BM_INITIAL;
923 
924 	if (DsaPointerIsValid(pstate->tbmiterator))
925 		tbm_free_shared_area(dsa, pstate->tbmiterator);
926 
927 	if (DsaPointerIsValid(pstate->prefetch_iterator))
928 		tbm_free_shared_area(dsa, pstate->prefetch_iterator);
929 
930 	pstate->tbmiterator = InvalidDsaPointer;
931 	pstate->prefetch_iterator = InvalidDsaPointer;
932 }
933 
934 /* ----------------------------------------------------------------
935  *		ExecBitmapHeapInitializeWorker
936  *
937  *		Copy relevant information from TOC into planstate.
938  * ----------------------------------------------------------------
939  */
940 void
ExecBitmapHeapInitializeWorker(BitmapHeapScanState * node,ParallelWorkerContext * pwcxt)941 ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
942 							   ParallelWorkerContext *pwcxt)
943 {
944 	ParallelBitmapHeapState *pstate;
945 	Snapshot	snapshot;
946 
947 	Assert(node->ss.ps.state->es_query_dsa != NULL);
948 
949 	pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
950 	node->pstate = pstate;
951 
952 	snapshot = RestoreSnapshot(pstate->phs_snapshot_data);
953 	table_scan_update_snapshot(node->ss.ss_currentScanDesc, snapshot);
954 }
955