1 /*-------------------------------------------------------------------------
2  *
3  * nodeBitmapHeapscan.c
4  *	  Routines to support bitmapped scans of relations
5  *
6  * NOTE: it is critical that this plan type only be used with MVCC-compliant
7  * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8  * special snapshots).  The reason is that since index and heap scans are
9  * decoupled, there can be no assurance that the index tuple prompting a
10  * visit to a particular heap TID still exists when the visit is made.
11  * Therefore the tuple might not exist anymore either (which is OK because
12  * heap_fetch will cope) --- but worse, the tuple slot could have been
13  * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
14  * certain to fail the time qual and so it will not be mistakenly returned,
15  * but with anything else we might return a tuple that doesn't meet the
16  * required index qual conditions.
17  *
18  *
19  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
20  * Portions Copyright (c) 1994, Regents of the University of California
21  *
22  *
23  * IDENTIFICATION
24  *	  src/backend/executor/nodeBitmapHeapscan.c
25  *
26  *-------------------------------------------------------------------------
27  */
28 /*
29  * INTERFACE ROUTINES
30  *		ExecBitmapHeapScan			scans a relation using bitmap info
31  *		ExecBitmapHeapNext			workhorse for above
32  *		ExecInitBitmapHeapScan		creates and initializes state info.
33  *		ExecReScanBitmapHeapScan	prepares to rescan the plan.
34  *		ExecEndBitmapHeapScan		releases all storage.
35  */
36 #include "postgres.h"
37 
38 #include <math.h>
39 
40 #include "access/relscan.h"
41 #include "access/tableam.h"
42 #include "access/transam.h"
43 #include "access/visibilitymap.h"
44 #include "executor/execdebug.h"
45 #include "executor/nodeBitmapHeapscan.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/bufmgr.h"
49 #include "storage/predicate.h"
50 #include "utils/memutils.h"
51 #include "utils/rel.h"
52 #include "utils/spccache.h"
53 #include "utils/snapmgr.h"
54 
55 
56 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
57 static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
58 static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
59 												TBMIterateResult *tbmres);
60 static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
61 static inline void BitmapPrefetch(BitmapHeapScanState *node,
62 								  TableScanDesc scan);
63 static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
64 
65 
66 /* ----------------------------------------------------------------
67  *		BitmapHeapNext
68  *
69  *		Retrieve next tuple from the BitmapHeapScan node's currentRelation
70  * ----------------------------------------------------------------
71  */
72 static TupleTableSlot *
BitmapHeapNext(BitmapHeapScanState * node)73 BitmapHeapNext(BitmapHeapScanState *node)
74 {
75 	ExprContext *econtext;
76 	TableScanDesc scan;
77 	TIDBitmap  *tbm;
78 	TBMIterator *tbmiterator = NULL;
79 	TBMSharedIterator *shared_tbmiterator = NULL;
80 	TBMIterateResult *tbmres;
81 	TupleTableSlot *slot;
82 	ParallelBitmapHeapState *pstate = node->pstate;
83 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
84 
85 	/*
86 	 * extract necessary information from index scan node
87 	 */
88 	econtext = node->ss.ps.ps_ExprContext;
89 	slot = node->ss.ss_ScanTupleSlot;
90 	scan = node->ss.ss_currentScanDesc;
91 	tbm = node->tbm;
92 	if (pstate == NULL)
93 		tbmiterator = node->tbmiterator;
94 	else
95 		shared_tbmiterator = node->shared_tbmiterator;
96 	tbmres = node->tbmres;
97 
98 	/*
99 	 * If we haven't yet performed the underlying index scan, do it, and begin
100 	 * the iteration over the bitmap.
101 	 *
102 	 * For prefetching, we use *two* iterators, one for the pages we are
103 	 * actually scanning and another that runs ahead of the first for
104 	 * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
105 	 * the prefetch iterator is.  Also, node->prefetch_target tracks the
106 	 * desired prefetch distance, which starts small and increases up to the
107 	 * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
108 	 * a scan that stops after a few tuples because of a LIMIT.
109 	 */
110 	if (!node->initialized)
111 	{
112 		if (!pstate)
113 		{
114 			tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
115 
116 			if (!tbm || !IsA(tbm, TIDBitmap))
117 				elog(ERROR, "unrecognized result from subplan");
118 
119 			node->tbm = tbm;
120 			node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
121 			node->tbmres = tbmres = NULL;
122 
123 #ifdef USE_PREFETCH
124 			if (node->prefetch_maximum > 0)
125 			{
126 				node->prefetch_iterator = tbm_begin_iterate(tbm);
127 				node->prefetch_pages = 0;
128 				node->prefetch_target = -1;
129 			}
130 #endif							/* USE_PREFETCH */
131 		}
132 		else
133 		{
134 			/*
135 			 * The leader will immediately come out of the function, but
136 			 * others will be blocked until leader populates the TBM and wakes
137 			 * them up.
138 			 */
139 			if (BitmapShouldInitializeSharedState(pstate))
140 			{
141 				tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
142 				if (!tbm || !IsA(tbm, TIDBitmap))
143 					elog(ERROR, "unrecognized result from subplan");
144 
145 				node->tbm = tbm;
146 
147 				/*
148 				 * Prepare to iterate over the TBM. This will return the
149 				 * dsa_pointer of the iterator state which will be used by
150 				 * multiple processes to iterate jointly.
151 				 */
152 				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
153 #ifdef USE_PREFETCH
154 				if (node->prefetch_maximum > 0)
155 				{
156 					pstate->prefetch_iterator =
157 						tbm_prepare_shared_iterate(tbm);
158 
159 					/*
160 					 * We don't need the mutex here as we haven't yet woke up
161 					 * others.
162 					 */
163 					pstate->prefetch_pages = 0;
164 					pstate->prefetch_target = -1;
165 				}
166 #endif
167 
168 				/* We have initialized the shared state so wake up others. */
169 				BitmapDoneInitializingSharedState(pstate);
170 			}
171 
172 			/* Allocate a private iterator and attach the shared state to it */
173 			node->shared_tbmiterator = shared_tbmiterator =
174 				tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
175 			node->tbmres = tbmres = NULL;
176 
177 #ifdef USE_PREFETCH
178 			if (node->prefetch_maximum > 0)
179 			{
180 				node->shared_prefetch_iterator =
181 					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
182 			}
183 #endif							/* USE_PREFETCH */
184 		}
185 		node->initialized = true;
186 	}
187 
188 	for (;;)
189 	{
190 		bool		skip_fetch;
191 
192 		CHECK_FOR_INTERRUPTS();
193 
194 		/*
195 		 * Get next page of results if needed
196 		 */
197 		if (tbmres == NULL)
198 		{
199 			if (!pstate)
200 				node->tbmres = tbmres = tbm_iterate(tbmiterator);
201 			else
202 				node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
203 			if (tbmres == NULL)
204 			{
205 				/* no more entries in the bitmap */
206 				break;
207 			}
208 
209 			BitmapAdjustPrefetchIterator(node, tbmres);
210 
211 			/*
212 			 * We can skip fetching the heap page if we don't need any fields
213 			 * from the heap, and the bitmap entries don't need rechecking,
214 			 * and all tuples on the page are visible to our transaction.
215 			 *
216 			 * XXX: It's a layering violation that we do these checks above
217 			 * tableam, they should probably moved below it at some point.
218 			 */
219 			skip_fetch = (node->can_skip_fetch &&
220 						  !tbmres->recheck &&
221 						  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
222 										 tbmres->blockno,
223 										 &node->vmbuffer));
224 
225 			if (skip_fetch)
226 			{
227 				/* can't be lossy in the skip_fetch case */
228 				Assert(tbmres->ntuples >= 0);
229 
230 				/*
231 				 * The number of tuples on this page is put into
232 				 * node->return_empty_tuples.
233 				 */
234 				node->return_empty_tuples = tbmres->ntuples;
235 			}
236 			else if (!table_scan_bitmap_next_block(scan, tbmres))
237 			{
238 				/* AM doesn't think this block is valid, skip */
239 				continue;
240 			}
241 
242 			if (tbmres->ntuples >= 0)
243 				node->exact_pages++;
244 			else
245 				node->lossy_pages++;
246 
247 			/* Adjust the prefetch target */
248 			BitmapAdjustPrefetchTarget(node);
249 		}
250 		else
251 		{
252 			/*
253 			 * Continuing in previously obtained page.
254 			 */
255 
256 #ifdef USE_PREFETCH
257 
258 			/*
259 			 * Try to prefetch at least a few pages even before we get to the
260 			 * second page if we don't stop reading after the first tuple.
261 			 */
262 			if (!pstate)
263 			{
264 				if (node->prefetch_target < node->prefetch_maximum)
265 					node->prefetch_target++;
266 			}
267 			else if (pstate->prefetch_target < node->prefetch_maximum)
268 			{
269 				/* take spinlock while updating shared state */
270 				SpinLockAcquire(&pstate->mutex);
271 				if (pstate->prefetch_target < node->prefetch_maximum)
272 					pstate->prefetch_target++;
273 				SpinLockRelease(&pstate->mutex);
274 			}
275 #endif							/* USE_PREFETCH */
276 		}
277 
278 		/*
279 		 * We issue prefetch requests *after* fetching the current page to try
280 		 * to avoid having prefetching interfere with the main I/O. Also, this
281 		 * should happen only when we have determined there is still something
282 		 * to do on the current page, else we may uselessly prefetch the same
283 		 * page we are just about to request for real.
284 		 *
285 		 * XXX: It's a layering violation that we do these checks above
286 		 * tableam, they should probably moved below it at some point.
287 		 */
288 		BitmapPrefetch(node, scan);
289 
290 		if (node->return_empty_tuples > 0)
291 		{
292 			/*
293 			 * If we don't have to fetch the tuple, just return nulls.
294 			 */
295 			ExecStoreAllNullTuple(slot);
296 
297 			if (--node->return_empty_tuples == 0)
298 			{
299 				/* no more tuples to return in the next round */
300 				node->tbmres = tbmres = NULL;
301 			}
302 		}
303 		else
304 		{
305 			/*
306 			 * Attempt to fetch tuple from AM.
307 			 */
308 			if (!table_scan_bitmap_next_tuple(scan, tbmres, slot))
309 			{
310 				/* nothing more to look at on this page */
311 				node->tbmres = tbmres = NULL;
312 				continue;
313 			}
314 
315 			/*
316 			 * If we are using lossy info, we have to recheck the qual
317 			 * conditions at every tuple.
318 			 */
319 			if (tbmres->recheck)
320 			{
321 				econtext->ecxt_scantuple = slot;
322 				if (!ExecQualAndReset(node->bitmapqualorig, econtext))
323 				{
324 					/* Fails recheck, so drop it and loop back for another */
325 					InstrCountFiltered2(node, 1);
326 					ExecClearTuple(slot);
327 					continue;
328 				}
329 			}
330 		}
331 
332 		/* OK to return this tuple */
333 		return slot;
334 	}
335 
336 	/*
337 	 * if we get here it means we are at the end of the scan..
338 	 */
339 	return ExecClearTuple(slot);
340 }
341 
342 /*
343  *	BitmapDoneInitializingSharedState - Shared state is initialized
344  *
345  *	By this time the leader has already populated the TBM and initialized the
346  *	shared state so wake up other processes.
347  */
348 static inline void
BitmapDoneInitializingSharedState(ParallelBitmapHeapState * pstate)349 BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
350 {
351 	SpinLockAcquire(&pstate->mutex);
352 	pstate->state = BM_FINISHED;
353 	SpinLockRelease(&pstate->mutex);
354 	ConditionVariableBroadcast(&pstate->cv);
355 }
356 
357 /*
358  *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
359  */
360 static inline void
BitmapAdjustPrefetchIterator(BitmapHeapScanState * node,TBMIterateResult * tbmres)361 BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
362 							 TBMIterateResult *tbmres)
363 {
364 #ifdef USE_PREFETCH
365 	ParallelBitmapHeapState *pstate = node->pstate;
366 
367 	if (pstate == NULL)
368 	{
369 		TBMIterator *prefetch_iterator = node->prefetch_iterator;
370 
371 		if (node->prefetch_pages > 0)
372 		{
373 			/* The main iterator has closed the distance by one page */
374 			node->prefetch_pages--;
375 		}
376 		else if (prefetch_iterator)
377 		{
378 			/* Do not let the prefetch iterator get behind the main one */
379 			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
380 
381 			if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
382 				elog(ERROR, "prefetch and main iterators are out of sync");
383 		}
384 		return;
385 	}
386 
387 	if (node->prefetch_maximum > 0)
388 	{
389 		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
390 
391 		SpinLockAcquire(&pstate->mutex);
392 		if (pstate->prefetch_pages > 0)
393 		{
394 			pstate->prefetch_pages--;
395 			SpinLockRelease(&pstate->mutex);
396 		}
397 		else
398 		{
399 			/* Release the mutex before iterating */
400 			SpinLockRelease(&pstate->mutex);
401 
402 			/*
403 			 * In case of shared mode, we can not ensure that the current
404 			 * blockno of the main iterator and that of the prefetch iterator
405 			 * are same.  It's possible that whatever blockno we are
406 			 * prefetching will be processed by another process.  Therefore,
407 			 * we don't validate the blockno here as we do in non-parallel
408 			 * case.
409 			 */
410 			if (prefetch_iterator)
411 				tbm_shared_iterate(prefetch_iterator);
412 		}
413 	}
414 #endif							/* USE_PREFETCH */
415 }
416 
417 /*
418  * BitmapAdjustPrefetchTarget - Adjust the prefetch target
419  *
420  * Increase prefetch target if it's not yet at the max.  Note that
421  * we will increase it to zero after fetching the very first
422  * page/tuple, then to one after the second tuple is fetched, then
423  * it doubles as later pages are fetched.
424  */
425 static inline void
BitmapAdjustPrefetchTarget(BitmapHeapScanState * node)426 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
427 {
428 #ifdef USE_PREFETCH
429 	ParallelBitmapHeapState *pstate = node->pstate;
430 
431 	if (pstate == NULL)
432 	{
433 		if (node->prefetch_target >= node->prefetch_maximum)
434 			 /* don't increase any further */ ;
435 		else if (node->prefetch_target >= node->prefetch_maximum / 2)
436 			node->prefetch_target = node->prefetch_maximum;
437 		else if (node->prefetch_target > 0)
438 			node->prefetch_target *= 2;
439 		else
440 			node->prefetch_target++;
441 		return;
442 	}
443 
444 	/* Do an unlocked check first to save spinlock acquisitions. */
445 	if (pstate->prefetch_target < node->prefetch_maximum)
446 	{
447 		SpinLockAcquire(&pstate->mutex);
448 		if (pstate->prefetch_target >= node->prefetch_maximum)
449 			 /* don't increase any further */ ;
450 		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
451 			pstate->prefetch_target = node->prefetch_maximum;
452 		else if (pstate->prefetch_target > 0)
453 			pstate->prefetch_target *= 2;
454 		else
455 			pstate->prefetch_target++;
456 		SpinLockRelease(&pstate->mutex);
457 	}
458 #endif							/* USE_PREFETCH */
459 }
460 
461 /*
462  * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
463  */
464 static inline void
BitmapPrefetch(BitmapHeapScanState * node,TableScanDesc scan)465 BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
466 {
467 #ifdef USE_PREFETCH
468 	ParallelBitmapHeapState *pstate = node->pstate;
469 
470 	if (pstate == NULL)
471 	{
472 		TBMIterator *prefetch_iterator = node->prefetch_iterator;
473 
474 		if (prefetch_iterator)
475 		{
476 			while (node->prefetch_pages < node->prefetch_target)
477 			{
478 				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
479 				bool		skip_fetch;
480 
481 				if (tbmpre == NULL)
482 				{
483 					/* No more pages to prefetch */
484 					tbm_end_iterate(prefetch_iterator);
485 					node->prefetch_iterator = NULL;
486 					break;
487 				}
488 				node->prefetch_pages++;
489 
490 				/*
491 				 * If we expect not to have to actually read this heap page,
492 				 * skip this prefetch call, but continue to run the prefetch
493 				 * logic normally.  (Would it be better not to increment
494 				 * prefetch_pages?)
495 				 *
496 				 * This depends on the assumption that the index AM will
497 				 * report the same recheck flag for this future heap page as
498 				 * it did for the current heap page; which is not a certainty
499 				 * but is true in many cases.
500 				 */
501 				skip_fetch = (node->can_skip_fetch &&
502 							  (node->tbmres ? !node->tbmres->recheck : false) &&
503 							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
504 											 tbmpre->blockno,
505 											 &node->pvmbuffer));
506 
507 				if (!skip_fetch)
508 					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
509 			}
510 		}
511 
512 		return;
513 	}
514 
515 	if (pstate->prefetch_pages < pstate->prefetch_target)
516 	{
517 		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
518 
519 		if (prefetch_iterator)
520 		{
521 			while (1)
522 			{
523 				TBMIterateResult *tbmpre;
524 				bool		do_prefetch = false;
525 				bool		skip_fetch;
526 
527 				/*
528 				 * Recheck under the mutex. If some other process has already
529 				 * done enough prefetching then we need not to do anything.
530 				 */
531 				SpinLockAcquire(&pstate->mutex);
532 				if (pstate->prefetch_pages < pstate->prefetch_target)
533 				{
534 					pstate->prefetch_pages++;
535 					do_prefetch = true;
536 				}
537 				SpinLockRelease(&pstate->mutex);
538 
539 				if (!do_prefetch)
540 					return;
541 
542 				tbmpre = tbm_shared_iterate(prefetch_iterator);
543 				if (tbmpre == NULL)
544 				{
545 					/* No more pages to prefetch */
546 					tbm_end_shared_iterate(prefetch_iterator);
547 					node->shared_prefetch_iterator = NULL;
548 					break;
549 				}
550 
551 				/* As above, skip prefetch if we expect not to need page */
552 				skip_fetch = (node->can_skip_fetch &&
553 							  (node->tbmres ? !node->tbmres->recheck : false) &&
554 							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
555 											 tbmpre->blockno,
556 											 &node->pvmbuffer));
557 
558 				if (!skip_fetch)
559 					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
560 			}
561 		}
562 	}
563 #endif							/* USE_PREFETCH */
564 }
565 
566 /*
567  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
568  */
569 static bool
BitmapHeapRecheck(BitmapHeapScanState * node,TupleTableSlot * slot)570 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
571 {
572 	ExprContext *econtext;
573 
574 	/*
575 	 * extract necessary information from index scan node
576 	 */
577 	econtext = node->ss.ps.ps_ExprContext;
578 
579 	/* Does the tuple meet the original qual conditions? */
580 	econtext->ecxt_scantuple = slot;
581 	return ExecQualAndReset(node->bitmapqualorig, econtext);
582 }
583 
584 /* ----------------------------------------------------------------
585  *		ExecBitmapHeapScan(node)
586  * ----------------------------------------------------------------
587  */
588 static TupleTableSlot *
ExecBitmapHeapScan(PlanState * pstate)589 ExecBitmapHeapScan(PlanState *pstate)
590 {
591 	BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate);
592 
593 	return ExecScan(&node->ss,
594 					(ExecScanAccessMtd) BitmapHeapNext,
595 					(ExecScanRecheckMtd) BitmapHeapRecheck);
596 }
597 
598 /* ----------------------------------------------------------------
599  *		ExecReScanBitmapHeapScan(node)
600  * ----------------------------------------------------------------
601  */
602 void
ExecReScanBitmapHeapScan(BitmapHeapScanState * node)603 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
604 {
605 	PlanState  *outerPlan = outerPlanState(node);
606 
607 	/* rescan to release any page pin */
608 	table_rescan(node->ss.ss_currentScanDesc, NULL);
609 
610 	/* release bitmaps and buffers if any */
611 	if (node->tbmiterator)
612 		tbm_end_iterate(node->tbmiterator);
613 	if (node->prefetch_iterator)
614 		tbm_end_iterate(node->prefetch_iterator);
615 	if (node->shared_tbmiterator)
616 		tbm_end_shared_iterate(node->shared_tbmiterator);
617 	if (node->shared_prefetch_iterator)
618 		tbm_end_shared_iterate(node->shared_prefetch_iterator);
619 	if (node->tbm)
620 		tbm_free(node->tbm);
621 	if (node->vmbuffer != InvalidBuffer)
622 		ReleaseBuffer(node->vmbuffer);
623 	if (node->pvmbuffer != InvalidBuffer)
624 		ReleaseBuffer(node->pvmbuffer);
625 	node->tbm = NULL;
626 	node->tbmiterator = NULL;
627 	node->tbmres = NULL;
628 	node->prefetch_iterator = NULL;
629 	node->initialized = false;
630 	node->shared_tbmiterator = NULL;
631 	node->shared_prefetch_iterator = NULL;
632 	node->vmbuffer = InvalidBuffer;
633 	node->pvmbuffer = InvalidBuffer;
634 
635 	ExecScanReScan(&node->ss);
636 
637 	/*
638 	 * if chgParam of subnode is not null then plan will be re-scanned by
639 	 * first ExecProcNode.
640 	 */
641 	if (outerPlan->chgParam == NULL)
642 		ExecReScan(outerPlan);
643 }
644 
645 /* ----------------------------------------------------------------
646  *		ExecEndBitmapHeapScan
647  * ----------------------------------------------------------------
648  */
649 void
ExecEndBitmapHeapScan(BitmapHeapScanState * node)650 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
651 {
652 	TableScanDesc scanDesc;
653 
654 	/*
655 	 * extract information from the node
656 	 */
657 	scanDesc = node->ss.ss_currentScanDesc;
658 
659 	/*
660 	 * Free the exprcontext
661 	 */
662 	ExecFreeExprContext(&node->ss.ps);
663 
664 	/*
665 	 * clear out tuple table slots
666 	 */
667 	if (node->ss.ps.ps_ResultTupleSlot)
668 		ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
669 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
670 
671 	/*
672 	 * close down subplans
673 	 */
674 	ExecEndNode(outerPlanState(node));
675 
676 	/*
677 	 * release bitmaps and buffers if any
678 	 */
679 	if (node->tbmiterator)
680 		tbm_end_iterate(node->tbmiterator);
681 	if (node->prefetch_iterator)
682 		tbm_end_iterate(node->prefetch_iterator);
683 	if (node->tbm)
684 		tbm_free(node->tbm);
685 	if (node->shared_tbmiterator)
686 		tbm_end_shared_iterate(node->shared_tbmiterator);
687 	if (node->shared_prefetch_iterator)
688 		tbm_end_shared_iterate(node->shared_prefetch_iterator);
689 	if (node->vmbuffer != InvalidBuffer)
690 		ReleaseBuffer(node->vmbuffer);
691 	if (node->pvmbuffer != InvalidBuffer)
692 		ReleaseBuffer(node->pvmbuffer);
693 
694 	/*
695 	 * close heap scan
696 	 */
697 	table_endscan(scanDesc);
698 }
699 
700 /* ----------------------------------------------------------------
701  *		ExecInitBitmapHeapScan
702  *
703  *		Initializes the scan's state information.
704  * ----------------------------------------------------------------
705  */
706 BitmapHeapScanState *
ExecInitBitmapHeapScan(BitmapHeapScan * node,EState * estate,int eflags)707 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
708 {
709 	BitmapHeapScanState *scanstate;
710 	Relation	currentRelation;
711 	int			io_concurrency;
712 
713 	/* check for unsupported flags */
714 	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
715 
716 	/*
717 	 * Assert caller didn't ask for an unsafe snapshot --- see comments at
718 	 * head of file.
719 	 */
720 	Assert(IsMVCCSnapshot(estate->es_snapshot));
721 
722 	/*
723 	 * create state structure
724 	 */
725 	scanstate = makeNode(BitmapHeapScanState);
726 	scanstate->ss.ps.plan = (Plan *) node;
727 	scanstate->ss.ps.state = estate;
728 	scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
729 
730 	scanstate->tbm = NULL;
731 	scanstate->tbmiterator = NULL;
732 	scanstate->tbmres = NULL;
733 	scanstate->return_empty_tuples = 0;
734 	scanstate->vmbuffer = InvalidBuffer;
735 	scanstate->pvmbuffer = InvalidBuffer;
736 	scanstate->exact_pages = 0;
737 	scanstate->lossy_pages = 0;
738 	scanstate->prefetch_iterator = NULL;
739 	scanstate->prefetch_pages = 0;
740 	scanstate->prefetch_target = 0;
741 	/* may be updated below */
742 	scanstate->prefetch_maximum = target_prefetch_pages;
743 	scanstate->pscan_len = 0;
744 	scanstate->initialized = false;
745 	scanstate->shared_tbmiterator = NULL;
746 	scanstate->shared_prefetch_iterator = NULL;
747 	scanstate->pstate = NULL;
748 
749 	/*
750 	 * We can potentially skip fetching heap pages if we do not need any
751 	 * columns of the table, either for checking non-indexable quals or for
752 	 * returning data.  This test is a bit simplistic, as it checks the
753 	 * stronger condition that there's no qual or return tlist at all.  But in
754 	 * most cases it's probably not worth working harder than that.
755 	 */
756 	scanstate->can_skip_fetch = (node->scan.plan.qual == NIL &&
757 								 node->scan.plan.targetlist == NIL);
758 
759 	/*
760 	 * Miscellaneous initialization
761 	 *
762 	 * create expression context for node
763 	 */
764 	ExecAssignExprContext(estate, &scanstate->ss.ps);
765 
766 	/*
767 	 * open the scan relation
768 	 */
769 	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
770 
771 	/*
772 	 * initialize child nodes
773 	 */
774 	outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
775 
776 	/*
777 	 * get the scan type from the relation descriptor.
778 	 */
779 	ExecInitScanTupleSlot(estate, &scanstate->ss,
780 						  RelationGetDescr(currentRelation),
781 						  table_slot_callbacks(currentRelation));
782 
783 	/*
784 	 * Initialize result type and projection.
785 	 */
786 	ExecInitResultTypeTL(&scanstate->ss.ps);
787 	ExecAssignScanProjectionInfo(&scanstate->ss);
788 
789 	/*
790 	 * initialize child expressions
791 	 */
792 	scanstate->ss.ps.qual =
793 		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
794 	scanstate->bitmapqualorig =
795 		ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
796 
797 	/*
798 	 * Determine the maximum for prefetch_target.  If the tablespace has a
799 	 * specific IO concurrency set, use that to compute the corresponding
800 	 * maximum value; otherwise, we already initialized to the value computed
801 	 * by the GUC machinery.
802 	 */
803 	io_concurrency =
804 		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
805 	if (io_concurrency != effective_io_concurrency)
806 	{
807 		double		maximum;
808 
809 		if (ComputeIoConcurrency(io_concurrency, &maximum))
810 			scanstate->prefetch_maximum = rint(maximum);
811 	}
812 
813 	scanstate->ss.ss_currentRelation = currentRelation;
814 
815 	scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation,
816 														  estate->es_snapshot,
817 														  0,
818 														  NULL);
819 
820 	/*
821 	 * all done.
822 	 */
823 	return scanstate;
824 }
825 
826 /*----------------
827  *		BitmapShouldInitializeSharedState
828  *
829  *		The first process to come here and see the state to the BM_INITIAL
830  *		will become the leader for the parallel bitmap scan and will be
831  *		responsible for populating the TIDBitmap.  The other processes will
832  *		be blocked by the condition variable until the leader wakes them up.
833  * ---------------
834  */
835 static bool
BitmapShouldInitializeSharedState(ParallelBitmapHeapState * pstate)836 BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
837 {
838 	SharedBitmapState state;
839 
840 	while (1)
841 	{
842 		SpinLockAcquire(&pstate->mutex);
843 		state = pstate->state;
844 		if (pstate->state == BM_INITIAL)
845 			pstate->state = BM_INPROGRESS;
846 		SpinLockRelease(&pstate->mutex);
847 
848 		/* Exit if bitmap is done, or if we're the leader. */
849 		if (state != BM_INPROGRESS)
850 			break;
851 
852 		/* Wait for the leader to wake us up. */
853 		ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
854 	}
855 
856 	ConditionVariableCancelSleep();
857 
858 	return (state == BM_INITIAL);
859 }
860 
861 /* ----------------------------------------------------------------
862  *		ExecBitmapHeapEstimate
863  *
864  *		Compute the amount of space we'll need in the parallel
865  *		query DSM, and inform pcxt->estimator about our needs.
866  * ----------------------------------------------------------------
867  */
868 void
ExecBitmapHeapEstimate(BitmapHeapScanState * node,ParallelContext * pcxt)869 ExecBitmapHeapEstimate(BitmapHeapScanState *node,
870 					   ParallelContext *pcxt)
871 {
872 	EState	   *estate = node->ss.ps.state;
873 
874 	node->pscan_len = add_size(offsetof(ParallelBitmapHeapState,
875 										phs_snapshot_data),
876 							   EstimateSnapshotSpace(estate->es_snapshot));
877 
878 	shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
879 	shm_toc_estimate_keys(&pcxt->estimator, 1);
880 }
881 
882 /* ----------------------------------------------------------------
883  *		ExecBitmapHeapInitializeDSM
884  *
885  *		Set up a parallel bitmap heap scan descriptor.
886  * ----------------------------------------------------------------
887  */
888 void
ExecBitmapHeapInitializeDSM(BitmapHeapScanState * node,ParallelContext * pcxt)889 ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
890 							ParallelContext *pcxt)
891 {
892 	ParallelBitmapHeapState *pstate;
893 	EState	   *estate = node->ss.ps.state;
894 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
895 
896 	/* If there's no DSA, there are no workers; initialize nothing. */
897 	if (dsa == NULL)
898 		return;
899 
900 	pstate = shm_toc_allocate(pcxt->toc, node->pscan_len);
901 
902 	pstate->tbmiterator = 0;
903 	pstate->prefetch_iterator = 0;
904 
905 	/* Initialize the mutex */
906 	SpinLockInit(&pstate->mutex);
907 	pstate->prefetch_pages = 0;
908 	pstate->prefetch_target = 0;
909 	pstate->state = BM_INITIAL;
910 
911 	ConditionVariableInit(&pstate->cv);
912 	SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data);
913 
914 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
915 	node->pstate = pstate;
916 }
917 
918 /* ----------------------------------------------------------------
919  *		ExecBitmapHeapReInitializeDSM
920  *
921  *		Reset shared state before beginning a fresh scan.
922  * ----------------------------------------------------------------
923  */
924 void
ExecBitmapHeapReInitializeDSM(BitmapHeapScanState * node,ParallelContext * pcxt)925 ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
926 							  ParallelContext *pcxt)
927 {
928 	ParallelBitmapHeapState *pstate = node->pstate;
929 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
930 
931 	/* If there's no DSA, there are no workers; do nothing. */
932 	if (dsa == NULL)
933 		return;
934 
935 	pstate->state = BM_INITIAL;
936 
937 	if (DsaPointerIsValid(pstate->tbmiterator))
938 		tbm_free_shared_area(dsa, pstate->tbmiterator);
939 
940 	if (DsaPointerIsValid(pstate->prefetch_iterator))
941 		tbm_free_shared_area(dsa, pstate->prefetch_iterator);
942 
943 	pstate->tbmiterator = InvalidDsaPointer;
944 	pstate->prefetch_iterator = InvalidDsaPointer;
945 }
946 
947 /* ----------------------------------------------------------------
948  *		ExecBitmapHeapInitializeWorker
949  *
950  *		Copy relevant information from TOC into planstate.
951  * ----------------------------------------------------------------
952  */
953 void
ExecBitmapHeapInitializeWorker(BitmapHeapScanState * node,ParallelWorkerContext * pwcxt)954 ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
955 							   ParallelWorkerContext *pwcxt)
956 {
957 	ParallelBitmapHeapState *pstate;
958 	Snapshot	snapshot;
959 
960 	Assert(node->ss.ps.state->es_query_dsa != NULL);
961 
962 	pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
963 	node->pstate = pstate;
964 
965 	snapshot = RestoreSnapshot(pstate->phs_snapshot_data);
966 	table_scan_update_snapshot(node->ss.ss_currentScanDesc, snapshot);
967 }
968