1 /*-------------------------------------------------------------------------
2  *
3  * nodeBitmapHeapscan.c
4  *	  Routines to support bitmapped scans of relations
5  *
6  * NOTE: it is critical that this plan type only be used with MVCC-compliant
7  * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8  * special snapshots).  The reason is that since index and heap scans are
9  * decoupled, there can be no assurance that the index tuple prompting a
10  * visit to a particular heap TID still exists when the visit is made.
11  * Therefore the tuple might not exist anymore either (which is OK because
12  * heap_fetch will cope) --- but worse, the tuple slot could have been
13  * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
14  * certain to fail the time qual and so it will not be mistakenly returned,
15  * but with anything else we might return a tuple that doesn't meet the
16  * required index qual conditions.
17  *
18  *
19  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
20  * Portions Copyright (c) 1994, Regents of the University of California
21  *
22  *
23  * IDENTIFICATION
24  *	  src/backend/executor/nodeBitmapHeapscan.c
25  *
26  *-------------------------------------------------------------------------
27  */
28 /*
29  * INTERFACE ROUTINES
30  *		ExecBitmapHeapScan			scans a relation using bitmap info
31  *		ExecBitmapHeapNext			workhorse for above
32  *		ExecInitBitmapHeapScan		creates and initializes state info.
33  *		ExecReScanBitmapHeapScan	prepares to rescan the plan.
34  *		ExecEndBitmapHeapScan		releases all storage.
35  */
36 #include "postgres.h"
37 
38 #include <math.h>
39 
40 #include "access/relscan.h"
41 #include "access/transam.h"
42 #include "executor/execdebug.h"
43 #include "executor/nodeBitmapHeapscan.h"
44 #include "miscadmin.h"
45 #include "pgstat.h"
46 #include "storage/bufmgr.h"
47 #include "storage/predicate.h"
48 #include "utils/memutils.h"
49 #include "utils/rel.h"
50 #include "utils/spccache.h"
51 #include "utils/snapmgr.h"
52 #include "utils/tqual.h"
53 
54 
55 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
56 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
57 static inline void BitmapDoneInitializingSharedState(
58 								  ParallelBitmapHeapState *pstate);
59 static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
60 							 TBMIterateResult *tbmres);
61 static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
62 static inline void BitmapPrefetch(BitmapHeapScanState *node,
63 			   HeapScanDesc scan);
64 static bool BitmapShouldInitializeSharedState(
65 								  ParallelBitmapHeapState *pstate);
66 
67 
68 /* ----------------------------------------------------------------
69  *		BitmapHeapNext
70  *
71  *		Retrieve next tuple from the BitmapHeapScan node's currentRelation
72  * ----------------------------------------------------------------
73  */
74 static TupleTableSlot *
BitmapHeapNext(BitmapHeapScanState * node)75 BitmapHeapNext(BitmapHeapScanState *node)
76 {
77 	ExprContext *econtext;
78 	HeapScanDesc scan;
79 	TIDBitmap  *tbm;
80 	TBMIterator *tbmiterator = NULL;
81 	TBMSharedIterator *shared_tbmiterator = NULL;
82 	TBMIterateResult *tbmres;
83 	OffsetNumber targoffset;
84 	TupleTableSlot *slot;
85 	ParallelBitmapHeapState *pstate = node->pstate;
86 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
87 
88 	/*
89 	 * extract necessary information from index scan node
90 	 */
91 	econtext = node->ss.ps.ps_ExprContext;
92 	slot = node->ss.ss_ScanTupleSlot;
93 	scan = node->ss.ss_currentScanDesc;
94 	tbm = node->tbm;
95 	if (pstate == NULL)
96 		tbmiterator = node->tbmiterator;
97 	else
98 		shared_tbmiterator = node->shared_tbmiterator;
99 	tbmres = node->tbmres;
100 
101 	/*
102 	 * If we haven't yet performed the underlying index scan, do it, and begin
103 	 * the iteration over the bitmap.
104 	 *
105 	 * For prefetching, we use *two* iterators, one for the pages we are
106 	 * actually scanning and another that runs ahead of the first for
107 	 * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
108 	 * the prefetch iterator is.  Also, node->prefetch_target tracks the
109 	 * desired prefetch distance, which starts small and increases up to the
110 	 * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
111 	 * a scan that stops after a few tuples because of a LIMIT.
112 	 */
113 	if (!node->initialized)
114 	{
115 		if (!pstate)
116 		{
117 			tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
118 
119 			if (!tbm || !IsA(tbm, TIDBitmap))
120 				elog(ERROR, "unrecognized result from subplan");
121 
122 			node->tbm = tbm;
123 			node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
124 			node->tbmres = tbmres = NULL;
125 
126 #ifdef USE_PREFETCH
127 			if (node->prefetch_maximum > 0)
128 			{
129 				node->prefetch_iterator = tbm_begin_iterate(tbm);
130 				node->prefetch_pages = 0;
131 				node->prefetch_target = -1;
132 			}
133 #endif							/* USE_PREFETCH */
134 		}
135 		else
136 		{
137 			/*
138 			 * The leader will immediately come out of the function, but
139 			 * others will be blocked until leader populates the TBM and wakes
140 			 * them up.
141 			 */
142 			if (BitmapShouldInitializeSharedState(pstate))
143 			{
144 				tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
145 				if (!tbm || !IsA(tbm, TIDBitmap))
146 					elog(ERROR, "unrecognized result from subplan");
147 
148 				node->tbm = tbm;
149 
150 				/*
151 				 * Prepare to iterate over the TBM. This will return the
152 				 * dsa_pointer of the iterator state which will be used by
153 				 * multiple processes to iterate jointly.
154 				 */
155 				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
156 #ifdef USE_PREFETCH
157 				if (node->prefetch_maximum > 0)
158 				{
159 					pstate->prefetch_iterator =
160 						tbm_prepare_shared_iterate(tbm);
161 
162 					/*
163 					 * We don't need the mutex here as we haven't yet woke up
164 					 * others.
165 					 */
166 					pstate->prefetch_pages = 0;
167 					pstate->prefetch_target = -1;
168 				}
169 #endif
170 
171 				/* We have initialized the shared state so wake up others. */
172 				BitmapDoneInitializingSharedState(pstate);
173 			}
174 
175 			/* Allocate a private iterator and attach the shared state to it */
176 			node->shared_tbmiterator = shared_tbmiterator =
177 				tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
178 			node->tbmres = tbmres = NULL;
179 
180 #ifdef USE_PREFETCH
181 			if (node->prefetch_maximum > 0)
182 			{
183 				node->shared_prefetch_iterator =
184 					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
185 			}
186 #endif							/* USE_PREFETCH */
187 		}
188 		node->initialized = true;
189 	}
190 
191 	for (;;)
192 	{
193 		Page		dp;
194 		ItemId		lp;
195 
196 		CHECK_FOR_INTERRUPTS();
197 
198 		/*
199 		 * Get next page of results if needed
200 		 */
201 		if (tbmres == NULL)
202 		{
203 			if (!pstate)
204 				node->tbmres = tbmres = tbm_iterate(tbmiterator);
205 			else
206 				node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
207 			if (tbmres == NULL)
208 			{
209 				/* no more entries in the bitmap */
210 				break;
211 			}
212 
213 			BitmapAdjustPrefetchIterator(node, tbmres);
214 
215 			/*
216 			 * Ignore any claimed entries past what we think is the end of the
217 			 * relation.  (This is probably not necessary given that we got at
218 			 * least AccessShareLock on the table before performing any of the
219 			 * indexscans, but let's be safe.)
220 			 */
221 			if (tbmres->blockno >= scan->rs_nblocks)
222 			{
223 				node->tbmres = tbmres = NULL;
224 				continue;
225 			}
226 
227 			/*
228 			 * Fetch the current heap page and identify candidate tuples.
229 			 */
230 			bitgetpage(scan, tbmres);
231 
232 			if (tbmres->ntuples >= 0)
233 				node->exact_pages++;
234 			else
235 				node->lossy_pages++;
236 
237 			/*
238 			 * Set rs_cindex to first slot to examine
239 			 */
240 			scan->rs_cindex = 0;
241 
242 			/* Adjust the prefetch target */
243 			BitmapAdjustPrefetchTarget(node);
244 		}
245 		else
246 		{
247 			/*
248 			 * Continuing in previously obtained page; advance rs_cindex
249 			 */
250 			scan->rs_cindex++;
251 
252 #ifdef USE_PREFETCH
253 
254 			/*
255 			 * Try to prefetch at least a few pages even before we get to the
256 			 * second page if we don't stop reading after the first tuple.
257 			 */
258 			if (!pstate)
259 			{
260 				if (node->prefetch_target < node->prefetch_maximum)
261 					node->prefetch_target++;
262 			}
263 			else if (pstate->prefetch_target < node->prefetch_maximum)
264 			{
265 				/* take spinlock while updating shared state */
266 				SpinLockAcquire(&pstate->mutex);
267 				if (pstate->prefetch_target < node->prefetch_maximum)
268 					pstate->prefetch_target++;
269 				SpinLockRelease(&pstate->mutex);
270 			}
271 #endif							/* USE_PREFETCH */
272 		}
273 
274 		/*
275 		 * Out of range?  If so, nothing more to look at on this page
276 		 */
277 		if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
278 		{
279 			node->tbmres = tbmres = NULL;
280 			continue;
281 		}
282 
283 		/*
284 		 * We issue prefetch requests *after* fetching the current page to try
285 		 * to avoid having prefetching interfere with the main I/O. Also, this
286 		 * should happen only when we have determined there is still something
287 		 * to do on the current page, else we may uselessly prefetch the same
288 		 * page we are just about to request for real.
289 		 */
290 		BitmapPrefetch(node, scan);
291 
292 		/*
293 		 * Okay to fetch the tuple
294 		 */
295 		targoffset = scan->rs_vistuples[scan->rs_cindex];
296 		dp = (Page) BufferGetPage(scan->rs_cbuf);
297 		lp = PageGetItemId(dp, targoffset);
298 		Assert(ItemIdIsNormal(lp));
299 
300 		scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
301 		scan->rs_ctup.t_len = ItemIdGetLength(lp);
302 		scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
303 		ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
304 
305 		pgstat_count_heap_fetch(scan->rs_rd);
306 
307 		/*
308 		 * Set up the result slot to point to this tuple. Note that the slot
309 		 * acquires a pin on the buffer.
310 		 */
311 		ExecStoreTuple(&scan->rs_ctup,
312 					   slot,
313 					   scan->rs_cbuf,
314 					   false);
315 
316 		/*
317 		 * If we are using lossy info, we have to recheck the qual conditions
318 		 * at every tuple.
319 		 */
320 		if (tbmres->recheck)
321 		{
322 			econtext->ecxt_scantuple = slot;
323 			ResetExprContext(econtext);
324 
325 			if (!ExecQual(node->bitmapqualorig, econtext))
326 			{
327 				/* Fails recheck, so drop it and loop back for another */
328 				InstrCountFiltered2(node, 1);
329 				ExecClearTuple(slot);
330 				continue;
331 			}
332 		}
333 
334 		/* OK to return this tuple */
335 		return slot;
336 	}
337 
338 	/*
339 	 * if we get here it means we are at the end of the scan..
340 	 */
341 	return ExecClearTuple(slot);
342 }
343 
344 /*
345  * bitgetpage - subroutine for BitmapHeapNext()
346  *
347  * This routine reads and pins the specified page of the relation, then
348  * builds an array indicating which tuples on the page are both potentially
349  * interesting according to the bitmap, and visible according to the snapshot.
350  */
351 static void
bitgetpage(HeapScanDesc scan,TBMIterateResult * tbmres)352 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
353 {
354 	BlockNumber page = tbmres->blockno;
355 	Buffer		buffer;
356 	Snapshot	snapshot;
357 	int			ntup;
358 
359 	/*
360 	 * Acquire pin on the target heap page, trading in any pin we held before.
361 	 */
362 	Assert(page < scan->rs_nblocks);
363 
364 	scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
365 										 scan->rs_rd,
366 										 page);
367 	buffer = scan->rs_cbuf;
368 	snapshot = scan->rs_snapshot;
369 
370 	ntup = 0;
371 
372 	/*
373 	 * Prune and repair fragmentation for the whole page, if possible.
374 	 */
375 	heap_page_prune_opt(scan->rs_rd, buffer);
376 
377 	/*
378 	 * We must hold share lock on the buffer content while examining tuple
379 	 * visibility.  Afterwards, however, the tuples we have found to be
380 	 * visible are guaranteed good as long as we hold the buffer pin.
381 	 */
382 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
383 
384 	/*
385 	 * We need two separate strategies for lossy and non-lossy cases.
386 	 */
387 	if (tbmres->ntuples >= 0)
388 	{
389 		/*
390 		 * Bitmap is non-lossy, so we just look through the offsets listed in
391 		 * tbmres; but we have to follow any HOT chain starting at each such
392 		 * offset.
393 		 */
394 		int			curslot;
395 
396 		for (curslot = 0; curslot < tbmres->ntuples; curslot++)
397 		{
398 			OffsetNumber offnum = tbmres->offsets[curslot];
399 			ItemPointerData tid;
400 			HeapTupleData heapTuple;
401 
402 			ItemPointerSet(&tid, page, offnum);
403 			if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
404 									   &heapTuple, NULL, true))
405 				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
406 		}
407 	}
408 	else
409 	{
410 		/*
411 		 * Bitmap is lossy, so we must examine each item pointer on the page.
412 		 * But we can ignore HOT chains, since we'll check each tuple anyway.
413 		 */
414 		Page		dp = (Page) BufferGetPage(buffer);
415 		OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
416 		OffsetNumber offnum;
417 
418 		for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
419 		{
420 			ItemId		lp;
421 			HeapTupleData loctup;
422 			bool		valid;
423 
424 			lp = PageGetItemId(dp, offnum);
425 			if (!ItemIdIsNormal(lp))
426 				continue;
427 			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
428 			loctup.t_len = ItemIdGetLength(lp);
429 			loctup.t_tableOid = scan->rs_rd->rd_id;
430 			ItemPointerSet(&loctup.t_self, page, offnum);
431 			valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
432 			if (valid)
433 			{
434 				scan->rs_vistuples[ntup++] = offnum;
435 				PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
436 			}
437 			CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
438 											buffer, snapshot);
439 		}
440 	}
441 
442 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
443 
444 	Assert(ntup <= MaxHeapTuplesPerPage);
445 	scan->rs_ntuples = ntup;
446 }
447 
448 /*
449  *	BitmapDoneInitializingSharedState - Shared state is initialized
450  *
451  *	By this time the leader has already populated the TBM and initialized the
452  *	shared state so wake up other processes.
453  */
454 static inline void
BitmapDoneInitializingSharedState(ParallelBitmapHeapState * pstate)455 BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
456 {
457 	SpinLockAcquire(&pstate->mutex);
458 	pstate->state = BM_FINISHED;
459 	SpinLockRelease(&pstate->mutex);
460 	ConditionVariableBroadcast(&pstate->cv);
461 }
462 
463 /*
464  *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
465  */
466 static inline void
BitmapAdjustPrefetchIterator(BitmapHeapScanState * node,TBMIterateResult * tbmres)467 BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
468 							 TBMIterateResult *tbmres)
469 {
470 #ifdef USE_PREFETCH
471 	ParallelBitmapHeapState *pstate = node->pstate;
472 
473 	if (pstate == NULL)
474 	{
475 		TBMIterator *prefetch_iterator = node->prefetch_iterator;
476 
477 		if (node->prefetch_pages > 0)
478 		{
479 			/* The main iterator has closed the distance by one page */
480 			node->prefetch_pages--;
481 		}
482 		else if (prefetch_iterator)
483 		{
484 			/* Do not let the prefetch iterator get behind the main one */
485 			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
486 
487 			if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
488 				elog(ERROR, "prefetch and main iterators are out of sync");
489 		}
490 		return;
491 	}
492 
493 	if (node->prefetch_maximum > 0)
494 	{
495 		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
496 
497 		SpinLockAcquire(&pstate->mutex);
498 		if (pstate->prefetch_pages > 0)
499 		{
500 			pstate->prefetch_pages--;
501 			SpinLockRelease(&pstate->mutex);
502 		}
503 		else
504 		{
505 			/* Release the mutex before iterating */
506 			SpinLockRelease(&pstate->mutex);
507 
508 			/*
509 			 * In case of shared mode, we can not ensure that the current
510 			 * blockno of the main iterator and that of the prefetch iterator
511 			 * are same.  It's possible that whatever blockno we are
512 			 * prefetching will be processed by another process.  Therefore,
513 			 * we don't validate the blockno here as we do in non-parallel
514 			 * case.
515 			 */
516 			if (prefetch_iterator)
517 				tbm_shared_iterate(prefetch_iterator);
518 		}
519 	}
520 #endif							/* USE_PREFETCH */
521 }
522 
523 /*
524  * BitmapAdjustPrefetchTarget - Adjust the prefetch target
525  *
526  * Increase prefetch target if it's not yet at the max.  Note that
527  * we will increase it to zero after fetching the very first
528  * page/tuple, then to one after the second tuple is fetched, then
529  * it doubles as later pages are fetched.
530  */
531 static inline void
BitmapAdjustPrefetchTarget(BitmapHeapScanState * node)532 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
533 {
534 #ifdef USE_PREFETCH
535 	ParallelBitmapHeapState *pstate = node->pstate;
536 
537 	if (pstate == NULL)
538 	{
539 		if (node->prefetch_target >= node->prefetch_maximum)
540 			 /* don't increase any further */ ;
541 		else if (node->prefetch_target >= node->prefetch_maximum / 2)
542 			node->prefetch_target = node->prefetch_maximum;
543 		else if (node->prefetch_target > 0)
544 			node->prefetch_target *= 2;
545 		else
546 			node->prefetch_target++;
547 		return;
548 	}
549 
550 	/* Do an unlocked check first to save spinlock acquisitions. */
551 	if (pstate->prefetch_target < node->prefetch_maximum)
552 	{
553 		SpinLockAcquire(&pstate->mutex);
554 		if (pstate->prefetch_target >= node->prefetch_maximum)
555 			 /* don't increase any further */ ;
556 		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
557 			pstate->prefetch_target = node->prefetch_maximum;
558 		else if (pstate->prefetch_target > 0)
559 			pstate->prefetch_target *= 2;
560 		else
561 			pstate->prefetch_target++;
562 		SpinLockRelease(&pstate->mutex);
563 	}
564 #endif							/* USE_PREFETCH */
565 }
566 
567 /*
568  * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
569  */
570 static inline void
BitmapPrefetch(BitmapHeapScanState * node,HeapScanDesc scan)571 BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
572 {
573 #ifdef USE_PREFETCH
574 	ParallelBitmapHeapState *pstate = node->pstate;
575 
576 	if (pstate == NULL)
577 	{
578 		TBMIterator *prefetch_iterator = node->prefetch_iterator;
579 
580 		if (prefetch_iterator)
581 		{
582 			while (node->prefetch_pages < node->prefetch_target)
583 			{
584 				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
585 
586 				if (tbmpre == NULL)
587 				{
588 					/* No more pages to prefetch */
589 					tbm_end_iterate(prefetch_iterator);
590 					node->prefetch_iterator = NULL;
591 					break;
592 				}
593 				node->prefetch_pages++;
594 				PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
595 			}
596 		}
597 
598 		return;
599 	}
600 
601 	if (pstate->prefetch_pages < pstate->prefetch_target)
602 	{
603 		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
604 
605 		if (prefetch_iterator)
606 		{
607 			while (1)
608 			{
609 				TBMIterateResult *tbmpre;
610 				bool		do_prefetch = false;
611 
612 				/*
613 				 * Recheck under the mutex. If some other process has already
614 				 * done enough prefetching then we need not to do anything.
615 				 */
616 				SpinLockAcquire(&pstate->mutex);
617 				if (pstate->prefetch_pages < pstate->prefetch_target)
618 				{
619 					pstate->prefetch_pages++;
620 					do_prefetch = true;
621 				}
622 				SpinLockRelease(&pstate->mutex);
623 
624 				if (!do_prefetch)
625 					return;
626 
627 				tbmpre = tbm_shared_iterate(prefetch_iterator);
628 				if (tbmpre == NULL)
629 				{
630 					/* No more pages to prefetch */
631 					tbm_end_shared_iterate(prefetch_iterator);
632 					node->shared_prefetch_iterator = NULL;
633 					break;
634 				}
635 
636 				PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
637 			}
638 		}
639 	}
640 #endif							/* USE_PREFETCH */
641 }
642 
643 /*
644  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
645  */
646 static bool
BitmapHeapRecheck(BitmapHeapScanState * node,TupleTableSlot * slot)647 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
648 {
649 	ExprContext *econtext;
650 
651 	/*
652 	 * extract necessary information from index scan node
653 	 */
654 	econtext = node->ss.ps.ps_ExprContext;
655 
656 	/* Does the tuple meet the original qual conditions? */
657 	econtext->ecxt_scantuple = slot;
658 
659 	ResetExprContext(econtext);
660 
661 	return ExecQual(node->bitmapqualorig, econtext);
662 }
663 
664 /* ----------------------------------------------------------------
665  *		ExecBitmapHeapScan(node)
666  * ----------------------------------------------------------------
667  */
668 static TupleTableSlot *
ExecBitmapHeapScan(PlanState * pstate)669 ExecBitmapHeapScan(PlanState *pstate)
670 {
671 	BitmapHeapScanState *node = castNode(BitmapHeapScanState, pstate);
672 
673 	return ExecScan(&node->ss,
674 					(ExecScanAccessMtd) BitmapHeapNext,
675 					(ExecScanRecheckMtd) BitmapHeapRecheck);
676 }
677 
678 /* ----------------------------------------------------------------
679  *		ExecReScanBitmapHeapScan(node)
680  * ----------------------------------------------------------------
681  */
682 void
ExecReScanBitmapHeapScan(BitmapHeapScanState * node)683 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
684 {
685 	PlanState  *outerPlan = outerPlanState(node);
686 
687 	/* rescan to release any page pin */
688 	heap_rescan(node->ss.ss_currentScanDesc, NULL);
689 
690 	if (node->tbmiterator)
691 		tbm_end_iterate(node->tbmiterator);
692 	if (node->prefetch_iterator)
693 		tbm_end_iterate(node->prefetch_iterator);
694 	if (node->shared_tbmiterator)
695 		tbm_end_shared_iterate(node->shared_tbmiterator);
696 	if (node->shared_prefetch_iterator)
697 		tbm_end_shared_iterate(node->shared_prefetch_iterator);
698 	if (node->tbm)
699 		tbm_free(node->tbm);
700 	node->tbm = NULL;
701 	node->tbmiterator = NULL;
702 	node->tbmres = NULL;
703 	node->prefetch_iterator = NULL;
704 	node->initialized = false;
705 	node->shared_tbmiterator = NULL;
706 	node->shared_prefetch_iterator = NULL;
707 
708 	ExecScanReScan(&node->ss);
709 
710 	/*
711 	 * if chgParam of subnode is not null then plan will be re-scanned by
712 	 * first ExecProcNode.
713 	 */
714 	if (outerPlan->chgParam == NULL)
715 		ExecReScan(outerPlan);
716 }
717 
718 /* ----------------------------------------------------------------
719  *		ExecEndBitmapHeapScan
720  * ----------------------------------------------------------------
721  */
722 void
ExecEndBitmapHeapScan(BitmapHeapScanState * node)723 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
724 {
725 	Relation	relation;
726 	HeapScanDesc scanDesc;
727 
728 	/*
729 	 * extract information from the node
730 	 */
731 	relation = node->ss.ss_currentRelation;
732 	scanDesc = node->ss.ss_currentScanDesc;
733 
734 	/*
735 	 * Free the exprcontext
736 	 */
737 	ExecFreeExprContext(&node->ss.ps);
738 
739 	/*
740 	 * clear out tuple table slots
741 	 */
742 	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
743 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
744 
745 	/*
746 	 * close down subplans
747 	 */
748 	ExecEndNode(outerPlanState(node));
749 
750 	/*
751 	 * release bitmap if any
752 	 */
753 	if (node->tbmiterator)
754 		tbm_end_iterate(node->tbmiterator);
755 	if (node->prefetch_iterator)
756 		tbm_end_iterate(node->prefetch_iterator);
757 	if (node->tbm)
758 		tbm_free(node->tbm);
759 	if (node->shared_tbmiterator)
760 		tbm_end_shared_iterate(node->shared_tbmiterator);
761 	if (node->shared_prefetch_iterator)
762 		tbm_end_shared_iterate(node->shared_prefetch_iterator);
763 
764 	/*
765 	 * close heap scan
766 	 */
767 	heap_endscan(scanDesc);
768 
769 	/*
770 	 * close the heap relation.
771 	 */
772 	ExecCloseScanRelation(relation);
773 }
774 
775 /* ----------------------------------------------------------------
776  *		ExecInitBitmapHeapScan
777  *
778  *		Initializes the scan's state information.
779  * ----------------------------------------------------------------
780  */
781 BitmapHeapScanState *
ExecInitBitmapHeapScan(BitmapHeapScan * node,EState * estate,int eflags)782 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
783 {
784 	BitmapHeapScanState *scanstate;
785 	Relation	currentRelation;
786 	int			io_concurrency;
787 
788 	/* check for unsupported flags */
789 	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
790 
791 	/*
792 	 * Assert caller didn't ask for an unsafe snapshot --- see comments at
793 	 * head of file.
794 	 */
795 	Assert(IsMVCCSnapshot(estate->es_snapshot));
796 
797 	/*
798 	 * create state structure
799 	 */
800 	scanstate = makeNode(BitmapHeapScanState);
801 	scanstate->ss.ps.plan = (Plan *) node;
802 	scanstate->ss.ps.state = estate;
803 	scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
804 
805 	scanstate->tbm = NULL;
806 	scanstate->tbmiterator = NULL;
807 	scanstate->tbmres = NULL;
808 	scanstate->exact_pages = 0;
809 	scanstate->lossy_pages = 0;
810 	scanstate->prefetch_iterator = NULL;
811 	scanstate->prefetch_pages = 0;
812 	scanstate->prefetch_target = 0;
813 	/* may be updated below */
814 	scanstate->prefetch_maximum = target_prefetch_pages;
815 	scanstate->pscan_len = 0;
816 	scanstate->initialized = false;
817 	scanstate->shared_tbmiterator = NULL;
818 	scanstate->pstate = NULL;
819 
820 	/*
821 	 * Miscellaneous initialization
822 	 *
823 	 * create expression context for node
824 	 */
825 	ExecAssignExprContext(estate, &scanstate->ss.ps);
826 
827 	/*
828 	 * initialize child expressions
829 	 */
830 	scanstate->ss.ps.qual =
831 		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
832 	scanstate->bitmapqualorig =
833 		ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
834 
835 	/*
836 	 * tuple table initialization
837 	 */
838 	ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
839 	ExecInitScanTupleSlot(estate, &scanstate->ss);
840 
841 	/*
842 	 * open the base relation and acquire appropriate lock on it.
843 	 */
844 	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
845 
846 	/*
847 	 * Determine the maximum for prefetch_target.  If the tablespace has a
848 	 * specific IO concurrency set, use that to compute the corresponding
849 	 * maximum value; otherwise, we already initialized to the value computed
850 	 * by the GUC machinery.
851 	 */
852 	io_concurrency =
853 		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
854 	if (io_concurrency != effective_io_concurrency)
855 	{
856 		double		maximum;
857 
858 		if (ComputeIoConcurrency(io_concurrency, &maximum))
859 			scanstate->prefetch_maximum = rint(maximum);
860 	}
861 
862 	scanstate->ss.ss_currentRelation = currentRelation;
863 
864 	/*
865 	 * Even though we aren't going to do a conventional seqscan, it is useful
866 	 * to create a HeapScanDesc --- most of the fields in it are usable.
867 	 */
868 	scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
869 														 estate->es_snapshot,
870 														 0,
871 														 NULL);
872 
873 	/*
874 	 * get the scan type from the relation descriptor.
875 	 */
876 	ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
877 
878 	/*
879 	 * Initialize result tuple type and projection info.
880 	 */
881 	ExecAssignResultTypeFromTL(&scanstate->ss.ps);
882 	ExecAssignScanProjectionInfo(&scanstate->ss);
883 
884 	/*
885 	 * initialize child nodes
886 	 *
887 	 * We do this last because the child nodes will open indexscans on our
888 	 * relation's indexes, and we want to be sure we have acquired a lock on
889 	 * the relation first.
890 	 */
891 	outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
892 
893 	/*
894 	 * all done.
895 	 */
896 	return scanstate;
897 }
898 
899 /*----------------
900  *		BitmapShouldInitializeSharedState
901  *
902  *		The first process to come here and see the state to the BM_INITIAL
903  *		will become the leader for the parallel bitmap scan and will be
904  *		responsible for populating the TIDBitmap.  The other processes will
905  *		be blocked by the condition variable until the leader wakes them up.
906  * ---------------
907  */
908 static bool
BitmapShouldInitializeSharedState(ParallelBitmapHeapState * pstate)909 BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
910 {
911 	SharedBitmapState state;
912 
913 	while (1)
914 	{
915 		SpinLockAcquire(&pstate->mutex);
916 		state = pstate->state;
917 		if (pstate->state == BM_INITIAL)
918 			pstate->state = BM_INPROGRESS;
919 		SpinLockRelease(&pstate->mutex);
920 
921 		/* Exit if bitmap is done, or if we're the leader. */
922 		if (state != BM_INPROGRESS)
923 			break;
924 
925 		/* Wait for the leader to wake us up. */
926 		ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
927 	}
928 
929 	ConditionVariableCancelSleep();
930 
931 	return (state == BM_INITIAL);
932 }
933 
934 /* ----------------------------------------------------------------
935  *		ExecBitmapHeapEstimate
936  *
937  *		estimates the space required to serialize bitmap scan node.
938  * ----------------------------------------------------------------
939  */
940 void
ExecBitmapHeapEstimate(BitmapHeapScanState * node,ParallelContext * pcxt)941 ExecBitmapHeapEstimate(BitmapHeapScanState *node,
942 					   ParallelContext *pcxt)
943 {
944 	EState	   *estate = node->ss.ps.state;
945 
946 	node->pscan_len = add_size(offsetof(ParallelBitmapHeapState,
947 										phs_snapshot_data),
948 							   EstimateSnapshotSpace(estate->es_snapshot));
949 
950 	shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
951 	shm_toc_estimate_keys(&pcxt->estimator, 1);
952 }
953 
954 /* ----------------------------------------------------------------
955  *		ExecBitmapHeapInitializeDSM
956  *
957  *		Set up a parallel bitmap heap scan descriptor.
958  * ----------------------------------------------------------------
959  */
960 void
ExecBitmapHeapInitializeDSM(BitmapHeapScanState * node,ParallelContext * pcxt)961 ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
962 							ParallelContext *pcxt)
963 {
964 	ParallelBitmapHeapState *pstate;
965 	EState	   *estate = node->ss.ps.state;
966 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
967 
968 	/* If there's no DSA, there are no workers; initialize nothing. */
969 	if (dsa == NULL)
970 		return;
971 
972 	pstate = shm_toc_allocate(pcxt->toc, node->pscan_len);
973 
974 	pstate->tbmiterator = 0;
975 	pstate->prefetch_iterator = 0;
976 
977 	/* Initialize the mutex */
978 	SpinLockInit(&pstate->mutex);
979 	pstate->prefetch_pages = 0;
980 	pstate->prefetch_target = 0;
981 	pstate->state = BM_INITIAL;
982 
983 	ConditionVariableInit(&pstate->cv);
984 	SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data);
985 
986 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
987 	node->pstate = pstate;
988 }
989 
990 /* ----------------------------------------------------------------
991  *		ExecBitmapHeapReInitializeDSM
992  *
993  *		Reset shared state before beginning a fresh scan.
994  * ----------------------------------------------------------------
995  */
996 void
ExecBitmapHeapReInitializeDSM(BitmapHeapScanState * node,ParallelContext * pcxt)997 ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
998 							  ParallelContext *pcxt)
999 {
1000 	ParallelBitmapHeapState *pstate = node->pstate;
1001 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
1002 
1003 	/* If there's no DSA, there are no workers; do nothing. */
1004 	if (dsa == NULL)
1005 		return;
1006 
1007 	pstate->state = BM_INITIAL;
1008 
1009 	if (DsaPointerIsValid(pstate->tbmiterator))
1010 		tbm_free_shared_area(dsa, pstate->tbmiterator);
1011 
1012 	if (DsaPointerIsValid(pstate->prefetch_iterator))
1013 		tbm_free_shared_area(dsa, pstate->prefetch_iterator);
1014 
1015 	pstate->tbmiterator = InvalidDsaPointer;
1016 	pstate->prefetch_iterator = InvalidDsaPointer;
1017 }
1018 
1019 /* ----------------------------------------------------------------
1020  *		ExecBitmapHeapInitializeWorker
1021  *
1022  *		Copy relevant information from TOC into planstate.
1023  * ----------------------------------------------------------------
1024  */
1025 void
ExecBitmapHeapInitializeWorker(BitmapHeapScanState * node,shm_toc * toc)1026 ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, shm_toc *toc)
1027 {
1028 	ParallelBitmapHeapState *pstate;
1029 	Snapshot	snapshot;
1030 
1031 	Assert(node->ss.ps.state->es_query_dsa != NULL);
1032 
1033 	pstate = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false);
1034 	node->pstate = pstate;
1035 
1036 	snapshot = RestoreSnapshot(pstate->phs_snapshot_data);
1037 	heap_update_snapshot(node->ss.ss_currentScanDesc, snapshot);
1038 }
1039