1 /*-------------------------------------------------------------------------
2  *
3  * nodeBitmapHeapscan.c
4  *	  Routines to support bitmapped scans of relations
5  *
6  * NOTE: it is critical that this plan type only be used with MVCC-compliant
7  * snapshots (ie, regular snapshots, not SnapshotAny or one of the other
8  * special snapshots).  The reason is that since index and heap scans are
9  * decoupled, there can be no assurance that the index tuple prompting a
10  * visit to a particular heap TID still exists when the visit is made.
11  * Therefore the tuple might not exist anymore either (which is OK because
12  * heap_fetch will cope) --- but worse, the tuple slot could have been
13  * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
14  * certain to fail the time qual and so it will not be mistakenly returned,
15  * but with anything else we might return a tuple that doesn't meet the
16  * required index qual conditions.
17  *
18  *
19  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
20  * Portions Copyright (c) 1994, Regents of the University of California
21  *
22  *
23  * IDENTIFICATION
24  *	  src/backend/executor/nodeBitmapHeapscan.c
25  *
26  *-------------------------------------------------------------------------
27  */
28 /*
29  * INTERFACE ROUTINES
30  *		ExecBitmapHeapScan			scans a relation using bitmap info
31  *		ExecBitmapHeapNext			workhorse for above
32  *		ExecInitBitmapHeapScan		creates and initializes state info.
33  *		ExecReScanBitmapHeapScan	prepares to rescan the plan.
34  *		ExecEndBitmapHeapScan		releases all storage.
35  */
36 #include "postgres.h"
37 
38 #include "access/relscan.h"
39 #include "access/transam.h"
40 #include "executor/execdebug.h"
41 #include "executor/nodeBitmapHeapscan.h"
42 #include "pgstat.h"
43 #include "storage/bufmgr.h"
44 #include "storage/predicate.h"
45 #include "utils/memutils.h"
46 #include "utils/rel.h"
47 #include "utils/spccache.h"
48 #include "utils/snapmgr.h"
49 #include "utils/tqual.h"
50 
51 
52 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
53 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
54 
55 
56 /* ----------------------------------------------------------------
57  *		BitmapHeapNext
58  *
59  *		Retrieve next tuple from the BitmapHeapScan node's currentRelation
60  * ----------------------------------------------------------------
61  */
62 static TupleTableSlot *
BitmapHeapNext(BitmapHeapScanState * node)63 BitmapHeapNext(BitmapHeapScanState *node)
64 {
65 	ExprContext *econtext;
66 	HeapScanDesc scan;
67 	TIDBitmap  *tbm;
68 	TBMIterator *tbmiterator;
69 	TBMIterateResult *tbmres;
70 
71 #ifdef USE_PREFETCH
72 	TBMIterator *prefetch_iterator;
73 #endif
74 	OffsetNumber targoffset;
75 	TupleTableSlot *slot;
76 
77 	/*
78 	 * extract necessary information from index scan node
79 	 */
80 	econtext = node->ss.ps.ps_ExprContext;
81 	slot = node->ss.ss_ScanTupleSlot;
82 	scan = node->ss.ss_currentScanDesc;
83 	tbm = node->tbm;
84 	tbmiterator = node->tbmiterator;
85 	tbmres = node->tbmres;
86 #ifdef USE_PREFETCH
87 	prefetch_iterator = node->prefetch_iterator;
88 #endif
89 
90 	/*
91 	 * If we haven't yet performed the underlying index scan, do it, and begin
92 	 * the iteration over the bitmap.
93 	 *
94 	 * For prefetching, we use *two* iterators, one for the pages we are
95 	 * actually scanning and another that runs ahead of the first for
96 	 * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
97 	 * the prefetch iterator is.  Also, node->prefetch_target tracks the
98 	 * desired prefetch distance, which starts small and increases up to the
99 	 * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
100 	 * a scan that stops after a few tuples because of a LIMIT.
101 	 */
102 	if (tbm == NULL)
103 	{
104 		tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
105 
106 		if (!tbm || !IsA(tbm, TIDBitmap))
107 			elog(ERROR, "unrecognized result from subplan");
108 
109 		node->tbm = tbm;
110 		node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
111 		node->tbmres = tbmres = NULL;
112 
113 #ifdef USE_PREFETCH
114 		if (node->prefetch_maximum > 0)
115 		{
116 			node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
117 			node->prefetch_pages = 0;
118 			node->prefetch_target = -1;
119 		}
120 #endif   /* USE_PREFETCH */
121 	}
122 
123 	for (;;)
124 	{
125 		Page		dp;
126 		ItemId		lp;
127 
128 		/*
129 		 * Get next page of results if needed
130 		 */
131 		if (tbmres == NULL)
132 		{
133 			node->tbmres = tbmres = tbm_iterate(tbmiterator);
134 			if (tbmres == NULL)
135 			{
136 				/* no more entries in the bitmap */
137 				break;
138 			}
139 
140 #ifdef USE_PREFETCH
141 			if (node->prefetch_pages > 0)
142 			{
143 				/* The main iterator has closed the distance by one page */
144 				node->prefetch_pages--;
145 			}
146 			else if (prefetch_iterator)
147 			{
148 				/* Do not let the prefetch iterator get behind the main one */
149 				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
150 
151 				if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
152 					elog(ERROR, "prefetch and main iterators are out of sync");
153 			}
154 #endif   /* USE_PREFETCH */
155 
156 			/*
157 			 * Ignore any claimed entries past what we think is the end of the
158 			 * relation.  (This is probably not necessary given that we got at
159 			 * least AccessShareLock on the table before performing any of the
160 			 * indexscans, but let's be safe.)
161 			 */
162 			if (tbmres->blockno >= scan->rs_nblocks)
163 			{
164 				node->tbmres = tbmres = NULL;
165 				continue;
166 			}
167 
168 			/*
169 			 * Fetch the current heap page and identify candidate tuples.
170 			 */
171 			bitgetpage(scan, tbmres);
172 
173 			if (tbmres->ntuples >= 0)
174 				node->exact_pages++;
175 			else
176 				node->lossy_pages++;
177 
178 			/*
179 			 * Set rs_cindex to first slot to examine
180 			 */
181 			scan->rs_cindex = 0;
182 
183 #ifdef USE_PREFETCH
184 
185 			/*
186 			 * Increase prefetch target if it's not yet at the max.  Note that
187 			 * we will increase it to zero after fetching the very first
188 			 * page/tuple, then to one after the second tuple is fetched, then
189 			 * it doubles as later pages are fetched.
190 			 */
191 			if (node->prefetch_target >= node->prefetch_maximum)
192 				 /* don't increase any further */ ;
193 			else if (node->prefetch_target >= node->prefetch_maximum / 2)
194 				node->prefetch_target = node->prefetch_maximum;
195 			else if (node->prefetch_target > 0)
196 				node->prefetch_target *= 2;
197 			else
198 				node->prefetch_target++;
199 #endif   /* USE_PREFETCH */
200 		}
201 		else
202 		{
203 			/*
204 			 * Continuing in previously obtained page; advance rs_cindex
205 			 */
206 			scan->rs_cindex++;
207 
208 #ifdef USE_PREFETCH
209 
210 			/*
211 			 * Try to prefetch at least a few pages even before we get to the
212 			 * second page if we don't stop reading after the first tuple.
213 			 */
214 			if (node->prefetch_target < node->prefetch_maximum)
215 				node->prefetch_target++;
216 #endif   /* USE_PREFETCH */
217 		}
218 
219 		/*
220 		 * Out of range?  If so, nothing more to look at on this page
221 		 */
222 		if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
223 		{
224 			node->tbmres = tbmres = NULL;
225 			continue;
226 		}
227 
228 #ifdef USE_PREFETCH
229 
230 		/*
231 		 * We issue prefetch requests *after* fetching the current page to try
232 		 * to avoid having prefetching interfere with the main I/O. Also, this
233 		 * should happen only when we have determined there is still something
234 		 * to do on the current page, else we may uselessly prefetch the same
235 		 * page we are just about to request for real.
236 		 */
237 		if (prefetch_iterator)
238 		{
239 			while (node->prefetch_pages < node->prefetch_target)
240 			{
241 				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
242 
243 				if (tbmpre == NULL)
244 				{
245 					/* No more pages to prefetch */
246 					tbm_end_iterate(prefetch_iterator);
247 					node->prefetch_iterator = prefetch_iterator = NULL;
248 					break;
249 				}
250 				node->prefetch_pages++;
251 				PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
252 			}
253 		}
254 #endif   /* USE_PREFETCH */
255 
256 		/*
257 		 * Okay to fetch the tuple
258 		 */
259 		targoffset = scan->rs_vistuples[scan->rs_cindex];
260 		dp = (Page) BufferGetPage(scan->rs_cbuf);
261 		lp = PageGetItemId(dp, targoffset);
262 		Assert(ItemIdIsNormal(lp));
263 
264 		scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
265 		scan->rs_ctup.t_len = ItemIdGetLength(lp);
266 		scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
267 		ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
268 
269 		pgstat_count_heap_fetch(scan->rs_rd);
270 
271 		/*
272 		 * Set up the result slot to point to this tuple. Note that the slot
273 		 * acquires a pin on the buffer.
274 		 */
275 		ExecStoreTuple(&scan->rs_ctup,
276 					   slot,
277 					   scan->rs_cbuf,
278 					   false);
279 
280 		/*
281 		 * If we are using lossy info, we have to recheck the qual conditions
282 		 * at every tuple.
283 		 */
284 		if (tbmres->recheck)
285 		{
286 			econtext->ecxt_scantuple = slot;
287 			ResetExprContext(econtext);
288 
289 			if (!ExecQual(node->bitmapqualorig, econtext, false))
290 			{
291 				/* Fails recheck, so drop it and loop back for another */
292 				InstrCountFiltered2(node, 1);
293 				ExecClearTuple(slot);
294 				continue;
295 			}
296 		}
297 
298 		/* OK to return this tuple */
299 		return slot;
300 	}
301 
302 	/*
303 	 * if we get here it means we are at the end of the scan..
304 	 */
305 	return ExecClearTuple(slot);
306 }
307 
308 /*
309  * bitgetpage - subroutine for BitmapHeapNext()
310  *
311  * This routine reads and pins the specified page of the relation, then
312  * builds an array indicating which tuples on the page are both potentially
313  * interesting according to the bitmap, and visible according to the snapshot.
314  */
315 static void
bitgetpage(HeapScanDesc scan,TBMIterateResult * tbmres)316 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
317 {
318 	BlockNumber page = tbmres->blockno;
319 	Buffer		buffer;
320 	Snapshot	snapshot;
321 	int			ntup;
322 
323 	/*
324 	 * Acquire pin on the target heap page, trading in any pin we held before.
325 	 */
326 	Assert(page < scan->rs_nblocks);
327 
328 	scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
329 										 scan->rs_rd,
330 										 page);
331 	buffer = scan->rs_cbuf;
332 	snapshot = scan->rs_snapshot;
333 
334 	ntup = 0;
335 
336 	/*
337 	 * Prune and repair fragmentation for the whole page, if possible.
338 	 */
339 	heap_page_prune_opt(scan->rs_rd, buffer);
340 
341 	/*
342 	 * We must hold share lock on the buffer content while examining tuple
343 	 * visibility.  Afterwards, however, the tuples we have found to be
344 	 * visible are guaranteed good as long as we hold the buffer pin.
345 	 */
346 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
347 
348 	/*
349 	 * We need two separate strategies for lossy and non-lossy cases.
350 	 */
351 	if (tbmres->ntuples >= 0)
352 	{
353 		/*
354 		 * Bitmap is non-lossy, so we just look through the offsets listed in
355 		 * tbmres; but we have to follow any HOT chain starting at each such
356 		 * offset.
357 		 */
358 		int			curslot;
359 
360 		for (curslot = 0; curslot < tbmres->ntuples; curslot++)
361 		{
362 			OffsetNumber offnum = tbmres->offsets[curslot];
363 			ItemPointerData tid;
364 			HeapTupleData heapTuple;
365 
366 			ItemPointerSet(&tid, page, offnum);
367 			if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
368 									   &heapTuple, NULL, true))
369 				scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
370 		}
371 	}
372 	else
373 	{
374 		/*
375 		 * Bitmap is lossy, so we must examine each item pointer on the page.
376 		 * But we can ignore HOT chains, since we'll check each tuple anyway.
377 		 */
378 		Page		dp = (Page) BufferGetPage(buffer);
379 		OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
380 		OffsetNumber offnum;
381 
382 		for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
383 		{
384 			ItemId		lp;
385 			HeapTupleData loctup;
386 			bool		valid;
387 
388 			lp = PageGetItemId(dp, offnum);
389 			if (!ItemIdIsNormal(lp))
390 				continue;
391 			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
392 			loctup.t_len = ItemIdGetLength(lp);
393 			loctup.t_tableOid = scan->rs_rd->rd_id;
394 			ItemPointerSet(&loctup.t_self, page, offnum);
395 			valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
396 			if (valid)
397 			{
398 				scan->rs_vistuples[ntup++] = offnum;
399 				PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
400 			}
401 			CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
402 											buffer, snapshot);
403 		}
404 	}
405 
406 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
407 
408 	Assert(ntup <= MaxHeapTuplesPerPage);
409 	scan->rs_ntuples = ntup;
410 }
411 
412 /*
413  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
414  */
415 static bool
BitmapHeapRecheck(BitmapHeapScanState * node,TupleTableSlot * slot)416 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
417 {
418 	ExprContext *econtext;
419 
420 	/*
421 	 * extract necessary information from index scan node
422 	 */
423 	econtext = node->ss.ps.ps_ExprContext;
424 
425 	/* Does the tuple meet the original qual conditions? */
426 	econtext->ecxt_scantuple = slot;
427 
428 	ResetExprContext(econtext);
429 
430 	return ExecQual(node->bitmapqualorig, econtext, false);
431 }
432 
433 /* ----------------------------------------------------------------
434  *		ExecBitmapHeapScan(node)
435  * ----------------------------------------------------------------
436  */
437 TupleTableSlot *
ExecBitmapHeapScan(BitmapHeapScanState * node)438 ExecBitmapHeapScan(BitmapHeapScanState *node)
439 {
440 	return ExecScan(&node->ss,
441 					(ExecScanAccessMtd) BitmapHeapNext,
442 					(ExecScanRecheckMtd) BitmapHeapRecheck);
443 }
444 
445 /* ----------------------------------------------------------------
446  *		ExecReScanBitmapHeapScan(node)
447  * ----------------------------------------------------------------
448  */
449 void
ExecReScanBitmapHeapScan(BitmapHeapScanState * node)450 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
451 {
452 	PlanState  *outerPlan = outerPlanState(node);
453 
454 	/* rescan to release any page pin */
455 	heap_rescan(node->ss.ss_currentScanDesc, NULL);
456 
457 	if (node->tbmiterator)
458 		tbm_end_iterate(node->tbmiterator);
459 	if (node->prefetch_iterator)
460 		tbm_end_iterate(node->prefetch_iterator);
461 	if (node->tbm)
462 		tbm_free(node->tbm);
463 	node->tbm = NULL;
464 	node->tbmiterator = NULL;
465 	node->tbmres = NULL;
466 	node->prefetch_iterator = NULL;
467 
468 	ExecScanReScan(&node->ss);
469 
470 	/*
471 	 * if chgParam of subnode is not null then plan will be re-scanned by
472 	 * first ExecProcNode.
473 	 */
474 	if (outerPlan->chgParam == NULL)
475 		ExecReScan(outerPlan);
476 }
477 
478 /* ----------------------------------------------------------------
479  *		ExecEndBitmapHeapScan
480  * ----------------------------------------------------------------
481  */
482 void
ExecEndBitmapHeapScan(BitmapHeapScanState * node)483 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
484 {
485 	Relation	relation;
486 	HeapScanDesc scanDesc;
487 
488 	/*
489 	 * extract information from the node
490 	 */
491 	relation = node->ss.ss_currentRelation;
492 	scanDesc = node->ss.ss_currentScanDesc;
493 
494 	/*
495 	 * Free the exprcontext
496 	 */
497 	ExecFreeExprContext(&node->ss.ps);
498 
499 	/*
500 	 * clear out tuple table slots
501 	 */
502 	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
503 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
504 
505 	/*
506 	 * close down subplans
507 	 */
508 	ExecEndNode(outerPlanState(node));
509 
510 	/*
511 	 * release bitmap if any
512 	 */
513 	if (node->tbmiterator)
514 		tbm_end_iterate(node->tbmiterator);
515 	if (node->prefetch_iterator)
516 		tbm_end_iterate(node->prefetch_iterator);
517 	if (node->tbm)
518 		tbm_free(node->tbm);
519 
520 	/*
521 	 * close heap scan
522 	 */
523 	heap_endscan(scanDesc);
524 
525 	/*
526 	 * close the heap relation.
527 	 */
528 	ExecCloseScanRelation(relation);
529 }
530 
531 /* ----------------------------------------------------------------
532  *		ExecInitBitmapHeapScan
533  *
534  *		Initializes the scan's state information.
535  * ----------------------------------------------------------------
536  */
537 BitmapHeapScanState *
ExecInitBitmapHeapScan(BitmapHeapScan * node,EState * estate,int eflags)538 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
539 {
540 	BitmapHeapScanState *scanstate;
541 	Relation	currentRelation;
542 	int			io_concurrency;
543 
544 	/* check for unsupported flags */
545 	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
546 
547 	/*
548 	 * Assert caller didn't ask for an unsafe snapshot --- see comments at
549 	 * head of file.
550 	 */
551 	Assert(IsMVCCSnapshot(estate->es_snapshot));
552 
553 	/*
554 	 * create state structure
555 	 */
556 	scanstate = makeNode(BitmapHeapScanState);
557 	scanstate->ss.ps.plan = (Plan *) node;
558 	scanstate->ss.ps.state = estate;
559 
560 	scanstate->tbm = NULL;
561 	scanstate->tbmiterator = NULL;
562 	scanstate->tbmres = NULL;
563 	scanstate->exact_pages = 0;
564 	scanstate->lossy_pages = 0;
565 	scanstate->prefetch_iterator = NULL;
566 	scanstate->prefetch_pages = 0;
567 	scanstate->prefetch_target = 0;
568 	/* may be updated below */
569 	scanstate->prefetch_maximum = target_prefetch_pages;
570 
571 	/*
572 	 * Miscellaneous initialization
573 	 *
574 	 * create expression context for node
575 	 */
576 	ExecAssignExprContext(estate, &scanstate->ss.ps);
577 
578 	scanstate->ss.ps.ps_TupFromTlist = false;
579 
580 	/*
581 	 * initialize child expressions
582 	 */
583 	scanstate->ss.ps.targetlist = (List *)
584 		ExecInitExpr((Expr *) node->scan.plan.targetlist,
585 					 (PlanState *) scanstate);
586 	scanstate->ss.ps.qual = (List *)
587 		ExecInitExpr((Expr *) node->scan.plan.qual,
588 					 (PlanState *) scanstate);
589 	scanstate->bitmapqualorig = (List *)
590 		ExecInitExpr((Expr *) node->bitmapqualorig,
591 					 (PlanState *) scanstate);
592 
593 	/*
594 	 * tuple table initialization
595 	 */
596 	ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
597 	ExecInitScanTupleSlot(estate, &scanstate->ss);
598 
599 	/*
600 	 * open the base relation and acquire appropriate lock on it.
601 	 */
602 	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
603 
604 	/*
605 	 * Determine the maximum for prefetch_target.  If the tablespace has a
606 	 * specific IO concurrency set, use that to compute the corresponding
607 	 * maximum value; otherwise, we already initialized to the value computed
608 	 * by the GUC machinery.
609 	 */
610 	io_concurrency =
611 		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
612 	if (io_concurrency != effective_io_concurrency)
613 	{
614 		double		maximum;
615 
616 		if (ComputeIoConcurrency(io_concurrency, &maximum))
617 			scanstate->prefetch_maximum = rint(maximum);
618 	}
619 
620 	scanstate->ss.ss_currentRelation = currentRelation;
621 
622 	/*
623 	 * Even though we aren't going to do a conventional seqscan, it is useful
624 	 * to create a HeapScanDesc --- most of the fields in it are usable.
625 	 */
626 	scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
627 														 estate->es_snapshot,
628 														 0,
629 														 NULL);
630 
631 	/*
632 	 * get the scan type from the relation descriptor.
633 	 */
634 	ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
635 
636 	/*
637 	 * Initialize result tuple type and projection info.
638 	 */
639 	ExecAssignResultTypeFromTL(&scanstate->ss.ps);
640 	ExecAssignScanProjectionInfo(&scanstate->ss);
641 
642 	/*
643 	 * initialize child nodes
644 	 *
645 	 * We do this last because the child nodes will open indexscans on our
646 	 * relation's indexes, and we want to be sure we have acquired a lock on
647 	 * the relation first.
648 	 */
649 	outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
650 
651 	/*
652 	 * all done.
653 	 */
654 	return scanstate;
655 }
656