1 /*-------------------------------------------------------------------------
2  *
3  * nodeSamplescan.c
4  *	  Support routines for sample scans of relations (table sampling).
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/executor/nodeSamplescan.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "access/tsmapi.h"
20 #include "executor/executor.h"
21 #include "executor/nodeSamplescan.h"
22 #include "miscadmin.h"
23 #include "pgstat.h"
24 #include "storage/predicate.h"
25 #include "utils/builtins.h"
26 #include "utils/rel.h"
27 #include "utils/tqual.h"
28 
29 static void InitScanRelation(SampleScanState *node, EState *estate, int eflags);
30 static TupleTableSlot *SampleNext(SampleScanState *node);
31 static void tablesample_init(SampleScanState *scanstate);
32 static HeapTuple tablesample_getnext(SampleScanState *scanstate);
33 static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
34 				   HeapScanDesc scan);
35 
36 /* ----------------------------------------------------------------
37  *						Scan Support
38  * ----------------------------------------------------------------
39  */
40 
41 /* ----------------------------------------------------------------
42  *		SampleNext
43  *
44  *		This is a workhorse for ExecSampleScan
45  * ----------------------------------------------------------------
46  */
47 static TupleTableSlot *
SampleNext(SampleScanState * node)48 SampleNext(SampleScanState *node)
49 {
50 	HeapTuple	tuple;
51 	TupleTableSlot *slot;
52 
53 	/*
54 	 * if this is first call within a scan, initialize
55 	 */
56 	if (!node->begun)
57 		tablesample_init(node);
58 
59 	/*
60 	 * get the next tuple, and store it in our result slot
61 	 */
62 	tuple = tablesample_getnext(node);
63 
64 	slot = node->ss.ss_ScanTupleSlot;
65 
66 	if (tuple)
67 		ExecStoreTuple(tuple,	/* tuple to store */
68 					   slot,	/* slot to store in */
69 					   node->ss.ss_currentScanDesc->rs_cbuf,	/* tuple's buffer */
70 					   false);	/* don't pfree this pointer */
71 	else
72 		ExecClearTuple(slot);
73 
74 	return slot;
75 }
76 
77 /*
78  * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual
79  */
80 static bool
SampleRecheck(SampleScanState * node,TupleTableSlot * slot)81 SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
82 {
83 	/*
84 	 * No need to recheck for SampleScan, since like SeqScan we don't pass any
85 	 * checkable keys to heap_beginscan.
86 	 */
87 	return true;
88 }
89 
90 /* ----------------------------------------------------------------
91  *		ExecSampleScan(node)
92  *
93  *		Scans the relation using the sampling method and returns
94  *		the next qualifying tuple.
95  *		We call the ExecScan() routine and pass it the appropriate
96  *		access method functions.
97  * ----------------------------------------------------------------
98  */
99 static TupleTableSlot *
ExecSampleScan(PlanState * pstate)100 ExecSampleScan(PlanState *pstate)
101 {
102 	SampleScanState *node = castNode(SampleScanState, pstate);
103 
104 	return ExecScan(&node->ss,
105 					(ExecScanAccessMtd) SampleNext,
106 					(ExecScanRecheckMtd) SampleRecheck);
107 }
108 
109 /* ----------------------------------------------------------------
110  *		InitScanRelation
111  *
112  *		Set up to access the scan relation.
113  * ----------------------------------------------------------------
114  */
115 static void
InitScanRelation(SampleScanState * node,EState * estate,int eflags)116 InitScanRelation(SampleScanState *node, EState *estate, int eflags)
117 {
118 	Relation	currentRelation;
119 
120 	/*
121 	 * get the relation object id from the relid'th entry in the range table,
122 	 * open that relation and acquire appropriate lock on it.
123 	 */
124 	currentRelation = ExecOpenScanRelation(estate,
125 										   ((SampleScan *) node->ss.ps.plan)->scan.scanrelid,
126 										   eflags);
127 
128 	node->ss.ss_currentRelation = currentRelation;
129 
130 	/* we won't set up the HeapScanDesc till later */
131 	node->ss.ss_currentScanDesc = NULL;
132 
133 	/* and report the scan tuple slot's rowtype */
134 	ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
135 }
136 
137 
138 /* ----------------------------------------------------------------
139  *		ExecInitSampleScan
140  * ----------------------------------------------------------------
141  */
142 SampleScanState *
ExecInitSampleScan(SampleScan * node,EState * estate,int eflags)143 ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
144 {
145 	SampleScanState *scanstate;
146 	TableSampleClause *tsc = node->tablesample;
147 	TsmRoutine *tsm;
148 
149 	Assert(outerPlan(node) == NULL);
150 	Assert(innerPlan(node) == NULL);
151 
152 	/*
153 	 * create state structure
154 	 */
155 	scanstate = makeNode(SampleScanState);
156 	scanstate->ss.ps.plan = (Plan *) node;
157 	scanstate->ss.ps.state = estate;
158 	scanstate->ss.ps.ExecProcNode = ExecSampleScan;
159 
160 	/*
161 	 * Miscellaneous initialization
162 	 *
163 	 * create expression context for node
164 	 */
165 	ExecAssignExprContext(estate, &scanstate->ss.ps);
166 
167 	/*
168 	 * initialize child expressions
169 	 */
170 	scanstate->ss.ps.qual =
171 		ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
172 
173 	scanstate->args = ExecInitExprList(tsc->args, (PlanState *) scanstate);
174 	scanstate->repeatable =
175 		ExecInitExpr(tsc->repeatable, (PlanState *) scanstate);
176 
177 	/*
178 	 * tuple table initialization
179 	 */
180 	ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
181 	ExecInitScanTupleSlot(estate, &scanstate->ss);
182 
183 	/*
184 	 * initialize scan relation
185 	 */
186 	InitScanRelation(scanstate, estate, eflags);
187 
188 	/*
189 	 * Initialize result tuple type and projection info.
190 	 */
191 	ExecAssignResultTypeFromTL(&scanstate->ss.ps);
192 	ExecAssignScanProjectionInfo(&scanstate->ss);
193 
194 	/*
195 	 * If we don't have a REPEATABLE clause, select a random seed.  We want to
196 	 * do this just once, since the seed shouldn't change over rescans.
197 	 */
198 	if (tsc->repeatable == NULL)
199 		scanstate->seed = random();
200 
201 	/*
202 	 * Finally, initialize the TABLESAMPLE method handler.
203 	 */
204 	tsm = GetTsmRoutine(tsc->tsmhandler);
205 	scanstate->tsmroutine = tsm;
206 	scanstate->tsm_state = NULL;
207 
208 	if (tsm->InitSampleScan)
209 		tsm->InitSampleScan(scanstate, eflags);
210 
211 	/* We'll do BeginSampleScan later; we can't evaluate params yet */
212 	scanstate->begun = false;
213 
214 	return scanstate;
215 }
216 
217 /* ----------------------------------------------------------------
218  *		ExecEndSampleScan
219  *
220  *		frees any storage allocated through C routines.
221  * ----------------------------------------------------------------
222  */
223 void
ExecEndSampleScan(SampleScanState * node)224 ExecEndSampleScan(SampleScanState *node)
225 {
226 	/*
227 	 * Tell sampling function that we finished the scan.
228 	 */
229 	if (node->tsmroutine->EndSampleScan)
230 		node->tsmroutine->EndSampleScan(node);
231 
232 	/*
233 	 * Free the exprcontext
234 	 */
235 	ExecFreeExprContext(&node->ss.ps);
236 
237 	/*
238 	 * clean out the tuple table
239 	 */
240 	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
241 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
242 
243 	/*
244 	 * close heap scan
245 	 */
246 	if (node->ss.ss_currentScanDesc)
247 		heap_endscan(node->ss.ss_currentScanDesc);
248 
249 	/*
250 	 * close the heap relation.
251 	 */
252 	ExecCloseScanRelation(node->ss.ss_currentRelation);
253 }
254 
255 /* ----------------------------------------------------------------
256  *		ExecReScanSampleScan
257  *
258  *		Rescans the relation.
259  *
260  * ----------------------------------------------------------------
261  */
262 void
ExecReScanSampleScan(SampleScanState * node)263 ExecReScanSampleScan(SampleScanState *node)
264 {
265 	/* Remember we need to do BeginSampleScan again (if we did it at all) */
266 	node->begun = false;
267 
268 	ExecScanReScan(&node->ss);
269 }
270 
271 
272 /*
273  * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
274  */
275 static void
tablesample_init(SampleScanState * scanstate)276 tablesample_init(SampleScanState *scanstate)
277 {
278 	TsmRoutine *tsm = scanstate->tsmroutine;
279 	ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
280 	Datum	   *params;
281 	Datum		datum;
282 	bool		isnull;
283 	uint32		seed;
284 	bool		allow_sync;
285 	int			i;
286 	ListCell   *arg;
287 
288 	params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
289 
290 	i = 0;
291 	foreach(arg, scanstate->args)
292 	{
293 		ExprState  *argstate = (ExprState *) lfirst(arg);
294 
295 		params[i] = ExecEvalExprSwitchContext(argstate,
296 											  econtext,
297 											  &isnull);
298 		if (isnull)
299 			ereport(ERROR,
300 					(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
301 					 errmsg("TABLESAMPLE parameter cannot be null")));
302 		i++;
303 	}
304 
305 	if (scanstate->repeatable)
306 	{
307 		datum = ExecEvalExprSwitchContext(scanstate->repeatable,
308 										  econtext,
309 										  &isnull);
310 		if (isnull)
311 			ereport(ERROR,
312 					(errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
313 					 errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
314 
315 		/*
316 		 * The REPEATABLE parameter has been coerced to float8 by the parser.
317 		 * The reason for using float8 at the SQL level is that it will
318 		 * produce unsurprising results both for users used to databases that
319 		 * accept only integers in the REPEATABLE clause and for those who
320 		 * might expect that REPEATABLE works like setseed() (a float in the
321 		 * range from -1 to 1).
322 		 *
323 		 * We use hashfloat8() to convert the supplied value into a suitable
324 		 * seed.  For regression-testing purposes, that has the convenient
325 		 * property that REPEATABLE(0) gives a machine-independent result.
326 		 */
327 		seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
328 	}
329 	else
330 	{
331 		/* Use the seed selected by ExecInitSampleScan */
332 		seed = scanstate->seed;
333 	}
334 
335 	/* Set default values for params that BeginSampleScan can adjust */
336 	scanstate->use_bulkread = true;
337 	scanstate->use_pagemode = true;
338 
339 	/* Let tablesample method do its thing */
340 	tsm->BeginSampleScan(scanstate,
341 						 params,
342 						 list_length(scanstate->args),
343 						 seed);
344 
345 	/* We'll use syncscan if there's no NextSampleBlock function */
346 	allow_sync = (tsm->NextSampleBlock == NULL);
347 
348 	/* Now we can create or reset the HeapScanDesc */
349 	if (scanstate->ss.ss_currentScanDesc == NULL)
350 	{
351 		scanstate->ss.ss_currentScanDesc =
352 			heap_beginscan_sampling(scanstate->ss.ss_currentRelation,
353 									scanstate->ss.ps.state->es_snapshot,
354 									0, NULL,
355 									scanstate->use_bulkread,
356 									allow_sync,
357 									scanstate->use_pagemode);
358 	}
359 	else
360 	{
361 		heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
362 							   scanstate->use_bulkread,
363 							   allow_sync,
364 							   scanstate->use_pagemode);
365 	}
366 
367 	pfree(params);
368 
369 	/* And we're initialized. */
370 	scanstate->begun = true;
371 }
372 
373 /*
374  * Get next tuple from TABLESAMPLE method.
375  *
376  * Note: an awful lot of this is copied-and-pasted from heapam.c.  It would
377  * perhaps be better to refactor to share more code.
378  */
379 static HeapTuple
tablesample_getnext(SampleScanState * scanstate)380 tablesample_getnext(SampleScanState *scanstate)
381 {
382 	TsmRoutine *tsm = scanstate->tsmroutine;
383 	HeapScanDesc scan = scanstate->ss.ss_currentScanDesc;
384 	HeapTuple	tuple = &(scan->rs_ctup);
385 	Snapshot	snapshot = scan->rs_snapshot;
386 	bool		pagemode = scan->rs_pageatatime;
387 	BlockNumber blockno;
388 	Page		page;
389 	bool		all_visible;
390 	OffsetNumber maxoffset;
391 
392 	if (!scan->rs_inited)
393 	{
394 		/*
395 		 * return null immediately if relation is empty
396 		 */
397 		if (scan->rs_nblocks == 0)
398 		{
399 			Assert(!BufferIsValid(scan->rs_cbuf));
400 			tuple->t_data = NULL;
401 			return NULL;
402 		}
403 		if (tsm->NextSampleBlock)
404 		{
405 			blockno = tsm->NextSampleBlock(scanstate);
406 			if (!BlockNumberIsValid(blockno))
407 			{
408 				tuple->t_data = NULL;
409 				return NULL;
410 			}
411 		}
412 		else
413 			blockno = scan->rs_startblock;
414 		Assert(blockno < scan->rs_nblocks);
415 		heapgetpage(scan, blockno);
416 		scan->rs_inited = true;
417 	}
418 	else
419 	{
420 		/* continue from previously returned page/tuple */
421 		blockno = scan->rs_cblock;	/* current page */
422 	}
423 
424 	/*
425 	 * When not using pagemode, we must lock the buffer during tuple
426 	 * visibility checks.
427 	 */
428 	if (!pagemode)
429 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
430 
431 	page = (Page) BufferGetPage(scan->rs_cbuf);
432 	all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
433 	maxoffset = PageGetMaxOffsetNumber(page);
434 
435 	for (;;)
436 	{
437 		OffsetNumber tupoffset;
438 		bool		finished;
439 
440 		CHECK_FOR_INTERRUPTS();
441 
442 		/* Ask the tablesample method which tuples to check on this page. */
443 		tupoffset = tsm->NextSampleTuple(scanstate,
444 										 blockno,
445 										 maxoffset);
446 
447 		if (OffsetNumberIsValid(tupoffset))
448 		{
449 			ItemId		itemid;
450 			bool		visible;
451 
452 			/* Skip invalid tuple pointers. */
453 			itemid = PageGetItemId(page, tupoffset);
454 			if (!ItemIdIsNormal(itemid))
455 				continue;
456 
457 			tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
458 			tuple->t_len = ItemIdGetLength(itemid);
459 			ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
460 
461 			if (all_visible)
462 				visible = true;
463 			else
464 				visible = SampleTupleVisible(tuple, tupoffset, scan);
465 
466 			/* in pagemode, heapgetpage did this for us */
467 			if (!pagemode)
468 				CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
469 												scan->rs_cbuf, snapshot);
470 
471 			if (visible)
472 			{
473 				/* Found visible tuple, return it. */
474 				if (!pagemode)
475 					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
476 				break;
477 			}
478 			else
479 			{
480 				/* Try next tuple from same page. */
481 				continue;
482 			}
483 		}
484 
485 		/*
486 		 * if we get here, it means we've exhausted the items on this page and
487 		 * it's time to move to the next.
488 		 */
489 		if (!pagemode)
490 			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
491 
492 		if (tsm->NextSampleBlock)
493 		{
494 			blockno = tsm->NextSampleBlock(scanstate);
495 			Assert(!scan->rs_syncscan);
496 			finished = !BlockNumberIsValid(blockno);
497 		}
498 		else
499 		{
500 			/* Without NextSampleBlock, just do a plain forward seqscan. */
501 			blockno++;
502 			if (blockno >= scan->rs_nblocks)
503 				blockno = 0;
504 
505 			/*
506 			 * Report our new scan position for synchronization purposes.
507 			 *
508 			 * Note: we do this before checking for end of scan so that the
509 			 * final state of the position hint is back at the start of the
510 			 * rel.  That's not strictly necessary, but otherwise when you run
511 			 * the same query multiple times the starting position would shift
512 			 * a little bit backwards on every invocation, which is confusing.
513 			 * We don't guarantee any specific ordering in general, though.
514 			 */
515 			if (scan->rs_syncscan)
516 				ss_report_location(scan->rs_rd, blockno);
517 
518 			finished = (blockno == scan->rs_startblock);
519 		}
520 
521 		/*
522 		 * Reached end of scan?
523 		 */
524 		if (finished)
525 		{
526 			if (BufferIsValid(scan->rs_cbuf))
527 				ReleaseBuffer(scan->rs_cbuf);
528 			scan->rs_cbuf = InvalidBuffer;
529 			scan->rs_cblock = InvalidBlockNumber;
530 			tuple->t_data = NULL;
531 			scan->rs_inited = false;
532 			return NULL;
533 		}
534 
535 		Assert(blockno < scan->rs_nblocks);
536 		heapgetpage(scan, blockno);
537 
538 		/* Re-establish state for new page */
539 		if (!pagemode)
540 			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
541 
542 		page = (Page) BufferGetPage(scan->rs_cbuf);
543 		all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
544 		maxoffset = PageGetMaxOffsetNumber(page);
545 	}
546 
547 	/* Count successfully-fetched tuples as heap fetches */
548 	pgstat_count_heap_getnext(scan->rs_rd);
549 
550 	return &(scan->rs_ctup);
551 }
552 
553 /*
554  * Check visibility of the tuple.
555  */
556 static bool
SampleTupleVisible(HeapTuple tuple,OffsetNumber tupoffset,HeapScanDesc scan)557 SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
558 {
559 	if (scan->rs_pageatatime)
560 	{
561 		/*
562 		 * In pageatatime mode, heapgetpage() already did visibility checks,
563 		 * so just look at the info it left in rs_vistuples[].
564 		 *
565 		 * We use a binary search over the known-sorted array.  Note: we could
566 		 * save some effort if we insisted that NextSampleTuple select tuples
567 		 * in increasing order, but it's not clear that there would be enough
568 		 * gain to justify the restriction.
569 		 */
570 		int			start = 0,
571 					end = scan->rs_ntuples - 1;
572 
573 		while (start <= end)
574 		{
575 			int			mid = (start + end) / 2;
576 			OffsetNumber curoffset = scan->rs_vistuples[mid];
577 
578 			if (tupoffset == curoffset)
579 				return true;
580 			else if (tupoffset < curoffset)
581 				end = mid - 1;
582 			else
583 				start = mid + 1;
584 		}
585 
586 		return false;
587 	}
588 	else
589 	{
590 		/* Otherwise, we have to check the tuple individually. */
591 		return HeapTupleSatisfiesVisibility(tuple,
592 											scan->rs_snapshot,
593 											scan->rs_cbuf);
594 	}
595 }
596