1 /*-------------------------------------------------------------------------
2 *
3 * nodeSamplescan.c
4 * Support routines for sample scans of relations (table sampling).
5 *
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/executor/nodeSamplescan.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include "access/hash.h"
18 #include "access/relscan.h"
19 #include "access/tsmapi.h"
20 #include "executor/executor.h"
21 #include "executor/nodeSamplescan.h"
22 #include "miscadmin.h"
23 #include "pgstat.h"
24 #include "storage/predicate.h"
25 #include "utils/builtins.h"
26 #include "utils/rel.h"
27 #include "utils/tqual.h"
28
29 static void InitScanRelation(SampleScanState *node, EState *estate, int eflags);
30 static TupleTableSlot *SampleNext(SampleScanState *node);
31 static void tablesample_init(SampleScanState *scanstate);
32 static HeapTuple tablesample_getnext(SampleScanState *scanstate);
33 static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
34 HeapScanDesc scan);
35
36 /* ----------------------------------------------------------------
37 * Scan Support
38 * ----------------------------------------------------------------
39 */
40
41 /* ----------------------------------------------------------------
42 * SampleNext
43 *
44 * This is a workhorse for ExecSampleScan
45 * ----------------------------------------------------------------
46 */
47 static TupleTableSlot *
SampleNext(SampleScanState * node)48 SampleNext(SampleScanState *node)
49 {
50 HeapTuple tuple;
51 TupleTableSlot *slot;
52
53 /*
54 * if this is first call within a scan, initialize
55 */
56 if (!node->begun)
57 tablesample_init(node);
58
59 /*
60 * get the next tuple, and store it in our result slot
61 */
62 tuple = tablesample_getnext(node);
63
64 slot = node->ss.ss_ScanTupleSlot;
65
66 if (tuple)
67 ExecStoreTuple(tuple, /* tuple to store */
68 slot, /* slot to store in */
69 node->ss.ss_currentScanDesc->rs_cbuf, /* tuple's buffer */
70 false); /* don't pfree this pointer */
71 else
72 ExecClearTuple(slot);
73
74 return slot;
75 }
76
77 /*
78 * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual
79 */
80 static bool
SampleRecheck(SampleScanState * node,TupleTableSlot * slot)81 SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
82 {
83 /*
84 * No need to recheck for SampleScan, since like SeqScan we don't pass any
85 * checkable keys to heap_beginscan.
86 */
87 return true;
88 }
89
90 /* ----------------------------------------------------------------
91 * ExecSampleScan(node)
92 *
93 * Scans the relation using the sampling method and returns
94 * the next qualifying tuple.
95 * We call the ExecScan() routine and pass it the appropriate
96 * access method functions.
97 * ----------------------------------------------------------------
98 */
99 static TupleTableSlot *
ExecSampleScan(PlanState * pstate)100 ExecSampleScan(PlanState *pstate)
101 {
102 SampleScanState *node = castNode(SampleScanState, pstate);
103
104 return ExecScan(&node->ss,
105 (ExecScanAccessMtd) SampleNext,
106 (ExecScanRecheckMtd) SampleRecheck);
107 }
108
109 /* ----------------------------------------------------------------
110 * InitScanRelation
111 *
112 * Set up to access the scan relation.
113 * ----------------------------------------------------------------
114 */
115 static void
InitScanRelation(SampleScanState * node,EState * estate,int eflags)116 InitScanRelation(SampleScanState *node, EState *estate, int eflags)
117 {
118 Relation currentRelation;
119
120 /*
121 * get the relation object id from the relid'th entry in the range table,
122 * open that relation and acquire appropriate lock on it.
123 */
124 currentRelation = ExecOpenScanRelation(estate,
125 ((SampleScan *) node->ss.ps.plan)->scan.scanrelid,
126 eflags);
127
128 node->ss.ss_currentRelation = currentRelation;
129
130 /* we won't set up the HeapScanDesc till later */
131 node->ss.ss_currentScanDesc = NULL;
132
133 /* and report the scan tuple slot's rowtype */
134 ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
135 }
136
137
138 /* ----------------------------------------------------------------
139 * ExecInitSampleScan
140 * ----------------------------------------------------------------
141 */
142 SampleScanState *
ExecInitSampleScan(SampleScan * node,EState * estate,int eflags)143 ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
144 {
145 SampleScanState *scanstate;
146 TableSampleClause *tsc = node->tablesample;
147 TsmRoutine *tsm;
148
149 Assert(outerPlan(node) == NULL);
150 Assert(innerPlan(node) == NULL);
151
152 /*
153 * create state structure
154 */
155 scanstate = makeNode(SampleScanState);
156 scanstate->ss.ps.plan = (Plan *) node;
157 scanstate->ss.ps.state = estate;
158 scanstate->ss.ps.ExecProcNode = ExecSampleScan;
159
160 /*
161 * Miscellaneous initialization
162 *
163 * create expression context for node
164 */
165 ExecAssignExprContext(estate, &scanstate->ss.ps);
166
167 /*
168 * initialize child expressions
169 */
170 scanstate->ss.ps.qual =
171 ExecInitQual(node->scan.plan.qual, (PlanState *) scanstate);
172
173 scanstate->args = ExecInitExprList(tsc->args, (PlanState *) scanstate);
174 scanstate->repeatable =
175 ExecInitExpr(tsc->repeatable, (PlanState *) scanstate);
176
177 /*
178 * tuple table initialization
179 */
180 ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
181 ExecInitScanTupleSlot(estate, &scanstate->ss);
182
183 /*
184 * initialize scan relation
185 */
186 InitScanRelation(scanstate, estate, eflags);
187
188 /*
189 * Initialize result tuple type and projection info.
190 */
191 ExecAssignResultTypeFromTL(&scanstate->ss.ps);
192 ExecAssignScanProjectionInfo(&scanstate->ss);
193
194 /*
195 * If we don't have a REPEATABLE clause, select a random seed. We want to
196 * do this just once, since the seed shouldn't change over rescans.
197 */
198 if (tsc->repeatable == NULL)
199 scanstate->seed = random();
200
201 /*
202 * Finally, initialize the TABLESAMPLE method handler.
203 */
204 tsm = GetTsmRoutine(tsc->tsmhandler);
205 scanstate->tsmroutine = tsm;
206 scanstate->tsm_state = NULL;
207
208 if (tsm->InitSampleScan)
209 tsm->InitSampleScan(scanstate, eflags);
210
211 /* We'll do BeginSampleScan later; we can't evaluate params yet */
212 scanstate->begun = false;
213
214 return scanstate;
215 }
216
217 /* ----------------------------------------------------------------
218 * ExecEndSampleScan
219 *
220 * frees any storage allocated through C routines.
221 * ----------------------------------------------------------------
222 */
223 void
ExecEndSampleScan(SampleScanState * node)224 ExecEndSampleScan(SampleScanState *node)
225 {
226 /*
227 * Tell sampling function that we finished the scan.
228 */
229 if (node->tsmroutine->EndSampleScan)
230 node->tsmroutine->EndSampleScan(node);
231
232 /*
233 * Free the exprcontext
234 */
235 ExecFreeExprContext(&node->ss.ps);
236
237 /*
238 * clean out the tuple table
239 */
240 ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
241 ExecClearTuple(node->ss.ss_ScanTupleSlot);
242
243 /*
244 * close heap scan
245 */
246 if (node->ss.ss_currentScanDesc)
247 heap_endscan(node->ss.ss_currentScanDesc);
248
249 /*
250 * close the heap relation.
251 */
252 ExecCloseScanRelation(node->ss.ss_currentRelation);
253 }
254
255 /* ----------------------------------------------------------------
256 * ExecReScanSampleScan
257 *
258 * Rescans the relation.
259 *
260 * ----------------------------------------------------------------
261 */
262 void
ExecReScanSampleScan(SampleScanState * node)263 ExecReScanSampleScan(SampleScanState *node)
264 {
265 /* Remember we need to do BeginSampleScan again (if we did it at all) */
266 node->begun = false;
267
268 ExecScanReScan(&node->ss);
269 }
270
271
272 /*
273 * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
274 */
275 static void
tablesample_init(SampleScanState * scanstate)276 tablesample_init(SampleScanState *scanstate)
277 {
278 TsmRoutine *tsm = scanstate->tsmroutine;
279 ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
280 Datum *params;
281 Datum datum;
282 bool isnull;
283 uint32 seed;
284 bool allow_sync;
285 int i;
286 ListCell *arg;
287
288 params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
289
290 i = 0;
291 foreach(arg, scanstate->args)
292 {
293 ExprState *argstate = (ExprState *) lfirst(arg);
294
295 params[i] = ExecEvalExprSwitchContext(argstate,
296 econtext,
297 &isnull);
298 if (isnull)
299 ereport(ERROR,
300 (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
301 errmsg("TABLESAMPLE parameter cannot be null")));
302 i++;
303 }
304
305 if (scanstate->repeatable)
306 {
307 datum = ExecEvalExprSwitchContext(scanstate->repeatable,
308 econtext,
309 &isnull);
310 if (isnull)
311 ereport(ERROR,
312 (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
313 errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
314
315 /*
316 * The REPEATABLE parameter has been coerced to float8 by the parser.
317 * The reason for using float8 at the SQL level is that it will
318 * produce unsurprising results both for users used to databases that
319 * accept only integers in the REPEATABLE clause and for those who
320 * might expect that REPEATABLE works like setseed() (a float in the
321 * range from -1 to 1).
322 *
323 * We use hashfloat8() to convert the supplied value into a suitable
324 * seed. For regression-testing purposes, that has the convenient
325 * property that REPEATABLE(0) gives a machine-independent result.
326 */
327 seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
328 }
329 else
330 {
331 /* Use the seed selected by ExecInitSampleScan */
332 seed = scanstate->seed;
333 }
334
335 /* Set default values for params that BeginSampleScan can adjust */
336 scanstate->use_bulkread = true;
337 scanstate->use_pagemode = true;
338
339 /* Let tablesample method do its thing */
340 tsm->BeginSampleScan(scanstate,
341 params,
342 list_length(scanstate->args),
343 seed);
344
345 /* We'll use syncscan if there's no NextSampleBlock function */
346 allow_sync = (tsm->NextSampleBlock == NULL);
347
348 /* Now we can create or reset the HeapScanDesc */
349 if (scanstate->ss.ss_currentScanDesc == NULL)
350 {
351 scanstate->ss.ss_currentScanDesc =
352 heap_beginscan_sampling(scanstate->ss.ss_currentRelation,
353 scanstate->ss.ps.state->es_snapshot,
354 0, NULL,
355 scanstate->use_bulkread,
356 allow_sync,
357 scanstate->use_pagemode);
358 }
359 else
360 {
361 heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
362 scanstate->use_bulkread,
363 allow_sync,
364 scanstate->use_pagemode);
365 }
366
367 pfree(params);
368
369 /* And we're initialized. */
370 scanstate->begun = true;
371 }
372
373 /*
374 * Get next tuple from TABLESAMPLE method.
375 *
376 * Note: an awful lot of this is copied-and-pasted from heapam.c. It would
377 * perhaps be better to refactor to share more code.
378 */
379 static HeapTuple
tablesample_getnext(SampleScanState * scanstate)380 tablesample_getnext(SampleScanState *scanstate)
381 {
382 TsmRoutine *tsm = scanstate->tsmroutine;
383 HeapScanDesc scan = scanstate->ss.ss_currentScanDesc;
384 HeapTuple tuple = &(scan->rs_ctup);
385 Snapshot snapshot = scan->rs_snapshot;
386 bool pagemode = scan->rs_pageatatime;
387 BlockNumber blockno;
388 Page page;
389 bool all_visible;
390 OffsetNumber maxoffset;
391
392 if (!scan->rs_inited)
393 {
394 /*
395 * return null immediately if relation is empty
396 */
397 if (scan->rs_nblocks == 0)
398 {
399 Assert(!BufferIsValid(scan->rs_cbuf));
400 tuple->t_data = NULL;
401 return NULL;
402 }
403 if (tsm->NextSampleBlock)
404 {
405 blockno = tsm->NextSampleBlock(scanstate);
406 if (!BlockNumberIsValid(blockno))
407 {
408 tuple->t_data = NULL;
409 return NULL;
410 }
411 }
412 else
413 blockno = scan->rs_startblock;
414 Assert(blockno < scan->rs_nblocks);
415 heapgetpage(scan, blockno);
416 scan->rs_inited = true;
417 }
418 else
419 {
420 /* continue from previously returned page/tuple */
421 blockno = scan->rs_cblock; /* current page */
422 }
423
424 /*
425 * When not using pagemode, we must lock the buffer during tuple
426 * visibility checks.
427 */
428 if (!pagemode)
429 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
430
431 page = (Page) BufferGetPage(scan->rs_cbuf);
432 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
433 maxoffset = PageGetMaxOffsetNumber(page);
434
435 for (;;)
436 {
437 OffsetNumber tupoffset;
438 bool finished;
439
440 CHECK_FOR_INTERRUPTS();
441
442 /* Ask the tablesample method which tuples to check on this page. */
443 tupoffset = tsm->NextSampleTuple(scanstate,
444 blockno,
445 maxoffset);
446
447 if (OffsetNumberIsValid(tupoffset))
448 {
449 ItemId itemid;
450 bool visible;
451
452 /* Skip invalid tuple pointers. */
453 itemid = PageGetItemId(page, tupoffset);
454 if (!ItemIdIsNormal(itemid))
455 continue;
456
457 tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
458 tuple->t_len = ItemIdGetLength(itemid);
459 ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
460
461 if (all_visible)
462 visible = true;
463 else
464 visible = SampleTupleVisible(tuple, tupoffset, scan);
465
466 /* in pagemode, heapgetpage did this for us */
467 if (!pagemode)
468 CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
469 scan->rs_cbuf, snapshot);
470
471 if (visible)
472 {
473 /* Found visible tuple, return it. */
474 if (!pagemode)
475 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
476 break;
477 }
478 else
479 {
480 /* Try next tuple from same page. */
481 continue;
482 }
483 }
484
485 /*
486 * if we get here, it means we've exhausted the items on this page and
487 * it's time to move to the next.
488 */
489 if (!pagemode)
490 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
491
492 if (tsm->NextSampleBlock)
493 {
494 blockno = tsm->NextSampleBlock(scanstate);
495 Assert(!scan->rs_syncscan);
496 finished = !BlockNumberIsValid(blockno);
497 }
498 else
499 {
500 /* Without NextSampleBlock, just do a plain forward seqscan. */
501 blockno++;
502 if (blockno >= scan->rs_nblocks)
503 blockno = 0;
504
505 /*
506 * Report our new scan position for synchronization purposes.
507 *
508 * Note: we do this before checking for end of scan so that the
509 * final state of the position hint is back at the start of the
510 * rel. That's not strictly necessary, but otherwise when you run
511 * the same query multiple times the starting position would shift
512 * a little bit backwards on every invocation, which is confusing.
513 * We don't guarantee any specific ordering in general, though.
514 */
515 if (scan->rs_syncscan)
516 ss_report_location(scan->rs_rd, blockno);
517
518 finished = (blockno == scan->rs_startblock);
519 }
520
521 /*
522 * Reached end of scan?
523 */
524 if (finished)
525 {
526 if (BufferIsValid(scan->rs_cbuf))
527 ReleaseBuffer(scan->rs_cbuf);
528 scan->rs_cbuf = InvalidBuffer;
529 scan->rs_cblock = InvalidBlockNumber;
530 tuple->t_data = NULL;
531 scan->rs_inited = false;
532 return NULL;
533 }
534
535 Assert(blockno < scan->rs_nblocks);
536 heapgetpage(scan, blockno);
537
538 /* Re-establish state for new page */
539 if (!pagemode)
540 LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
541
542 page = (Page) BufferGetPage(scan->rs_cbuf);
543 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
544 maxoffset = PageGetMaxOffsetNumber(page);
545 }
546
547 /* Count successfully-fetched tuples as heap fetches */
548 pgstat_count_heap_getnext(scan->rs_rd);
549
550 return &(scan->rs_ctup);
551 }
552
553 /*
554 * Check visibility of the tuple.
555 */
556 static bool
SampleTupleVisible(HeapTuple tuple,OffsetNumber tupoffset,HeapScanDesc scan)557 SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
558 {
559 if (scan->rs_pageatatime)
560 {
561 /*
562 * In pageatatime mode, heapgetpage() already did visibility checks,
563 * so just look at the info it left in rs_vistuples[].
564 *
565 * We use a binary search over the known-sorted array. Note: we could
566 * save some effort if we insisted that NextSampleTuple select tuples
567 * in increasing order, but it's not clear that there would be enough
568 * gain to justify the restriction.
569 */
570 int start = 0,
571 end = scan->rs_ntuples - 1;
572
573 while (start <= end)
574 {
575 int mid = (start + end) / 2;
576 OffsetNumber curoffset = scan->rs_vistuples[mid];
577
578 if (tupoffset == curoffset)
579 return true;
580 else if (tupoffset < curoffset)
581 end = mid - 1;
582 else
583 start = mid + 1;
584 }
585
586 return false;
587 }
588 else
589 {
590 /* Otherwise, we have to check the tuple individually. */
591 return HeapTupleSatisfiesVisibility(tuple,
592 scan->rs_snapshot,
593 scan->rs_cbuf);
594 }
595 }
596