1 /*----------------------------------------------------------------------
2  *
3  * tableam.c
4  *		Table access method routines too big to be inline functions.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/table/tableam.c
12  *
13  * NOTES
14  *	  Note that most function in here are documented in tableam.h, rather than
15  *	  here. That's because there's a lot of inline functions in tableam.h and
16  *	  it'd be harder to understand if one constantly had to switch between files.
17  *
18  *----------------------------------------------------------------------
19  */
20 #include "postgres.h"
21 
22 #include <math.h>
23 
24 #include "access/syncscan.h"
25 #include "access/tableam.h"
26 #include "access/xact.h"
27 #include "optimizer/plancat.h"
28 #include "port/pg_bitutils.h"
29 #include "storage/bufmgr.h"
30 #include "storage/shmem.h"
31 #include "storage/smgr.h"
32 
33 /*
34  * Constants to control the behavior of block allocation to parallel workers
35  * during a parallel seqscan.  Technically these values do not need to be
36  * powers of 2, but having them as powers of 2 makes the math more optimal
37  * and makes the ramp-down stepping more even.
38  */
39 
40 /* The number of I/O chunks we try to break a parallel seqscan down into */
41 #define PARALLEL_SEQSCAN_NCHUNKS			2048
42 /* Ramp down size of allocations when we've only this number of chunks left */
43 #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS	64
44 /* Cap the size of parallel I/O chunks to this number of blocks */
45 #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE		8192
46 
47 /* GUC variables */
48 char	   *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
49 bool		synchronize_seqscans = true;
50 
51 
52 /* ----------------------------------------------------------------------------
53  * Slot functions.
54  * ----------------------------------------------------------------------------
55  */
56 
57 const TupleTableSlotOps *
table_slot_callbacks(Relation relation)58 table_slot_callbacks(Relation relation)
59 {
60 	const TupleTableSlotOps *tts_cb;
61 
62 	if (relation->rd_tableam)
63 		tts_cb = relation->rd_tableam->slot_callbacks(relation);
64 	else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
65 	{
66 		/*
67 		 * Historically FDWs expect to store heap tuples in slots. Continue
68 		 * handing them one, to make it less painful to adapt FDWs to new
69 		 * versions. The cost of a heap slot over a virtual slot is pretty
70 		 * small.
71 		 */
72 		tts_cb = &TTSOpsHeapTuple;
73 	}
74 	else
75 	{
76 		/*
77 		 * These need to be supported, as some parts of the code (like COPY)
78 		 * need to create slots for such relations too. It seems better to
79 		 * centralize the knowledge that a heap slot is the right thing in
80 		 * that case here.
81 		 */
82 		Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
83 			   relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
84 		tts_cb = &TTSOpsVirtual;
85 	}
86 
87 	return tts_cb;
88 }
89 
90 TupleTableSlot *
table_slot_create(Relation relation,List ** reglist)91 table_slot_create(Relation relation, List **reglist)
92 {
93 	const TupleTableSlotOps *tts_cb;
94 	TupleTableSlot *slot;
95 
96 	tts_cb = table_slot_callbacks(relation);
97 	slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
98 
99 	if (reglist)
100 		*reglist = lappend(*reglist, slot);
101 
102 	return slot;
103 }
104 
105 
106 /* ----------------------------------------------------------------------------
107  * Table scan functions.
108  * ----------------------------------------------------------------------------
109  */
110 
111 TableScanDesc
table_beginscan_catalog(Relation relation,int nkeys,struct ScanKeyData * key)112 table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
113 {
114 	uint32		flags = SO_TYPE_SEQSCAN |
115 	SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
116 	Oid			relid = RelationGetRelid(relation);
117 	Snapshot	snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
118 
119 	return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
120 											NULL, flags);
121 }
122 
123 void
table_scan_update_snapshot(TableScanDesc scan,Snapshot snapshot)124 table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot)
125 {
126 	Assert(IsMVCCSnapshot(snapshot));
127 
128 	RegisterSnapshot(snapshot);
129 	scan->rs_snapshot = snapshot;
130 	scan->rs_flags |= SO_TEMP_SNAPSHOT;
131 }
132 
133 
134 /* ----------------------------------------------------------------------------
135  * Parallel table scan related functions.
136  * ----------------------------------------------------------------------------
137  */
138 
139 Size
table_parallelscan_estimate(Relation rel,Snapshot snapshot)140 table_parallelscan_estimate(Relation rel, Snapshot snapshot)
141 {
142 	Size		sz = 0;
143 
144 	if (IsMVCCSnapshot(snapshot))
145 		sz = add_size(sz, EstimateSnapshotSpace(snapshot));
146 	else
147 		Assert(snapshot == SnapshotAny);
148 
149 	sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
150 
151 	return sz;
152 }
153 
154 void
table_parallelscan_initialize(Relation rel,ParallelTableScanDesc pscan,Snapshot snapshot)155 table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
156 							  Snapshot snapshot)
157 {
158 	Size		snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
159 
160 	pscan->phs_snapshot_off = snapshot_off;
161 
162 	if (IsMVCCSnapshot(snapshot))
163 	{
164 		SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
165 		pscan->phs_snapshot_any = false;
166 	}
167 	else
168 	{
169 		Assert(snapshot == SnapshotAny);
170 		pscan->phs_snapshot_any = true;
171 	}
172 }
173 
174 TableScanDesc
table_beginscan_parallel(Relation relation,ParallelTableScanDesc parallel_scan)175 table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
176 {
177 	Snapshot	snapshot;
178 	uint32		flags = SO_TYPE_SEQSCAN |
179 	SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
180 
181 	Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
182 
183 	if (!parallel_scan->phs_snapshot_any)
184 	{
185 		/* Snapshot was serialized -- restore it */
186 		snapshot = RestoreSnapshot((char *) parallel_scan +
187 								   parallel_scan->phs_snapshot_off);
188 		RegisterSnapshot(snapshot);
189 		flags |= SO_TEMP_SNAPSHOT;
190 	}
191 	else
192 	{
193 		/* SnapshotAny passed by caller (not serialized) */
194 		snapshot = SnapshotAny;
195 	}
196 
197 	return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
198 											parallel_scan, flags);
199 }
200 
201 
202 /* ----------------------------------------------------------------------------
203  * Index scan related functions.
204  * ----------------------------------------------------------------------------
205  */
206 
207 /*
208  * To perform that check simply start an index scan, create the necessary
209  * slot, do the heap lookup, and shut everything down again. This could be
210  * optimized, but is unlikely to matter from a performance POV. If there
211  * frequently are live index pointers also matching a unique index key, the
212  * CPU overhead of this routine is unlikely to matter.
213  *
214  * Note that *tid may be modified when we return true if the AM supports
215  * storing multiple row versions reachable via a single index entry (like
216  * heap's HOT).
217  */
218 bool
table_index_fetch_tuple_check(Relation rel,ItemPointer tid,Snapshot snapshot,bool * all_dead)219 table_index_fetch_tuple_check(Relation rel,
220 							  ItemPointer tid,
221 							  Snapshot snapshot,
222 							  bool *all_dead)
223 {
224 	IndexFetchTableData *scan;
225 	TupleTableSlot *slot;
226 	bool		call_again = false;
227 	bool		found;
228 
229 	slot = table_slot_create(rel, NULL);
230 	scan = table_index_fetch_begin(rel);
231 	found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
232 									all_dead);
233 	table_index_fetch_end(scan);
234 	ExecDropSingleTupleTableSlot(slot);
235 
236 	return found;
237 }
238 
239 
240 /* ------------------------------------------------------------------------
241  * Functions for non-modifying operations on individual tuples
242  * ------------------------------------------------------------------------
243  */
244 
245 void
table_tuple_get_latest_tid(TableScanDesc scan,ItemPointer tid)246 table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
247 {
248 	Relation	rel = scan->rs_rd;
249 	const TableAmRoutine *tableam = rel->rd_tableam;
250 
251 	/*
252 	 * We don't expect direct calls to table_tuple_get_latest_tid with valid
253 	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
254 	 * xact.c where these variables are declared.
255 	 */
256 	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
257 		elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
258 
259 	/*
260 	 * Since this can be called with user-supplied TID, don't trust the input
261 	 * too much.
262 	 */
263 	if (!tableam->tuple_tid_valid(scan, tid))
264 		ereport(ERROR,
265 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
266 				 errmsg("tid (%u, %u) is not valid for relation \"%s\"",
267 						ItemPointerGetBlockNumberNoCheck(tid),
268 						ItemPointerGetOffsetNumberNoCheck(tid),
269 						RelationGetRelationName(rel))));
270 
271 	tableam->tuple_get_latest_tid(scan, tid);
272 }
273 
274 
275 /* ----------------------------------------------------------------------------
276  * Functions to make modifications a bit simpler.
277  * ----------------------------------------------------------------------------
278  */
279 
280 /*
281  * simple_table_tuple_insert - insert a tuple
282  *
283  * Currently, this routine differs from table_tuple_insert only in supplying a
284  * default command ID and not allowing access to the speedup options.
285  */
286 void
simple_table_tuple_insert(Relation rel,TupleTableSlot * slot)287 simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
288 {
289 	table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
290 }
291 
292 /*
293  * simple_table_tuple_delete - delete a tuple
294  *
295  * This routine may be used to delete a tuple when concurrent updates of
296  * the target tuple are not expected (for example, because we have a lock
297  * on the relation associated with the tuple).  Any failure is reported
298  * via ereport().
299  */
300 void
simple_table_tuple_delete(Relation rel,ItemPointer tid,Snapshot snapshot)301 simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
302 {
303 	TM_Result	result;
304 	TM_FailureData tmfd;
305 
306 	result = table_tuple_delete(rel, tid,
307 								GetCurrentCommandId(true),
308 								snapshot, InvalidSnapshot,
309 								true /* wait for commit */ ,
310 								&tmfd, false /* changingPart */ );
311 
312 	switch (result)
313 	{
314 		case TM_SelfModified:
315 			/* Tuple was already updated in current command? */
316 			elog(ERROR, "tuple already updated by self");
317 			break;
318 
319 		case TM_Ok:
320 			/* done successfully */
321 			break;
322 
323 		case TM_Updated:
324 			elog(ERROR, "tuple concurrently updated");
325 			break;
326 
327 		case TM_Deleted:
328 			elog(ERROR, "tuple concurrently deleted");
329 			break;
330 
331 		default:
332 			elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
333 			break;
334 	}
335 }
336 
337 /*
338  * simple_table_tuple_update - replace a tuple
339  *
340  * This routine may be used to update a tuple when concurrent updates of
341  * the target tuple are not expected (for example, because we have a lock
342  * on the relation associated with the tuple).  Any failure is reported
343  * via ereport().
344  */
345 void
simple_table_tuple_update(Relation rel,ItemPointer otid,TupleTableSlot * slot,Snapshot snapshot,bool * update_indexes)346 simple_table_tuple_update(Relation rel, ItemPointer otid,
347 						  TupleTableSlot *slot,
348 						  Snapshot snapshot,
349 						  bool *update_indexes)
350 {
351 	TM_Result	result;
352 	TM_FailureData tmfd;
353 	LockTupleMode lockmode;
354 
355 	result = table_tuple_update(rel, otid, slot,
356 								GetCurrentCommandId(true),
357 								snapshot, InvalidSnapshot,
358 								true /* wait for commit */ ,
359 								&tmfd, &lockmode, update_indexes);
360 
361 	switch (result)
362 	{
363 		case TM_SelfModified:
364 			/* Tuple was already updated in current command? */
365 			elog(ERROR, "tuple already updated by self");
366 			break;
367 
368 		case TM_Ok:
369 			/* done successfully */
370 			break;
371 
372 		case TM_Updated:
373 			elog(ERROR, "tuple concurrently updated");
374 			break;
375 
376 		case TM_Deleted:
377 			elog(ERROR, "tuple concurrently deleted");
378 			break;
379 
380 		default:
381 			elog(ERROR, "unrecognized table_tuple_update status: %u", result);
382 			break;
383 	}
384 
385 }
386 
387 
388 /* ----------------------------------------------------------------------------
389  * Helper functions to implement parallel scans for block oriented AMs.
390  * ----------------------------------------------------------------------------
391  */
392 
393 Size
table_block_parallelscan_estimate(Relation rel)394 table_block_parallelscan_estimate(Relation rel)
395 {
396 	return sizeof(ParallelBlockTableScanDescData);
397 }
398 
399 Size
table_block_parallelscan_initialize(Relation rel,ParallelTableScanDesc pscan)400 table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
401 {
402 	ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
403 
404 	bpscan->base.phs_relid = RelationGetRelid(rel);
405 	bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
406 	/* compare phs_syncscan initialization to similar logic in initscan */
407 	bpscan->base.phs_syncscan = synchronize_seqscans &&
408 		!RelationUsesLocalBuffers(rel) &&
409 		bpscan->phs_nblocks > NBuffers / 4;
410 	SpinLockInit(&bpscan->phs_mutex);
411 	bpscan->phs_startblock = InvalidBlockNumber;
412 	pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
413 
414 	return sizeof(ParallelBlockTableScanDescData);
415 }
416 
417 void
table_block_parallelscan_reinitialize(Relation rel,ParallelTableScanDesc pscan)418 table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
419 {
420 	ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
421 
422 	pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
423 }
424 
425 /*
426  * find and set the scan's startblock
427  *
428  * Determine where the parallel seq scan should start.  This function may be
429  * called many times, once by each parallel worker.  We must be careful only
430  * to set the startblock once.
431  */
432 void
table_block_parallelscan_startblock_init(Relation rel,ParallelBlockTableScanWorker pbscanwork,ParallelBlockTableScanDesc pbscan)433 table_block_parallelscan_startblock_init(Relation rel,
434 										 ParallelBlockTableScanWorker pbscanwork,
435 										 ParallelBlockTableScanDesc pbscan)
436 {
437 	BlockNumber sync_startpage = InvalidBlockNumber;
438 
439 	/* Reset the state we use for controlling allocation size. */
440 	memset(pbscanwork, 0, sizeof(*pbscanwork));
441 
442 	StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
443 					 "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
444 
445 	/*
446 	 * We determine the chunk size based on the size of the relation. First we
447 	 * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
448 	 * take the next highest power of 2 number of the chunk size.  This means
449 	 * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
450 	 * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
451 	 */
452 	pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
453 													   PARALLEL_SEQSCAN_NCHUNKS, 1));
454 
455 	/*
456 	 * Ensure we don't go over the maximum chunk size with larger tables. This
457 	 * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
458 	 * tables.  Too large a chunk size has been shown to be detrimental to
459 	 * synchronous scan performance.
460 	 */
461 	pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
462 									  PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
463 
464 retry:
465 	/* Grab the spinlock. */
466 	SpinLockAcquire(&pbscan->phs_mutex);
467 
468 	/*
469 	 * If the scan's startblock has not yet been initialized, we must do so
470 	 * now.  If this is not a synchronized scan, we just start at block 0, but
471 	 * if it is a synchronized scan, we must get the starting position from
472 	 * the synchronized scan machinery.  We can't hold the spinlock while
473 	 * doing that, though, so release the spinlock, get the information we
474 	 * need, and retry.  If nobody else has initialized the scan in the
475 	 * meantime, we'll fill in the value we fetched on the second time
476 	 * through.
477 	 */
478 	if (pbscan->phs_startblock == InvalidBlockNumber)
479 	{
480 		if (!pbscan->base.phs_syncscan)
481 			pbscan->phs_startblock = 0;
482 		else if (sync_startpage != InvalidBlockNumber)
483 			pbscan->phs_startblock = sync_startpage;
484 		else
485 		{
486 			SpinLockRelease(&pbscan->phs_mutex);
487 			sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
488 			goto retry;
489 		}
490 	}
491 	SpinLockRelease(&pbscan->phs_mutex);
492 }
493 
494 /*
495  * get the next page to scan
496  *
497  * Get the next page to scan.  Even if there are no pages left to scan,
498  * another backend could have grabbed a page to scan and not yet finished
499  * looking at it, so it doesn't follow that the scan is done when the first
500  * backend gets an InvalidBlockNumber return.
501  */
502 BlockNumber
table_block_parallelscan_nextpage(Relation rel,ParallelBlockTableScanWorker pbscanwork,ParallelBlockTableScanDesc pbscan)503 table_block_parallelscan_nextpage(Relation rel,
504 								  ParallelBlockTableScanWorker pbscanwork,
505 								  ParallelBlockTableScanDesc pbscan)
506 {
507 	BlockNumber page;
508 	uint64		nallocated;
509 
510 	/*
511 	 * The logic below allocates block numbers out to parallel workers in a
512 	 * way that each worker will receive a set of consecutive block numbers to
513 	 * scan.  Earlier versions of this would allocate the next highest block
514 	 * number to the next worker to call this function.  This would generally
515 	 * result in workers never receiving consecutive block numbers.  Some
516 	 * operating systems would not detect the sequential I/O pattern due to
517 	 * each backend being a different process which could result in poor
518 	 * performance due to inefficient or no readahead.  To work around this
519 	 * issue, we now allocate a range of block numbers for each worker and
520 	 * when they come back for another block, we give them the next one in
521 	 * that range until the range is complete.  When the worker completes the
522 	 * range of blocks we then allocate another range for it and return the
523 	 * first block number from that range.
524 	 *
525 	 * Here we name these ranges of blocks "chunks".  The initial size of
526 	 * these chunks is determined in table_block_parallelscan_startblock_init
527 	 * based on the size of the relation.  Towards the end of the scan, we
528 	 * start making reductions in the size of the chunks in order to attempt
529 	 * to divide the remaining work over all the workers as evenly as
530 	 * possible.
531 	 *
532 	 * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
533 	 * the number of blocks remaining in the chunk.  When that reaches 0 then
534 	 * we must allocate a new chunk for the worker.
535 	 *
536 	 * phs_nallocated tracks how many blocks have been allocated to workers
537 	 * already.  When phs_nallocated >= rs_nblocks, all blocks have been
538 	 * allocated.
539 	 *
540 	 * Because we use an atomic fetch-and-add to fetch the current value, the
541 	 * phs_nallocated counter will exceed rs_nblocks, because workers will
542 	 * still increment the value, when they try to allocate the next block but
543 	 * all blocks have been allocated already. The counter must be 64 bits
544 	 * wide because of that, to avoid wrapping around when rs_nblocks is close
545 	 * to 2^32.
546 	 *
547 	 * The actual block to return is calculated by adding the counter to the
548 	 * starting block number, modulo nblocks.
549 	 */
550 
551 	/*
552 	 * First check if we have any remaining blocks in a previous chunk for
553 	 * this worker.  We must consume all of the blocks from that before we
554 	 * allocate a new chunk to the worker.
555 	 */
556 	if (pbscanwork->phsw_chunk_remaining > 0)
557 	{
558 		/*
559 		 * Give them the next block in the range and update the remaining
560 		 * number of blocks.
561 		 */
562 		nallocated = ++pbscanwork->phsw_nallocated;
563 		pbscanwork->phsw_chunk_remaining--;
564 	}
565 	else
566 	{
567 		/*
568 		 * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
569 		 * remaining in the scan, we half the chunk size.  Since we reduce the
570 		 * chunk size here, we'll hit this again after doing
571 		 * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
572 		 * iterations of this, we'll end up doing the last few blocks with the
573 		 * chunk size set to 1.
574 		 */
575 		if (pbscanwork->phsw_chunk_size > 1 &&
576 			pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
577 			(pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
578 			pbscanwork->phsw_chunk_size >>= 1;
579 
580 		nallocated = pbscanwork->phsw_nallocated =
581 			pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
582 									pbscanwork->phsw_chunk_size);
583 
584 		/*
585 		 * Set the remaining number of blocks in this chunk so that subsequent
586 		 * calls from this worker continue on with this chunk until it's done.
587 		 */
588 		pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
589 	}
590 
591 	if (nallocated >= pbscan->phs_nblocks)
592 		page = InvalidBlockNumber;	/* all blocks have been allocated */
593 	else
594 		page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
595 
596 	/*
597 	 * Report scan location.  Normally, we report the current page number.
598 	 * When we reach the end of the scan, though, we report the starting page,
599 	 * not the ending page, just so the starting positions for later scans
600 	 * doesn't slew backwards.  We only report the position at the end of the
601 	 * scan once, though: subsequent callers will report nothing.
602 	 */
603 	if (pbscan->base.phs_syncscan)
604 	{
605 		if (page != InvalidBlockNumber)
606 			ss_report_location(rel, page);
607 		else if (nallocated == pbscan->phs_nblocks)
608 			ss_report_location(rel, pbscan->phs_startblock);
609 	}
610 
611 	return page;
612 }
613 
614 /* ----------------------------------------------------------------------------
615  * Helper functions to implement relation sizing for block oriented AMs.
616  * ----------------------------------------------------------------------------
617  */
618 
619 /*
620  * table_block_relation_size
621  *
622  * If a table AM uses the various relation forks as the sole place where data
623  * is stored, and if it uses them in the expected manner (e.g. the actual data
624  * is in the main fork rather than some other), it can use this implementation
625  * of the relation_size callback rather than implementing its own.
626  */
627 uint64
table_block_relation_size(Relation rel,ForkNumber forkNumber)628 table_block_relation_size(Relation rel, ForkNumber forkNumber)
629 {
630 	uint64		nblocks = 0;
631 
632 	/* Open it at the smgr level if not already done */
633 	RelationOpenSmgr(rel);
634 
635 	/* InvalidForkNumber indicates returning the size for all forks */
636 	if (forkNumber == InvalidForkNumber)
637 	{
638 		for (int i = 0; i < MAX_FORKNUM; i++)
639 			nblocks += smgrnblocks(rel->rd_smgr, i);
640 	}
641 	else
642 		nblocks = smgrnblocks(rel->rd_smgr, forkNumber);
643 
644 	return nblocks * BLCKSZ;
645 }
646 
647 /*
648  * table_block_relation_estimate_size
649  *
650  * This function can't be directly used as the implementation of the
651  * relation_estimate_size callback, because it has a few additional parameters.
652  * Instead, it is intended to be used as a helper function; the caller can
653  * pass through the arguments to its relation_estimate_size function plus the
654  * additional values required here.
655  *
656  * overhead_bytes_per_tuple should contain the approximate number of bytes
657  * of storage required to store a tuple above and beyond what is required for
658  * the tuple data proper. Typically, this would include things like the
659  * size of the tuple header and item pointer. This is only used for query
660  * planning, so a table AM where the value is not constant could choose to
661  * pass a "best guess".
662  *
663  * usable_bytes_per_page should contain the approximate number of bytes per
664  * page usable for tuple data, excluding the page header and any anticipated
665  * special space.
666  */
667 void
table_block_relation_estimate_size(Relation rel,int32 * attr_widths,BlockNumber * pages,double * tuples,double * allvisfrac,Size overhead_bytes_per_tuple,Size usable_bytes_per_page)668 table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
669 								   BlockNumber *pages, double *tuples,
670 								   double *allvisfrac,
671 								   Size overhead_bytes_per_tuple,
672 								   Size usable_bytes_per_page)
673 {
674 	BlockNumber curpages;
675 	BlockNumber relpages;
676 	double		reltuples;
677 	BlockNumber relallvisible;
678 	double		density;
679 
680 	/* it should have storage, so we can call the smgr */
681 	curpages = RelationGetNumberOfBlocks(rel);
682 
683 	/* coerce values in pg_class to more desirable types */
684 	relpages = (BlockNumber) rel->rd_rel->relpages;
685 	reltuples = (double) rel->rd_rel->reltuples;
686 	relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
687 
688 	/*
689 	 * HACK: if the relation has never yet been vacuumed, use a minimum size
690 	 * estimate of 10 pages.  The idea here is to avoid assuming a
691 	 * newly-created table is really small, even if it currently is, because
692 	 * that may not be true once some data gets loaded into it.  Once a vacuum
693 	 * or analyze cycle has been done on it, it's more reasonable to believe
694 	 * the size is somewhat stable.
695 	 *
696 	 * (Note that this is only an issue if the plan gets cached and used again
697 	 * after the table has been filled.  What we're trying to avoid is using a
698 	 * nestloop-type plan on a table that has grown substantially since the
699 	 * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
700 	 * inserts have happened and cause cached-plan invalidation; but that
701 	 * doesn't happen instantaneously, and it won't happen at all for cases
702 	 * such as temporary tables.)
703 	 *
704 	 * We test "never vacuumed" by seeing whether reltuples < 0.
705 	 *
706 	 * If the table has inheritance children, we don't apply this heuristic.
707 	 * Totally empty parent tables are quite common, so we should be willing
708 	 * to believe that they are empty.
709 	 */
710 	if (curpages < 10 &&
711 		reltuples < 0 &&
712 		!rel->rd_rel->relhassubclass)
713 		curpages = 10;
714 
715 	/* report estimated # pages */
716 	*pages = curpages;
717 	/* quick exit if rel is clearly empty */
718 	if (curpages == 0)
719 	{
720 		*tuples = 0;
721 		*allvisfrac = 0;
722 		return;
723 	}
724 
725 	/* estimate number of tuples from previous tuple density */
726 	if (reltuples >= 0 && relpages > 0)
727 		density = reltuples / (double) relpages;
728 	else
729 	{
730 		/*
731 		 * When we have no data because the relation was never yet vacuumed,
732 		 * estimate tuple width from attribute datatypes.  We assume here that
733 		 * the pages are completely full, which is OK for tables but is
734 		 * probably an overestimate for indexes.  Fortunately
735 		 * get_relation_info() can clamp the overestimate to the parent
736 		 * table's size.
737 		 *
738 		 * Note: this code intentionally disregards alignment considerations,
739 		 * because (a) that would be gilding the lily considering how crude
740 		 * the estimate is, (b) it creates platform dependencies in the
741 		 * default plans which are kind of a headache for regression testing,
742 		 * and (c) different table AMs might use different padding schemes.
743 		 */
744 		int32		tuple_width;
745 
746 		tuple_width = get_rel_data_width(rel, attr_widths);
747 		tuple_width += overhead_bytes_per_tuple;
748 		/* note: integer division is intentional here */
749 		density = usable_bytes_per_page / tuple_width;
750 	}
751 	*tuples = rint(density * (double) curpages);
752 
753 	/*
754 	 * We use relallvisible as-is, rather than scaling it up like we do for
755 	 * the pages and tuples counts, on the theory that any pages added since
756 	 * the last VACUUM are most likely not marked all-visible.  But costsize.c
757 	 * wants it converted to a fraction.
758 	 */
759 	if (relallvisible == 0 || curpages <= 0)
760 		*allvisfrac = 0;
761 	else if ((double) relallvisible >= curpages)
762 		*allvisfrac = 1;
763 	else
764 		*allvisfrac = (double) relallvisible / curpages;
765 }
766