1 /*----------------------------------------------------------------------
2 *
3 * tableam.c
4 * Table access method routines too big to be inline functions.
5 *
6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/table/tableam.c
12 *
13 * NOTES
14 * Note that most function in here are documented in tableam.h, rather than
15 * here. That's because there's a lot of inline functions in tableam.h and
16 * it'd be harder to understand if one constantly had to switch between files.
17 *
18 *----------------------------------------------------------------------
19 */
20 #include "postgres.h"
21
22 #include <math.h>
23
24 #include "access/syncscan.h"
25 #include "access/tableam.h"
26 #include "access/xact.h"
27 #include "optimizer/plancat.h"
28 #include "port/pg_bitutils.h"
29 #include "storage/bufmgr.h"
30 #include "storage/shmem.h"
31 #include "storage/smgr.h"
32
33 /*
34 * Constants to control the behavior of block allocation to parallel workers
35 * during a parallel seqscan. Technically these values do not need to be
36 * powers of 2, but having them as powers of 2 makes the math more optimal
37 * and makes the ramp-down stepping more even.
38 */
39
40 /* The number of I/O chunks we try to break a parallel seqscan down into */
41 #define PARALLEL_SEQSCAN_NCHUNKS 2048
42 /* Ramp down size of allocations when we've only this number of chunks left */
43 #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64
44 /* Cap the size of parallel I/O chunks to this number of blocks */
45 #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE 8192
46
47 /* GUC variables */
48 char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
49 bool synchronize_seqscans = true;
50
51
52 /* ----------------------------------------------------------------------------
53 * Slot functions.
54 * ----------------------------------------------------------------------------
55 */
56
57 const TupleTableSlotOps *
table_slot_callbacks(Relation relation)58 table_slot_callbacks(Relation relation)
59 {
60 const TupleTableSlotOps *tts_cb;
61
62 if (relation->rd_tableam)
63 tts_cb = relation->rd_tableam->slot_callbacks(relation);
64 else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
65 {
66 /*
67 * Historically FDWs expect to store heap tuples in slots. Continue
68 * handing them one, to make it less painful to adapt FDWs to new
69 * versions. The cost of a heap slot over a virtual slot is pretty
70 * small.
71 */
72 tts_cb = &TTSOpsHeapTuple;
73 }
74 else
75 {
76 /*
77 * These need to be supported, as some parts of the code (like COPY)
78 * need to create slots for such relations too. It seems better to
79 * centralize the knowledge that a heap slot is the right thing in
80 * that case here.
81 */
82 Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
83 relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
84 tts_cb = &TTSOpsVirtual;
85 }
86
87 return tts_cb;
88 }
89
90 TupleTableSlot *
table_slot_create(Relation relation,List ** reglist)91 table_slot_create(Relation relation, List **reglist)
92 {
93 const TupleTableSlotOps *tts_cb;
94 TupleTableSlot *slot;
95
96 tts_cb = table_slot_callbacks(relation);
97 slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
98
99 if (reglist)
100 *reglist = lappend(*reglist, slot);
101
102 return slot;
103 }
104
105
106 /* ----------------------------------------------------------------------------
107 * Table scan functions.
108 * ----------------------------------------------------------------------------
109 */
110
111 TableScanDesc
table_beginscan_catalog(Relation relation,int nkeys,struct ScanKeyData * key)112 table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
113 {
114 uint32 flags = SO_TYPE_SEQSCAN |
115 SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
116 Oid relid = RelationGetRelid(relation);
117 Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
118
119 return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
120 NULL, flags);
121 }
122
123 void
table_scan_update_snapshot(TableScanDesc scan,Snapshot snapshot)124 table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot)
125 {
126 Assert(IsMVCCSnapshot(snapshot));
127
128 RegisterSnapshot(snapshot);
129 scan->rs_snapshot = snapshot;
130 scan->rs_flags |= SO_TEMP_SNAPSHOT;
131 }
132
133
134 /* ----------------------------------------------------------------------------
135 * Parallel table scan related functions.
136 * ----------------------------------------------------------------------------
137 */
138
139 Size
table_parallelscan_estimate(Relation rel,Snapshot snapshot)140 table_parallelscan_estimate(Relation rel, Snapshot snapshot)
141 {
142 Size sz = 0;
143
144 if (IsMVCCSnapshot(snapshot))
145 sz = add_size(sz, EstimateSnapshotSpace(snapshot));
146 else
147 Assert(snapshot == SnapshotAny);
148
149 sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
150
151 return sz;
152 }
153
154 void
table_parallelscan_initialize(Relation rel,ParallelTableScanDesc pscan,Snapshot snapshot)155 table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
156 Snapshot snapshot)
157 {
158 Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
159
160 pscan->phs_snapshot_off = snapshot_off;
161
162 if (IsMVCCSnapshot(snapshot))
163 {
164 SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
165 pscan->phs_snapshot_any = false;
166 }
167 else
168 {
169 Assert(snapshot == SnapshotAny);
170 pscan->phs_snapshot_any = true;
171 }
172 }
173
174 TableScanDesc
table_beginscan_parallel(Relation relation,ParallelTableScanDesc parallel_scan)175 table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
176 {
177 Snapshot snapshot;
178 uint32 flags = SO_TYPE_SEQSCAN |
179 SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
180
181 Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
182
183 if (!parallel_scan->phs_snapshot_any)
184 {
185 /* Snapshot was serialized -- restore it */
186 snapshot = RestoreSnapshot((char *) parallel_scan +
187 parallel_scan->phs_snapshot_off);
188 RegisterSnapshot(snapshot);
189 flags |= SO_TEMP_SNAPSHOT;
190 }
191 else
192 {
193 /* SnapshotAny passed by caller (not serialized) */
194 snapshot = SnapshotAny;
195 }
196
197 return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
198 parallel_scan, flags);
199 }
200
201
202 /* ----------------------------------------------------------------------------
203 * Index scan related functions.
204 * ----------------------------------------------------------------------------
205 */
206
207 /*
208 * To perform that check simply start an index scan, create the necessary
209 * slot, do the heap lookup, and shut everything down again. This could be
210 * optimized, but is unlikely to matter from a performance POV. If there
211 * frequently are live index pointers also matching a unique index key, the
212 * CPU overhead of this routine is unlikely to matter.
213 *
214 * Note that *tid may be modified when we return true if the AM supports
215 * storing multiple row versions reachable via a single index entry (like
216 * heap's HOT).
217 */
218 bool
table_index_fetch_tuple_check(Relation rel,ItemPointer tid,Snapshot snapshot,bool * all_dead)219 table_index_fetch_tuple_check(Relation rel,
220 ItemPointer tid,
221 Snapshot snapshot,
222 bool *all_dead)
223 {
224 IndexFetchTableData *scan;
225 TupleTableSlot *slot;
226 bool call_again = false;
227 bool found;
228
229 slot = table_slot_create(rel, NULL);
230 scan = table_index_fetch_begin(rel);
231 found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
232 all_dead);
233 table_index_fetch_end(scan);
234 ExecDropSingleTupleTableSlot(slot);
235
236 return found;
237 }
238
239
240 /* ------------------------------------------------------------------------
241 * Functions for non-modifying operations on individual tuples
242 * ------------------------------------------------------------------------
243 */
244
245 void
table_tuple_get_latest_tid(TableScanDesc scan,ItemPointer tid)246 table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
247 {
248 Relation rel = scan->rs_rd;
249 const TableAmRoutine *tableam = rel->rd_tableam;
250
251 /*
252 * We don't expect direct calls to table_tuple_get_latest_tid with valid
253 * CheckXidAlive for catalog or regular tables. See detailed comments in
254 * xact.c where these variables are declared.
255 */
256 if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
257 elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
258
259 /*
260 * Since this can be called with user-supplied TID, don't trust the input
261 * too much.
262 */
263 if (!tableam->tuple_tid_valid(scan, tid))
264 ereport(ERROR,
265 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
266 errmsg("tid (%u, %u) is not valid for relation \"%s\"",
267 ItemPointerGetBlockNumberNoCheck(tid),
268 ItemPointerGetOffsetNumberNoCheck(tid),
269 RelationGetRelationName(rel))));
270
271 tableam->tuple_get_latest_tid(scan, tid);
272 }
273
274
275 /* ----------------------------------------------------------------------------
276 * Functions to make modifications a bit simpler.
277 * ----------------------------------------------------------------------------
278 */
279
280 /*
281 * simple_table_tuple_insert - insert a tuple
282 *
283 * Currently, this routine differs from table_tuple_insert only in supplying a
284 * default command ID and not allowing access to the speedup options.
285 */
286 void
simple_table_tuple_insert(Relation rel,TupleTableSlot * slot)287 simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
288 {
289 table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
290 }
291
292 /*
293 * simple_table_tuple_delete - delete a tuple
294 *
295 * This routine may be used to delete a tuple when concurrent updates of
296 * the target tuple are not expected (for example, because we have a lock
297 * on the relation associated with the tuple). Any failure is reported
298 * via ereport().
299 */
300 void
simple_table_tuple_delete(Relation rel,ItemPointer tid,Snapshot snapshot)301 simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
302 {
303 TM_Result result;
304 TM_FailureData tmfd;
305
306 result = table_tuple_delete(rel, tid,
307 GetCurrentCommandId(true),
308 snapshot, InvalidSnapshot,
309 true /* wait for commit */ ,
310 &tmfd, false /* changingPart */ );
311
312 switch (result)
313 {
314 case TM_SelfModified:
315 /* Tuple was already updated in current command? */
316 elog(ERROR, "tuple already updated by self");
317 break;
318
319 case TM_Ok:
320 /* done successfully */
321 break;
322
323 case TM_Updated:
324 elog(ERROR, "tuple concurrently updated");
325 break;
326
327 case TM_Deleted:
328 elog(ERROR, "tuple concurrently deleted");
329 break;
330
331 default:
332 elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
333 break;
334 }
335 }
336
337 /*
338 * simple_table_tuple_update - replace a tuple
339 *
340 * This routine may be used to update a tuple when concurrent updates of
341 * the target tuple are not expected (for example, because we have a lock
342 * on the relation associated with the tuple). Any failure is reported
343 * via ereport().
344 */
345 void
simple_table_tuple_update(Relation rel,ItemPointer otid,TupleTableSlot * slot,Snapshot snapshot,bool * update_indexes)346 simple_table_tuple_update(Relation rel, ItemPointer otid,
347 TupleTableSlot *slot,
348 Snapshot snapshot,
349 bool *update_indexes)
350 {
351 TM_Result result;
352 TM_FailureData tmfd;
353 LockTupleMode lockmode;
354
355 result = table_tuple_update(rel, otid, slot,
356 GetCurrentCommandId(true),
357 snapshot, InvalidSnapshot,
358 true /* wait for commit */ ,
359 &tmfd, &lockmode, update_indexes);
360
361 switch (result)
362 {
363 case TM_SelfModified:
364 /* Tuple was already updated in current command? */
365 elog(ERROR, "tuple already updated by self");
366 break;
367
368 case TM_Ok:
369 /* done successfully */
370 break;
371
372 case TM_Updated:
373 elog(ERROR, "tuple concurrently updated");
374 break;
375
376 case TM_Deleted:
377 elog(ERROR, "tuple concurrently deleted");
378 break;
379
380 default:
381 elog(ERROR, "unrecognized table_tuple_update status: %u", result);
382 break;
383 }
384
385 }
386
387
388 /* ----------------------------------------------------------------------------
389 * Helper functions to implement parallel scans for block oriented AMs.
390 * ----------------------------------------------------------------------------
391 */
392
393 Size
table_block_parallelscan_estimate(Relation rel)394 table_block_parallelscan_estimate(Relation rel)
395 {
396 return sizeof(ParallelBlockTableScanDescData);
397 }
398
399 Size
table_block_parallelscan_initialize(Relation rel,ParallelTableScanDesc pscan)400 table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
401 {
402 ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
403
404 bpscan->base.phs_relid = RelationGetRelid(rel);
405 bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
406 /* compare phs_syncscan initialization to similar logic in initscan */
407 bpscan->base.phs_syncscan = synchronize_seqscans &&
408 !RelationUsesLocalBuffers(rel) &&
409 bpscan->phs_nblocks > NBuffers / 4;
410 SpinLockInit(&bpscan->phs_mutex);
411 bpscan->phs_startblock = InvalidBlockNumber;
412 pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
413
414 return sizeof(ParallelBlockTableScanDescData);
415 }
416
417 void
table_block_parallelscan_reinitialize(Relation rel,ParallelTableScanDesc pscan)418 table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
419 {
420 ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
421
422 pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
423 }
424
425 /*
426 * find and set the scan's startblock
427 *
428 * Determine where the parallel seq scan should start. This function may be
429 * called many times, once by each parallel worker. We must be careful only
430 * to set the startblock once.
431 */
432 void
table_block_parallelscan_startblock_init(Relation rel,ParallelBlockTableScanWorker pbscanwork,ParallelBlockTableScanDesc pbscan)433 table_block_parallelscan_startblock_init(Relation rel,
434 ParallelBlockTableScanWorker pbscanwork,
435 ParallelBlockTableScanDesc pbscan)
436 {
437 BlockNumber sync_startpage = InvalidBlockNumber;
438
439 /* Reset the state we use for controlling allocation size. */
440 memset(pbscanwork, 0, sizeof(*pbscanwork));
441
442 StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
443 "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
444
445 /*
446 * We determine the chunk size based on the size of the relation. First we
447 * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
448 * take the next highest power of 2 number of the chunk size. This means
449 * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
450 * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
451 */
452 pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
453 PARALLEL_SEQSCAN_NCHUNKS, 1));
454
455 /*
456 * Ensure we don't go over the maximum chunk size with larger tables. This
457 * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
458 * tables. Too large a chunk size has been shown to be detrimental to
459 * synchronous scan performance.
460 */
461 pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
462 PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
463
464 retry:
465 /* Grab the spinlock. */
466 SpinLockAcquire(&pbscan->phs_mutex);
467
468 /*
469 * If the scan's startblock has not yet been initialized, we must do so
470 * now. If this is not a synchronized scan, we just start at block 0, but
471 * if it is a synchronized scan, we must get the starting position from
472 * the synchronized scan machinery. We can't hold the spinlock while
473 * doing that, though, so release the spinlock, get the information we
474 * need, and retry. If nobody else has initialized the scan in the
475 * meantime, we'll fill in the value we fetched on the second time
476 * through.
477 */
478 if (pbscan->phs_startblock == InvalidBlockNumber)
479 {
480 if (!pbscan->base.phs_syncscan)
481 pbscan->phs_startblock = 0;
482 else if (sync_startpage != InvalidBlockNumber)
483 pbscan->phs_startblock = sync_startpage;
484 else
485 {
486 SpinLockRelease(&pbscan->phs_mutex);
487 sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
488 goto retry;
489 }
490 }
491 SpinLockRelease(&pbscan->phs_mutex);
492 }
493
494 /*
495 * get the next page to scan
496 *
497 * Get the next page to scan. Even if there are no pages left to scan,
498 * another backend could have grabbed a page to scan and not yet finished
499 * looking at it, so it doesn't follow that the scan is done when the first
500 * backend gets an InvalidBlockNumber return.
501 */
502 BlockNumber
table_block_parallelscan_nextpage(Relation rel,ParallelBlockTableScanWorker pbscanwork,ParallelBlockTableScanDesc pbscan)503 table_block_parallelscan_nextpage(Relation rel,
504 ParallelBlockTableScanWorker pbscanwork,
505 ParallelBlockTableScanDesc pbscan)
506 {
507 BlockNumber page;
508 uint64 nallocated;
509
510 /*
511 * The logic below allocates block numbers out to parallel workers in a
512 * way that each worker will receive a set of consecutive block numbers to
513 * scan. Earlier versions of this would allocate the next highest block
514 * number to the next worker to call this function. This would generally
515 * result in workers never receiving consecutive block numbers. Some
516 * operating systems would not detect the sequential I/O pattern due to
517 * each backend being a different process which could result in poor
518 * performance due to inefficient or no readahead. To work around this
519 * issue, we now allocate a range of block numbers for each worker and
520 * when they come back for another block, we give them the next one in
521 * that range until the range is complete. When the worker completes the
522 * range of blocks we then allocate another range for it and return the
523 * first block number from that range.
524 *
525 * Here we name these ranges of blocks "chunks". The initial size of
526 * these chunks is determined in table_block_parallelscan_startblock_init
527 * based on the size of the relation. Towards the end of the scan, we
528 * start making reductions in the size of the chunks in order to attempt
529 * to divide the remaining work over all the workers as evenly as
530 * possible.
531 *
532 * Here pbscanwork is local worker memory. phsw_chunk_remaining tracks
533 * the number of blocks remaining in the chunk. When that reaches 0 then
534 * we must allocate a new chunk for the worker.
535 *
536 * phs_nallocated tracks how many blocks have been allocated to workers
537 * already. When phs_nallocated >= rs_nblocks, all blocks have been
538 * allocated.
539 *
540 * Because we use an atomic fetch-and-add to fetch the current value, the
541 * phs_nallocated counter will exceed rs_nblocks, because workers will
542 * still increment the value, when they try to allocate the next block but
543 * all blocks have been allocated already. The counter must be 64 bits
544 * wide because of that, to avoid wrapping around when rs_nblocks is close
545 * to 2^32.
546 *
547 * The actual block to return is calculated by adding the counter to the
548 * starting block number, modulo nblocks.
549 */
550
551 /*
552 * First check if we have any remaining blocks in a previous chunk for
553 * this worker. We must consume all of the blocks from that before we
554 * allocate a new chunk to the worker.
555 */
556 if (pbscanwork->phsw_chunk_remaining > 0)
557 {
558 /*
559 * Give them the next block in the range and update the remaining
560 * number of blocks.
561 */
562 nallocated = ++pbscanwork->phsw_nallocated;
563 pbscanwork->phsw_chunk_remaining--;
564 }
565 else
566 {
567 /*
568 * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
569 * remaining in the scan, we half the chunk size. Since we reduce the
570 * chunk size here, we'll hit this again after doing
571 * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few
572 * iterations of this, we'll end up doing the last few blocks with the
573 * chunk size set to 1.
574 */
575 if (pbscanwork->phsw_chunk_size > 1 &&
576 pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
577 (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
578 pbscanwork->phsw_chunk_size >>= 1;
579
580 nallocated = pbscanwork->phsw_nallocated =
581 pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
582 pbscanwork->phsw_chunk_size);
583
584 /*
585 * Set the remaining number of blocks in this chunk so that subsequent
586 * calls from this worker continue on with this chunk until it's done.
587 */
588 pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
589 }
590
591 if (nallocated >= pbscan->phs_nblocks)
592 page = InvalidBlockNumber; /* all blocks have been allocated */
593 else
594 page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
595
596 /*
597 * Report scan location. Normally, we report the current page number.
598 * When we reach the end of the scan, though, we report the starting page,
599 * not the ending page, just so the starting positions for later scans
600 * doesn't slew backwards. We only report the position at the end of the
601 * scan once, though: subsequent callers will report nothing.
602 */
603 if (pbscan->base.phs_syncscan)
604 {
605 if (page != InvalidBlockNumber)
606 ss_report_location(rel, page);
607 else if (nallocated == pbscan->phs_nblocks)
608 ss_report_location(rel, pbscan->phs_startblock);
609 }
610
611 return page;
612 }
613
614 /* ----------------------------------------------------------------------------
615 * Helper functions to implement relation sizing for block oriented AMs.
616 * ----------------------------------------------------------------------------
617 */
618
619 /*
620 * table_block_relation_size
621 *
622 * If a table AM uses the various relation forks as the sole place where data
623 * is stored, and if it uses them in the expected manner (e.g. the actual data
624 * is in the main fork rather than some other), it can use this implementation
625 * of the relation_size callback rather than implementing its own.
626 */
627 uint64
table_block_relation_size(Relation rel,ForkNumber forkNumber)628 table_block_relation_size(Relation rel, ForkNumber forkNumber)
629 {
630 uint64 nblocks = 0;
631
632 /* Open it at the smgr level if not already done */
633 RelationOpenSmgr(rel);
634
635 /* InvalidForkNumber indicates returning the size for all forks */
636 if (forkNumber == InvalidForkNumber)
637 {
638 for (int i = 0; i < MAX_FORKNUM; i++)
639 nblocks += smgrnblocks(rel->rd_smgr, i);
640 }
641 else
642 nblocks = smgrnblocks(rel->rd_smgr, forkNumber);
643
644 return nblocks * BLCKSZ;
645 }
646
647 /*
648 * table_block_relation_estimate_size
649 *
650 * This function can't be directly used as the implementation of the
651 * relation_estimate_size callback, because it has a few additional parameters.
652 * Instead, it is intended to be used as a helper function; the caller can
653 * pass through the arguments to its relation_estimate_size function plus the
654 * additional values required here.
655 *
656 * overhead_bytes_per_tuple should contain the approximate number of bytes
657 * of storage required to store a tuple above and beyond what is required for
658 * the tuple data proper. Typically, this would include things like the
659 * size of the tuple header and item pointer. This is only used for query
660 * planning, so a table AM where the value is not constant could choose to
661 * pass a "best guess".
662 *
663 * usable_bytes_per_page should contain the approximate number of bytes per
664 * page usable for tuple data, excluding the page header and any anticipated
665 * special space.
666 */
667 void
table_block_relation_estimate_size(Relation rel,int32 * attr_widths,BlockNumber * pages,double * tuples,double * allvisfrac,Size overhead_bytes_per_tuple,Size usable_bytes_per_page)668 table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
669 BlockNumber *pages, double *tuples,
670 double *allvisfrac,
671 Size overhead_bytes_per_tuple,
672 Size usable_bytes_per_page)
673 {
674 BlockNumber curpages;
675 BlockNumber relpages;
676 double reltuples;
677 BlockNumber relallvisible;
678 double density;
679
680 /* it should have storage, so we can call the smgr */
681 curpages = RelationGetNumberOfBlocks(rel);
682
683 /* coerce values in pg_class to more desirable types */
684 relpages = (BlockNumber) rel->rd_rel->relpages;
685 reltuples = (double) rel->rd_rel->reltuples;
686 relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
687
688 /*
689 * HACK: if the relation has never yet been vacuumed, use a minimum size
690 * estimate of 10 pages. The idea here is to avoid assuming a
691 * newly-created table is really small, even if it currently is, because
692 * that may not be true once some data gets loaded into it. Once a vacuum
693 * or analyze cycle has been done on it, it's more reasonable to believe
694 * the size is somewhat stable.
695 *
696 * (Note that this is only an issue if the plan gets cached and used again
697 * after the table has been filled. What we're trying to avoid is using a
698 * nestloop-type plan on a table that has grown substantially since the
699 * plan was made. Normally, autovacuum/autoanalyze will occur once enough
700 * inserts have happened and cause cached-plan invalidation; but that
701 * doesn't happen instantaneously, and it won't happen at all for cases
702 * such as temporary tables.)
703 *
704 * We test "never vacuumed" by seeing whether reltuples < 0.
705 *
706 * If the table has inheritance children, we don't apply this heuristic.
707 * Totally empty parent tables are quite common, so we should be willing
708 * to believe that they are empty.
709 */
710 if (curpages < 10 &&
711 reltuples < 0 &&
712 !rel->rd_rel->relhassubclass)
713 curpages = 10;
714
715 /* report estimated # pages */
716 *pages = curpages;
717 /* quick exit if rel is clearly empty */
718 if (curpages == 0)
719 {
720 *tuples = 0;
721 *allvisfrac = 0;
722 return;
723 }
724
725 /* estimate number of tuples from previous tuple density */
726 if (reltuples >= 0 && relpages > 0)
727 density = reltuples / (double) relpages;
728 else
729 {
730 /*
731 * When we have no data because the relation was never yet vacuumed,
732 * estimate tuple width from attribute datatypes. We assume here that
733 * the pages are completely full, which is OK for tables but is
734 * probably an overestimate for indexes. Fortunately
735 * get_relation_info() can clamp the overestimate to the parent
736 * table's size.
737 *
738 * Note: this code intentionally disregards alignment considerations,
739 * because (a) that would be gilding the lily considering how crude
740 * the estimate is, (b) it creates platform dependencies in the
741 * default plans which are kind of a headache for regression testing,
742 * and (c) different table AMs might use different padding schemes.
743 */
744 int32 tuple_width;
745
746 tuple_width = get_rel_data_width(rel, attr_widths);
747 tuple_width += overhead_bytes_per_tuple;
748 /* note: integer division is intentional here */
749 density = usable_bytes_per_page / tuple_width;
750 }
751 *tuples = rint(density * (double) curpages);
752
753 /*
754 * We use relallvisible as-is, rather than scaling it up like we do for
755 * the pages and tuples counts, on the theory that any pages added since
756 * the last VACUUM are most likely not marked all-visible. But costsize.c
757 * wants it converted to a fraction.
758 */
759 if (relallvisible == 0 || curpages <= 0)
760 *allvisfrac = 0;
761 else if ((double) relallvisible >= curpages)
762 *allvisfrac = 1;
763 else
764 *allvisfrac = (double) relallvisible / curpages;
765 }
766