1 /*------------------------------------------------------------------------- 2 * 3 * tableam.h 4 * POSTGRES table access method definitions. 5 * 6 * 7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/access/tableam.h 11 * 12 * NOTES 13 * See tableam.sgml for higher level documentation. 14 * 15 *------------------------------------------------------------------------- 16 */ 17 #ifndef TABLEAM_H 18 #define TABLEAM_H 19 20 #include "access/relscan.h" 21 #include "access/sdir.h" 22 #include "utils/guc.h" 23 #include "utils/rel.h" 24 #include "utils/snapshot.h" 25 26 27 #define DEFAULT_TABLE_ACCESS_METHOD "heap" 28 29 /* GUCs */ 30 extern char *default_table_access_method; 31 extern bool synchronize_seqscans; 32 33 34 struct BulkInsertStateData; 35 struct IndexInfo; 36 struct SampleScanState; 37 struct TBMIterateResult; 38 struct VacuumParams; 39 struct ValidateIndexState; 40 41 /* 42 * Bitmask values for the flags argument to the scan_begin callback. 43 */ 44 typedef enum ScanOptions 45 { 46 /* one of SO_TYPE_* may be specified */ 47 SO_TYPE_SEQSCAN = 1 << 0, 48 SO_TYPE_BITMAPSCAN = 1 << 1, 49 SO_TYPE_SAMPLESCAN = 1 << 2, 50 SO_TYPE_ANALYZE = 1 << 3, 51 SO_TYPE_TIDSCAN = 1 << 8, 52 53 /* several of SO_ALLOW_* may be specified */ 54 /* allow or disallow use of access strategy */ 55 SO_ALLOW_STRAT = 1 << 4, 56 /* report location to syncscan logic? */ 57 SO_ALLOW_SYNC = 1 << 5, 58 /* verify visibility page-at-a-time? */ 59 SO_ALLOW_PAGEMODE = 1 << 6, 60 61 /* unregister snapshot at scan end? */ 62 SO_TEMP_SNAPSHOT = 1 << 7 63 } ScanOptions; 64 65 /* 66 * Result codes for table_{update,delete,lock_tuple}, and for visibility 67 * routines inside table AMs. 68 */ 69 typedef enum TM_Result 70 { 71 /* 72 * Signals that the action succeeded (i.e. update/delete performed, lock 73 * was acquired) 74 */ 75 TM_Ok, 76 77 /* The affected tuple wasn't visible to the relevant snapshot */ 78 TM_Invisible, 79 80 /* The affected tuple was already modified by the calling backend */ 81 TM_SelfModified, 82 83 /* 84 * The affected tuple was updated by another transaction. This includes 85 * the case where tuple was moved to another partition. 86 */ 87 TM_Updated, 88 89 /* The affected tuple was deleted by another transaction */ 90 TM_Deleted, 91 92 /* 93 * The affected tuple is currently being modified by another session. This 94 * will only be returned if table_(update/delete/lock_tuple) are 95 * instructed not to wait. 96 */ 97 TM_BeingModified, 98 99 /* lock couldn't be acquired, action skipped. Only used by lock_tuple */ 100 TM_WouldBlock 101 } TM_Result; 102 103 /* 104 * When table_tuple_update, table_tuple_delete, or table_tuple_lock fail 105 * because the target tuple is already outdated, they fill in this struct to 106 * provide information to the caller about what happened. 107 * 108 * ctid is the target's ctid link: it is the same as the target's TID if the 109 * target was deleted, or the location of the replacement tuple if the target 110 * was updated. 111 * 112 * xmax is the outdating transaction's XID. If the caller wants to visit the 113 * replacement tuple, it must check that this matches before believing the 114 * replacement is really a match. 115 * 116 * cmax is the outdating command's CID, but only when the failure code is 117 * TM_SelfModified (i.e., something in the current transaction outdated the 118 * tuple); otherwise cmax is zero. (We make this restriction because 119 * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other 120 * transactions.) 121 */ 122 typedef struct TM_FailureData 123 { 124 ItemPointerData ctid; 125 TransactionId xmax; 126 CommandId cmax; 127 bool traversed; 128 } TM_FailureData; 129 130 /* "options" flag bits for table_tuple_insert */ 131 #define TABLE_INSERT_SKIP_WAL 0x0001 132 #define TABLE_INSERT_SKIP_FSM 0x0002 133 #define TABLE_INSERT_FROZEN 0x0004 134 #define TABLE_INSERT_NO_LOGICAL 0x0008 135 136 /* flag bits for table_tuple_lock */ 137 /* Follow tuples whose update is in progress if lock modes don't conflict */ 138 #define TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS (1 << 0) 139 /* Follow update chain and lock latest version of tuple */ 140 #define TUPLE_LOCK_FLAG_FIND_LAST_VERSION (1 << 1) 141 142 143 /* Typedef for callback function for table_index_build_scan */ 144 typedef void (*IndexBuildCallback) (Relation index, 145 HeapTuple htup, 146 Datum *values, 147 bool *isnull, 148 bool tupleIsAlive, 149 void *state); 150 151 /* 152 * API struct for a table AM. Note this must be allocated in a 153 * server-lifetime manner, typically as a static const struct, which then gets 154 * returned by FormData_pg_am.amhandler. 155 * 156 * In most cases it's not appropriate to call the callbacks directly, use the 157 * table_* wrapper functions instead. 158 * 159 * GetTableAmRoutine() asserts that required callbacks are filled in, remember 160 * to update when adding a callback. 161 */ 162 typedef struct TableAmRoutine 163 { 164 /* this must be set to T_TableAmRoutine */ 165 NodeTag type; 166 167 168 /* ------------------------------------------------------------------------ 169 * Slot related callbacks. 170 * ------------------------------------------------------------------------ 171 */ 172 173 /* 174 * Return slot implementation suitable for storing a tuple of this AM. 175 */ 176 const TupleTableSlotOps *(*slot_callbacks) (Relation rel); 177 178 179 /* ------------------------------------------------------------------------ 180 * Table scan callbacks. 181 * ------------------------------------------------------------------------ 182 */ 183 184 /* 185 * Start a scan of `rel`. The callback has to return a TableScanDesc, 186 * which will typically be embedded in a larger, AM specific, struct. 187 * 188 * If nkeys != 0, the results need to be filtered by those scan keys. 189 * 190 * pscan, if not NULL, will have already been initialized with 191 * parallelscan_initialize(), and has to be for the same relation. Will 192 * only be set coming from table_beginscan_parallel(). 193 * 194 * `flags` is a bitmask indicating the type of scan (ScanOptions's 195 * SO_TYPE_*, currently only one may be specified), options controlling 196 * the scan's behaviour (ScanOptions's SO_ALLOW_*, several may be 197 * specified, an AM may ignore unsupported ones) and whether the snapshot 198 * needs to be deallocated at scan_end (ScanOptions's SO_TEMP_SNAPSHOT). 199 */ 200 TableScanDesc (*scan_begin) (Relation rel, 201 Snapshot snapshot, 202 int nkeys, struct ScanKeyData *key, 203 ParallelTableScanDesc pscan, 204 uint32 flags); 205 206 /* 207 * Release resources and deallocate scan. If TableScanDesc.temp_snap, 208 * TableScanDesc.rs_snapshot needs to be unregistered. 209 */ 210 void (*scan_end) (TableScanDesc scan); 211 212 /* 213 * Restart relation scan. If set_params is set to true, allow_{strat, 214 * sync, pagemode} (see scan_begin) changes should be taken into account. 215 */ 216 void (*scan_rescan) (TableScanDesc scan, struct ScanKeyData *key, 217 bool set_params, bool allow_strat, 218 bool allow_sync, bool allow_pagemode); 219 220 /* 221 * Return next tuple from `scan`, store in slot. 222 */ 223 bool (*scan_getnextslot) (TableScanDesc scan, 224 ScanDirection direction, 225 TupleTableSlot *slot); 226 227 228 /* ------------------------------------------------------------------------ 229 * Parallel table scan related functions. 230 * ------------------------------------------------------------------------ 231 */ 232 233 /* 234 * Estimate the size of shared memory needed for a parallel scan of this 235 * relation. The snapshot does not need to be accounted for. 236 */ 237 Size (*parallelscan_estimate) (Relation rel); 238 239 /* 240 * Initialize ParallelTableScanDesc for a parallel scan of this relation. 241 * `pscan` will be sized according to parallelscan_estimate() for the same 242 * relation. 243 */ 244 Size (*parallelscan_initialize) (Relation rel, 245 ParallelTableScanDesc pscan); 246 247 /* 248 * Reinitialize `pscan` for a new scan. `rel` will be the same relation as 249 * when `pscan` was initialized by parallelscan_initialize. 250 */ 251 void (*parallelscan_reinitialize) (Relation rel, 252 ParallelTableScanDesc pscan); 253 254 255 /* ------------------------------------------------------------------------ 256 * Index Scan Callbacks 257 * ------------------------------------------------------------------------ 258 */ 259 260 /* 261 * Prepare to fetch tuples from the relation, as needed when fetching 262 * tuples for an index scan. The callback has to return an 263 * IndexFetchTableData, which the AM will typically embed in a larger 264 * structure with additional information. 265 * 266 * Tuples for an index scan can then be fetched via index_fetch_tuple. 267 */ 268 struct IndexFetchTableData *(*index_fetch_begin) (Relation rel); 269 270 /* 271 * Reset index fetch. Typically this will release cross index fetch 272 * resources held in IndexFetchTableData. 273 */ 274 void (*index_fetch_reset) (struct IndexFetchTableData *data); 275 276 /* 277 * Release resources and deallocate index fetch. 278 */ 279 void (*index_fetch_end) (struct IndexFetchTableData *data); 280 281 /* 282 * Fetch tuple at `tid` into `slot`, after doing a visibility test 283 * according to `snapshot`. If a tuple was found and passed the visibility 284 * test, return true, false otherwise. 285 * 286 * Note that AMs that do not necessarily update indexes when indexed 287 * columns do not change, need to return the current/correct version of 288 * the tuple that is visible to the snapshot, even if the tid points to an 289 * older version of the tuple. 290 * 291 * *call_again is false on the first call to index_fetch_tuple for a tid. 292 * If there potentially is another tuple matching the tid, *call_again 293 * needs be set to true by index_fetch_tuple, signalling to the caller 294 * that index_fetch_tuple should be called again for the same tid. 295 * 296 * *all_dead, if all_dead is not NULL, should be set to true by 297 * index_fetch_tuple iff it is guaranteed that no backend needs to see 298 * that tuple. Index AMs can use that to avoid returning that tid in 299 * future searches. 300 */ 301 bool (*index_fetch_tuple) (struct IndexFetchTableData *scan, 302 ItemPointer tid, 303 Snapshot snapshot, 304 TupleTableSlot *slot, 305 bool *call_again, bool *all_dead); 306 307 308 /* ------------------------------------------------------------------------ 309 * Callbacks for non-modifying operations on individual tuples 310 * ------------------------------------------------------------------------ 311 */ 312 313 /* 314 * Fetch tuple at `tid` into `slot`, after doing a visibility test 315 * according to `snapshot`. If a tuple was found and passed the visibility 316 * test, returns true, false otherwise. 317 */ 318 bool (*tuple_fetch_row_version) (Relation rel, 319 ItemPointer tid, 320 Snapshot snapshot, 321 TupleTableSlot *slot); 322 323 /* 324 * Is tid valid for a scan of this relation. 325 */ 326 bool (*tuple_tid_valid) (TableScanDesc scan, 327 ItemPointer tid); 328 329 /* 330 * Return the latest version of the tuple at `tid`, by updating `tid` to 331 * point at the newest version. 332 */ 333 void (*tuple_get_latest_tid) (TableScanDesc scan, 334 ItemPointer tid); 335 336 /* 337 * Does the tuple in `slot` satisfy `snapshot`? The slot needs to be of 338 * the appropriate type for the AM. 339 */ 340 bool (*tuple_satisfies_snapshot) (Relation rel, 341 TupleTableSlot *slot, 342 Snapshot snapshot); 343 344 /* see table_compute_xid_horizon_for_tuples() */ 345 TransactionId (*compute_xid_horizon_for_tuples) (Relation rel, 346 ItemPointerData *items, 347 int nitems); 348 349 350 /* ------------------------------------------------------------------------ 351 * Manipulations of physical tuples. 352 * ------------------------------------------------------------------------ 353 */ 354 355 /* see table_tuple_insert() for reference about parameters */ 356 void (*tuple_insert) (Relation rel, TupleTableSlot *slot, 357 CommandId cid, int options, 358 struct BulkInsertStateData *bistate); 359 360 /* see table_tuple_insert_speculative() for reference about parameters */ 361 void (*tuple_insert_speculative) (Relation rel, 362 TupleTableSlot *slot, 363 CommandId cid, 364 int options, 365 struct BulkInsertStateData *bistate, 366 uint32 specToken); 367 368 /* see table_tuple_complete_speculative() for reference about parameters */ 369 void (*tuple_complete_speculative) (Relation rel, 370 TupleTableSlot *slot, 371 uint32 specToken, 372 bool succeeded); 373 374 /* see table_multi_insert() for reference about parameters */ 375 void (*multi_insert) (Relation rel, TupleTableSlot **slots, int nslots, 376 CommandId cid, int options, struct BulkInsertStateData *bistate); 377 378 /* see table_tuple_delete() for reference about parameters */ 379 TM_Result (*tuple_delete) (Relation rel, 380 ItemPointer tid, 381 CommandId cid, 382 Snapshot snapshot, 383 Snapshot crosscheck, 384 bool wait, 385 TM_FailureData *tmfd, 386 bool changingPart); 387 388 /* see table_tuple_update() for reference about parameters */ 389 TM_Result (*tuple_update) (Relation rel, 390 ItemPointer otid, 391 TupleTableSlot *slot, 392 CommandId cid, 393 Snapshot snapshot, 394 Snapshot crosscheck, 395 bool wait, 396 TM_FailureData *tmfd, 397 LockTupleMode *lockmode, 398 bool *update_indexes); 399 400 /* see table_tuple_lock() for reference about parameters */ 401 TM_Result (*tuple_lock) (Relation rel, 402 ItemPointer tid, 403 Snapshot snapshot, 404 TupleTableSlot *slot, 405 CommandId cid, 406 LockTupleMode mode, 407 LockWaitPolicy wait_policy, 408 uint8 flags, 409 TM_FailureData *tmfd); 410 411 /* 412 * Perform operations necessary to complete insertions made via 413 * tuple_insert and multi_insert with a BulkInsertState specified. This 414 * may for example be used to flush the relation, when the 415 * TABLE_INSERT_SKIP_WAL option was used. 416 * 417 * Typically callers of tuple_insert and multi_insert will just pass all 418 * the flags that apply to them, and each AM has to decide which of them 419 * make sense for it, and then only take actions in finish_bulk_insert for 420 * those flags, and ignore others. 421 * 422 * Optional callback. 423 */ 424 void (*finish_bulk_insert) (Relation rel, int options); 425 426 427 /* ------------------------------------------------------------------------ 428 * DDL related functionality. 429 * ------------------------------------------------------------------------ 430 */ 431 432 /* 433 * This callback needs to create a new relation filenode for `rel`, with 434 * appropriate durability behaviour for `persistence`. 435 * 436 * Note that only the subset of the relcache filled by 437 * RelationBuildLocalRelation() can be relied upon and that the relation's 438 * catalog entries will either not yet exist (new relation), or will still 439 * reference the old relfilenode. 440 * 441 * As output *freezeXid, *minmulti must be set to the values appropriate 442 * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those 443 * fields to be filled they can be set to InvalidTransactionId and 444 * InvalidMultiXactId, respectively. 445 * 446 * See also table_relation_set_new_filenode(). 447 */ 448 void (*relation_set_new_filenode) (Relation rel, 449 const RelFileNode *newrnode, 450 char persistence, 451 TransactionId *freezeXid, 452 MultiXactId *minmulti); 453 454 /* 455 * This callback needs to remove all contents from `rel`'s current 456 * relfilenode. No provisions for transactional behaviour need to be made. 457 * Often this can be implemented by truncating the underlying storage to 458 * its minimal size. 459 * 460 * See also table_relation_nontransactional_truncate(). 461 */ 462 void (*relation_nontransactional_truncate) (Relation rel); 463 464 /* 465 * See table_relation_copy_data(). 466 * 467 * This can typically be implemented by directly copying the underlying 468 * storage, unless it contains references to the tablespace internally. 469 */ 470 void (*relation_copy_data) (Relation rel, 471 const RelFileNode *newrnode); 472 473 /* See table_relation_copy_for_cluster() */ 474 void (*relation_copy_for_cluster) (Relation NewTable, 475 Relation OldTable, 476 Relation OldIndex, 477 bool use_sort, 478 TransactionId OldestXmin, 479 TransactionId *xid_cutoff, 480 MultiXactId *multi_cutoff, 481 double *num_tuples, 482 double *tups_vacuumed, 483 double *tups_recently_dead); 484 485 /* 486 * React to VACUUM command on the relation. The VACUUM can be 487 * triggered by a user or by autovacuum. The specific actions 488 * performed by the AM will depend heavily on the individual AM. 489 * 490 * On entry a transaction is already established, and the relation is 491 * locked with a ShareUpdateExclusive lock. 492 * 493 * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through 494 * this routine, even if (for ANALYZE) it is part of the same VACUUM 495 * command. 496 * 497 * There probably, in the future, needs to be a separate callback to 498 * integrate with autovacuum's scheduling. 499 */ 500 void (*relation_vacuum) (Relation onerel, 501 struct VacuumParams *params, 502 BufferAccessStrategy bstrategy); 503 504 /* 505 * Prepare to analyze block `blockno` of `scan`. The scan has been started 506 * with table_beginscan_analyze(). See also 507 * table_scan_analyze_next_block(). 508 * 509 * The callback may acquire resources like locks that are held until 510 * table_scan_analyze_next_tuple() returns false. It e.g. can make sense 511 * to hold a lock until all tuples on a block have been analyzed by 512 * scan_analyze_next_tuple. 513 * 514 * The callback can return false if the block is not suitable for 515 * sampling, e.g. because it's a metapage that could never contain tuples. 516 * 517 * XXX: This obviously is primarily suited for block-based AMs. It's not 518 * clear what a good interface for non block based AMs would be, so there 519 * isn't one yet. 520 */ 521 bool (*scan_analyze_next_block) (TableScanDesc scan, 522 BlockNumber blockno, 523 BufferAccessStrategy bstrategy); 524 525 /* 526 * See table_scan_analyze_next_tuple(). 527 * 528 * Not every AM might have a meaningful concept of dead rows, in which 529 * case it's OK to not increment *deadrows - but note that that may 530 * influence autovacuum scheduling (see comment for relation_vacuum 531 * callback). 532 */ 533 bool (*scan_analyze_next_tuple) (TableScanDesc scan, 534 TransactionId OldestXmin, 535 double *liverows, 536 double *deadrows, 537 TupleTableSlot *slot); 538 539 /* see table_index_build_range_scan for reference about parameters */ 540 double (*index_build_range_scan) (Relation table_rel, 541 Relation index_rel, 542 struct IndexInfo *index_info, 543 bool allow_sync, 544 bool anyvisible, 545 bool progress, 546 BlockNumber start_blockno, 547 BlockNumber numblocks, 548 IndexBuildCallback callback, 549 void *callback_state, 550 TableScanDesc scan); 551 552 /* see table_index_validate_scan for reference about parameters */ 553 void (*index_validate_scan) (Relation table_rel, 554 Relation index_rel, 555 struct IndexInfo *index_info, 556 Snapshot snapshot, 557 struct ValidateIndexState *state); 558 559 560 /* ------------------------------------------------------------------------ 561 * Miscellaneous functions. 562 * ------------------------------------------------------------------------ 563 */ 564 565 /* 566 * See table_relation_size(). 567 * 568 * Note that currently a few callers use the MAIN_FORKNUM size to figure 569 * out the range of potentially interesting blocks (brin, analyze). It's 570 * probable that we'll need to revise the interface for those at some 571 * point. 572 */ 573 uint64 (*relation_size) (Relation rel, ForkNumber forkNumber); 574 575 576 /* 577 * This callback should return true if the relation requires a TOAST table 578 * and false if it does not. It may wish to examine the relation's tuple 579 * descriptor before making a decision, but if it uses some other method 580 * of storing large values (or if it does not support them) it can simply 581 * return false. 582 */ 583 bool (*relation_needs_toast_table) (Relation rel); 584 585 586 /* ------------------------------------------------------------------------ 587 * Planner related functions. 588 * ------------------------------------------------------------------------ 589 */ 590 591 /* 592 * See table_relation_estimate_size(). 593 * 594 * While block oriented, it shouldn't be too hard for an AM that doesn't 595 * internally use blocks to convert into a usable representation. 596 * 597 * This differs from the relation_size callback by returning size 598 * estimates (both relation size and tuple count) for planning purposes, 599 * rather than returning a currently correct estimate. 600 */ 601 void (*relation_estimate_size) (Relation rel, int32 *attr_widths, 602 BlockNumber *pages, double *tuples, 603 double *allvisfrac); 604 605 606 /* ------------------------------------------------------------------------ 607 * Executor related functions. 608 * ------------------------------------------------------------------------ 609 */ 610 611 /* 612 * Prepare to fetch / check / return tuples from `tbmres->blockno` as part 613 * of a bitmap table scan. `scan` was started via table_beginscan_bm(). 614 * Return false if there are no tuples to be found on the page, true 615 * otherwise. 616 * 617 * This will typically read and pin the target block, and do the necessary 618 * work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might 619 * make sense to perform tuple visibility checks at this time). For some 620 * AMs it will make more sense to do all the work referencing `tbmres` 621 * contents here, for others it might be better to defer more work to 622 * scan_bitmap_next_tuple. 623 * 624 * If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples 625 * on the page have to be returned, otherwise the tuples at offsets in 626 * `tbmres->offsets` need to be returned. 627 * 628 * XXX: Currently this may only be implemented if the AM uses md.c as its 629 * storage manager, and uses ItemPointer->ip_blkid in a manner that maps 630 * blockids directly to the underlying storage. nodeBitmapHeapscan.c 631 * performs prefetching directly using that interface. This probably 632 * needs to be rectified at a later point. 633 * 634 * XXX: Currently this may only be implemented if the AM uses the 635 * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to 636 * perform prefetching. This probably needs to be rectified at a later 637 * point. 638 * 639 * Optional callback, but either both scan_bitmap_next_block and 640 * scan_bitmap_next_tuple need to exist, or neither. 641 */ 642 bool (*scan_bitmap_next_block) (TableScanDesc scan, 643 struct TBMIterateResult *tbmres); 644 645 /* 646 * Fetch the next tuple of a bitmap table scan into `slot` and return true 647 * if a visible tuple was found, false otherwise. 648 * 649 * For some AMs it will make more sense to do all the work referencing 650 * `tbmres` contents in scan_bitmap_next_block, for others it might be 651 * better to defer more work to this callback. 652 * 653 * Optional callback, but either both scan_bitmap_next_block and 654 * scan_bitmap_next_tuple need to exist, or neither. 655 */ 656 bool (*scan_bitmap_next_tuple) (TableScanDesc scan, 657 struct TBMIterateResult *tbmres, 658 TupleTableSlot *slot); 659 660 /* 661 * Prepare to fetch tuples from the next block in a sample scan. Return 662 * false if the sample scan is finished, true otherwise. `scan` was 663 * started via table_beginscan_sampling(). 664 * 665 * Typically this will first determine the target block by calling the 666 * TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively 667 * perform a sequential scan over all blocks. The determined block is 668 * then typically read and pinned. 669 * 670 * As the TsmRoutine interface is block based, a block needs to be passed 671 * to NextSampleBlock(). If that's not appropriate for an AM, it 672 * internally needs to perform mapping between the internal and a block 673 * based representation. 674 * 675 * Note that it's not acceptable to hold deadlock prone resources such as 676 * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the 677 * block - the tuple is likely to be returned to an upper query node, and 678 * the next call could be off a long while. Holding buffer pins and such 679 * is obviously OK. 680 * 681 * Currently it is required to implement this interface, as there's no 682 * alternative way (contrary e.g. to bitmap scans) to implement sample 683 * scans. If infeasible to implement, the AM may raise an error. 684 */ 685 bool (*scan_sample_next_block) (TableScanDesc scan, 686 struct SampleScanState *scanstate); 687 688 /* 689 * This callback, only called after scan_sample_next_block has returned 690 * true, should determine the next tuple to be returned from the selected 691 * block using the TsmRoutine's NextSampleTuple() callback. 692 * 693 * The callback needs to perform visibility checks, and only return 694 * visible tuples. That obviously can mean calling NextSampleTuple() 695 * multiple times. 696 * 697 * The TsmRoutine interface assumes that there's a maximum offset on a 698 * given page, so if that doesn't apply to an AM, it needs to emulate that 699 * assumption somehow. 700 */ 701 bool (*scan_sample_next_tuple) (TableScanDesc scan, 702 struct SampleScanState *scanstate, 703 TupleTableSlot *slot); 704 705 } TableAmRoutine; 706 707 708 /* ---------------------------------------------------------------------------- 709 * Slot functions. 710 * ---------------------------------------------------------------------------- 711 */ 712 713 /* 714 * Returns slot callbacks suitable for holding tuples of the appropriate type 715 * for the relation. Works for tables, views, foreign tables and partitioned 716 * tables. 717 */ 718 extern const TupleTableSlotOps *table_slot_callbacks(Relation rel); 719 720 /* 721 * Returns slot using the callbacks returned by table_slot_callbacks(), and 722 * registers it on *reglist. 723 */ 724 extern TupleTableSlot *table_slot_create(Relation rel, List **reglist); 725 726 727 /* ---------------------------------------------------------------------------- 728 * Table scan functions. 729 * ---------------------------------------------------------------------------- 730 */ 731 732 /* 733 * Start a scan of `rel`. Returned tuples pass a visibility test of 734 * `snapshot`, and if nkeys != 0, the results are filtered by those scan keys. 735 */ 736 static inline TableScanDesc 737 table_beginscan(Relation rel, Snapshot snapshot, 738 int nkeys, struct ScanKeyData *key) 739 { 740 uint32 flags = SO_TYPE_SEQSCAN | 741 SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; 742 743 return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); 744 } 745 746 /* 747 * Like table_beginscan(), but for scanning catalog. It'll automatically use a 748 * snapshot appropriate for scanning catalog relations. 749 */ 750 extern TableScanDesc table_beginscan_catalog(Relation rel, int nkeys, 751 struct ScanKeyData *key); 752 753 /* 754 * Like table_beginscan(), but table_beginscan_strat() offers an extended API 755 * that lets the caller control whether a nondefault buffer access strategy 756 * can be used, and whether syncscan can be chosen (possibly resulting in the 757 * scan not starting from block zero). Both of these default to true with 758 * plain table_beginscan. 759 */ 760 static inline TableScanDesc 761 table_beginscan_strat(Relation rel, Snapshot snapshot, 762 int nkeys, struct ScanKeyData *key, 763 bool allow_strat, bool allow_sync) 764 { 765 uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; 766 767 if (allow_strat) 768 flags |= SO_ALLOW_STRAT; 769 if (allow_sync) 770 flags |= SO_ALLOW_SYNC; 771 772 return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); 773 } 774 775 /* 776 * table_beginscan_bm is an alternative entry point for setting up a 777 * TableScanDesc for a bitmap heap scan. Although that scan technology is 778 * really quite unlike a standard seqscan, there is just enough commonality to 779 * make it worth using the same data structure. 780 */ 781 static inline TableScanDesc 782 table_beginscan_bm(Relation rel, Snapshot snapshot, 783 int nkeys, struct ScanKeyData *key) 784 { 785 uint32 flags = SO_TYPE_BITMAPSCAN | SO_ALLOW_PAGEMODE; 786 787 return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); 788 } 789 790 /* 791 * table_beginscan_sampling is an alternative entry point for setting up a 792 * TableScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth 793 * using the same data structure although the behavior is rather different. 794 * In addition to the options offered by table_beginscan_strat, this call 795 * also allows control of whether page-mode visibility checking is used. 796 */ 797 static inline TableScanDesc 798 table_beginscan_sampling(Relation rel, Snapshot snapshot, 799 int nkeys, struct ScanKeyData *key, 800 bool allow_strat, bool allow_sync, 801 bool allow_pagemode) 802 { 803 uint32 flags = SO_TYPE_SAMPLESCAN; 804 805 if (allow_strat) 806 flags |= SO_ALLOW_STRAT; 807 if (allow_sync) 808 flags |= SO_ALLOW_SYNC; 809 if (allow_pagemode) 810 flags |= SO_ALLOW_PAGEMODE; 811 812 return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); 813 } 814 815 /* 816 * table_beginscan_tid is an alternative entry point for setting up a 817 * TableScanDesc for a Tid scan. As with bitmap scans, it's worth using 818 * the same data structure although the behavior is rather different. 819 */ 820 static inline TableScanDesc 821 table_beginscan_tid(Relation rel, Snapshot snapshot) 822 { 823 uint32 flags = SO_TYPE_TIDSCAN; 824 825 return rel->rd_tableam->scan_begin(rel, snapshot, 0, NULL, NULL, flags); 826 } 827 828 /* 829 * table_beginscan_analyze is an alternative entry point for setting up a 830 * TableScanDesc for an ANALYZE scan. As with bitmap scans, it's worth using 831 * the same data structure although the behavior is rather different. 832 */ 833 static inline TableScanDesc 834 table_beginscan_analyze(Relation rel) 835 { 836 uint32 flags = SO_TYPE_ANALYZE; 837 838 return rel->rd_tableam->scan_begin(rel, NULL, 0, NULL, NULL, flags); 839 } 840 841 /* 842 * End relation scan. 843 */ 844 static inline void 845 table_endscan(TableScanDesc scan) 846 { 847 scan->rs_rd->rd_tableam->scan_end(scan); 848 } 849 850 /* 851 * Restart a relation scan. 852 */ 853 static inline void 854 table_rescan(TableScanDesc scan, 855 struct ScanKeyData *key) 856 { 857 scan->rs_rd->rd_tableam->scan_rescan(scan, key, false, false, false, false); 858 } 859 860 /* 861 * Restart a relation scan after changing params. 862 * 863 * This call allows changing the buffer strategy, syncscan, and pagemode 864 * options before starting a fresh scan. Note that although the actual use of 865 * syncscan might change (effectively, enabling or disabling reporting), the 866 * previously selected startblock will be kept. 867 */ 868 static inline void 869 table_rescan_set_params(TableScanDesc scan, struct ScanKeyData *key, 870 bool allow_strat, bool allow_sync, bool allow_pagemode) 871 { 872 scan->rs_rd->rd_tableam->scan_rescan(scan, key, true, 873 allow_strat, allow_sync, 874 allow_pagemode); 875 } 876 877 /* 878 * Update snapshot used by the scan. 879 */ 880 extern void table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot); 881 882 /* 883 * Return next tuple from `scan`, store in slot. 884 */ 885 static inline bool 886 table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) 887 { 888 slot->tts_tableOid = RelationGetRelid(sscan->rs_rd); 889 return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot); 890 } 891 892 893 /* ---------------------------------------------------------------------------- 894 * Parallel table scan related functions. 895 * ---------------------------------------------------------------------------- 896 */ 897 898 /* 899 * Estimate the size of shared memory needed for a parallel scan of this 900 * relation. 901 */ 902 extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); 903 904 /* 905 * Initialize ParallelTableScanDesc for a parallel scan of this 906 * relation. `pscan` needs to be sized according to parallelscan_estimate() 907 * for the same relation. Call this just once in the leader process; then, 908 * individual workers attach via table_beginscan_parallel. 909 */ 910 extern void table_parallelscan_initialize(Relation rel, 911 ParallelTableScanDesc pscan, 912 Snapshot snapshot); 913 914 /* 915 * Begin a parallel scan. `pscan` needs to have been initialized with 916 * table_parallelscan_initialize(), for the same relation. The initialization 917 * does not need to have happened in this backend. 918 * 919 * Caller must hold a suitable lock on the relation. 920 */ 921 extern TableScanDesc table_beginscan_parallel(Relation rel, 922 ParallelTableScanDesc pscan); 923 924 /* 925 * Restart a parallel scan. Call this in the leader process. Caller is 926 * responsible for making sure that all workers have finished the scan 927 * beforehand. 928 */ 929 static inline void 930 table_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) 931 { 932 rel->rd_tableam->parallelscan_reinitialize(rel, pscan); 933 } 934 935 936 /* ---------------------------------------------------------------------------- 937 * Index scan related functions. 938 * ---------------------------------------------------------------------------- 939 */ 940 941 /* 942 * Prepare to fetch tuples from the relation, as needed when fetching tuples 943 * for an index scan. 944 * 945 * Tuples for an index scan can then be fetched via table_index_fetch_tuple(). 946 */ 947 static inline IndexFetchTableData * 948 table_index_fetch_begin(Relation rel) 949 { 950 return rel->rd_tableam->index_fetch_begin(rel); 951 } 952 953 /* 954 * Reset index fetch. Typically this will release cross index fetch resources 955 * held in IndexFetchTableData. 956 */ 957 static inline void 958 table_index_fetch_reset(struct IndexFetchTableData *scan) 959 { 960 scan->rel->rd_tableam->index_fetch_reset(scan); 961 } 962 963 /* 964 * Release resources and deallocate index fetch. 965 */ 966 static inline void 967 table_index_fetch_end(struct IndexFetchTableData *scan) 968 { 969 scan->rel->rd_tableam->index_fetch_end(scan); 970 } 971 972 /* 973 * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing 974 * a visibility test according to `snapshot`. If a tuple was found and passed 975 * the visibility test, returns true, false otherwise. 976 * 977 * *call_again needs to be false on the first call to table_index_fetch_tuple() for 978 * a tid. If there potentially is another tuple matching the tid, *call_again 979 * will be set to true, signalling that table_index_fetch_tuple() should be called 980 * again for the same tid. 981 * 982 * *all_dead, if all_dead is not NULL, will be set to true by 983 * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see 984 * that tuple. Index AMs can use that to avoid returning that tid in future 985 * searches. 986 * 987 * The difference between this function and table_tuple_fetch_row_version() 988 * is that this function returns the currently visible version of a row if 989 * the AM supports storing multiple row versions reachable via a single index 990 * entry (like heap's HOT). Whereas table_tuple_fetch_row_version() only 991 * evaluates the tuple exactly at `tid`. Outside of index entry ->table tuple 992 * lookups, table_tuple_fetch_row_version() is what's usually needed. 993 */ 994 static inline bool 995 table_index_fetch_tuple(struct IndexFetchTableData *scan, 996 ItemPointer tid, 997 Snapshot snapshot, 998 TupleTableSlot *slot, 999 bool *call_again, bool *all_dead) 1000 { 1001 1002 return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, 1003 slot, call_again, 1004 all_dead); 1005 } 1006 1007 /* 1008 * This is a convenience wrapper around table_index_fetch_tuple() which 1009 * returns whether there are table tuple items corresponding to an index 1010 * entry. This likely is only useful to verify if there's a conflict in a 1011 * unique index. 1012 */ 1013 extern bool table_index_fetch_tuple_check(Relation rel, 1014 ItemPointer tid, 1015 Snapshot snapshot, 1016 bool *all_dead); 1017 1018 1019 /* ------------------------------------------------------------------------ 1020 * Functions for non-modifying operations on individual tuples 1021 * ------------------------------------------------------------------------ 1022 */ 1023 1024 1025 /* 1026 * Fetch tuple at `tid` into `slot`, after doing a visibility test according to 1027 * `snapshot`. If a tuple was found and passed the visibility test, returns 1028 * true, false otherwise. 1029 * 1030 * See table_index_fetch_tuple's comment about what the difference between 1031 * these functions is. It is correct to use this function outside of index 1032 * entry->table tuple lookups. 1033 */ 1034 static inline bool 1035 table_tuple_fetch_row_version(Relation rel, 1036 ItemPointer tid, 1037 Snapshot snapshot, 1038 TupleTableSlot *slot) 1039 { 1040 return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot); 1041 } 1042 1043 /* 1044 * Verify that `tid` is a potentially valid tuple identifier. That doesn't 1045 * mean that the pointed to row needs to exist or be visible, but that 1046 * attempting to fetch the row (e.g. with table_tuple_get_latest_tid() or 1047 * table_tuple_fetch_row_version()) should not error out if called with that 1048 * tid. 1049 * 1050 * `scan` needs to have been started via table_beginscan(). 1051 */ 1052 static inline bool 1053 table_tuple_tid_valid(TableScanDesc scan, ItemPointer tid) 1054 { 1055 return scan->rs_rd->rd_tableam->tuple_tid_valid(scan, tid); 1056 } 1057 1058 /* 1059 * Return the latest version of the tuple at `tid`, by updating `tid` to 1060 * point at the newest version. 1061 */ 1062 extern void table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid); 1063 1064 /* 1065 * Return true iff tuple in slot satisfies the snapshot. 1066 * 1067 * This assumes the slot's tuple is valid, and of the appropriate type for the 1068 * AM. 1069 * 1070 * Some AMs might modify the data underlying the tuple as a side-effect. If so 1071 * they ought to mark the relevant buffer dirty. 1072 */ 1073 static inline bool 1074 table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, 1075 Snapshot snapshot) 1076 { 1077 return rel->rd_tableam->tuple_satisfies_snapshot(rel, slot, snapshot); 1078 } 1079 1080 /* 1081 * Compute the newest xid among the tuples pointed to by items. This is used 1082 * to compute what snapshots to conflict with when replaying WAL records for 1083 * page-level index vacuums. 1084 */ 1085 static inline TransactionId 1086 table_compute_xid_horizon_for_tuples(Relation rel, 1087 ItemPointerData *items, 1088 int nitems) 1089 { 1090 return rel->rd_tableam->compute_xid_horizon_for_tuples(rel, items, nitems); 1091 } 1092 1093 1094 /* ---------------------------------------------------------------------------- 1095 * Functions for manipulations of physical tuples. 1096 * ---------------------------------------------------------------------------- 1097 */ 1098 1099 /* 1100 * Insert a tuple from a slot into table AM routine. 1101 * 1102 * The options bitmask allows the caller to specify options that may change the 1103 * behaviour of the AM. The AM will ignore options that it does not support. 1104 * 1105 * If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple doesn't 1106 * need to be logged to WAL, even for a non-temp relation. It is the AMs 1107 * choice whether this optimization is supported. 1108 * 1109 * If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse 1110 * free space in the relation. This can save some cycles when we know the 1111 * relation is new and doesn't contain useful amounts of free space. 1112 * TABLE_INSERT_SKIP_FSM is commonly passed directly to 1113 * RelationGetBufferForTuple. See that method for more information. 1114 * 1115 * TABLE_INSERT_FROZEN should only be specified for inserts into 1116 * relfilenodes created during the current subtransaction and when 1117 * there are no prior snapshots or pre-existing portals open. 1118 * This causes rows to be frozen, which is an MVCC violation and 1119 * requires explicit options chosen by user. 1120 * 1121 * TABLE_INSERT_NO_LOGICAL force-disables the emitting of logical decoding 1122 * information for the tuple. This should solely be used during table rewrites 1123 * where RelationIsLogicallyLogged(relation) is not yet accurate for the new 1124 * relation. 1125 * 1126 * Note that most of these options will be applied when inserting into the 1127 * heap's TOAST table, too, if the tuple requires any out-of-line data. 1128 * 1129 * The BulkInsertState object (if any; bistate can be NULL for default 1130 * behavior) is also just passed through to RelationGetBufferForTuple. If 1131 * `bistate` is provided, table_finish_bulk_insert() needs to be called. 1132 * 1133 * On return the slot's tts_tid and tts_tableOid are updated to reflect the 1134 * insertion. But note that any toasting of fields within the slot is NOT 1135 * reflected in the slots contents. 1136 */ 1137 static inline void 1138 table_tuple_insert(Relation rel, TupleTableSlot *slot, CommandId cid, 1139 int options, struct BulkInsertStateData *bistate) 1140 { 1141 rel->rd_tableam->tuple_insert(rel, slot, cid, options, 1142 bistate); 1143 } 1144 1145 /* 1146 * Perform a "speculative insertion". These can be backed out afterwards 1147 * without aborting the whole transaction. Other sessions can wait for the 1148 * speculative insertion to be confirmed, turning it into a regular tuple, or 1149 * aborted, as if it never existed. Speculatively inserted tuples behave as 1150 * "value locks" of short duration, used to implement INSERT .. ON CONFLICT. 1151 * 1152 * A transaction having performed a speculative insertion has to either abort, 1153 * or finish the speculative insertion with 1154 * table_tuple_complete_speculative(succeeded = ...). 1155 */ 1156 static inline void 1157 table_tuple_insert_speculative(Relation rel, TupleTableSlot *slot, 1158 CommandId cid, int options, 1159 struct BulkInsertStateData *bistate, 1160 uint32 specToken) 1161 { 1162 rel->rd_tableam->tuple_insert_speculative(rel, slot, cid, options, 1163 bistate, specToken); 1164 } 1165 1166 /* 1167 * Complete "speculative insertion" started in the same transaction. If 1168 * succeeded is true, the tuple is fully inserted, if false, it's removed. 1169 */ 1170 static inline void 1171 table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot, 1172 uint32 specToken, bool succeeded) 1173 { 1174 rel->rd_tableam->tuple_complete_speculative(rel, slot, specToken, 1175 succeeded); 1176 } 1177 1178 /* 1179 * Insert multiple tuples into a table. 1180 * 1181 * This is like table_tuple_insert(), but inserts multiple tuples in one 1182 * operation. That's often faster than calling table_tuple_insert() in a loop, 1183 * because e.g. the AM can reduce WAL logging and page locking overhead. 1184 * 1185 * Except for taking `nslots` tuples as input, as an array of TupleTableSlots 1186 * in `slots`, the parameters for table_multi_insert() are the same as for 1187 * table_tuple_insert(). 1188 * 1189 * Note: this leaks memory into the current memory context. You can create a 1190 * temporary context before calling this, if that's a problem. 1191 */ 1192 static inline void 1193 table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots, 1194 CommandId cid, int options, struct BulkInsertStateData *bistate) 1195 { 1196 rel->rd_tableam->multi_insert(rel, slots, nslots, 1197 cid, options, bistate); 1198 } 1199 1200 /* 1201 * Delete a tuple. 1202 * 1203 * NB: do not call this directly unless prepared to deal with 1204 * concurrent-update conditions. Use simple_table_tuple_delete instead. 1205 * 1206 * Input parameters: 1207 * relation - table to be modified (caller must hold suitable lock) 1208 * tid - TID of tuple to be deleted 1209 * cid - delete command ID (used for visibility test, and stored into 1210 * cmax if successful) 1211 * crosscheck - if not InvalidSnapshot, also check tuple against this 1212 * wait - true if should wait for any conflicting update to commit/abort 1213 * Output parameters: 1214 * tmfd - filled in failure cases (see below) 1215 * changingPart - true iff the tuple is being moved to another partition 1216 * table due to an update of the partition key. Otherwise, false. 1217 * 1218 * Normal, successful return value is TM_Ok, which means we did actually 1219 * delete it. Failure return codes are TM_SelfModified, TM_Updated, and 1220 * TM_BeingModified (the last only possible if wait == false). 1221 * 1222 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, 1223 * t_xmax, and, if possible, and, if possible, t_cmax. See comments for 1224 * struct TM_FailureData for additional info. 1225 */ 1226 static inline TM_Result 1227 table_tuple_delete(Relation rel, ItemPointer tid, CommandId cid, 1228 Snapshot snapshot, Snapshot crosscheck, bool wait, 1229 TM_FailureData *tmfd, bool changingPart) 1230 { 1231 return rel->rd_tableam->tuple_delete(rel, tid, cid, 1232 snapshot, crosscheck, 1233 wait, tmfd, changingPart); 1234 } 1235 1236 /* 1237 * Update a tuple. 1238 * 1239 * NB: do not call this directly unless you are prepared to deal with 1240 * concurrent-update conditions. Use simple_table_tuple_update instead. 1241 * 1242 * Input parameters: 1243 * relation - table to be modified (caller must hold suitable lock) 1244 * otid - TID of old tuple to be replaced 1245 * slot - newly constructed tuple data to store 1246 * cid - update command ID (used for visibility test, and stored into 1247 * cmax/cmin if successful) 1248 * crosscheck - if not InvalidSnapshot, also check old tuple against this 1249 * wait - true if should wait for any conflicting update to commit/abort 1250 * Output parameters: 1251 * tmfd - filled in failure cases (see below) 1252 * lockmode - filled with lock mode acquired on tuple 1253 * update_indexes - in success cases this is set to true if new index entries 1254 * are required for this tuple 1255 * 1256 * Normal, successful return value is TM_Ok, which means we did actually 1257 * update it. Failure return codes are TM_SelfModified, TM_Updated, and 1258 * TM_BeingModified (the last only possible if wait == false). 1259 * 1260 * On success, the slot's tts_tid and tts_tableOid are updated to match the new 1261 * stored tuple; in particular, slot->tts_tid is set to the TID where the 1262 * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT 1263 * update was done. However, any TOAST changes in the new tuple's 1264 * data are not reflected into *newtup. 1265 * 1266 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, 1267 * t_xmax, and, if possible, t_cmax. See comments for struct TM_FailureData 1268 * for additional info. 1269 */ 1270 static inline TM_Result 1271 table_tuple_update(Relation rel, ItemPointer otid, TupleTableSlot *slot, 1272 CommandId cid, Snapshot snapshot, Snapshot crosscheck, 1273 bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, 1274 bool *update_indexes) 1275 { 1276 return rel->rd_tableam->tuple_update(rel, otid, slot, 1277 cid, snapshot, crosscheck, 1278 wait, tmfd, 1279 lockmode, update_indexes); 1280 } 1281 1282 /* 1283 * Lock a tuple in the specified mode. 1284 * 1285 * Input parameters: 1286 * relation: relation containing tuple (caller must hold suitable lock) 1287 * tid: TID of tuple to lock 1288 * snapshot: snapshot to use for visibility determinations 1289 * cid: current command ID (used for visibility test, and stored into 1290 * tuple's cmax if lock is successful) 1291 * mode: lock mode desired 1292 * wait_policy: what to do if tuple lock is not available 1293 * flags: 1294 * If TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS, follow the update chain to 1295 * also lock descendant tuples if lock modes don't conflict. 1296 * If TUPLE_LOCK_FLAG_FIND_LAST_VERSION, follow the update chain and lock 1297 * latest version. 1298 * 1299 * Output parameters: 1300 * *slot: contains the target tuple 1301 * *tmfd: filled in failure cases (see below) 1302 * 1303 * Function result may be: 1304 * TM_Ok: lock was successfully acquired 1305 * TM_Invisible: lock failed because tuple was never visible to us 1306 * TM_SelfModified: lock failed because tuple updated by self 1307 * TM_Updated: lock failed because tuple updated by other xact 1308 * TM_Deleted: lock failed because tuple deleted by other xact 1309 * TM_WouldBlock: lock couldn't be acquired and wait_policy is skip 1310 * 1311 * In the failure cases other than TM_Invisible and TM_Deleted, the routine 1312 * fills *tmfd with the tuple's t_ctid, t_xmax, and, if possible, t_cmax. See 1313 * comments for struct TM_FailureData for additional info. 1314 */ 1315 static inline TM_Result 1316 table_tuple_lock(Relation rel, ItemPointer tid, Snapshot snapshot, 1317 TupleTableSlot *slot, CommandId cid, LockTupleMode mode, 1318 LockWaitPolicy wait_policy, uint8 flags, 1319 TM_FailureData *tmfd) 1320 { 1321 return rel->rd_tableam->tuple_lock(rel, tid, snapshot, slot, 1322 cid, mode, wait_policy, 1323 flags, tmfd); 1324 } 1325 1326 /* 1327 * Perform operations necessary to complete insertions made via 1328 * tuple_insert and multi_insert with a BulkInsertState specified. This 1329 * e.g. may e.g. used to flush the relation when inserting with 1330 * TABLE_INSERT_SKIP_WAL specified. 1331 */ 1332 static inline void 1333 table_finish_bulk_insert(Relation rel, int options) 1334 { 1335 /* optional callback */ 1336 if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert) 1337 rel->rd_tableam->finish_bulk_insert(rel, options); 1338 } 1339 1340 1341 /* ------------------------------------------------------------------------ 1342 * DDL related functionality. 1343 * ------------------------------------------------------------------------ 1344 */ 1345 1346 /* 1347 * Create storage for `rel` in `newrnode`, with persistence set to 1348 * `persistence`. 1349 * 1350 * This is used both during relation creation and various DDL operations to 1351 * create a new relfilenode that can be filled from scratch. When creating 1352 * new storage for an existing relfilenode, this should be called before the 1353 * relcache entry has been updated. 1354 * 1355 * *freezeXid, *minmulti are set to the xid / multixact horizon for the table 1356 * that pg_class.{relfrozenxid, relminmxid} have to be set to. 1357 */ 1358 static inline void 1359 table_relation_set_new_filenode(Relation rel, 1360 const RelFileNode *newrnode, 1361 char persistence, 1362 TransactionId *freezeXid, 1363 MultiXactId *minmulti) 1364 { 1365 rel->rd_tableam->relation_set_new_filenode(rel, newrnode, persistence, 1366 freezeXid, minmulti); 1367 } 1368 1369 /* 1370 * Remove all table contents from `rel`, in a non-transactional manner. 1371 * Non-transactional meaning that there's no need to support rollbacks. This 1372 * commonly only is used to perform truncations for relfilenodes created in the 1373 * current transaction. 1374 */ 1375 static inline void 1376 table_relation_nontransactional_truncate(Relation rel) 1377 { 1378 rel->rd_tableam->relation_nontransactional_truncate(rel); 1379 } 1380 1381 /* 1382 * Copy data from `rel` into the new relfilenode `newrnode`. The new 1383 * relfilenode may not have storage associated before this function is 1384 * called. This is only supposed to be used for low level operations like 1385 * changing a relation's tablespace. 1386 */ 1387 static inline void 1388 table_relation_copy_data(Relation rel, const RelFileNode *newrnode) 1389 { 1390 rel->rd_tableam->relation_copy_data(rel, newrnode); 1391 } 1392 1393 /* 1394 * Copy data from `OldTable` into `NewTable`, as part of a CLUSTER or VACUUM 1395 * FULL. 1396 * 1397 * Additional Input parameters: 1398 * - use_sort - if true, the table contents are sorted appropriate for 1399 * `OldIndex`; if false and OldIndex is not InvalidOid, the data is copied 1400 * in that index's order; if false and OldIndex is InvalidOid, no sorting is 1401 * performed 1402 * - OldIndex - see use_sort 1403 * - OldestXmin - computed by vacuum_set_xid_limits(), even when 1404 * not needed for the relation's AM 1405 * - *xid_cutoff - ditto 1406 * - *multi_cutoff - ditto 1407 * 1408 * Output parameters: 1409 * - *xid_cutoff - rel's new relfrozenxid value, may be invalid 1410 * - *multi_cutoff - rel's new relminmxid value, may be invalid 1411 * - *tups_vacuumed - stats, for logging, if appropriate for AM 1412 * - *tups_recently_dead - stats, for logging, if appropriate for AM 1413 */ 1414 static inline void 1415 table_relation_copy_for_cluster(Relation OldTable, Relation NewTable, 1416 Relation OldIndex, 1417 bool use_sort, 1418 TransactionId OldestXmin, 1419 TransactionId *xid_cutoff, 1420 MultiXactId *multi_cutoff, 1421 double *num_tuples, 1422 double *tups_vacuumed, 1423 double *tups_recently_dead) 1424 { 1425 OldTable->rd_tableam->relation_copy_for_cluster(OldTable, NewTable, OldIndex, 1426 use_sort, OldestXmin, 1427 xid_cutoff, multi_cutoff, 1428 num_tuples, tups_vacuumed, 1429 tups_recently_dead); 1430 } 1431 1432 /* 1433 * Perform VACUUM on the relation. The VACUUM can be triggered by a user or by 1434 * autovacuum. The specific actions performed by the AM will depend heavily on 1435 * the individual AM. 1436 * 1437 * On entry a transaction needs to already been established, and the 1438 * table is locked with a ShareUpdateExclusive lock. 1439 * 1440 * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through this 1441 * routine, even if (for ANALYZE) it is part of the same VACUUM command. 1442 */ 1443 static inline void 1444 table_relation_vacuum(Relation rel, struct VacuumParams *params, 1445 BufferAccessStrategy bstrategy) 1446 { 1447 rel->rd_tableam->relation_vacuum(rel, params, bstrategy); 1448 } 1449 1450 /* 1451 * Prepare to analyze block `blockno` of `scan`. The scan needs to have been 1452 * started with table_beginscan_analyze(). Note that this routine might 1453 * acquire resources like locks that are held until 1454 * table_scan_analyze_next_tuple() returns false. 1455 * 1456 * Returns false if block is unsuitable for sampling, true otherwise. 1457 */ 1458 static inline bool 1459 table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, 1460 BufferAccessStrategy bstrategy) 1461 { 1462 return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno, 1463 bstrategy); 1464 } 1465 1466 /* 1467 * Iterate over tuples in the block selected with 1468 * table_scan_analyze_next_block() (which needs to have returned true, and 1469 * this routine may not have returned false for the same block before). If a 1470 * tuple that's suitable for sampling is found, true is returned and a tuple 1471 * is stored in `slot`. 1472 * 1473 * *liverows and *deadrows are incremented according to the encountered 1474 * tuples. 1475 */ 1476 static inline bool 1477 table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, 1478 double *liverows, double *deadrows, 1479 TupleTableSlot *slot) 1480 { 1481 return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin, 1482 liverows, deadrows, 1483 slot); 1484 } 1485 1486 /* 1487 * table_index_build_scan - scan the table to find tuples to be indexed 1488 * 1489 * This is called back from an access-method-specific index build procedure 1490 * after the AM has done whatever setup it needs. The parent table relation 1491 * is scanned to find tuples that should be entered into the index. Each 1492 * such tuple is passed to the AM's callback routine, which does the right 1493 * things to add it to the new index. After we return, the AM's index 1494 * build procedure does whatever cleanup it needs. 1495 * 1496 * The total count of live tuples is returned. This is for updating pg_class 1497 * statistics. (It's annoying not to be able to do that here, but we want to 1498 * merge that update with others; see index_update_stats.) Note that the 1499 * index AM itself must keep track of the number of index tuples; we don't do 1500 * so here because the AM might reject some of the tuples for its own reasons, 1501 * such as being unable to store NULLs. 1502 * 1503 * If 'progress', the PROGRESS_SCAN_BLOCKS_TOTAL counter is updated when 1504 * starting the scan, and PROGRESS_SCAN_BLOCKS_DONE is updated as we go along. 1505 * 1506 * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect 1507 * any potentially broken HOT chains. Currently, we set this if there are any 1508 * RECENTLY_DEAD or DELETE_IN_PROGRESS entries in a HOT chain, without trying 1509 * very hard to detect whether they're really incompatible with the chain tip. 1510 * This only really makes sense for heap AM, it might need to be generalized 1511 * for other AMs later. 1512 */ 1513 static inline double 1514 table_index_build_scan(Relation table_rel, 1515 Relation index_rel, 1516 struct IndexInfo *index_info, 1517 bool allow_sync, 1518 bool progress, 1519 IndexBuildCallback callback, 1520 void *callback_state, 1521 TableScanDesc scan) 1522 { 1523 return table_rel->rd_tableam->index_build_range_scan(table_rel, 1524 index_rel, 1525 index_info, 1526 allow_sync, 1527 false, 1528 progress, 1529 0, 1530 InvalidBlockNumber, 1531 callback, 1532 callback_state, 1533 scan); 1534 } 1535 1536 /* 1537 * As table_index_build_scan(), except that instead of scanning the complete 1538 * table, only the given number of blocks are scanned. Scan to end-of-rel can 1539 * be signalled by passing InvalidBlockNumber as numblocks. Note that 1540 * restricting the range to scan cannot be done when requesting syncscan. 1541 * 1542 * When "anyvisible" mode is requested, all tuples visible to any transaction 1543 * are indexed and counted as live, including those inserted or deleted by 1544 * transactions that are still in progress. 1545 */ 1546 static inline double 1547 table_index_build_range_scan(Relation table_rel, 1548 Relation index_rel, 1549 struct IndexInfo *index_info, 1550 bool allow_sync, 1551 bool anyvisible, 1552 bool progress, 1553 BlockNumber start_blockno, 1554 BlockNumber numblocks, 1555 IndexBuildCallback callback, 1556 void *callback_state, 1557 TableScanDesc scan) 1558 { 1559 return table_rel->rd_tableam->index_build_range_scan(table_rel, 1560 index_rel, 1561 index_info, 1562 allow_sync, 1563 anyvisible, 1564 progress, 1565 start_blockno, 1566 numblocks, 1567 callback, 1568 callback_state, 1569 scan); 1570 } 1571 1572 /* 1573 * table_index_validate_scan - second table scan for concurrent index build 1574 * 1575 * See validate_index() for an explanation. 1576 */ 1577 static inline void 1578 table_index_validate_scan(Relation table_rel, 1579 Relation index_rel, 1580 struct IndexInfo *index_info, 1581 Snapshot snapshot, 1582 struct ValidateIndexState *state) 1583 { 1584 table_rel->rd_tableam->index_validate_scan(table_rel, 1585 index_rel, 1586 index_info, 1587 snapshot, 1588 state); 1589 } 1590 1591 1592 /* ---------------------------------------------------------------------------- 1593 * Miscellaneous functionality 1594 * ---------------------------------------------------------------------------- 1595 */ 1596 1597 /* 1598 * Return the current size of `rel` in bytes. If `forkNumber` is 1599 * InvalidForkNumber, return the relation's overall size, otherwise the size 1600 * for the indicated fork. 1601 * 1602 * Note that the overall size might not be the equivalent of the sum of sizes 1603 * for the individual forks for some AMs, e.g. because the AMs storage does 1604 * not neatly map onto the builtin types of forks. 1605 */ 1606 static inline uint64 1607 table_relation_size(Relation rel, ForkNumber forkNumber) 1608 { 1609 return rel->rd_tableam->relation_size(rel, forkNumber); 1610 } 1611 1612 /* 1613 * table_relation_needs_toast_table - does this relation need a toast table? 1614 */ 1615 static inline bool 1616 table_relation_needs_toast_table(Relation rel) 1617 { 1618 return rel->rd_tableam->relation_needs_toast_table(rel); 1619 } 1620 1621 1622 /* ---------------------------------------------------------------------------- 1623 * Planner related functionality 1624 * ---------------------------------------------------------------------------- 1625 */ 1626 1627 /* 1628 * Estimate the current size of the relation, as an AM specific workhorse for 1629 * estimate_rel_size(). Look there for an explanation of the parameters. 1630 */ 1631 static inline void 1632 table_relation_estimate_size(Relation rel, int32 *attr_widths, 1633 BlockNumber *pages, double *tuples, 1634 double *allvisfrac) 1635 { 1636 rel->rd_tableam->relation_estimate_size(rel, attr_widths, pages, tuples, 1637 allvisfrac); 1638 } 1639 1640 1641 /* ---------------------------------------------------------------------------- 1642 * Executor related functionality 1643 * ---------------------------------------------------------------------------- 1644 */ 1645 1646 /* 1647 * Prepare to fetch / check / return tuples from `tbmres->blockno` as part of 1648 * a bitmap table scan. `scan` needs to have been started via 1649 * table_beginscan_bm(). Returns false if there are no tuples to be found on 1650 * the page, true otherwise. 1651 * 1652 * Note, this is an optionally implemented function, therefore should only be 1653 * used after verifying the presence (at plan time or such). 1654 */ 1655 static inline bool 1656 table_scan_bitmap_next_block(TableScanDesc scan, 1657 struct TBMIterateResult *tbmres) 1658 { 1659 return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, 1660 tbmres); 1661 } 1662 1663 /* 1664 * Fetch the next tuple of a bitmap table scan into `slot` and return true if 1665 * a visible tuple was found, false otherwise. 1666 * table_scan_bitmap_next_block() needs to previously have selected a 1667 * block (i.e. returned true), and no previous 1668 * table_scan_bitmap_next_tuple() for the same block may have 1669 * returned false. 1670 */ 1671 static inline bool 1672 table_scan_bitmap_next_tuple(TableScanDesc scan, 1673 struct TBMIterateResult *tbmres, 1674 TupleTableSlot *slot) 1675 { 1676 return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan, 1677 tbmres, 1678 slot); 1679 } 1680 1681 /* 1682 * Prepare to fetch tuples from the next block in a sample scan. Returns false 1683 * if the sample scan is finished, true otherwise. `scan` needs to have been 1684 * started via table_beginscan_sampling(). 1685 * 1686 * This will call the TsmRoutine's NextSampleBlock() callback if necessary 1687 * (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the 1688 * underlying relation. 1689 */ 1690 static inline bool 1691 table_scan_sample_next_block(TableScanDesc scan, 1692 struct SampleScanState *scanstate) 1693 { 1694 return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate); 1695 } 1696 1697 /* 1698 * Fetch the next sample tuple into `slot` and return true if a visible tuple 1699 * was found, false otherwise. table_scan_sample_next_block() needs to 1700 * previously have selected a block (i.e. returned true), and no previous 1701 * table_scan_sample_next_tuple() for the same block may have returned false. 1702 * 1703 * This will call the TsmRoutine's NextSampleTuple() callback. 1704 */ 1705 static inline bool 1706 table_scan_sample_next_tuple(TableScanDesc scan, 1707 struct SampleScanState *scanstate, 1708 TupleTableSlot *slot) 1709 { 1710 return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate, 1711 slot); 1712 } 1713 1714 1715 /* ---------------------------------------------------------------------------- 1716 * Functions to make modifications a bit simpler. 1717 * ---------------------------------------------------------------------------- 1718 */ 1719 1720 extern void simple_table_tuple_insert(Relation rel, TupleTableSlot *slot); 1721 extern void simple_table_tuple_delete(Relation rel, ItemPointer tid, 1722 Snapshot snapshot); 1723 extern void simple_table_tuple_update(Relation rel, ItemPointer otid, 1724 TupleTableSlot *slot, Snapshot snapshot, 1725 bool *update_indexes); 1726 1727 1728 /* ---------------------------------------------------------------------------- 1729 * Helper functions to implement parallel scans for block oriented AMs. 1730 * ---------------------------------------------------------------------------- 1731 */ 1732 1733 extern Size table_block_parallelscan_estimate(Relation rel); 1734 extern Size table_block_parallelscan_initialize(Relation rel, 1735 ParallelTableScanDesc pscan); 1736 extern void table_block_parallelscan_reinitialize(Relation rel, 1737 ParallelTableScanDesc pscan); 1738 extern BlockNumber table_block_parallelscan_nextpage(Relation rel, 1739 ParallelBlockTableScanDesc pbscan); 1740 extern void table_block_parallelscan_startblock_init(Relation rel, 1741 ParallelBlockTableScanDesc pbscan); 1742 1743 1744 /* ---------------------------------------------------------------------------- 1745 * Functions in tableamapi.c 1746 * ---------------------------------------------------------------------------- 1747 */ 1748 1749 extern const TableAmRoutine *GetTableAmRoutine(Oid amhandler); 1750 extern const TableAmRoutine *GetHeapamTableAmRoutine(void); 1751 extern bool check_default_table_access_method(char **newval, void **extra, 1752 GucSource source); 1753 1754 #endif /* TABLEAM_H */ 1755