1 /*------------------------------------------------------------------------- 2 * 3 * storage.c 4 * code to create and destroy physical storage for relations 5 * 6 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group 7 * Portions Copyright (c) 1994, Regents of the University of California 8 * 9 * 10 * IDENTIFICATION 11 * src/backend/catalog/storage.c 12 * 13 * NOTES 14 * Some of this code used to be in storage/smgr/smgr.c, and the 15 * function names still reflect that. 16 * 17 *------------------------------------------------------------------------- 18 */ 19 20 #include "postgres.h" 21 22 #include "access/parallel.h" 23 #include "access/visibilitymap.h" 24 #include "access/xact.h" 25 #include "access/xlog.h" 26 #include "access/xloginsert.h" 27 #include "access/xlogutils.h" 28 #include "catalog/storage.h" 29 #include "catalog/storage_xlog.h" 30 #include "miscadmin.h" 31 #include "storage/freespace.h" 32 #include "storage/smgr.h" 33 #include "utils/hsearch.h" 34 #include "utils/memutils.h" 35 #include "utils/rel.h" 36 37 /* GUC variables */ 38 int wal_skip_threshold = 2048; /* in kilobytes */ 39 40 /* 41 * We keep a list of all relations (represented as RelFileNode values) 42 * that have been created or deleted in the current transaction. When 43 * a relation is created, we create the physical file immediately, but 44 * remember it so that we can delete the file again if the current 45 * transaction is aborted. Conversely, a deletion request is NOT 46 * executed immediately, but is just entered in the list. When and if 47 * the transaction commits, we can delete the physical file. 48 * 49 * To handle subtransactions, every entry is marked with its transaction 50 * nesting level. At subtransaction commit, we reassign the subtransaction's 51 * entries to the parent nesting level. At subtransaction abort, we can 52 * immediately execute the abort-time actions for all entries of the current 53 * nesting level. 54 * 55 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear 56 * unbetimes. It'd probably be OK to keep it in TopTransactionContext, 57 * but I'm being paranoid. 58 */ 59 60 typedef struct PendingRelDelete 61 { 62 RelFileNode relnode; /* relation that may need to be deleted */ 63 BackendId backend; /* InvalidBackendId if not a temp rel */ 64 bool atCommit; /* T=delete at commit; F=delete at abort */ 65 int nestLevel; /* xact nesting level of request */ 66 struct PendingRelDelete *next; /* linked-list link */ 67 } PendingRelDelete; 68 69 typedef struct PendingRelSync 70 { 71 RelFileNode rnode; 72 bool is_truncated; /* Has the file experienced truncation? */ 73 } PendingRelSync; 74 75 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ 76 HTAB *pendingSyncHash = NULL; 77 78 79 /* 80 * AddPendingSync 81 * Queue an at-commit fsync. 82 */ 83 static void 84 AddPendingSync(const RelFileNode *rnode) 85 { 86 PendingRelSync *pending; 87 bool found; 88 89 /* create the hash if not yet */ 90 if (!pendingSyncHash) 91 { 92 HASHCTL ctl; 93 94 ctl.keysize = sizeof(RelFileNode); 95 ctl.entrysize = sizeof(PendingRelSync); 96 ctl.hcxt = TopTransactionContext; 97 pendingSyncHash = hash_create("pending sync hash", 16, &ctl, 98 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); 99 } 100 101 pending = hash_search(pendingSyncHash, rnode, HASH_ENTER, &found); 102 Assert(!found); 103 pending->is_truncated = false; 104 } 105 106 /* 107 * RelationCreateStorage 108 * Create physical storage for a relation. 109 * 110 * Create the underlying disk file storage for the relation. This only 111 * creates the main fork; additional forks are created lazily by the 112 * modules that need them. 113 * 114 * This function is transactional. The creation is WAL-logged, and if the 115 * transaction aborts later on, the storage will be destroyed. 116 */ 117 SMgrRelation 118 RelationCreateStorage(RelFileNode rnode, char relpersistence) 119 { 120 PendingRelDelete *pending; 121 SMgrRelation srel; 122 BackendId backend; 123 bool needs_wal; 124 125 Assert(!IsInParallelMode()); /* couldn't update pendingSyncHash */ 126 127 switch (relpersistence) 128 { 129 case RELPERSISTENCE_TEMP: 130 backend = BackendIdForTempRelations(); 131 needs_wal = false; 132 break; 133 case RELPERSISTENCE_UNLOGGED: 134 backend = InvalidBackendId; 135 needs_wal = false; 136 break; 137 case RELPERSISTENCE_PERMANENT: 138 backend = InvalidBackendId; 139 needs_wal = true; 140 break; 141 default: 142 elog(ERROR, "invalid relpersistence: %c", relpersistence); 143 return NULL; /* placate compiler */ 144 } 145 146 srel = smgropen(rnode, backend); 147 smgrcreate(srel, MAIN_FORKNUM, false); 148 149 if (needs_wal) 150 log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM); 151 152 /* Add the relation to the list of stuff to delete at abort */ 153 pending = (PendingRelDelete *) 154 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); 155 pending->relnode = rnode; 156 pending->backend = backend; 157 pending->atCommit = false; /* delete if abort */ 158 pending->nestLevel = GetCurrentTransactionNestLevel(); 159 pending->next = pendingDeletes; 160 pendingDeletes = pending; 161 162 if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded()) 163 { 164 Assert(backend == InvalidBackendId); 165 AddPendingSync(&rnode); 166 } 167 168 return srel; 169 } 170 171 /* 172 * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL. 173 */ 174 void 175 log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum) 176 { 177 xl_smgr_create xlrec; 178 179 /* 180 * Make an XLOG entry reporting the file creation. 181 */ 182 xlrec.rnode = *rnode; 183 xlrec.forkNum = forkNum; 184 185 XLogBeginInsert(); 186 XLogRegisterData((char *) &xlrec, sizeof(xlrec)); 187 XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); 188 } 189 190 /* 191 * RelationDropStorage 192 * Schedule unlinking of physical storage at transaction commit. 193 */ 194 void 195 RelationDropStorage(Relation rel) 196 { 197 PendingRelDelete *pending; 198 199 /* Add the relation to the list of stuff to delete at commit */ 200 pending = (PendingRelDelete *) 201 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete)); 202 pending->relnode = rel->rd_node; 203 pending->backend = rel->rd_backend; 204 pending->atCommit = true; /* delete if commit */ 205 pending->nestLevel = GetCurrentTransactionNestLevel(); 206 pending->next = pendingDeletes; 207 pendingDeletes = pending; 208 209 /* 210 * NOTE: if the relation was created in this transaction, it will now be 211 * present in the pending-delete list twice, once with atCommit true and 212 * once with atCommit false. Hence, it will be physically deleted at end 213 * of xact in either case (and the other entry will be ignored by 214 * smgrDoPendingDeletes, so no error will occur). We could instead remove 215 * the existing list entry and delete the physical file immediately, but 216 * for now I'll keep the logic simple. 217 */ 218 219 RelationCloseSmgr(rel); 220 } 221 222 /* 223 * RelationPreserveStorage 224 * Mark a relation as not to be deleted after all. 225 * 226 * We need this function because relation mapping changes are committed 227 * separately from commit of the whole transaction, so it's still possible 228 * for the transaction to abort after the mapping update is done. 229 * When a new physical relation is installed in the map, it would be 230 * scheduled for delete-on-abort, so we'd delete it, and be in trouble. 231 * The relation mapper fixes this by telling us to not delete such relations 232 * after all as part of its commit. 233 * 234 * We also use this to reuse an old build of an index during ALTER TABLE, this 235 * time removing the delete-at-commit entry. 236 * 237 * No-op if the relation is not among those scheduled for deletion. 238 */ 239 void 240 RelationPreserveStorage(RelFileNode rnode, bool atCommit) 241 { 242 PendingRelDelete *pending; 243 PendingRelDelete *prev; 244 PendingRelDelete *next; 245 246 prev = NULL; 247 for (pending = pendingDeletes; pending != NULL; pending = next) 248 { 249 next = pending->next; 250 if (RelFileNodeEquals(rnode, pending->relnode) 251 && pending->atCommit == atCommit) 252 { 253 /* unlink and delete list entry */ 254 if (prev) 255 prev->next = next; 256 else 257 pendingDeletes = next; 258 pfree(pending); 259 /* prev does not change */ 260 } 261 else 262 { 263 /* unrelated entry, don't touch it */ 264 prev = pending; 265 } 266 } 267 } 268 269 /* 270 * RelationTruncate 271 * Physically truncate a relation to the specified number of blocks. 272 * 273 * This includes getting rid of any buffers for the blocks that are to be 274 * dropped. 275 */ 276 void 277 RelationTruncate(Relation rel, BlockNumber nblocks) 278 { 279 bool fsm; 280 bool vm; 281 bool need_fsm_vacuum = false; 282 ForkNumber forks[MAX_FORKNUM]; 283 BlockNumber blocks[MAX_FORKNUM]; 284 int nforks = 0; 285 286 /* Open it at the smgr level if not already done */ 287 RelationOpenSmgr(rel); 288 289 /* 290 * Make sure smgr_targblock etc aren't pointing somewhere past new end 291 */ 292 rel->rd_smgr->smgr_targblock = InvalidBlockNumber; 293 for (int i = 0; i <= MAX_FORKNUM; ++i) 294 rel->rd_smgr->smgr_cached_nblocks[i] = InvalidBlockNumber; 295 296 /* Prepare for truncation of MAIN fork of the relation */ 297 forks[nforks] = MAIN_FORKNUM; 298 blocks[nforks] = nblocks; 299 nforks++; 300 301 /* Prepare for truncation of the FSM if it exists */ 302 fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM); 303 if (fsm) 304 { 305 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks); 306 if (BlockNumberIsValid(blocks[nforks])) 307 { 308 forks[nforks] = FSM_FORKNUM; 309 nforks++; 310 need_fsm_vacuum = true; 311 } 312 } 313 314 /* Prepare for truncation of the visibility map too if it exists */ 315 vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); 316 if (vm) 317 { 318 blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks); 319 if (BlockNumberIsValid(blocks[nforks])) 320 { 321 forks[nforks] = VISIBILITYMAP_FORKNUM; 322 nforks++; 323 } 324 } 325 326 RelationPreTruncate(rel); 327 328 /* 329 * We WAL-log the truncation before actually truncating, which means 330 * trouble if the truncation fails. If we then crash, the WAL replay 331 * likely isn't going to succeed in the truncation either, and cause a 332 * PANIC. It's tempting to put a critical section here, but that cure 333 * would be worse than the disease. It would turn a usually harmless 334 * failure to truncate, that might spell trouble at WAL replay, into a 335 * certain PANIC. 336 */ 337 if (RelationNeedsWAL(rel)) 338 { 339 /* 340 * Make an XLOG entry reporting the file truncation. 341 */ 342 XLogRecPtr lsn; 343 xl_smgr_truncate xlrec; 344 345 xlrec.blkno = nblocks; 346 xlrec.rnode = rel->rd_node; 347 xlrec.flags = SMGR_TRUNCATE_ALL; 348 349 XLogBeginInsert(); 350 XLogRegisterData((char *) &xlrec, sizeof(xlrec)); 351 352 lsn = XLogInsert(RM_SMGR_ID, 353 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); 354 355 /* 356 * Flush, because otherwise the truncation of the main relation might 357 * hit the disk before the WAL record, and the truncation of the FSM 358 * or visibility map. If we crashed during that window, we'd be left 359 * with a truncated heap, but the FSM or visibility map would still 360 * contain entries for the non-existent heap pages. 361 */ 362 if (fsm || vm) 363 XLogFlush(lsn); 364 } 365 366 /* Do the real work to truncate relation forks */ 367 smgrtruncate(rel->rd_smgr, forks, nforks, blocks); 368 369 /* 370 * Update upper-level FSM pages to account for the truncation. This is 371 * important because the just-truncated pages were likely marked as 372 * all-free, and would be preferentially selected. 373 */ 374 if (need_fsm_vacuum) 375 FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); 376 } 377 378 /* 379 * RelationPreTruncate 380 * Perform AM-independent work before a physical truncation. 381 * 382 * If an access method's relation_nontransactional_truncate does not call 383 * RelationTruncate(), it must call this before decreasing the table size. 384 */ 385 void 386 RelationPreTruncate(Relation rel) 387 { 388 PendingRelSync *pending; 389 390 if (!pendingSyncHash) 391 return; 392 RelationOpenSmgr(rel); 393 394 pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node), 395 HASH_FIND, NULL); 396 if (pending) 397 pending->is_truncated = true; 398 } 399 400 /* 401 * Copy a fork's data, block by block. 402 * 403 * Note that this requires that there is no dirty data in shared buffers. If 404 * it's possible that there are, callers need to flush those using 405 * e.g. FlushRelationBuffers(rel). 406 */ 407 void 408 RelationCopyStorage(SMgrRelation src, SMgrRelation dst, 409 ForkNumber forkNum, char relpersistence) 410 { 411 PGAlignedBlock buf; 412 Page page; 413 bool use_wal; 414 bool copying_initfork; 415 BlockNumber nblocks; 416 BlockNumber blkno; 417 418 page = (Page) buf.data; 419 420 /* 421 * The init fork for an unlogged relation in many respects has to be 422 * treated the same as normal relation, changes need to be WAL logged and 423 * it needs to be synced to disk. 424 */ 425 copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED && 426 forkNum == INIT_FORKNUM; 427 428 /* 429 * We need to log the copied data in WAL iff WAL archiving/streaming is 430 * enabled AND it's a permanent relation. This gives the same answer as 431 * "RelationNeedsWAL(rel) || copying_initfork", because we know the 432 * current operation created a new relfilenode. 433 */ 434 use_wal = XLogIsNeeded() && 435 (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork); 436 437 nblocks = smgrnblocks(src, forkNum); 438 439 for (blkno = 0; blkno < nblocks; blkno++) 440 { 441 /* If we got a cancel signal during the copy of the data, quit */ 442 CHECK_FOR_INTERRUPTS(); 443 444 smgrread(src, forkNum, blkno, buf.data); 445 446 if (!PageIsVerifiedExtended(page, blkno, 447 PIV_LOG_WARNING | PIV_REPORT_STAT)) 448 ereport(ERROR, 449 (errcode(ERRCODE_DATA_CORRUPTED), 450 errmsg("invalid page in block %u of relation %s", 451 blkno, 452 relpathbackend(src->smgr_rnode.node, 453 src->smgr_rnode.backend, 454 forkNum)))); 455 456 /* 457 * WAL-log the copied page. Unfortunately we don't know what kind of a 458 * page this is, so we have to log the full page including any unused 459 * space. 460 */ 461 if (use_wal) 462 log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false); 463 464 PageSetChecksumInplace(page, blkno); 465 466 /* 467 * Now write the page. We say skipFsync = true because there's no 468 * need for smgr to schedule an fsync for this write; we'll do it 469 * ourselves below. 470 */ 471 smgrextend(dst, forkNum, blkno, buf.data, true); 472 } 473 474 /* 475 * When we WAL-logged rel pages, we must nonetheless fsync them. The 476 * reason is that since we're copying outside shared buffers, a CHECKPOINT 477 * occurring during the copy has no way to flush the previously written 478 * data to disk (indeed it won't know the new rel even exists). A crash 479 * later on would replay WAL from the checkpoint, therefore it wouldn't 480 * replay our earlier WAL entries. If we do not fsync those pages here, 481 * they might still not be on disk when the crash occurs. 482 */ 483 if (use_wal || copying_initfork) 484 smgrimmedsync(dst, forkNum); 485 } 486 487 /* 488 * RelFileNodeSkippingWAL 489 * Check if a BM_PERMANENT relfilenode is using WAL. 490 * 491 * Changes of certain relfilenodes must not write WAL; see "Skipping WAL for 492 * New RelFileNode" in src/backend/access/transam/README. Though it is known 493 * from Relation efficiently, this function is intended for the code paths not 494 * having access to Relation. 495 */ 496 bool 497 RelFileNodeSkippingWAL(RelFileNode rnode) 498 { 499 if (!pendingSyncHash || 500 hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL) 501 return false; 502 503 return true; 504 } 505 506 /* 507 * EstimatePendingSyncsSpace 508 * Estimate space needed to pass syncs to parallel workers. 509 */ 510 Size 511 EstimatePendingSyncsSpace(void) 512 { 513 long entries; 514 515 entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0; 516 return mul_size(1 + entries, sizeof(RelFileNode)); 517 } 518 519 /* 520 * SerializePendingSyncs 521 * Serialize syncs for parallel workers. 522 */ 523 void 524 SerializePendingSyncs(Size maxSize, char *startAddress) 525 { 526 HTAB *tmphash; 527 HASHCTL ctl; 528 HASH_SEQ_STATUS scan; 529 PendingRelSync *sync; 530 PendingRelDelete *delete; 531 RelFileNode *src; 532 RelFileNode *dest = (RelFileNode *) startAddress; 533 534 if (!pendingSyncHash) 535 goto terminate; 536 537 /* Create temporary hash to collect active relfilenodes */ 538 ctl.keysize = sizeof(RelFileNode); 539 ctl.entrysize = sizeof(RelFileNode); 540 ctl.hcxt = CurrentMemoryContext; 541 tmphash = hash_create("tmp relfilenodes", 542 hash_get_num_entries(pendingSyncHash), &ctl, 543 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); 544 545 /* collect all rnodes from pending syncs */ 546 hash_seq_init(&scan, pendingSyncHash); 547 while ((sync = (PendingRelSync *) hash_seq_search(&scan))) 548 (void) hash_search(tmphash, &sync->rnode, HASH_ENTER, NULL); 549 550 /* remove deleted rnodes */ 551 for (delete = pendingDeletes; delete != NULL; delete = delete->next) 552 if (delete->atCommit) 553 (void) hash_search(tmphash, (void *) &delete->relnode, 554 HASH_REMOVE, NULL); 555 556 hash_seq_init(&scan, tmphash); 557 while ((src = (RelFileNode *) hash_seq_search(&scan))) 558 *dest++ = *src; 559 560 hash_destroy(tmphash); 561 562 terminate: 563 MemSet(dest, 0, sizeof(RelFileNode)); 564 } 565 566 /* 567 * RestorePendingSyncs 568 * Restore syncs within a parallel worker. 569 * 570 * RelationNeedsWAL() and RelFileNodeSkippingWAL() must offer the correct 571 * answer to parallel workers. Only smgrDoPendingSyncs() reads the 572 * is_truncated field, at end of transaction. Hence, don't restore it. 573 */ 574 void 575 RestorePendingSyncs(char *startAddress) 576 { 577 RelFileNode *rnode; 578 579 Assert(pendingSyncHash == NULL); 580 for (rnode = (RelFileNode *) startAddress; rnode->relNode != 0; rnode++) 581 AddPendingSync(rnode); 582 } 583 584 /* 585 * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact. 586 * 587 * This also runs when aborting a subxact; we want to clean up a failed 588 * subxact immediately. 589 * 590 * Note: It's possible that we're being asked to remove a relation that has 591 * no physical storage in any fork. In particular, it's possible that we're 592 * cleaning up an old temporary relation for which RemovePgTempFiles has 593 * already recovered the physical storage. 594 */ 595 void 596 smgrDoPendingDeletes(bool isCommit) 597 { 598 int nestLevel = GetCurrentTransactionNestLevel(); 599 PendingRelDelete *pending; 600 PendingRelDelete *prev; 601 PendingRelDelete *next; 602 int nrels = 0, 603 maxrels = 0; 604 SMgrRelation *srels = NULL; 605 606 prev = NULL; 607 for (pending = pendingDeletes; pending != NULL; pending = next) 608 { 609 next = pending->next; 610 if (pending->nestLevel < nestLevel) 611 { 612 /* outer-level entries should not be processed yet */ 613 prev = pending; 614 } 615 else 616 { 617 /* unlink list entry first, so we don't retry on failure */ 618 if (prev) 619 prev->next = next; 620 else 621 pendingDeletes = next; 622 /* do deletion if called for */ 623 if (pending->atCommit == isCommit) 624 { 625 SMgrRelation srel; 626 627 srel = smgropen(pending->relnode, pending->backend); 628 629 /* allocate the initial array, or extend it, if needed */ 630 if (maxrels == 0) 631 { 632 maxrels = 8; 633 srels = palloc(sizeof(SMgrRelation) * maxrels); 634 } 635 else if (maxrels <= nrels) 636 { 637 maxrels *= 2; 638 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); 639 } 640 641 srels[nrels++] = srel; 642 } 643 /* must explicitly free the list entry */ 644 pfree(pending); 645 /* prev does not change */ 646 } 647 } 648 649 if (nrels > 0) 650 { 651 smgrdounlinkall(srels, nrels, false); 652 653 for (int i = 0; i < nrels; i++) 654 smgrclose(srels[i]); 655 656 pfree(srels); 657 } 658 } 659 660 /* 661 * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact. 662 */ 663 void 664 smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) 665 { 666 PendingRelDelete *pending; 667 int nrels = 0, 668 maxrels = 0; 669 SMgrRelation *srels = NULL; 670 HASH_SEQ_STATUS scan; 671 PendingRelSync *pendingsync; 672 673 Assert(GetCurrentTransactionNestLevel() == 1); 674 675 if (!pendingSyncHash) 676 return; /* no relation needs sync */ 677 678 /* Abort -- just throw away all pending syncs */ 679 if (!isCommit) 680 { 681 pendingSyncHash = NULL; 682 return; 683 } 684 685 AssertPendingSyncs_RelationCache(); 686 687 /* Parallel worker -- just throw away all pending syncs */ 688 if (isParallelWorker) 689 { 690 pendingSyncHash = NULL; 691 return; 692 } 693 694 /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */ 695 for (pending = pendingDeletes; pending != NULL; pending = pending->next) 696 if (pending->atCommit) 697 (void) hash_search(pendingSyncHash, (void *) &pending->relnode, 698 HASH_REMOVE, NULL); 699 700 hash_seq_init(&scan, pendingSyncHash); 701 while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan))) 702 { 703 ForkNumber fork; 704 BlockNumber nblocks[MAX_FORKNUM + 1]; 705 BlockNumber total_blocks = 0; 706 SMgrRelation srel; 707 708 srel = smgropen(pendingsync->rnode, InvalidBackendId); 709 710 /* 711 * We emit newpage WAL records for smaller relations. 712 * 713 * Small WAL records have a chance to be emitted along with other 714 * backends' WAL records. We emit WAL records instead of syncing for 715 * files that are smaller than a certain threshold, expecting faster 716 * commit. The threshold is defined by the GUC wal_skip_threshold. 717 */ 718 if (!pendingsync->is_truncated) 719 { 720 for (fork = 0; fork <= MAX_FORKNUM; fork++) 721 { 722 if (smgrexists(srel, fork)) 723 { 724 BlockNumber n = smgrnblocks(srel, fork); 725 726 /* we shouldn't come here for unlogged relations */ 727 Assert(fork != INIT_FORKNUM); 728 nblocks[fork] = n; 729 total_blocks += n; 730 } 731 else 732 nblocks[fork] = InvalidBlockNumber; 733 } 734 } 735 736 /* 737 * Sync file or emit WAL records for its contents. 738 * 739 * Although we emit WAL record if the file is small enough, do file 740 * sync regardless of the size if the file has experienced a 741 * truncation. It is because the file would be followed by trailing 742 * garbage blocks after a crash recovery if, while a past longer file 743 * had been flushed out, we omitted syncing-out of the file and 744 * emitted WAL instead. You might think that we could choose WAL if 745 * the current main fork is longer than ever, but there's a case where 746 * main fork is longer than ever but FSM fork gets shorter. 747 */ 748 if (pendingsync->is_truncated || 749 total_blocks * BLCKSZ / 1024 >= wal_skip_threshold) 750 { 751 /* allocate the initial array, or extend it, if needed */ 752 if (maxrels == 0) 753 { 754 maxrels = 8; 755 srels = palloc(sizeof(SMgrRelation) * maxrels); 756 } 757 else if (maxrels <= nrels) 758 { 759 maxrels *= 2; 760 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); 761 } 762 763 srels[nrels++] = srel; 764 } 765 else 766 { 767 /* Emit WAL records for all blocks. The file is small enough. */ 768 for (fork = 0; fork <= MAX_FORKNUM; fork++) 769 { 770 int n = nblocks[fork]; 771 Relation rel; 772 773 if (!BlockNumberIsValid(n)) 774 continue; 775 776 /* 777 * Emit WAL for the whole file. Unfortunately we don't know 778 * what kind of a page this is, so we have to log the full 779 * page including any unused space. ReadBufferExtended() 780 * counts some pgstat events; unfortunately, we discard them. 781 */ 782 rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node); 783 log_newpage_range(rel, fork, 0, n, false); 784 FreeFakeRelcacheEntry(rel); 785 } 786 } 787 } 788 789 pendingSyncHash = NULL; 790 791 if (nrels > 0) 792 { 793 smgrdosyncall(srels, nrels); 794 pfree(srels); 795 } 796 } 797 798 /* 799 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted. 800 * 801 * The return value is the number of relations scheduled for termination. 802 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes. 803 * If there are no relations to be deleted, *ptr is set to NULL. 804 * 805 * Only non-temporary relations are included in the returned list. This is OK 806 * because the list is used only in contexts where temporary relations don't 807 * matter: we're either writing to the two-phase state file (and transactions 808 * that have touched temp tables can't be prepared) or we're writing to xlog 809 * (and all temporary files will be zapped if we restart anyway, so no need 810 * for redo to do it also). 811 * 812 * Note that the list does not include anything scheduled for termination 813 * by upper-level transactions. 814 */ 815 int 816 smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr) 817 { 818 int nestLevel = GetCurrentTransactionNestLevel(); 819 int nrels; 820 RelFileNode *rptr; 821 PendingRelDelete *pending; 822 823 nrels = 0; 824 for (pending = pendingDeletes; pending != NULL; pending = pending->next) 825 { 826 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit 827 && pending->backend == InvalidBackendId) 828 nrels++; 829 } 830 if (nrels == 0) 831 { 832 *ptr = NULL; 833 return 0; 834 } 835 rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode)); 836 *ptr = rptr; 837 for (pending = pendingDeletes; pending != NULL; pending = pending->next) 838 { 839 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit 840 && pending->backend == InvalidBackendId) 841 { 842 *rptr = pending->relnode; 843 rptr++; 844 } 845 } 846 return nrels; 847 } 848 849 /* 850 * PostPrepare_smgr -- Clean up after a successful PREPARE 851 * 852 * What we have to do here is throw away the in-memory state about pending 853 * relation deletes. It's all been recorded in the 2PC state file and 854 * it's no longer smgr's job to worry about it. 855 */ 856 void 857 PostPrepare_smgr(void) 858 { 859 PendingRelDelete *pending; 860 PendingRelDelete *next; 861 862 for (pending = pendingDeletes; pending != NULL; pending = next) 863 { 864 next = pending->next; 865 pendingDeletes = next; 866 /* must explicitly free the list entry */ 867 pfree(pending); 868 } 869 } 870 871 872 /* 873 * AtSubCommit_smgr() --- Take care of subtransaction commit. 874 * 875 * Reassign all items in the pending-deletes list to the parent transaction. 876 */ 877 void 878 AtSubCommit_smgr(void) 879 { 880 int nestLevel = GetCurrentTransactionNestLevel(); 881 PendingRelDelete *pending; 882 883 for (pending = pendingDeletes; pending != NULL; pending = pending->next) 884 { 885 if (pending->nestLevel >= nestLevel) 886 pending->nestLevel = nestLevel - 1; 887 } 888 } 889 890 /* 891 * AtSubAbort_smgr() --- Take care of subtransaction abort. 892 * 893 * Delete created relations and forget about deleted relations. 894 * We can execute these operations immediately because we know this 895 * subtransaction will not commit. 896 */ 897 void 898 AtSubAbort_smgr(void) 899 { 900 smgrDoPendingDeletes(false); 901 } 902 903 void 904 smgr_redo(XLogReaderState *record) 905 { 906 XLogRecPtr lsn = record->EndRecPtr; 907 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; 908 909 /* Backup blocks are not used in smgr records */ 910 Assert(!XLogRecHasAnyBlockRefs(record)); 911 912 if (info == XLOG_SMGR_CREATE) 913 { 914 xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); 915 SMgrRelation reln; 916 917 reln = smgropen(xlrec->rnode, InvalidBackendId); 918 smgrcreate(reln, xlrec->forkNum, true); 919 } 920 else if (info == XLOG_SMGR_TRUNCATE) 921 { 922 xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); 923 SMgrRelation reln; 924 Relation rel; 925 ForkNumber forks[MAX_FORKNUM]; 926 BlockNumber blocks[MAX_FORKNUM]; 927 int nforks = 0; 928 bool need_fsm_vacuum = false; 929 930 reln = smgropen(xlrec->rnode, InvalidBackendId); 931 932 /* 933 * Forcibly create relation if it doesn't exist (which suggests that 934 * it was dropped somewhere later in the WAL sequence). As in 935 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the 936 * log as best we can until the drop is seen. 937 */ 938 smgrcreate(reln, MAIN_FORKNUM, true); 939 940 /* 941 * Before we perform the truncation, update minimum recovery point to 942 * cover this WAL record. Once the relation is truncated, there's no 943 * going back. The buffer manager enforces the WAL-first rule for 944 * normal updates to relation files, so that the minimum recovery 945 * point is always updated before the corresponding change in the data 946 * file is flushed to disk. We have to do the same manually here. 947 * 948 * Doing this before the truncation means that if the truncation fails 949 * for some reason, you cannot start up the system even after restart, 950 * until you fix the underlying situation so that the truncation will 951 * succeed. Alternatively, we could update the minimum recovery point 952 * after truncation, but that would leave a small window where the 953 * WAL-first rule could be violated. 954 */ 955 XLogFlush(lsn); 956 957 /* Prepare for truncation of MAIN fork */ 958 if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0) 959 { 960 forks[nforks] = MAIN_FORKNUM; 961 blocks[nforks] = xlrec->blkno; 962 nforks++; 963 964 /* Also tell xlogutils.c about it */ 965 XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno); 966 } 967 968 /* Prepare for truncation of FSM and VM too */ 969 rel = CreateFakeRelcacheEntry(xlrec->rnode); 970 971 if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 && 972 smgrexists(reln, FSM_FORKNUM)) 973 { 974 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno); 975 if (BlockNumberIsValid(blocks[nforks])) 976 { 977 forks[nforks] = FSM_FORKNUM; 978 nforks++; 979 need_fsm_vacuum = true; 980 } 981 } 982 if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 && 983 smgrexists(reln, VISIBILITYMAP_FORKNUM)) 984 { 985 blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno); 986 if (BlockNumberIsValid(blocks[nforks])) 987 { 988 forks[nforks] = VISIBILITYMAP_FORKNUM; 989 nforks++; 990 } 991 } 992 993 /* Do the real work to truncate relation forks */ 994 if (nforks > 0) 995 smgrtruncate(reln, forks, nforks, blocks); 996 997 /* 998 * Update upper-level FSM pages to account for the truncation. This is 999 * important because the just-truncated pages were likely marked as 1000 * all-free, and would be preferentially selected. 1001 */ 1002 if (need_fsm_vacuum) 1003 FreeSpaceMapVacuumRange(rel, xlrec->blkno, 1004 InvalidBlockNumber); 1005 1006 FreeFakeRelcacheEntry(rel); 1007 } 1008 else 1009 elog(PANIC, "smgr_redo: unknown op code %u", info); 1010 } 1011