1 /*-------------------------------------------------------------------------
2  *
3  * storage.c
4  *	  code to create and destroy physical storage for relations
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/catalog/storage.c
12  *
13  * NOTES
14  *	  Some of this code used to be in storage/smgr/smgr.c, and the
15  *	  function names still reflect that.
16  *
17  *-------------------------------------------------------------------------
18  */
19 
20 #include "postgres.h"
21 
22 #include "access/parallel.h"
23 #include "access/visibilitymap.h"
24 #include "access/xact.h"
25 #include "access/xlog.h"
26 #include "access/xloginsert.h"
27 #include "access/xlogutils.h"
28 #include "catalog/storage.h"
29 #include "catalog/storage_xlog.h"
30 #include "miscadmin.h"
31 #include "storage/freespace.h"
32 #include "storage/smgr.h"
33 #include "utils/hsearch.h"
34 #include "utils/memutils.h"
35 #include "utils/rel.h"
36 
37 /* GUC variables */
38 int			wal_skip_threshold = 2048;	/* in kilobytes */
39 
40 /*
41  * We keep a list of all relations (represented as RelFileNode values)
42  * that have been created or deleted in the current transaction.  When
43  * a relation is created, we create the physical file immediately, but
44  * remember it so that we can delete the file again if the current
45  * transaction is aborted.  Conversely, a deletion request is NOT
46  * executed immediately, but is just entered in the list.  When and if
47  * the transaction commits, we can delete the physical file.
48  *
49  * To handle subtransactions, every entry is marked with its transaction
50  * nesting level.  At subtransaction commit, we reassign the subtransaction's
51  * entries to the parent nesting level.  At subtransaction abort, we can
52  * immediately execute the abort-time actions for all entries of the current
53  * nesting level.
54  *
55  * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
56  * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
57  * but I'm being paranoid.
58  */
59 
60 typedef struct PendingRelDelete
61 {
62 	RelFileNode relnode;		/* relation that may need to be deleted */
63 	BackendId	backend;		/* InvalidBackendId if not a temp rel */
64 	bool		atCommit;		/* T=delete at commit; F=delete at abort */
65 	int			nestLevel;		/* xact nesting level of request */
66 	struct PendingRelDelete *next;	/* linked-list link */
67 } PendingRelDelete;
68 
69 typedef struct PendingRelSync
70 {
71 	RelFileNode rnode;
72 	bool		is_truncated;	/* Has the file experienced truncation? */
73 } PendingRelSync;
74 
75 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
76 HTAB	   *pendingSyncHash = NULL;
77 
78 
79 /*
80  * AddPendingSync
81  *		Queue an at-commit fsync.
82  */
83 static void
84 AddPendingSync(const RelFileNode *rnode)
85 {
86 	PendingRelSync *pending;
87 	bool		found;
88 
89 	/* create the hash if not yet */
90 	if (!pendingSyncHash)
91 	{
92 		HASHCTL		ctl;
93 
94 		ctl.keysize = sizeof(RelFileNode);
95 		ctl.entrysize = sizeof(PendingRelSync);
96 		ctl.hcxt = TopTransactionContext;
97 		pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
98 									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
99 	}
100 
101 	pending = hash_search(pendingSyncHash, rnode, HASH_ENTER, &found);
102 	Assert(!found);
103 	pending->is_truncated = false;
104 }
105 
106 /*
107  * RelationCreateStorage
108  *		Create physical storage for a relation.
109  *
110  * Create the underlying disk file storage for the relation. This only
111  * creates the main fork; additional forks are created lazily by the
112  * modules that need them.
113  *
114  * This function is transactional. The creation is WAL-logged, and if the
115  * transaction aborts later on, the storage will be destroyed.
116  */
117 SMgrRelation
118 RelationCreateStorage(RelFileNode rnode, char relpersistence)
119 {
120 	PendingRelDelete *pending;
121 	SMgrRelation srel;
122 	BackendId	backend;
123 	bool		needs_wal;
124 
125 	Assert(!IsInParallelMode());	/* couldn't update pendingSyncHash */
126 
127 	switch (relpersistence)
128 	{
129 		case RELPERSISTENCE_TEMP:
130 			backend = BackendIdForTempRelations();
131 			needs_wal = false;
132 			break;
133 		case RELPERSISTENCE_UNLOGGED:
134 			backend = InvalidBackendId;
135 			needs_wal = false;
136 			break;
137 		case RELPERSISTENCE_PERMANENT:
138 			backend = InvalidBackendId;
139 			needs_wal = true;
140 			break;
141 		default:
142 			elog(ERROR, "invalid relpersistence: %c", relpersistence);
143 			return NULL;		/* placate compiler */
144 	}
145 
146 	srel = smgropen(rnode, backend);
147 	smgrcreate(srel, MAIN_FORKNUM, false);
148 
149 	if (needs_wal)
150 		log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
151 
152 	/* Add the relation to the list of stuff to delete at abort */
153 	pending = (PendingRelDelete *)
154 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
155 	pending->relnode = rnode;
156 	pending->backend = backend;
157 	pending->atCommit = false;	/* delete if abort */
158 	pending->nestLevel = GetCurrentTransactionNestLevel();
159 	pending->next = pendingDeletes;
160 	pendingDeletes = pending;
161 
162 	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
163 	{
164 		Assert(backend == InvalidBackendId);
165 		AddPendingSync(&rnode);
166 	}
167 
168 	return srel;
169 }
170 
171 /*
172  * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
173  */
174 void
175 log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
176 {
177 	xl_smgr_create xlrec;
178 
179 	/*
180 	 * Make an XLOG entry reporting the file creation.
181 	 */
182 	xlrec.rnode = *rnode;
183 	xlrec.forkNum = forkNum;
184 
185 	XLogBeginInsert();
186 	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
187 	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
188 }
189 
190 /*
191  * RelationDropStorage
192  *		Schedule unlinking of physical storage at transaction commit.
193  */
194 void
195 RelationDropStorage(Relation rel)
196 {
197 	PendingRelDelete *pending;
198 
199 	/* Add the relation to the list of stuff to delete at commit */
200 	pending = (PendingRelDelete *)
201 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
202 	pending->relnode = rel->rd_node;
203 	pending->backend = rel->rd_backend;
204 	pending->atCommit = true;	/* delete if commit */
205 	pending->nestLevel = GetCurrentTransactionNestLevel();
206 	pending->next = pendingDeletes;
207 	pendingDeletes = pending;
208 
209 	/*
210 	 * NOTE: if the relation was created in this transaction, it will now be
211 	 * present in the pending-delete list twice, once with atCommit true and
212 	 * once with atCommit false.  Hence, it will be physically deleted at end
213 	 * of xact in either case (and the other entry will be ignored by
214 	 * smgrDoPendingDeletes, so no error will occur).  We could instead remove
215 	 * the existing list entry and delete the physical file immediately, but
216 	 * for now I'll keep the logic simple.
217 	 */
218 
219 	RelationCloseSmgr(rel);
220 }
221 
222 /*
223  * RelationPreserveStorage
224  *		Mark a relation as not to be deleted after all.
225  *
226  * We need this function because relation mapping changes are committed
227  * separately from commit of the whole transaction, so it's still possible
228  * for the transaction to abort after the mapping update is done.
229  * When a new physical relation is installed in the map, it would be
230  * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
231  * The relation mapper fixes this by telling us to not delete such relations
232  * after all as part of its commit.
233  *
234  * We also use this to reuse an old build of an index during ALTER TABLE, this
235  * time removing the delete-at-commit entry.
236  *
237  * No-op if the relation is not among those scheduled for deletion.
238  */
239 void
240 RelationPreserveStorage(RelFileNode rnode, bool atCommit)
241 {
242 	PendingRelDelete *pending;
243 	PendingRelDelete *prev;
244 	PendingRelDelete *next;
245 
246 	prev = NULL;
247 	for (pending = pendingDeletes; pending != NULL; pending = next)
248 	{
249 		next = pending->next;
250 		if (RelFileNodeEquals(rnode, pending->relnode)
251 			&& pending->atCommit == atCommit)
252 		{
253 			/* unlink and delete list entry */
254 			if (prev)
255 				prev->next = next;
256 			else
257 				pendingDeletes = next;
258 			pfree(pending);
259 			/* prev does not change */
260 		}
261 		else
262 		{
263 			/* unrelated entry, don't touch it */
264 			prev = pending;
265 		}
266 	}
267 }
268 
269 /*
270  * RelationTruncate
271  *		Physically truncate a relation to the specified number of blocks.
272  *
273  * This includes getting rid of any buffers for the blocks that are to be
274  * dropped.
275  */
276 void
277 RelationTruncate(Relation rel, BlockNumber nblocks)
278 {
279 	bool		fsm;
280 	bool		vm;
281 	bool		need_fsm_vacuum = false;
282 	ForkNumber	forks[MAX_FORKNUM];
283 	BlockNumber blocks[MAX_FORKNUM];
284 	int			nforks = 0;
285 
286 	/* Open it at the smgr level if not already done */
287 	RelationOpenSmgr(rel);
288 
289 	/*
290 	 * Make sure smgr_targblock etc aren't pointing somewhere past new end
291 	 */
292 	rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
293 	for (int i = 0; i <= MAX_FORKNUM; ++i)
294 		rel->rd_smgr->smgr_cached_nblocks[i] = InvalidBlockNumber;
295 
296 	/* Prepare for truncation of MAIN fork of the relation */
297 	forks[nforks] = MAIN_FORKNUM;
298 	blocks[nforks] = nblocks;
299 	nforks++;
300 
301 	/* Prepare for truncation of the FSM if it exists */
302 	fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
303 	if (fsm)
304 	{
305 		blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
306 		if (BlockNumberIsValid(blocks[nforks]))
307 		{
308 			forks[nforks] = FSM_FORKNUM;
309 			nforks++;
310 			need_fsm_vacuum = true;
311 		}
312 	}
313 
314 	/* Prepare for truncation of the visibility map too if it exists */
315 	vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
316 	if (vm)
317 	{
318 		blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
319 		if (BlockNumberIsValid(blocks[nforks]))
320 		{
321 			forks[nforks] = VISIBILITYMAP_FORKNUM;
322 			nforks++;
323 		}
324 	}
325 
326 	RelationPreTruncate(rel);
327 
328 	/*
329 	 * We WAL-log the truncation before actually truncating, which means
330 	 * trouble if the truncation fails. If we then crash, the WAL replay
331 	 * likely isn't going to succeed in the truncation either, and cause a
332 	 * PANIC. It's tempting to put a critical section here, but that cure
333 	 * would be worse than the disease. It would turn a usually harmless
334 	 * failure to truncate, that might spell trouble at WAL replay, into a
335 	 * certain PANIC.
336 	 */
337 	if (RelationNeedsWAL(rel))
338 	{
339 		/*
340 		 * Make an XLOG entry reporting the file truncation.
341 		 */
342 		XLogRecPtr	lsn;
343 		xl_smgr_truncate xlrec;
344 
345 		xlrec.blkno = nblocks;
346 		xlrec.rnode = rel->rd_node;
347 		xlrec.flags = SMGR_TRUNCATE_ALL;
348 
349 		XLogBeginInsert();
350 		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
351 
352 		lsn = XLogInsert(RM_SMGR_ID,
353 						 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
354 
355 		/*
356 		 * Flush, because otherwise the truncation of the main relation might
357 		 * hit the disk before the WAL record, and the truncation of the FSM
358 		 * or visibility map. If we crashed during that window, we'd be left
359 		 * with a truncated heap, but the FSM or visibility map would still
360 		 * contain entries for the non-existent heap pages.
361 		 */
362 		if (fsm || vm)
363 			XLogFlush(lsn);
364 	}
365 
366 	/* Do the real work to truncate relation forks */
367 	smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
368 
369 	/*
370 	 * Update upper-level FSM pages to account for the truncation. This is
371 	 * important because the just-truncated pages were likely marked as
372 	 * all-free, and would be preferentially selected.
373 	 */
374 	if (need_fsm_vacuum)
375 		FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
376 }
377 
378 /*
379  * RelationPreTruncate
380  *		Perform AM-independent work before a physical truncation.
381  *
382  * If an access method's relation_nontransactional_truncate does not call
383  * RelationTruncate(), it must call this before decreasing the table size.
384  */
385 void
386 RelationPreTruncate(Relation rel)
387 {
388 	PendingRelSync *pending;
389 
390 	if (!pendingSyncHash)
391 		return;
392 	RelationOpenSmgr(rel);
393 
394 	pending = hash_search(pendingSyncHash, &(rel->rd_smgr->smgr_rnode.node),
395 						  HASH_FIND, NULL);
396 	if (pending)
397 		pending->is_truncated = true;
398 }
399 
400 /*
401  * Copy a fork's data, block by block.
402  *
403  * Note that this requires that there is no dirty data in shared buffers. If
404  * it's possible that there are, callers need to flush those using
405  * e.g. FlushRelationBuffers(rel).
406  */
407 void
408 RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
409 					ForkNumber forkNum, char relpersistence)
410 {
411 	PGAlignedBlock buf;
412 	Page		page;
413 	bool		use_wal;
414 	bool		copying_initfork;
415 	BlockNumber nblocks;
416 	BlockNumber blkno;
417 
418 	page = (Page) buf.data;
419 
420 	/*
421 	 * The init fork for an unlogged relation in many respects has to be
422 	 * treated the same as normal relation, changes need to be WAL logged and
423 	 * it needs to be synced to disk.
424 	 */
425 	copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
426 		forkNum == INIT_FORKNUM;
427 
428 	/*
429 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
430 	 * enabled AND it's a permanent relation.  This gives the same answer as
431 	 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
432 	 * current operation created a new relfilenode.
433 	 */
434 	use_wal = XLogIsNeeded() &&
435 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
436 
437 	nblocks = smgrnblocks(src, forkNum);
438 
439 	for (blkno = 0; blkno < nblocks; blkno++)
440 	{
441 		/* If we got a cancel signal during the copy of the data, quit */
442 		CHECK_FOR_INTERRUPTS();
443 
444 		smgrread(src, forkNum, blkno, buf.data);
445 
446 		if (!PageIsVerifiedExtended(page, blkno,
447 									PIV_LOG_WARNING | PIV_REPORT_STAT))
448 			ereport(ERROR,
449 					(errcode(ERRCODE_DATA_CORRUPTED),
450 					 errmsg("invalid page in block %u of relation %s",
451 							blkno,
452 							relpathbackend(src->smgr_rnode.node,
453 										   src->smgr_rnode.backend,
454 										   forkNum))));
455 
456 		/*
457 		 * WAL-log the copied page. Unfortunately we don't know what kind of a
458 		 * page this is, so we have to log the full page including any unused
459 		 * space.
460 		 */
461 		if (use_wal)
462 			log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
463 
464 		PageSetChecksumInplace(page, blkno);
465 
466 		/*
467 		 * Now write the page.  We say skipFsync = true because there's no
468 		 * need for smgr to schedule an fsync for this write; we'll do it
469 		 * ourselves below.
470 		 */
471 		smgrextend(dst, forkNum, blkno, buf.data, true);
472 	}
473 
474 	/*
475 	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
476 	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
477 	 * occurring during the copy has no way to flush the previously written
478 	 * data to disk (indeed it won't know the new rel even exists).  A crash
479 	 * later on would replay WAL from the checkpoint, therefore it wouldn't
480 	 * replay our earlier WAL entries. If we do not fsync those pages here,
481 	 * they might still not be on disk when the crash occurs.
482 	 */
483 	if (use_wal || copying_initfork)
484 		smgrimmedsync(dst, forkNum);
485 }
486 
487 /*
488  * RelFileNodeSkippingWAL
489  *		Check if a BM_PERMANENT relfilenode is using WAL.
490  *
491  * Changes of certain relfilenodes must not write WAL; see "Skipping WAL for
492  * New RelFileNode" in src/backend/access/transam/README.  Though it is known
493  * from Relation efficiently, this function is intended for the code paths not
494  * having access to Relation.
495  */
496 bool
497 RelFileNodeSkippingWAL(RelFileNode rnode)
498 {
499 	if (!pendingSyncHash ||
500 		hash_search(pendingSyncHash, &rnode, HASH_FIND, NULL) == NULL)
501 		return false;
502 
503 	return true;
504 }
505 
506 /*
507  * EstimatePendingSyncsSpace
508  *		Estimate space needed to pass syncs to parallel workers.
509  */
510 Size
511 EstimatePendingSyncsSpace(void)
512 {
513 	long		entries;
514 
515 	entries = pendingSyncHash ? hash_get_num_entries(pendingSyncHash) : 0;
516 	return mul_size(1 + entries, sizeof(RelFileNode));
517 }
518 
519 /*
520  * SerializePendingSyncs
521  *		Serialize syncs for parallel workers.
522  */
523 void
524 SerializePendingSyncs(Size maxSize, char *startAddress)
525 {
526 	HTAB	   *tmphash;
527 	HASHCTL		ctl;
528 	HASH_SEQ_STATUS scan;
529 	PendingRelSync *sync;
530 	PendingRelDelete *delete;
531 	RelFileNode *src;
532 	RelFileNode *dest = (RelFileNode *) startAddress;
533 
534 	if (!pendingSyncHash)
535 		goto terminate;
536 
537 	/* Create temporary hash to collect active relfilenodes */
538 	ctl.keysize = sizeof(RelFileNode);
539 	ctl.entrysize = sizeof(RelFileNode);
540 	ctl.hcxt = CurrentMemoryContext;
541 	tmphash = hash_create("tmp relfilenodes",
542 						  hash_get_num_entries(pendingSyncHash), &ctl,
543 						  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
544 
545 	/* collect all rnodes from pending syncs */
546 	hash_seq_init(&scan, pendingSyncHash);
547 	while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
548 		(void) hash_search(tmphash, &sync->rnode, HASH_ENTER, NULL);
549 
550 	/* remove deleted rnodes */
551 	for (delete = pendingDeletes; delete != NULL; delete = delete->next)
552 		if (delete->atCommit)
553 			(void) hash_search(tmphash, (void *) &delete->relnode,
554 							   HASH_REMOVE, NULL);
555 
556 	hash_seq_init(&scan, tmphash);
557 	while ((src = (RelFileNode *) hash_seq_search(&scan)))
558 		*dest++ = *src;
559 
560 	hash_destroy(tmphash);
561 
562 terminate:
563 	MemSet(dest, 0, sizeof(RelFileNode));
564 }
565 
566 /*
567  * RestorePendingSyncs
568  *		Restore syncs within a parallel worker.
569  *
570  * RelationNeedsWAL() and RelFileNodeSkippingWAL() must offer the correct
571  * answer to parallel workers.  Only smgrDoPendingSyncs() reads the
572  * is_truncated field, at end of transaction.  Hence, don't restore it.
573  */
574 void
575 RestorePendingSyncs(char *startAddress)
576 {
577 	RelFileNode *rnode;
578 
579 	Assert(pendingSyncHash == NULL);
580 	for (rnode = (RelFileNode *) startAddress; rnode->relNode != 0; rnode++)
581 		AddPendingSync(rnode);
582 }
583 
584 /*
585  *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
586  *
587  * This also runs when aborting a subxact; we want to clean up a failed
588  * subxact immediately.
589  *
590  * Note: It's possible that we're being asked to remove a relation that has
591  * no physical storage in any fork. In particular, it's possible that we're
592  * cleaning up an old temporary relation for which RemovePgTempFiles has
593  * already recovered the physical storage.
594  */
595 void
596 smgrDoPendingDeletes(bool isCommit)
597 {
598 	int			nestLevel = GetCurrentTransactionNestLevel();
599 	PendingRelDelete *pending;
600 	PendingRelDelete *prev;
601 	PendingRelDelete *next;
602 	int			nrels = 0,
603 				maxrels = 0;
604 	SMgrRelation *srels = NULL;
605 
606 	prev = NULL;
607 	for (pending = pendingDeletes; pending != NULL; pending = next)
608 	{
609 		next = pending->next;
610 		if (pending->nestLevel < nestLevel)
611 		{
612 			/* outer-level entries should not be processed yet */
613 			prev = pending;
614 		}
615 		else
616 		{
617 			/* unlink list entry first, so we don't retry on failure */
618 			if (prev)
619 				prev->next = next;
620 			else
621 				pendingDeletes = next;
622 			/* do deletion if called for */
623 			if (pending->atCommit == isCommit)
624 			{
625 				SMgrRelation srel;
626 
627 				srel = smgropen(pending->relnode, pending->backend);
628 
629 				/* allocate the initial array, or extend it, if needed */
630 				if (maxrels == 0)
631 				{
632 					maxrels = 8;
633 					srels = palloc(sizeof(SMgrRelation) * maxrels);
634 				}
635 				else if (maxrels <= nrels)
636 				{
637 					maxrels *= 2;
638 					srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
639 				}
640 
641 				srels[nrels++] = srel;
642 			}
643 			/* must explicitly free the list entry */
644 			pfree(pending);
645 			/* prev does not change */
646 		}
647 	}
648 
649 	if (nrels > 0)
650 	{
651 		smgrdounlinkall(srels, nrels, false);
652 
653 		for (int i = 0; i < nrels; i++)
654 			smgrclose(srels[i]);
655 
656 		pfree(srels);
657 	}
658 }
659 
660 /*
661  *	smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
662  */
663 void
664 smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
665 {
666 	PendingRelDelete *pending;
667 	int			nrels = 0,
668 				maxrels = 0;
669 	SMgrRelation *srels = NULL;
670 	HASH_SEQ_STATUS scan;
671 	PendingRelSync *pendingsync;
672 
673 	Assert(GetCurrentTransactionNestLevel() == 1);
674 
675 	if (!pendingSyncHash)
676 		return;					/* no relation needs sync */
677 
678 	/* Abort -- just throw away all pending syncs */
679 	if (!isCommit)
680 	{
681 		pendingSyncHash = NULL;
682 		return;
683 	}
684 
685 	AssertPendingSyncs_RelationCache();
686 
687 	/* Parallel worker -- just throw away all pending syncs */
688 	if (isParallelWorker)
689 	{
690 		pendingSyncHash = NULL;
691 		return;
692 	}
693 
694 	/* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
695 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
696 		if (pending->atCommit)
697 			(void) hash_search(pendingSyncHash, (void *) &pending->relnode,
698 							   HASH_REMOVE, NULL);
699 
700 	hash_seq_init(&scan, pendingSyncHash);
701 	while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
702 	{
703 		ForkNumber	fork;
704 		BlockNumber nblocks[MAX_FORKNUM + 1];
705 		BlockNumber total_blocks = 0;
706 		SMgrRelation srel;
707 
708 		srel = smgropen(pendingsync->rnode, InvalidBackendId);
709 
710 		/*
711 		 * We emit newpage WAL records for smaller relations.
712 		 *
713 		 * Small WAL records have a chance to be emitted along with other
714 		 * backends' WAL records.  We emit WAL records instead of syncing for
715 		 * files that are smaller than a certain threshold, expecting faster
716 		 * commit.  The threshold is defined by the GUC wal_skip_threshold.
717 		 */
718 		if (!pendingsync->is_truncated)
719 		{
720 			for (fork = 0; fork <= MAX_FORKNUM; fork++)
721 			{
722 				if (smgrexists(srel, fork))
723 				{
724 					BlockNumber n = smgrnblocks(srel, fork);
725 
726 					/* we shouldn't come here for unlogged relations */
727 					Assert(fork != INIT_FORKNUM);
728 					nblocks[fork] = n;
729 					total_blocks += n;
730 				}
731 				else
732 					nblocks[fork] = InvalidBlockNumber;
733 			}
734 		}
735 
736 		/*
737 		 * Sync file or emit WAL records for its contents.
738 		 *
739 		 * Although we emit WAL record if the file is small enough, do file
740 		 * sync regardless of the size if the file has experienced a
741 		 * truncation. It is because the file would be followed by trailing
742 		 * garbage blocks after a crash recovery if, while a past longer file
743 		 * had been flushed out, we omitted syncing-out of the file and
744 		 * emitted WAL instead.  You might think that we could choose WAL if
745 		 * the current main fork is longer than ever, but there's a case where
746 		 * main fork is longer than ever but FSM fork gets shorter.
747 		 */
748 		if (pendingsync->is_truncated ||
749 			total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
750 		{
751 			/* allocate the initial array, or extend it, if needed */
752 			if (maxrels == 0)
753 			{
754 				maxrels = 8;
755 				srels = palloc(sizeof(SMgrRelation) * maxrels);
756 			}
757 			else if (maxrels <= nrels)
758 			{
759 				maxrels *= 2;
760 				srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
761 			}
762 
763 			srels[nrels++] = srel;
764 		}
765 		else
766 		{
767 			/* Emit WAL records for all blocks.  The file is small enough. */
768 			for (fork = 0; fork <= MAX_FORKNUM; fork++)
769 			{
770 				int			n = nblocks[fork];
771 				Relation	rel;
772 
773 				if (!BlockNumberIsValid(n))
774 					continue;
775 
776 				/*
777 				 * Emit WAL for the whole file.  Unfortunately we don't know
778 				 * what kind of a page this is, so we have to log the full
779 				 * page including any unused space.  ReadBufferExtended()
780 				 * counts some pgstat events; unfortunately, we discard them.
781 				 */
782 				rel = CreateFakeRelcacheEntry(srel->smgr_rnode.node);
783 				log_newpage_range(rel, fork, 0, n, false);
784 				FreeFakeRelcacheEntry(rel);
785 			}
786 		}
787 	}
788 
789 	pendingSyncHash = NULL;
790 
791 	if (nrels > 0)
792 	{
793 		smgrdosyncall(srels, nrels);
794 		pfree(srels);
795 	}
796 }
797 
798 /*
799  * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
800  *
801  * The return value is the number of relations scheduled for termination.
802  * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
803  * If there are no relations to be deleted, *ptr is set to NULL.
804  *
805  * Only non-temporary relations are included in the returned list.  This is OK
806  * because the list is used only in contexts where temporary relations don't
807  * matter: we're either writing to the two-phase state file (and transactions
808  * that have touched temp tables can't be prepared) or we're writing to xlog
809  * (and all temporary files will be zapped if we restart anyway, so no need
810  * for redo to do it also).
811  *
812  * Note that the list does not include anything scheduled for termination
813  * by upper-level transactions.
814  */
815 int
816 smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
817 {
818 	int			nestLevel = GetCurrentTransactionNestLevel();
819 	int			nrels;
820 	RelFileNode *rptr;
821 	PendingRelDelete *pending;
822 
823 	nrels = 0;
824 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
825 	{
826 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
827 			&& pending->backend == InvalidBackendId)
828 			nrels++;
829 	}
830 	if (nrels == 0)
831 	{
832 		*ptr = NULL;
833 		return 0;
834 	}
835 	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
836 	*ptr = rptr;
837 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
838 	{
839 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
840 			&& pending->backend == InvalidBackendId)
841 		{
842 			*rptr = pending->relnode;
843 			rptr++;
844 		}
845 	}
846 	return nrels;
847 }
848 
849 /*
850  *	PostPrepare_smgr -- Clean up after a successful PREPARE
851  *
852  * What we have to do here is throw away the in-memory state about pending
853  * relation deletes.  It's all been recorded in the 2PC state file and
854  * it's no longer smgr's job to worry about it.
855  */
856 void
857 PostPrepare_smgr(void)
858 {
859 	PendingRelDelete *pending;
860 	PendingRelDelete *next;
861 
862 	for (pending = pendingDeletes; pending != NULL; pending = next)
863 	{
864 		next = pending->next;
865 		pendingDeletes = next;
866 		/* must explicitly free the list entry */
867 		pfree(pending);
868 	}
869 }
870 
871 
872 /*
873  * AtSubCommit_smgr() --- Take care of subtransaction commit.
874  *
875  * Reassign all items in the pending-deletes list to the parent transaction.
876  */
877 void
878 AtSubCommit_smgr(void)
879 {
880 	int			nestLevel = GetCurrentTransactionNestLevel();
881 	PendingRelDelete *pending;
882 
883 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
884 	{
885 		if (pending->nestLevel >= nestLevel)
886 			pending->nestLevel = nestLevel - 1;
887 	}
888 }
889 
890 /*
891  * AtSubAbort_smgr() --- Take care of subtransaction abort.
892  *
893  * Delete created relations and forget about deleted relations.
894  * We can execute these operations immediately because we know this
895  * subtransaction will not commit.
896  */
897 void
898 AtSubAbort_smgr(void)
899 {
900 	smgrDoPendingDeletes(false);
901 }
902 
903 void
904 smgr_redo(XLogReaderState *record)
905 {
906 	XLogRecPtr	lsn = record->EndRecPtr;
907 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
908 
909 	/* Backup blocks are not used in smgr records */
910 	Assert(!XLogRecHasAnyBlockRefs(record));
911 
912 	if (info == XLOG_SMGR_CREATE)
913 	{
914 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
915 		SMgrRelation reln;
916 
917 		reln = smgropen(xlrec->rnode, InvalidBackendId);
918 		smgrcreate(reln, xlrec->forkNum, true);
919 	}
920 	else if (info == XLOG_SMGR_TRUNCATE)
921 	{
922 		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
923 		SMgrRelation reln;
924 		Relation	rel;
925 		ForkNumber	forks[MAX_FORKNUM];
926 		BlockNumber blocks[MAX_FORKNUM];
927 		int			nforks = 0;
928 		bool		need_fsm_vacuum = false;
929 
930 		reln = smgropen(xlrec->rnode, InvalidBackendId);
931 
932 		/*
933 		 * Forcibly create relation if it doesn't exist (which suggests that
934 		 * it was dropped somewhere later in the WAL sequence).  As in
935 		 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
936 		 * log as best we can until the drop is seen.
937 		 */
938 		smgrcreate(reln, MAIN_FORKNUM, true);
939 
940 		/*
941 		 * Before we perform the truncation, update minimum recovery point to
942 		 * cover this WAL record. Once the relation is truncated, there's no
943 		 * going back. The buffer manager enforces the WAL-first rule for
944 		 * normal updates to relation files, so that the minimum recovery
945 		 * point is always updated before the corresponding change in the data
946 		 * file is flushed to disk. We have to do the same manually here.
947 		 *
948 		 * Doing this before the truncation means that if the truncation fails
949 		 * for some reason, you cannot start up the system even after restart,
950 		 * until you fix the underlying situation so that the truncation will
951 		 * succeed. Alternatively, we could update the minimum recovery point
952 		 * after truncation, but that would leave a small window where the
953 		 * WAL-first rule could be violated.
954 		 */
955 		XLogFlush(lsn);
956 
957 		/* Prepare for truncation of MAIN fork */
958 		if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
959 		{
960 			forks[nforks] = MAIN_FORKNUM;
961 			blocks[nforks] = xlrec->blkno;
962 			nforks++;
963 
964 			/* Also tell xlogutils.c about it */
965 			XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
966 		}
967 
968 		/* Prepare for truncation of FSM and VM too */
969 		rel = CreateFakeRelcacheEntry(xlrec->rnode);
970 
971 		if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
972 			smgrexists(reln, FSM_FORKNUM))
973 		{
974 			blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
975 			if (BlockNumberIsValid(blocks[nforks]))
976 			{
977 				forks[nforks] = FSM_FORKNUM;
978 				nforks++;
979 				need_fsm_vacuum = true;
980 			}
981 		}
982 		if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
983 			smgrexists(reln, VISIBILITYMAP_FORKNUM))
984 		{
985 			blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
986 			if (BlockNumberIsValid(blocks[nforks]))
987 			{
988 				forks[nforks] = VISIBILITYMAP_FORKNUM;
989 				nforks++;
990 			}
991 		}
992 
993 		/* Do the real work to truncate relation forks */
994 		if (nforks > 0)
995 			smgrtruncate(reln, forks, nforks, blocks);
996 
997 		/*
998 		 * Update upper-level FSM pages to account for the truncation. This is
999 		 * important because the just-truncated pages were likely marked as
1000 		 * all-free, and would be preferentially selected.
1001 		 */
1002 		if (need_fsm_vacuum)
1003 			FreeSpaceMapVacuumRange(rel, xlrec->blkno,
1004 									InvalidBlockNumber);
1005 
1006 		FreeFakeRelcacheEntry(rel);
1007 	}
1008 	else
1009 		elog(PANIC, "smgr_redo: unknown op code %u", info);
1010 }
1011