1 /*-------------------------------------------------------------------------
2  *
3  * storage.c
4  *	  code to create and destroy physical storage for relations
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/catalog/storage.c
12  *
13  * NOTES
14  *	  Some of this code used to be in storage/smgr/smgr.c, and the
15  *	  function names still reflect that.
16  *
17  *-------------------------------------------------------------------------
18  */
19 
20 #include "postgres.h"
21 
22 #include "access/visibilitymap.h"
23 #include "access/xact.h"
24 #include "access/xlog.h"
25 #include "access/xloginsert.h"
26 #include "access/xlogutils.h"
27 #include "catalog/storage.h"
28 #include "catalog/storage_xlog.h"
29 #include "storage/freespace.h"
30 #include "storage/smgr.h"
31 #include "utils/memutils.h"
32 #include "utils/rel.h"
33 
34 /*
35  * We keep a list of all relations (represented as RelFileNode values)
36  * that have been created or deleted in the current transaction.  When
37  * a relation is created, we create the physical file immediately, but
38  * remember it so that we can delete the file again if the current
39  * transaction is aborted.  Conversely, a deletion request is NOT
40  * executed immediately, but is just entered in the list.  When and if
41  * the transaction commits, we can delete the physical file.
42  *
43  * To handle subtransactions, every entry is marked with its transaction
44  * nesting level.  At subtransaction commit, we reassign the subtransaction's
45  * entries to the parent nesting level.  At subtransaction abort, we can
46  * immediately execute the abort-time actions for all entries of the current
47  * nesting level.
48  *
49  * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
50  * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
51  * but I'm being paranoid.
52  */
53 
54 typedef struct PendingRelDelete
55 {
56 	RelFileNode relnode;		/* relation that may need to be deleted */
57 	BackendId	backend;		/* InvalidBackendId if not a temp rel */
58 	bool		atCommit;		/* T=delete at commit; F=delete at abort */
59 	int			nestLevel;		/* xact nesting level of request */
60 	struct PendingRelDelete *next;	/* linked-list link */
61 } PendingRelDelete;
62 
63 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
64 
65 /*
66  * RelationCreateStorage
67  *		Create physical storage for a relation.
68  *
69  * Create the underlying disk file storage for the relation. This only
70  * creates the main fork; additional forks are created lazily by the
71  * modules that need them.
72  *
73  * This function is transactional. The creation is WAL-logged, and if the
74  * transaction aborts later on, the storage will be destroyed.
75  */
76 void
RelationCreateStorage(RelFileNode rnode,char relpersistence)77 RelationCreateStorage(RelFileNode rnode, char relpersistence)
78 {
79 	PendingRelDelete *pending;
80 	SMgrRelation srel;
81 	BackendId	backend;
82 	bool		needs_wal;
83 
84 	switch (relpersistence)
85 	{
86 		case RELPERSISTENCE_TEMP:
87 			backend = BackendIdForTempRelations();
88 			needs_wal = false;
89 			break;
90 		case RELPERSISTENCE_UNLOGGED:
91 			backend = InvalidBackendId;
92 			needs_wal = false;
93 			break;
94 		case RELPERSISTENCE_PERMANENT:
95 			backend = InvalidBackendId;
96 			needs_wal = true;
97 			break;
98 		default:
99 			elog(ERROR, "invalid relpersistence: %c", relpersistence);
100 			return;				/* placate compiler */
101 	}
102 
103 	srel = smgropen(rnode, backend);
104 	smgrcreate(srel, MAIN_FORKNUM, false);
105 
106 	if (needs_wal)
107 		log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
108 
109 	/* Add the relation to the list of stuff to delete at abort */
110 	pending = (PendingRelDelete *)
111 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
112 	pending->relnode = rnode;
113 	pending->backend = backend;
114 	pending->atCommit = false;	/* delete if abort */
115 	pending->nestLevel = GetCurrentTransactionNestLevel();
116 	pending->next = pendingDeletes;
117 	pendingDeletes = pending;
118 }
119 
120 /*
121  * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
122  */
123 void
log_smgrcreate(RelFileNode * rnode,ForkNumber forkNum)124 log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
125 {
126 	xl_smgr_create xlrec;
127 
128 	/*
129 	 * Make an XLOG entry reporting the file creation.
130 	 */
131 	xlrec.rnode = *rnode;
132 	xlrec.forkNum = forkNum;
133 
134 	XLogBeginInsert();
135 	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
136 	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
137 }
138 
139 /*
140  * RelationDropStorage
141  *		Schedule unlinking of physical storage at transaction commit.
142  */
143 void
RelationDropStorage(Relation rel)144 RelationDropStorage(Relation rel)
145 {
146 	PendingRelDelete *pending;
147 
148 	/* Add the relation to the list of stuff to delete at commit */
149 	pending = (PendingRelDelete *)
150 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
151 	pending->relnode = rel->rd_node;
152 	pending->backend = rel->rd_backend;
153 	pending->atCommit = true;	/* delete if commit */
154 	pending->nestLevel = GetCurrentTransactionNestLevel();
155 	pending->next = pendingDeletes;
156 	pendingDeletes = pending;
157 
158 	/*
159 	 * NOTE: if the relation was created in this transaction, it will now be
160 	 * present in the pending-delete list twice, once with atCommit true and
161 	 * once with atCommit false.  Hence, it will be physically deleted at end
162 	 * of xact in either case (and the other entry will be ignored by
163 	 * smgrDoPendingDeletes, so no error will occur).  We could instead remove
164 	 * the existing list entry and delete the physical file immediately, but
165 	 * for now I'll keep the logic simple.
166 	 */
167 
168 	RelationCloseSmgr(rel);
169 }
170 
171 /*
172  * RelationPreserveStorage
173  *		Mark a relation as not to be deleted after all.
174  *
175  * We need this function because relation mapping changes are committed
176  * separately from commit of the whole transaction, so it's still possible
177  * for the transaction to abort after the mapping update is done.
178  * When a new physical relation is installed in the map, it would be
179  * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
180  * The relation mapper fixes this by telling us to not delete such relations
181  * after all as part of its commit.
182  *
183  * We also use this to reuse an old build of an index during ALTER TABLE, this
184  * time removing the delete-at-commit entry.
185  *
186  * No-op if the relation is not among those scheduled for deletion.
187  */
188 void
RelationPreserveStorage(RelFileNode rnode,bool atCommit)189 RelationPreserveStorage(RelFileNode rnode, bool atCommit)
190 {
191 	PendingRelDelete *pending;
192 	PendingRelDelete *prev;
193 	PendingRelDelete *next;
194 
195 	prev = NULL;
196 	for (pending = pendingDeletes; pending != NULL; pending = next)
197 	{
198 		next = pending->next;
199 		if (RelFileNodeEquals(rnode, pending->relnode)
200 			&& pending->atCommit == atCommit)
201 		{
202 			/* unlink and delete list entry */
203 			if (prev)
204 				prev->next = next;
205 			else
206 				pendingDeletes = next;
207 			pfree(pending);
208 			/* prev does not change */
209 		}
210 		else
211 		{
212 			/* unrelated entry, don't touch it */
213 			prev = pending;
214 		}
215 	}
216 }
217 
218 /*
219  * RelationTruncate
220  *		Physically truncate a relation to the specified number of blocks.
221  *
222  * This includes getting rid of any buffers for the blocks that are to be
223  * dropped.
224  */
225 void
RelationTruncate(Relation rel,BlockNumber nblocks)226 RelationTruncate(Relation rel, BlockNumber nblocks)
227 {
228 	bool		fsm;
229 	bool		vm;
230 
231 	/* Open it at the smgr level if not already done */
232 	RelationOpenSmgr(rel);
233 
234 	/*
235 	 * Make sure smgr_targblock etc aren't pointing somewhere past new end
236 	 */
237 	rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
238 	rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
239 	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
240 
241 	/* Truncate the FSM first if it exists */
242 	fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
243 	if (fsm)
244 		FreeSpaceMapTruncateRel(rel, nblocks);
245 
246 	/* Truncate the visibility map too if it exists. */
247 	vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
248 	if (vm)
249 		visibilitymap_truncate(rel, nblocks);
250 
251 	/*
252 	 * We WAL-log the truncation before actually truncating, which means
253 	 * trouble if the truncation fails. If we then crash, the WAL replay
254 	 * likely isn't going to succeed in the truncation either, and cause a
255 	 * PANIC. It's tempting to put a critical section here, but that cure
256 	 * would be worse than the disease. It would turn a usually harmless
257 	 * failure to truncate, that might spell trouble at WAL replay, into a
258 	 * certain PANIC.
259 	 */
260 	if (RelationNeedsWAL(rel))
261 	{
262 		/*
263 		 * Make an XLOG entry reporting the file truncation.
264 		 */
265 		XLogRecPtr	lsn;
266 		xl_smgr_truncate xlrec;
267 
268 		xlrec.blkno = nblocks;
269 		xlrec.rnode = rel->rd_node;
270 		xlrec.flags = SMGR_TRUNCATE_ALL;
271 
272 		XLogBeginInsert();
273 		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
274 
275 		lsn = XLogInsert(RM_SMGR_ID,
276 						 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
277 
278 		/*
279 		 * Flush, because otherwise the truncation of the main relation might
280 		 * hit the disk before the WAL record, and the truncation of the FSM
281 		 * or visibility map. If we crashed during that window, we'd be left
282 		 * with a truncated heap, but the FSM or visibility map would still
283 		 * contain entries for the non-existent heap pages.
284 		 */
285 		if (fsm || vm)
286 			XLogFlush(lsn);
287 	}
288 
289 	/* Do the real work */
290 	smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
291 }
292 
293 /*
294  *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
295  *
296  * This also runs when aborting a subxact; we want to clean up a failed
297  * subxact immediately.
298  *
299  * Note: It's possible that we're being asked to remove a relation that has
300  * no physical storage in any fork. In particular, it's possible that we're
301  * cleaning up an old temporary relation for which RemovePgTempFiles has
302  * already recovered the physical storage.
303  */
304 void
smgrDoPendingDeletes(bool isCommit)305 smgrDoPendingDeletes(bool isCommit)
306 {
307 	int			nestLevel = GetCurrentTransactionNestLevel();
308 	PendingRelDelete *pending;
309 	PendingRelDelete *prev;
310 	PendingRelDelete *next;
311 	int			nrels = 0,
312 				i = 0,
313 				maxrels = 0;
314 	SMgrRelation *srels = NULL;
315 
316 	prev = NULL;
317 	for (pending = pendingDeletes; pending != NULL; pending = next)
318 	{
319 		next = pending->next;
320 		if (pending->nestLevel < nestLevel)
321 		{
322 			/* outer-level entries should not be processed yet */
323 			prev = pending;
324 		}
325 		else
326 		{
327 			/* unlink list entry first, so we don't retry on failure */
328 			if (prev)
329 				prev->next = next;
330 			else
331 				pendingDeletes = next;
332 			/* do deletion if called for */
333 			if (pending->atCommit == isCommit)
334 			{
335 				SMgrRelation srel;
336 
337 				srel = smgropen(pending->relnode, pending->backend);
338 
339 				/* allocate the initial array, or extend it, if needed */
340 				if (maxrels == 0)
341 				{
342 					maxrels = 8;
343 					srels = palloc(sizeof(SMgrRelation) * maxrels);
344 				}
345 				else if (maxrels <= nrels)
346 				{
347 					maxrels *= 2;
348 					srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
349 				}
350 
351 				srels[nrels++] = srel;
352 			}
353 			/* must explicitly free the list entry */
354 			pfree(pending);
355 			/* prev does not change */
356 		}
357 	}
358 
359 	if (nrels > 0)
360 	{
361 		smgrdounlinkall(srels, nrels, false);
362 
363 		for (i = 0; i < nrels; i++)
364 			smgrclose(srels[i]);
365 
366 		pfree(srels);
367 	}
368 }
369 
370 /*
371  * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
372  *
373  * The return value is the number of relations scheduled for termination.
374  * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
375  * If there are no relations to be deleted, *ptr is set to NULL.
376  *
377  * Only non-temporary relations are included in the returned list.  This is OK
378  * because the list is used only in contexts where temporary relations don't
379  * matter: we're either writing to the two-phase state file (and transactions
380  * that have touched temp tables can't be prepared) or we're writing to xlog
381  * (and all temporary files will be zapped if we restart anyway, so no need
382  * for redo to do it also).
383  *
384  * Note that the list does not include anything scheduled for termination
385  * by upper-level transactions.
386  */
387 int
smgrGetPendingDeletes(bool forCommit,RelFileNode ** ptr)388 smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
389 {
390 	int			nestLevel = GetCurrentTransactionNestLevel();
391 	int			nrels;
392 	RelFileNode *rptr;
393 	PendingRelDelete *pending;
394 
395 	nrels = 0;
396 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
397 	{
398 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
399 			&& pending->backend == InvalidBackendId)
400 			nrels++;
401 	}
402 	if (nrels == 0)
403 	{
404 		*ptr = NULL;
405 		return 0;
406 	}
407 	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
408 	*ptr = rptr;
409 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
410 	{
411 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
412 			&& pending->backend == InvalidBackendId)
413 		{
414 			*rptr = pending->relnode;
415 			rptr++;
416 		}
417 	}
418 	return nrels;
419 }
420 
421 /*
422  *	PostPrepare_smgr -- Clean up after a successful PREPARE
423  *
424  * What we have to do here is throw away the in-memory state about pending
425  * relation deletes.  It's all been recorded in the 2PC state file and
426  * it's no longer smgr's job to worry about it.
427  */
428 void
PostPrepare_smgr(void)429 PostPrepare_smgr(void)
430 {
431 	PendingRelDelete *pending;
432 	PendingRelDelete *next;
433 
434 	for (pending = pendingDeletes; pending != NULL; pending = next)
435 	{
436 		next = pending->next;
437 		pendingDeletes = next;
438 		/* must explicitly free the list entry */
439 		pfree(pending);
440 	}
441 }
442 
443 
444 /*
445  * AtSubCommit_smgr() --- Take care of subtransaction commit.
446  *
447  * Reassign all items in the pending-deletes list to the parent transaction.
448  */
449 void
AtSubCommit_smgr(void)450 AtSubCommit_smgr(void)
451 {
452 	int			nestLevel = GetCurrentTransactionNestLevel();
453 	PendingRelDelete *pending;
454 
455 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
456 	{
457 		if (pending->nestLevel >= nestLevel)
458 			pending->nestLevel = nestLevel - 1;
459 	}
460 }
461 
462 /*
463  * AtSubAbort_smgr() --- Take care of subtransaction abort.
464  *
465  * Delete created relations and forget about deleted relations.
466  * We can execute these operations immediately because we know this
467  * subtransaction will not commit.
468  */
469 void
AtSubAbort_smgr(void)470 AtSubAbort_smgr(void)
471 {
472 	smgrDoPendingDeletes(false);
473 }
474 
475 void
smgr_redo(XLogReaderState * record)476 smgr_redo(XLogReaderState *record)
477 {
478 	XLogRecPtr	lsn = record->EndRecPtr;
479 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
480 
481 	/* Backup blocks are not used in smgr records */
482 	Assert(!XLogRecHasAnyBlockRefs(record));
483 
484 	if (info == XLOG_SMGR_CREATE)
485 	{
486 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
487 		SMgrRelation reln;
488 
489 		reln = smgropen(xlrec->rnode, InvalidBackendId);
490 		smgrcreate(reln, xlrec->forkNum, true);
491 	}
492 	else if (info == XLOG_SMGR_TRUNCATE)
493 	{
494 		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
495 		SMgrRelation reln;
496 		Relation	rel;
497 
498 		reln = smgropen(xlrec->rnode, InvalidBackendId);
499 
500 		/*
501 		 * Forcibly create relation if it doesn't exist (which suggests that
502 		 * it was dropped somewhere later in the WAL sequence).  As in
503 		 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
504 		 * log as best we can until the drop is seen.
505 		 */
506 		smgrcreate(reln, MAIN_FORKNUM, true);
507 
508 		/*
509 		 * Before we perform the truncation, update minimum recovery point to
510 		 * cover this WAL record. Once the relation is truncated, there's no
511 		 * going back. The buffer manager enforces the WAL-first rule for
512 		 * normal updates to relation files, so that the minimum recovery
513 		 * point is always updated before the corresponding change in the data
514 		 * file is flushed to disk. We have to do the same manually here.
515 		 *
516 		 * Doing this before the truncation means that if the truncation fails
517 		 * for some reason, you cannot start up the system even after restart,
518 		 * until you fix the underlying situation so that the truncation will
519 		 * succeed. Alternatively, we could update the minimum recovery point
520 		 * after truncation, but that would leave a small window where the
521 		 * WAL-first rule could be violated.
522 		 */
523 		XLogFlush(lsn);
524 
525 		if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
526 		{
527 			smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);
528 
529 			/* Also tell xlogutils.c about it */
530 			XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
531 		}
532 
533 		/* Truncate FSM and VM too */
534 		rel = CreateFakeRelcacheEntry(xlrec->rnode);
535 
536 		if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
537 			smgrexists(reln, FSM_FORKNUM))
538 			FreeSpaceMapTruncateRel(rel, xlrec->blkno);
539 		if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
540 			smgrexists(reln, VISIBILITYMAP_FORKNUM))
541 			visibilitymap_truncate(rel, xlrec->blkno);
542 
543 		FreeFakeRelcacheEntry(rel);
544 	}
545 	else
546 		elog(PANIC, "smgr_redo: unknown op code %u", info);
547 }
548