1 /*-------------------------------------------------------------------------
2  *
3  * storage.c
4  *	  code to create and destroy physical storage for relations
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/catalog/storage.c
12  *
13  * NOTES
14  *	  Some of this code used to be in storage/smgr/smgr.c, and the
15  *	  function names still reflect that.
16  *
17  *-------------------------------------------------------------------------
18  */
19 
20 #include "postgres.h"
21 
22 #include "access/visibilitymap.h"
23 #include "access/xact.h"
24 #include "access/xlog.h"
25 #include "access/xloginsert.h"
26 #include "access/xlogutils.h"
27 #include "catalog/catalog.h"
28 #include "catalog/storage.h"
29 #include "catalog/storage_xlog.h"
30 #include "storage/freespace.h"
31 #include "storage/smgr.h"
32 #include "utils/memutils.h"
33 #include "utils/rel.h"
34 
35 /*
36  * We keep a list of all relations (represented as RelFileNode values)
37  * that have been created or deleted in the current transaction.  When
38  * a relation is created, we create the physical file immediately, but
39  * remember it so that we can delete the file again if the current
40  * transaction is aborted.  Conversely, a deletion request is NOT
41  * executed immediately, but is just entered in the list.  When and if
42  * the transaction commits, we can delete the physical file.
43  *
44  * To handle subtransactions, every entry is marked with its transaction
45  * nesting level.  At subtransaction commit, we reassign the subtransaction's
46  * entries to the parent nesting level.  At subtransaction abort, we can
47  * immediately execute the abort-time actions for all entries of the current
48  * nesting level.
49  *
50  * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
51  * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
52  * but I'm being paranoid.
53  */
54 
55 typedef struct PendingRelDelete
56 {
57 	RelFileNode relnode;		/* relation that may need to be deleted */
58 	BackendId	backend;		/* InvalidBackendId if not a temp rel */
59 	bool		atCommit;		/* T=delete at commit; F=delete at abort */
60 	int			nestLevel;		/* xact nesting level of request */
61 	struct PendingRelDelete *next;	/* linked-list link */
62 } PendingRelDelete;
63 
64 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
65 
66 /*
67  * RelationCreateStorage
68  *		Create physical storage for a relation.
69  *
70  * Create the underlying disk file storage for the relation. This only
71  * creates the main fork; additional forks are created lazily by the
72  * modules that need them.
73  *
74  * This function is transactional. The creation is WAL-logged, and if the
75  * transaction aborts later on, the storage will be destroyed.
76  */
77 void
RelationCreateStorage(RelFileNode rnode,char relpersistence)78 RelationCreateStorage(RelFileNode rnode, char relpersistence)
79 {
80 	PendingRelDelete *pending;
81 	SMgrRelation srel;
82 	BackendId	backend;
83 	bool		needs_wal;
84 
85 	switch (relpersistence)
86 	{
87 		case RELPERSISTENCE_TEMP:
88 			backend = BackendIdForTempRelations();
89 			needs_wal = false;
90 			break;
91 		case RELPERSISTENCE_UNLOGGED:
92 			backend = InvalidBackendId;
93 			needs_wal = false;
94 			break;
95 		case RELPERSISTENCE_PERMANENT:
96 			backend = InvalidBackendId;
97 			needs_wal = true;
98 			break;
99 		default:
100 			elog(ERROR, "invalid relpersistence: %c", relpersistence);
101 			return;				/* placate compiler */
102 	}
103 
104 	srel = smgropen(rnode, backend);
105 	smgrcreate(srel, MAIN_FORKNUM, false);
106 
107 	if (needs_wal)
108 		log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
109 
110 	/* Add the relation to the list of stuff to delete at abort */
111 	pending = (PendingRelDelete *)
112 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
113 	pending->relnode = rnode;
114 	pending->backend = backend;
115 	pending->atCommit = false;	/* delete if abort */
116 	pending->nestLevel = GetCurrentTransactionNestLevel();
117 	pending->next = pendingDeletes;
118 	pendingDeletes = pending;
119 }
120 
121 /*
122  * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
123  */
124 void
log_smgrcreate(RelFileNode * rnode,ForkNumber forkNum)125 log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
126 {
127 	xl_smgr_create xlrec;
128 
129 	/*
130 	 * Make an XLOG entry reporting the file creation.
131 	 */
132 	xlrec.rnode = *rnode;
133 	xlrec.forkNum = forkNum;
134 
135 	XLogBeginInsert();
136 	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
137 	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
138 }
139 
140 /*
141  * RelationDropStorage
142  *		Schedule unlinking of physical storage at transaction commit.
143  */
144 void
RelationDropStorage(Relation rel)145 RelationDropStorage(Relation rel)
146 {
147 	PendingRelDelete *pending;
148 
149 	/* Add the relation to the list of stuff to delete at commit */
150 	pending = (PendingRelDelete *)
151 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
152 	pending->relnode = rel->rd_node;
153 	pending->backend = rel->rd_backend;
154 	pending->atCommit = true;	/* delete if commit */
155 	pending->nestLevel = GetCurrentTransactionNestLevel();
156 	pending->next = pendingDeletes;
157 	pendingDeletes = pending;
158 
159 	/*
160 	 * NOTE: if the relation was created in this transaction, it will now be
161 	 * present in the pending-delete list twice, once with atCommit true and
162 	 * once with atCommit false.  Hence, it will be physically deleted at end
163 	 * of xact in either case (and the other entry will be ignored by
164 	 * smgrDoPendingDeletes, so no error will occur).  We could instead remove
165 	 * the existing list entry and delete the physical file immediately, but
166 	 * for now I'll keep the logic simple.
167 	 */
168 
169 	RelationCloseSmgr(rel);
170 }
171 
172 /*
173  * RelationPreserveStorage
174  *		Mark a relation as not to be deleted after all.
175  *
176  * We need this function because relation mapping changes are committed
177  * separately from commit of the whole transaction, so it's still possible
178  * for the transaction to abort after the mapping update is done.
179  * When a new physical relation is installed in the map, it would be
180  * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
181  * The relation mapper fixes this by telling us to not delete such relations
182  * after all as part of its commit.
183  *
184  * We also use this to reuse an old build of an index during ALTER TABLE, this
185  * time removing the delete-at-commit entry.
186  *
187  * No-op if the relation is not among those scheduled for deletion.
188  */
189 void
RelationPreserveStorage(RelFileNode rnode,bool atCommit)190 RelationPreserveStorage(RelFileNode rnode, bool atCommit)
191 {
192 	PendingRelDelete *pending;
193 	PendingRelDelete *prev;
194 	PendingRelDelete *next;
195 
196 	prev = NULL;
197 	for (pending = pendingDeletes; pending != NULL; pending = next)
198 	{
199 		next = pending->next;
200 		if (RelFileNodeEquals(rnode, pending->relnode)
201 			&& pending->atCommit == atCommit)
202 		{
203 			/* unlink and delete list entry */
204 			if (prev)
205 				prev->next = next;
206 			else
207 				pendingDeletes = next;
208 			pfree(pending);
209 			/* prev does not change */
210 		}
211 		else
212 		{
213 			/* unrelated entry, don't touch it */
214 			prev = pending;
215 		}
216 	}
217 }
218 
219 /*
220  * RelationTruncate
221  *		Physically truncate a relation to the specified number of blocks.
222  *
223  * This includes getting rid of any buffers for the blocks that are to be
224  * dropped.
225  */
226 void
RelationTruncate(Relation rel,BlockNumber nblocks)227 RelationTruncate(Relation rel, BlockNumber nblocks)
228 {
229 	bool		fsm;
230 	bool		vm;
231 
232 	/* Open it at the smgr level if not already done */
233 	RelationOpenSmgr(rel);
234 
235 	/*
236 	 * Make sure smgr_targblock etc aren't pointing somewhere past new end
237 	 */
238 	rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
239 	rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
240 	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
241 
242 	/* Truncate the FSM first if it exists */
243 	fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
244 	if (fsm)
245 		FreeSpaceMapTruncateRel(rel, nblocks);
246 
247 	/* Truncate the visibility map too if it exists. */
248 	vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
249 	if (vm)
250 		visibilitymap_truncate(rel, nblocks);
251 
252 	/*
253 	 * We WAL-log the truncation before actually truncating, which means
254 	 * trouble if the truncation fails. If we then crash, the WAL replay
255 	 * likely isn't going to succeed in the truncation either, and cause a
256 	 * PANIC. It's tempting to put a critical section here, but that cure
257 	 * would be worse than the disease. It would turn a usually harmless
258 	 * failure to truncate, that might spell trouble at WAL replay, into a
259 	 * certain PANIC.
260 	 */
261 	if (RelationNeedsWAL(rel))
262 	{
263 		/*
264 		 * Make an XLOG entry reporting the file truncation.
265 		 */
266 		XLogRecPtr	lsn;
267 		xl_smgr_truncate xlrec;
268 
269 		xlrec.blkno = nblocks;
270 		xlrec.rnode = rel->rd_node;
271 		xlrec.flags = SMGR_TRUNCATE_ALL;
272 
273 		XLogBeginInsert();
274 		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
275 
276 		lsn = XLogInsert(RM_SMGR_ID,
277 						 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
278 
279 		/*
280 		 * Flush, because otherwise the truncation of the main relation might
281 		 * hit the disk before the WAL record, and the truncation of the FSM
282 		 * or visibility map. If we crashed during that window, we'd be left
283 		 * with a truncated heap, but the FSM or visibility map would still
284 		 * contain entries for the non-existent heap pages.
285 		 */
286 		if (fsm || vm)
287 			XLogFlush(lsn);
288 	}
289 
290 	/* Do the real work */
291 	smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
292 }
293 
294 /*
295  *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
296  *
297  * This also runs when aborting a subxact; we want to clean up a failed
298  * subxact immediately.
299  *
300  * Note: It's possible that we're being asked to remove a relation that has
301  * no physical storage in any fork. In particular, it's possible that we're
302  * cleaning up an old temporary relation for which RemovePgTempFiles has
303  * already recovered the physical storage.
304  */
305 void
smgrDoPendingDeletes(bool isCommit)306 smgrDoPendingDeletes(bool isCommit)
307 {
308 	int			nestLevel = GetCurrentTransactionNestLevel();
309 	PendingRelDelete *pending;
310 	PendingRelDelete *prev;
311 	PendingRelDelete *next;
312 	int			nrels = 0,
313 				i = 0,
314 				maxrels = 0;
315 	SMgrRelation *srels = NULL;
316 
317 	prev = NULL;
318 	for (pending = pendingDeletes; pending != NULL; pending = next)
319 	{
320 		next = pending->next;
321 		if (pending->nestLevel < nestLevel)
322 		{
323 			/* outer-level entries should not be processed yet */
324 			prev = pending;
325 		}
326 		else
327 		{
328 			/* unlink list entry first, so we don't retry on failure */
329 			if (prev)
330 				prev->next = next;
331 			else
332 				pendingDeletes = next;
333 			/* do deletion if called for */
334 			if (pending->atCommit == isCommit)
335 			{
336 				SMgrRelation srel;
337 
338 				srel = smgropen(pending->relnode, pending->backend);
339 
340 				/* allocate the initial array, or extend it, if needed */
341 				if (maxrels == 0)
342 				{
343 					maxrels = 8;
344 					srels = palloc(sizeof(SMgrRelation) * maxrels);
345 				}
346 				else if (maxrels <= nrels)
347 				{
348 					maxrels *= 2;
349 					srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
350 				}
351 
352 				srels[nrels++] = srel;
353 			}
354 			/* must explicitly free the list entry */
355 			pfree(pending);
356 			/* prev does not change */
357 		}
358 	}
359 
360 	if (nrels > 0)
361 	{
362 		smgrdounlinkall(srels, nrels, false);
363 
364 		for (i = 0; i < nrels; i++)
365 			smgrclose(srels[i]);
366 
367 		pfree(srels);
368 	}
369 }
370 
371 /*
372  * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
373  *
374  * The return value is the number of relations scheduled for termination.
375  * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
376  * If there are no relations to be deleted, *ptr is set to NULL.
377  *
378  * Only non-temporary relations are included in the returned list.  This is OK
379  * because the list is used only in contexts where temporary relations don't
380  * matter: we're either writing to the two-phase state file (and transactions
381  * that have touched temp tables can't be prepared) or we're writing to xlog
382  * (and all temporary files will be zapped if we restart anyway, so no need
383  * for redo to do it also).
384  *
385  * Note that the list does not include anything scheduled for termination
386  * by upper-level transactions.
387  */
388 int
smgrGetPendingDeletes(bool forCommit,RelFileNode ** ptr)389 smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
390 {
391 	int			nestLevel = GetCurrentTransactionNestLevel();
392 	int			nrels;
393 	RelFileNode *rptr;
394 	PendingRelDelete *pending;
395 
396 	nrels = 0;
397 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
398 	{
399 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
400 			&& pending->backend == InvalidBackendId)
401 			nrels++;
402 	}
403 	if (nrels == 0)
404 	{
405 		*ptr = NULL;
406 		return 0;
407 	}
408 	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
409 	*ptr = rptr;
410 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
411 	{
412 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
413 			&& pending->backend == InvalidBackendId)
414 		{
415 			*rptr = pending->relnode;
416 			rptr++;
417 		}
418 	}
419 	return nrels;
420 }
421 
422 /*
423  *	PostPrepare_smgr -- Clean up after a successful PREPARE
424  *
425  * What we have to do here is throw away the in-memory state about pending
426  * relation deletes.  It's all been recorded in the 2PC state file and
427  * it's no longer smgr's job to worry about it.
428  */
429 void
PostPrepare_smgr(void)430 PostPrepare_smgr(void)
431 {
432 	PendingRelDelete *pending;
433 	PendingRelDelete *next;
434 
435 	for (pending = pendingDeletes; pending != NULL; pending = next)
436 	{
437 		next = pending->next;
438 		pendingDeletes = next;
439 		/* must explicitly free the list entry */
440 		pfree(pending);
441 	}
442 }
443 
444 
445 /*
446  * AtSubCommit_smgr() --- Take care of subtransaction commit.
447  *
448  * Reassign all items in the pending-deletes list to the parent transaction.
449  */
450 void
AtSubCommit_smgr(void)451 AtSubCommit_smgr(void)
452 {
453 	int			nestLevel = GetCurrentTransactionNestLevel();
454 	PendingRelDelete *pending;
455 
456 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
457 	{
458 		if (pending->nestLevel >= nestLevel)
459 			pending->nestLevel = nestLevel - 1;
460 	}
461 }
462 
463 /*
464  * AtSubAbort_smgr() --- Take care of subtransaction abort.
465  *
466  * Delete created relations and forget about deleted relations.
467  * We can execute these operations immediately because we know this
468  * subtransaction will not commit.
469  */
470 void
AtSubAbort_smgr(void)471 AtSubAbort_smgr(void)
472 {
473 	smgrDoPendingDeletes(false);
474 }
475 
476 void
smgr_redo(XLogReaderState * record)477 smgr_redo(XLogReaderState *record)
478 {
479 	XLogRecPtr	lsn = record->EndRecPtr;
480 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
481 
482 	/* Backup blocks are not used in smgr records */
483 	Assert(!XLogRecHasAnyBlockRefs(record));
484 
485 	if (info == XLOG_SMGR_CREATE)
486 	{
487 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
488 		SMgrRelation reln;
489 
490 		reln = smgropen(xlrec->rnode, InvalidBackendId);
491 		smgrcreate(reln, xlrec->forkNum, true);
492 	}
493 	else if (info == XLOG_SMGR_TRUNCATE)
494 	{
495 		xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
496 		SMgrRelation reln;
497 		Relation	rel;
498 
499 		reln = smgropen(xlrec->rnode, InvalidBackendId);
500 
501 		/*
502 		 * Forcibly create relation if it doesn't exist (which suggests that
503 		 * it was dropped somewhere later in the WAL sequence).  As in
504 		 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
505 		 * log as best we can until the drop is seen.
506 		 */
507 		smgrcreate(reln, MAIN_FORKNUM, true);
508 
509 		/*
510 		 * Before we perform the truncation, update minimum recovery point to
511 		 * cover this WAL record. Once the relation is truncated, there's no
512 		 * going back. The buffer manager enforces the WAL-first rule for
513 		 * normal updates to relation files, so that the minimum recovery
514 		 * point is always updated before the corresponding change in the data
515 		 * file is flushed to disk. We have to do the same manually here.
516 		 *
517 		 * Doing this before the truncation means that if the truncation fails
518 		 * for some reason, you cannot start up the system even after restart,
519 		 * until you fix the underlying situation so that the truncation will
520 		 * succeed. Alternatively, we could update the minimum recovery point
521 		 * after truncation, but that would leave a small window where the
522 		 * WAL-first rule could be violated.
523 		 */
524 		XLogFlush(lsn);
525 
526 		if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
527 		{
528 			smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);
529 
530 			/* Also tell xlogutils.c about it */
531 			XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
532 		}
533 
534 		/* Truncate FSM and VM too */
535 		rel = CreateFakeRelcacheEntry(xlrec->rnode);
536 
537 		if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
538 			smgrexists(reln, FSM_FORKNUM))
539 			FreeSpaceMapTruncateRel(rel, xlrec->blkno);
540 		if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
541 			smgrexists(reln, VISIBILITYMAP_FORKNUM))
542 			visibilitymap_truncate(rel, xlrec->blkno);
543 
544 		FreeFakeRelcacheEntry(rel);
545 	}
546 	else
547 		elog(PANIC, "smgr_redo: unknown op code %u", info);
548 }
549