1 /*-------------------------------------------------------------------------
2 *
3 * storage.c
4 * code to create and destroy physical storage for relations
5 *
6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/catalog/storage.c
12 *
13 * NOTES
14 * Some of this code used to be in storage/smgr/smgr.c, and the
15 * function names still reflect that.
16 *
17 *-------------------------------------------------------------------------
18 */
19
20 #include "postgres.h"
21
22 #include "access/visibilitymap.h"
23 #include "access/xact.h"
24 #include "access/xlog.h"
25 #include "access/xloginsert.h"
26 #include "access/xlogutils.h"
27 #include "catalog/storage.h"
28 #include "catalog/storage_xlog.h"
29 #include "storage/freespace.h"
30 #include "storage/smgr.h"
31 #include "utils/memutils.h"
32 #include "utils/rel.h"
33
34 /*
35 * We keep a list of all relations (represented as RelFileNode values)
36 * that have been created or deleted in the current transaction. When
37 * a relation is created, we create the physical file immediately, but
38 * remember it so that we can delete the file again if the current
39 * transaction is aborted. Conversely, a deletion request is NOT
40 * executed immediately, but is just entered in the list. When and if
41 * the transaction commits, we can delete the physical file.
42 *
43 * To handle subtransactions, every entry is marked with its transaction
44 * nesting level. At subtransaction commit, we reassign the subtransaction's
45 * entries to the parent nesting level. At subtransaction abort, we can
46 * immediately execute the abort-time actions for all entries of the current
47 * nesting level.
48 *
49 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
50 * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
51 * but I'm being paranoid.
52 */
53
54 typedef struct PendingRelDelete
55 {
56 RelFileNode relnode; /* relation that may need to be deleted */
57 BackendId backend; /* InvalidBackendId if not a temp rel */
58 bool atCommit; /* T=delete at commit; F=delete at abort */
59 int nestLevel; /* xact nesting level of request */
60 struct PendingRelDelete *next; /* linked-list link */
61 } PendingRelDelete;
62
63 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
64
65 /*
66 * RelationCreateStorage
67 * Create physical storage for a relation.
68 *
69 * Create the underlying disk file storage for the relation. This only
70 * creates the main fork; additional forks are created lazily by the
71 * modules that need them.
72 *
73 * This function is transactional. The creation is WAL-logged, and if the
74 * transaction aborts later on, the storage will be destroyed.
75 */
76 void
RelationCreateStorage(RelFileNode rnode,char relpersistence)77 RelationCreateStorage(RelFileNode rnode, char relpersistence)
78 {
79 PendingRelDelete *pending;
80 SMgrRelation srel;
81 BackendId backend;
82 bool needs_wal;
83
84 switch (relpersistence)
85 {
86 case RELPERSISTENCE_TEMP:
87 backend = BackendIdForTempRelations();
88 needs_wal = false;
89 break;
90 case RELPERSISTENCE_UNLOGGED:
91 backend = InvalidBackendId;
92 needs_wal = false;
93 break;
94 case RELPERSISTENCE_PERMANENT:
95 backend = InvalidBackendId;
96 needs_wal = true;
97 break;
98 default:
99 elog(ERROR, "invalid relpersistence: %c", relpersistence);
100 return; /* placate compiler */
101 }
102
103 srel = smgropen(rnode, backend);
104 smgrcreate(srel, MAIN_FORKNUM, false);
105
106 if (needs_wal)
107 log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
108
109 /* Add the relation to the list of stuff to delete at abort */
110 pending = (PendingRelDelete *)
111 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
112 pending->relnode = rnode;
113 pending->backend = backend;
114 pending->atCommit = false; /* delete if abort */
115 pending->nestLevel = GetCurrentTransactionNestLevel();
116 pending->next = pendingDeletes;
117 pendingDeletes = pending;
118 }
119
120 /*
121 * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
122 */
123 void
log_smgrcreate(RelFileNode * rnode,ForkNumber forkNum)124 log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
125 {
126 xl_smgr_create xlrec;
127
128 /*
129 * Make an XLOG entry reporting the file creation.
130 */
131 xlrec.rnode = *rnode;
132 xlrec.forkNum = forkNum;
133
134 XLogBeginInsert();
135 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
136 XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
137 }
138
139 /*
140 * RelationDropStorage
141 * Schedule unlinking of physical storage at transaction commit.
142 */
143 void
RelationDropStorage(Relation rel)144 RelationDropStorage(Relation rel)
145 {
146 PendingRelDelete *pending;
147
148 /* Add the relation to the list of stuff to delete at commit */
149 pending = (PendingRelDelete *)
150 MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
151 pending->relnode = rel->rd_node;
152 pending->backend = rel->rd_backend;
153 pending->atCommit = true; /* delete if commit */
154 pending->nestLevel = GetCurrentTransactionNestLevel();
155 pending->next = pendingDeletes;
156 pendingDeletes = pending;
157
158 /*
159 * NOTE: if the relation was created in this transaction, it will now be
160 * present in the pending-delete list twice, once with atCommit true and
161 * once with atCommit false. Hence, it will be physically deleted at end
162 * of xact in either case (and the other entry will be ignored by
163 * smgrDoPendingDeletes, so no error will occur). We could instead remove
164 * the existing list entry and delete the physical file immediately, but
165 * for now I'll keep the logic simple.
166 */
167
168 RelationCloseSmgr(rel);
169 }
170
171 /*
172 * RelationPreserveStorage
173 * Mark a relation as not to be deleted after all.
174 *
175 * We need this function because relation mapping changes are committed
176 * separately from commit of the whole transaction, so it's still possible
177 * for the transaction to abort after the mapping update is done.
178 * When a new physical relation is installed in the map, it would be
179 * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
180 * The relation mapper fixes this by telling us to not delete such relations
181 * after all as part of its commit.
182 *
183 * We also use this to reuse an old build of an index during ALTER TABLE, this
184 * time removing the delete-at-commit entry.
185 *
186 * No-op if the relation is not among those scheduled for deletion.
187 */
188 void
RelationPreserveStorage(RelFileNode rnode,bool atCommit)189 RelationPreserveStorage(RelFileNode rnode, bool atCommit)
190 {
191 PendingRelDelete *pending;
192 PendingRelDelete *prev;
193 PendingRelDelete *next;
194
195 prev = NULL;
196 for (pending = pendingDeletes; pending != NULL; pending = next)
197 {
198 next = pending->next;
199 if (RelFileNodeEquals(rnode, pending->relnode)
200 && pending->atCommit == atCommit)
201 {
202 /* unlink and delete list entry */
203 if (prev)
204 prev->next = next;
205 else
206 pendingDeletes = next;
207 pfree(pending);
208 /* prev does not change */
209 }
210 else
211 {
212 /* unrelated entry, don't touch it */
213 prev = pending;
214 }
215 }
216 }
217
218 /*
219 * RelationTruncate
220 * Physically truncate a relation to the specified number of blocks.
221 *
222 * This includes getting rid of any buffers for the blocks that are to be
223 * dropped.
224 */
225 void
RelationTruncate(Relation rel,BlockNumber nblocks)226 RelationTruncate(Relation rel, BlockNumber nblocks)
227 {
228 bool fsm;
229 bool vm;
230
231 /* Open it at the smgr level if not already done */
232 RelationOpenSmgr(rel);
233
234 /*
235 * Make sure smgr_targblock etc aren't pointing somewhere past new end
236 */
237 rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
238 rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
239 rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
240
241 /* Truncate the FSM first if it exists */
242 fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
243 if (fsm)
244 FreeSpaceMapTruncateRel(rel, nblocks);
245
246 /* Truncate the visibility map too if it exists. */
247 vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
248 if (vm)
249 visibilitymap_truncate(rel, nblocks);
250
251 /*
252 * We WAL-log the truncation before actually truncating, which means
253 * trouble if the truncation fails. If we then crash, the WAL replay
254 * likely isn't going to succeed in the truncation either, and cause a
255 * PANIC. It's tempting to put a critical section here, but that cure
256 * would be worse than the disease. It would turn a usually harmless
257 * failure to truncate, that might spell trouble at WAL replay, into a
258 * certain PANIC.
259 */
260 if (RelationNeedsWAL(rel))
261 {
262 /*
263 * Make an XLOG entry reporting the file truncation.
264 */
265 XLogRecPtr lsn;
266 xl_smgr_truncate xlrec;
267
268 xlrec.blkno = nblocks;
269 xlrec.rnode = rel->rd_node;
270 xlrec.flags = SMGR_TRUNCATE_ALL;
271
272 XLogBeginInsert();
273 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
274
275 lsn = XLogInsert(RM_SMGR_ID,
276 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
277
278 /*
279 * Flush, because otherwise the truncation of the main relation might
280 * hit the disk before the WAL record, and the truncation of the FSM
281 * or visibility map. If we crashed during that window, we'd be left
282 * with a truncated heap, but the FSM or visibility map would still
283 * contain entries for the non-existent heap pages.
284 */
285 if (fsm || vm)
286 XLogFlush(lsn);
287 }
288
289 /* Do the real work */
290 smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
291 }
292
293 /*
294 * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
295 *
296 * This also runs when aborting a subxact; we want to clean up a failed
297 * subxact immediately.
298 *
299 * Note: It's possible that we're being asked to remove a relation that has
300 * no physical storage in any fork. In particular, it's possible that we're
301 * cleaning up an old temporary relation for which RemovePgTempFiles has
302 * already recovered the physical storage.
303 */
304 void
smgrDoPendingDeletes(bool isCommit)305 smgrDoPendingDeletes(bool isCommit)
306 {
307 int nestLevel = GetCurrentTransactionNestLevel();
308 PendingRelDelete *pending;
309 PendingRelDelete *prev;
310 PendingRelDelete *next;
311 int nrels = 0,
312 i = 0,
313 maxrels = 0;
314 SMgrRelation *srels = NULL;
315
316 prev = NULL;
317 for (pending = pendingDeletes; pending != NULL; pending = next)
318 {
319 next = pending->next;
320 if (pending->nestLevel < nestLevel)
321 {
322 /* outer-level entries should not be processed yet */
323 prev = pending;
324 }
325 else
326 {
327 /* unlink list entry first, so we don't retry on failure */
328 if (prev)
329 prev->next = next;
330 else
331 pendingDeletes = next;
332 /* do deletion if called for */
333 if (pending->atCommit == isCommit)
334 {
335 SMgrRelation srel;
336
337 srel = smgropen(pending->relnode, pending->backend);
338
339 /* allocate the initial array, or extend it, if needed */
340 if (maxrels == 0)
341 {
342 maxrels = 8;
343 srels = palloc(sizeof(SMgrRelation) * maxrels);
344 }
345 else if (maxrels <= nrels)
346 {
347 maxrels *= 2;
348 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
349 }
350
351 srels[nrels++] = srel;
352 }
353 /* must explicitly free the list entry */
354 pfree(pending);
355 /* prev does not change */
356 }
357 }
358
359 if (nrels > 0)
360 {
361 smgrdounlinkall(srels, nrels, false);
362
363 for (i = 0; i < nrels; i++)
364 smgrclose(srels[i]);
365
366 pfree(srels);
367 }
368 }
369
370 /*
371 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
372 *
373 * The return value is the number of relations scheduled for termination.
374 * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
375 * If there are no relations to be deleted, *ptr is set to NULL.
376 *
377 * Only non-temporary relations are included in the returned list. This is OK
378 * because the list is used only in contexts where temporary relations don't
379 * matter: we're either writing to the two-phase state file (and transactions
380 * that have touched temp tables can't be prepared) or we're writing to xlog
381 * (and all temporary files will be zapped if we restart anyway, so no need
382 * for redo to do it also).
383 *
384 * Note that the list does not include anything scheduled for termination
385 * by upper-level transactions.
386 */
387 int
smgrGetPendingDeletes(bool forCommit,RelFileNode ** ptr)388 smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
389 {
390 int nestLevel = GetCurrentTransactionNestLevel();
391 int nrels;
392 RelFileNode *rptr;
393 PendingRelDelete *pending;
394
395 nrels = 0;
396 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
397 {
398 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
399 && pending->backend == InvalidBackendId)
400 nrels++;
401 }
402 if (nrels == 0)
403 {
404 *ptr = NULL;
405 return 0;
406 }
407 rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
408 *ptr = rptr;
409 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
410 {
411 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
412 && pending->backend == InvalidBackendId)
413 {
414 *rptr = pending->relnode;
415 rptr++;
416 }
417 }
418 return nrels;
419 }
420
421 /*
422 * PostPrepare_smgr -- Clean up after a successful PREPARE
423 *
424 * What we have to do here is throw away the in-memory state about pending
425 * relation deletes. It's all been recorded in the 2PC state file and
426 * it's no longer smgr's job to worry about it.
427 */
428 void
PostPrepare_smgr(void)429 PostPrepare_smgr(void)
430 {
431 PendingRelDelete *pending;
432 PendingRelDelete *next;
433
434 for (pending = pendingDeletes; pending != NULL; pending = next)
435 {
436 next = pending->next;
437 pendingDeletes = next;
438 /* must explicitly free the list entry */
439 pfree(pending);
440 }
441 }
442
443
444 /*
445 * AtSubCommit_smgr() --- Take care of subtransaction commit.
446 *
447 * Reassign all items in the pending-deletes list to the parent transaction.
448 */
449 void
AtSubCommit_smgr(void)450 AtSubCommit_smgr(void)
451 {
452 int nestLevel = GetCurrentTransactionNestLevel();
453 PendingRelDelete *pending;
454
455 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
456 {
457 if (pending->nestLevel >= nestLevel)
458 pending->nestLevel = nestLevel - 1;
459 }
460 }
461
462 /*
463 * AtSubAbort_smgr() --- Take care of subtransaction abort.
464 *
465 * Delete created relations and forget about deleted relations.
466 * We can execute these operations immediately because we know this
467 * subtransaction will not commit.
468 */
469 void
AtSubAbort_smgr(void)470 AtSubAbort_smgr(void)
471 {
472 smgrDoPendingDeletes(false);
473 }
474
475 void
smgr_redo(XLogReaderState * record)476 smgr_redo(XLogReaderState *record)
477 {
478 XLogRecPtr lsn = record->EndRecPtr;
479 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
480
481 /* Backup blocks are not used in smgr records */
482 Assert(!XLogRecHasAnyBlockRefs(record));
483
484 if (info == XLOG_SMGR_CREATE)
485 {
486 xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
487 SMgrRelation reln;
488
489 reln = smgropen(xlrec->rnode, InvalidBackendId);
490 smgrcreate(reln, xlrec->forkNum, true);
491 }
492 else if (info == XLOG_SMGR_TRUNCATE)
493 {
494 xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
495 SMgrRelation reln;
496 Relation rel;
497
498 reln = smgropen(xlrec->rnode, InvalidBackendId);
499
500 /*
501 * Forcibly create relation if it doesn't exist (which suggests that
502 * it was dropped somewhere later in the WAL sequence). As in
503 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
504 * log as best we can until the drop is seen.
505 */
506 smgrcreate(reln, MAIN_FORKNUM, true);
507
508 /*
509 * Before we perform the truncation, update minimum recovery point to
510 * cover this WAL record. Once the relation is truncated, there's no
511 * going back. The buffer manager enforces the WAL-first rule for
512 * normal updates to relation files, so that the minimum recovery
513 * point is always updated before the corresponding change in the data
514 * file is flushed to disk. We have to do the same manually here.
515 *
516 * Doing this before the truncation means that if the truncation fails
517 * for some reason, you cannot start up the system even after restart,
518 * until you fix the underlying situation so that the truncation will
519 * succeed. Alternatively, we could update the minimum recovery point
520 * after truncation, but that would leave a small window where the
521 * WAL-first rule could be violated.
522 */
523 XLogFlush(lsn);
524
525 if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
526 {
527 smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);
528
529 /* Also tell xlogutils.c about it */
530 XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
531 }
532
533 /* Truncate FSM and VM too */
534 rel = CreateFakeRelcacheEntry(xlrec->rnode);
535
536 if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
537 smgrexists(reln, FSM_FORKNUM))
538 FreeSpaceMapTruncateRel(rel, xlrec->blkno);
539 if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
540 smgrexists(reln, VISIBILITYMAP_FORKNUM))
541 visibilitymap_truncate(rel, xlrec->blkno);
542
543 FreeFakeRelcacheEntry(rel);
544 }
545 else
546 elog(PANIC, "smgr_redo: unknown op code %u", info);
547 }
548