1 /*-------------------------------------------------------------------------
2 *
3 * relmapper.c
4 * Catalog-to-filenode mapping
5 *
6 * For most tables, the physical file underlying the table is specified by
7 * pg_class.relfilenode. However, that obviously won't work for pg_class
8 * itself, nor for the other "nailed" catalogs for which we have to be able
9 * to set up working Relation entries without access to pg_class. It also
10 * does not work for shared catalogs, since there is no practical way to
11 * update other databases' pg_class entries when relocating a shared catalog.
12 * Therefore, for these special catalogs (henceforth referred to as "mapped
13 * catalogs") we rely on a separately maintained file that shows the mapping
14 * from catalog OIDs to filenode numbers. Each database has a map file for
15 * its local mapped catalogs, and there is a separate map file for shared
16 * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries.
17 *
18 * Relocation of a normal table is committed (ie, the new physical file becomes
19 * authoritative) when the pg_class row update commits. For mapped catalogs,
20 * the act of updating the map file is effectively commit of the relocation.
21 * We postpone the file update till just before commit of the transaction
22 * doing the rewrite, but there is necessarily a window between. Therefore
23 * mapped catalogs can only be relocated by operations such as VACUUM FULL
24 * and CLUSTER, which make no transactionally-significant changes: it must be
25 * safe for the new file to replace the old, even if the transaction itself
26 * aborts. An important factor here is that the indexes and toast table of
27 * a mapped catalog must also be mapped, so that the rewrites/relocations of
28 * all these files commit in a single map file update rather than being tied
29 * to transaction commit.
30 *
31 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
32 * Portions Copyright (c) 1994, Regents of the University of California
33 *
34 *
35 * IDENTIFICATION
36 * src/backend/utils/cache/relmapper.c
37 *
38 *-------------------------------------------------------------------------
39 */
40 #include "postgres.h"
41
42 #include <fcntl.h>
43 #include <sys/stat.h>
44 #include <unistd.h>
45
46 #include "access/xact.h"
47 #include "access/xlog.h"
48 #include "access/xloginsert.h"
49 #include "catalog/catalog.h"
50 #include "catalog/pg_tablespace.h"
51 #include "catalog/storage.h"
52 #include "miscadmin.h"
53 #include "storage/fd.h"
54 #include "storage/lwlock.h"
55 #include "utils/inval.h"
56 #include "utils/relmapper.h"
57
58
59 /*
60 * The map file is critical data: we have no automatic method for recovering
61 * from loss or corruption of it. We use a CRC so that we can detect
62 * corruption. To minimize the risk of failed updates, the map file should
63 * be kept to no more than one standard-size disk sector (ie 512 bytes),
64 * and we use overwrite-in-place rather than playing renaming games.
65 * The struct layout below is designed to occupy exactly 512 bytes, which
66 * might make filesystem updates a bit more efficient.
67 *
68 * Entries in the mappings[] array are in no particular order. We could
69 * speed searching by insisting on OID order, but it really shouldn't be
70 * worth the trouble given the intended size of the mapping sets.
71 */
72 #define RELMAPPER_FILENAME "pg_filenode.map"
73
74 #define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
75
76 #define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
77
78 typedef struct RelMapping
79 {
80 Oid mapoid; /* OID of a catalog */
81 Oid mapfilenode; /* its filenode number */
82 } RelMapping;
83
84 typedef struct RelMapFile
85 {
86 int32 magic; /* always RELMAPPER_FILEMAGIC */
87 int32 num_mappings; /* number of valid RelMapping entries */
88 RelMapping mappings[MAX_MAPPINGS];
89 pg_crc32c crc; /* CRC of all above */
90 int32 pad; /* to make the struct size be 512 exactly */
91 } RelMapFile;
92
93 /*
94 * The currently known contents of the shared map file and our database's
95 * local map file are stored here. These can be reloaded from disk
96 * immediately whenever we receive an update sinval message.
97 */
98 static RelMapFile shared_map;
99 static RelMapFile local_map;
100
101 /*
102 * We use the same RelMapFile data structure to track uncommitted local
103 * changes in the mappings (but note the magic and crc fields are not made
104 * valid in these variables). Currently, map updates are not allowed within
105 * subtransactions, so one set of transaction-level changes is sufficient.
106 *
107 * The active_xxx variables contain updates that are valid in our transaction
108 * and should be honored by RelationMapOidToFilenode. The pending_xxx
109 * variables contain updates we have been told about that aren't active yet;
110 * they will become active at the next CommandCounterIncrement. This setup
111 * lets map updates act similarly to updates of pg_class rows, ie, they
112 * become visible only at the next CommandCounterIncrement boundary.
113 */
114 static RelMapFile active_shared_updates;
115 static RelMapFile active_local_updates;
116 static RelMapFile pending_shared_updates;
117 static RelMapFile pending_local_updates;
118
119
120 /* non-export function prototypes */
121 static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode,
122 bool add_okay);
123 static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
124 bool add_okay);
125 static void load_relmap_file(bool shared, bool lock_held);
126 static void write_relmap_file(bool shared, RelMapFile *newmap,
127 bool write_wal, bool send_sinval, bool preserve_files,
128 Oid dbid, Oid tsid, const char *dbpath);
129 static void perform_relmap_update(bool shared, const RelMapFile *updates);
130
131
132 /*
133 * RelationMapOidToFilenode
134 *
135 * The raison d' etre ... given a relation OID, look up its filenode.
136 *
137 * Although shared and local relation OIDs should never overlap, the caller
138 * always knows which we need --- so pass that information to avoid useless
139 * searching.
140 *
141 * Returns InvalidOid if the OID is not known (which should never happen,
142 * but the caller is in a better position to report a meaningful error).
143 */
144 Oid
RelationMapOidToFilenode(Oid relationId,bool shared)145 RelationMapOidToFilenode(Oid relationId, bool shared)
146 {
147 const RelMapFile *map;
148 int32 i;
149
150 /* If there are active updates, believe those over the main maps */
151 if (shared)
152 {
153 map = &active_shared_updates;
154 for (i = 0; i < map->num_mappings; i++)
155 {
156 if (relationId == map->mappings[i].mapoid)
157 return map->mappings[i].mapfilenode;
158 }
159 map = &shared_map;
160 for (i = 0; i < map->num_mappings; i++)
161 {
162 if (relationId == map->mappings[i].mapoid)
163 return map->mappings[i].mapfilenode;
164 }
165 }
166 else
167 {
168 map = &active_local_updates;
169 for (i = 0; i < map->num_mappings; i++)
170 {
171 if (relationId == map->mappings[i].mapoid)
172 return map->mappings[i].mapfilenode;
173 }
174 map = &local_map;
175 for (i = 0; i < map->num_mappings; i++)
176 {
177 if (relationId == map->mappings[i].mapoid)
178 return map->mappings[i].mapfilenode;
179 }
180 }
181
182 return InvalidOid;
183 }
184
185 /*
186 * RelationMapFilenodeToOid
187 *
188 * Do the reverse of the normal direction of mapping done in
189 * RelationMapOidToFilenode.
190 *
191 * This is not supposed to be used during normal running but rather for
192 * information purposes when looking at the filesystem or xlog.
193 *
194 * Returns InvalidOid if the OID is not known; this can easily happen if the
195 * relfilenode doesn't pertain to a mapped relation.
196 */
197 Oid
RelationMapFilenodeToOid(Oid filenode,bool shared)198 RelationMapFilenodeToOid(Oid filenode, bool shared)
199 {
200 const RelMapFile *map;
201 int32 i;
202
203 /* If there are active updates, believe those over the main maps */
204 if (shared)
205 {
206 map = &active_shared_updates;
207 for (i = 0; i < map->num_mappings; i++)
208 {
209 if (filenode == map->mappings[i].mapfilenode)
210 return map->mappings[i].mapoid;
211 }
212 map = &shared_map;
213 for (i = 0; i < map->num_mappings; i++)
214 {
215 if (filenode == map->mappings[i].mapfilenode)
216 return map->mappings[i].mapoid;
217 }
218 }
219 else
220 {
221 map = &active_local_updates;
222 for (i = 0; i < map->num_mappings; i++)
223 {
224 if (filenode == map->mappings[i].mapfilenode)
225 return map->mappings[i].mapoid;
226 }
227 map = &local_map;
228 for (i = 0; i < map->num_mappings; i++)
229 {
230 if (filenode == map->mappings[i].mapfilenode)
231 return map->mappings[i].mapoid;
232 }
233 }
234
235 return InvalidOid;
236 }
237
238 /*
239 * RelationMapUpdateMap
240 *
241 * Install a new relfilenode mapping for the specified relation.
242 *
243 * If immediate is true (or we're bootstrapping), the mapping is activated
244 * immediately. Otherwise it is made pending until CommandCounterIncrement.
245 */
246 void
RelationMapUpdateMap(Oid relationId,Oid fileNode,bool shared,bool immediate)247 RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared,
248 bool immediate)
249 {
250 RelMapFile *map;
251
252 if (IsBootstrapProcessingMode())
253 {
254 /*
255 * In bootstrap mode, the mapping gets installed in permanent map.
256 */
257 if (shared)
258 map = &shared_map;
259 else
260 map = &local_map;
261 }
262 else
263 {
264 /*
265 * We don't currently support map changes within subtransactions. This
266 * could be done with more bookkeeping infrastructure, but it doesn't
267 * presently seem worth it.
268 */
269 if (GetCurrentTransactionNestLevel() > 1)
270 elog(ERROR, "cannot change relation mapping within subtransaction");
271
272 if (immediate)
273 {
274 /* Make it active, but only locally */
275 if (shared)
276 map = &active_shared_updates;
277 else
278 map = &active_local_updates;
279 }
280 else
281 {
282 /* Make it pending */
283 if (shared)
284 map = &pending_shared_updates;
285 else
286 map = &pending_local_updates;
287 }
288 }
289 apply_map_update(map, relationId, fileNode, true);
290 }
291
292 /*
293 * apply_map_update
294 *
295 * Insert a new mapping into the given map variable, replacing any existing
296 * mapping for the same relation.
297 *
298 * In some cases the caller knows there must be an existing mapping; pass
299 * add_okay = false to draw an error if not.
300 */
301 static void
apply_map_update(RelMapFile * map,Oid relationId,Oid fileNode,bool add_okay)302 apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay)
303 {
304 int32 i;
305
306 /* Replace any existing mapping */
307 for (i = 0; i < map->num_mappings; i++)
308 {
309 if (relationId == map->mappings[i].mapoid)
310 {
311 map->mappings[i].mapfilenode = fileNode;
312 return;
313 }
314 }
315
316 /* Nope, need to add a new mapping */
317 if (!add_okay)
318 elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
319 relationId);
320 if (map->num_mappings >= MAX_MAPPINGS)
321 elog(ERROR, "ran out of space in relation map");
322 map->mappings[map->num_mappings].mapoid = relationId;
323 map->mappings[map->num_mappings].mapfilenode = fileNode;
324 map->num_mappings++;
325 }
326
327 /*
328 * merge_map_updates
329 *
330 * Merge all the updates in the given pending-update map into the target map.
331 * This is just a bulk form of apply_map_update.
332 */
333 static void
merge_map_updates(RelMapFile * map,const RelMapFile * updates,bool add_okay)334 merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
335 {
336 int32 i;
337
338 for (i = 0; i < updates->num_mappings; i++)
339 {
340 apply_map_update(map,
341 updates->mappings[i].mapoid,
342 updates->mappings[i].mapfilenode,
343 add_okay);
344 }
345 }
346
347 /*
348 * RelationMapRemoveMapping
349 *
350 * Remove a relation's entry in the map. This is only allowed for "active"
351 * (but not committed) local mappings. We need it so we can back out the
352 * entry for the transient target file when doing VACUUM FULL/CLUSTER on
353 * a mapped relation.
354 */
355 void
RelationMapRemoveMapping(Oid relationId)356 RelationMapRemoveMapping(Oid relationId)
357 {
358 RelMapFile *map = &active_local_updates;
359 int32 i;
360
361 for (i = 0; i < map->num_mappings; i++)
362 {
363 if (relationId == map->mappings[i].mapoid)
364 {
365 /* Found it, collapse it out */
366 map->mappings[i] = map->mappings[map->num_mappings - 1];
367 map->num_mappings--;
368 return;
369 }
370 }
371 elog(ERROR, "could not find temporary mapping for relation %u",
372 relationId);
373 }
374
375 /*
376 * RelationMapInvalidate
377 *
378 * This routine is invoked for SI cache flush messages. We must re-read
379 * the indicated map file. However, we might receive a SI message in a
380 * process that hasn't yet, and might never, load the mapping files;
381 * for example the autovacuum launcher, which *must not* try to read
382 * a local map since it is attached to no particular database.
383 * So, re-read only if the map is valid now.
384 */
385 void
RelationMapInvalidate(bool shared)386 RelationMapInvalidate(bool shared)
387 {
388 if (shared)
389 {
390 if (shared_map.magic == RELMAPPER_FILEMAGIC)
391 load_relmap_file(true, false);
392 }
393 else
394 {
395 if (local_map.magic == RELMAPPER_FILEMAGIC)
396 load_relmap_file(false, false);
397 }
398 }
399
400 /*
401 * RelationMapInvalidateAll
402 *
403 * Reload all map files. This is used to recover from SI message buffer
404 * overflow: we can't be sure if we missed an inval message.
405 * Again, reload only currently-valid maps.
406 */
407 void
RelationMapInvalidateAll(void)408 RelationMapInvalidateAll(void)
409 {
410 if (shared_map.magic == RELMAPPER_FILEMAGIC)
411 load_relmap_file(true, false);
412 if (local_map.magic == RELMAPPER_FILEMAGIC)
413 load_relmap_file(false, false);
414 }
415
416 /*
417 * AtCCI_RelationMap
418 *
419 * Activate any "pending" relation map updates at CommandCounterIncrement time.
420 */
421 void
AtCCI_RelationMap(void)422 AtCCI_RelationMap(void)
423 {
424 if (pending_shared_updates.num_mappings != 0)
425 {
426 merge_map_updates(&active_shared_updates,
427 &pending_shared_updates,
428 true);
429 pending_shared_updates.num_mappings = 0;
430 }
431 if (pending_local_updates.num_mappings != 0)
432 {
433 merge_map_updates(&active_local_updates,
434 &pending_local_updates,
435 true);
436 pending_local_updates.num_mappings = 0;
437 }
438 }
439
440 /*
441 * AtEOXact_RelationMap
442 *
443 * Handle relation mapping at main-transaction commit or abort.
444 *
445 * During commit, this must be called as late as possible before the actual
446 * transaction commit, so as to minimize the window where the transaction
447 * could still roll back after committing map changes. Although nothing
448 * critically bad happens in such a case, we still would prefer that it
449 * not happen, since we'd possibly be losing useful updates to the relations'
450 * pg_class row(s).
451 *
452 * During abort, we just have to throw away any pending map changes.
453 * Normal post-abort cleanup will take care of fixing relcache entries.
454 */
455 void
AtEOXact_RelationMap(bool isCommit)456 AtEOXact_RelationMap(bool isCommit)
457 {
458 if (isCommit)
459 {
460 /*
461 * We should not get here with any "pending" updates. (We could
462 * logically choose to treat such as committed, but in the current
463 * code this should never happen.)
464 */
465 Assert(pending_shared_updates.num_mappings == 0);
466 Assert(pending_local_updates.num_mappings == 0);
467
468 /*
469 * Write any active updates to the actual map files, then reset them.
470 */
471 if (active_shared_updates.num_mappings != 0)
472 {
473 perform_relmap_update(true, &active_shared_updates);
474 active_shared_updates.num_mappings = 0;
475 }
476 if (active_local_updates.num_mappings != 0)
477 {
478 perform_relmap_update(false, &active_local_updates);
479 active_local_updates.num_mappings = 0;
480 }
481 }
482 else
483 {
484 /* Abort --- drop all local and pending updates */
485 active_shared_updates.num_mappings = 0;
486 active_local_updates.num_mappings = 0;
487 pending_shared_updates.num_mappings = 0;
488 pending_local_updates.num_mappings = 0;
489 }
490 }
491
492 /*
493 * AtPrepare_RelationMap
494 *
495 * Handle relation mapping at PREPARE.
496 *
497 * Currently, we don't support preparing any transaction that changes the map.
498 */
499 void
AtPrepare_RelationMap(void)500 AtPrepare_RelationMap(void)
501 {
502 if (active_shared_updates.num_mappings != 0 ||
503 active_local_updates.num_mappings != 0 ||
504 pending_shared_updates.num_mappings != 0 ||
505 pending_local_updates.num_mappings != 0)
506 ereport(ERROR,
507 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
508 errmsg("cannot PREPARE a transaction that modified relation mapping")));
509 }
510
511 /*
512 * CheckPointRelationMap
513 *
514 * This is called during a checkpoint. It must ensure that any relation map
515 * updates that were WAL-logged before the start of the checkpoint are
516 * securely flushed to disk and will not need to be replayed later. This
517 * seems unlikely to be a performance-critical issue, so we use a simple
518 * method: we just take and release the RelationMappingLock. This ensures
519 * that any already-logged map update is complete, because write_relmap_file
520 * will fsync the map file before the lock is released.
521 */
522 void
CheckPointRelationMap(void)523 CheckPointRelationMap(void)
524 {
525 LWLockAcquire(RelationMappingLock, LW_SHARED);
526 LWLockRelease(RelationMappingLock);
527 }
528
529 /*
530 * RelationMapFinishBootstrap
531 *
532 * Write out the initial relation mapping files at the completion of
533 * bootstrap. All the mapped files should have been made known to us
534 * via RelationMapUpdateMap calls.
535 */
536 void
RelationMapFinishBootstrap(void)537 RelationMapFinishBootstrap(void)
538 {
539 Assert(IsBootstrapProcessingMode());
540
541 /* Shouldn't be anything "pending" ... */
542 Assert(active_shared_updates.num_mappings == 0);
543 Assert(active_local_updates.num_mappings == 0);
544 Assert(pending_shared_updates.num_mappings == 0);
545 Assert(pending_local_updates.num_mappings == 0);
546
547 /* Write the files; no WAL or sinval needed */
548 write_relmap_file(true, &shared_map, false, false, false,
549 InvalidOid, GLOBALTABLESPACE_OID, NULL);
550 write_relmap_file(false, &local_map, false, false, false,
551 MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
552 }
553
554 /*
555 * RelationMapInitialize
556 *
557 * This initializes the mapper module at process startup. We can't access the
558 * database yet, so just make sure the maps are empty.
559 */
560 void
RelationMapInitialize(void)561 RelationMapInitialize(void)
562 {
563 /* The static variables should initialize to zeroes, but let's be sure */
564 shared_map.magic = 0; /* mark it not loaded */
565 local_map.magic = 0;
566 shared_map.num_mappings = 0;
567 local_map.num_mappings = 0;
568 active_shared_updates.num_mappings = 0;
569 active_local_updates.num_mappings = 0;
570 pending_shared_updates.num_mappings = 0;
571 pending_local_updates.num_mappings = 0;
572 }
573
574 /*
575 * RelationMapInitializePhase2
576 *
577 * This is called to prepare for access to pg_database during startup.
578 * We should be able to read the shared map file now.
579 */
580 void
RelationMapInitializePhase2(void)581 RelationMapInitializePhase2(void)
582 {
583 /*
584 * In bootstrap mode, the map file isn't there yet, so do nothing.
585 */
586 if (IsBootstrapProcessingMode())
587 return;
588
589 /*
590 * Load the shared map file, die on error.
591 */
592 load_relmap_file(true, false);
593 }
594
595 /*
596 * RelationMapInitializePhase3
597 *
598 * This is called as soon as we have determined MyDatabaseId and set up
599 * DatabasePath. At this point we should be able to read the local map file.
600 */
601 void
RelationMapInitializePhase3(void)602 RelationMapInitializePhase3(void)
603 {
604 /*
605 * In bootstrap mode, the map file isn't there yet, so do nothing.
606 */
607 if (IsBootstrapProcessingMode())
608 return;
609
610 /*
611 * Load the local map file, die on error.
612 */
613 load_relmap_file(false, false);
614 }
615
616 /*
617 * load_relmap_file -- load data from the shared or local map file
618 *
619 * Because the map file is essential for access to core system catalogs,
620 * failure to read it is a fatal error.
621 *
622 * Note that the local case requires DatabasePath to be set up.
623 */
624 static void
load_relmap_file(bool shared,bool lock_held)625 load_relmap_file(bool shared, bool lock_held)
626 {
627 RelMapFile *map;
628 char mapfilename[MAXPGPATH];
629 pg_crc32c crc;
630 int fd;
631
632 if (shared)
633 {
634 snprintf(mapfilename, sizeof(mapfilename), "global/%s",
635 RELMAPPER_FILENAME);
636 map = &shared_map;
637 }
638 else
639 {
640 snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
641 DatabasePath, RELMAPPER_FILENAME);
642 map = &local_map;
643 }
644
645 /* Read data ... */
646 fd = OpenTransientFile(mapfilename,
647 O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR);
648 if (fd < 0)
649 ereport(FATAL,
650 (errcode_for_file_access(),
651 errmsg("could not open relation mapping file \"%s\": %m",
652 mapfilename)));
653
654 /*
655 * Grab the lock to prevent the file from being updated while we read it,
656 * unless the caller is already holding the lock. If the file is updated
657 * shortly after we look, the sinval signaling mechanism will make us
658 * re-read it before we are able to access any relation that's affected by
659 * the change.
660 */
661 if (!lock_held)
662 LWLockAcquire(RelationMappingLock, LW_SHARED);
663
664 if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile))
665 ereport(FATAL,
666 (errcode_for_file_access(),
667 errmsg("could not read relation mapping file \"%s\": %m",
668 mapfilename)));
669
670 if (!lock_held)
671 LWLockRelease(RelationMappingLock);
672
673 CloseTransientFile(fd);
674
675 /* check for correct magic number, etc */
676 if (map->magic != RELMAPPER_FILEMAGIC ||
677 map->num_mappings < 0 ||
678 map->num_mappings > MAX_MAPPINGS)
679 ereport(FATAL,
680 (errmsg("relation mapping file \"%s\" contains invalid data",
681 mapfilename)));
682
683 /* verify the CRC */
684 INIT_CRC32C(crc);
685 COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
686 FIN_CRC32C(crc);
687
688 if (!EQ_CRC32C(crc, map->crc))
689 ereport(FATAL,
690 (errmsg("relation mapping file \"%s\" contains incorrect checksum",
691 mapfilename)));
692 }
693
694 /*
695 * Write out a new shared or local map file with the given contents.
696 *
697 * The magic number and CRC are automatically updated in *newmap. On
698 * success, we copy the data to the appropriate permanent static variable.
699 *
700 * If write_wal is TRUE then an appropriate WAL message is emitted.
701 * (It will be false for bootstrap and WAL replay cases.)
702 *
703 * If send_sinval is TRUE then a SI invalidation message is sent.
704 * (This should be true except in bootstrap case.)
705 *
706 * If preserve_files is TRUE then the storage manager is warned not to
707 * delete the files listed in the map.
708 *
709 * Because this may be called during WAL replay when MyDatabaseId,
710 * DatabasePath, etc aren't valid, we require the caller to pass in suitable
711 * values. The caller is also responsible for being sure no concurrent
712 * map update could be happening.
713 */
714 static void
write_relmap_file(bool shared,RelMapFile * newmap,bool write_wal,bool send_sinval,bool preserve_files,Oid dbid,Oid tsid,const char * dbpath)715 write_relmap_file(bool shared, RelMapFile *newmap,
716 bool write_wal, bool send_sinval, bool preserve_files,
717 Oid dbid, Oid tsid, const char *dbpath)
718 {
719 int fd;
720 RelMapFile *realmap;
721 char mapfilename[MAXPGPATH];
722
723 /*
724 * Fill in the overhead fields and update CRC.
725 */
726 newmap->magic = RELMAPPER_FILEMAGIC;
727 if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
728 elog(ERROR, "attempt to write bogus relation mapping");
729
730 INIT_CRC32C(newmap->crc);
731 COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
732 FIN_CRC32C(newmap->crc);
733
734 /*
735 * Open the target file. We prefer to do this before entering the
736 * critical section, so that an open() failure need not force PANIC.
737 */
738 if (shared)
739 {
740 snprintf(mapfilename, sizeof(mapfilename), "global/%s",
741 RELMAPPER_FILENAME);
742 realmap = &shared_map;
743 }
744 else
745 {
746 snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
747 dbpath, RELMAPPER_FILENAME);
748 realmap = &local_map;
749 }
750
751 fd = OpenTransientFile(mapfilename,
752 O_WRONLY | O_CREAT | PG_BINARY,
753 S_IRUSR | S_IWUSR);
754 if (fd < 0)
755 ereport(ERROR,
756 (errcode_for_file_access(),
757 errmsg("could not open relation mapping file \"%s\": %m",
758 mapfilename)));
759
760 if (write_wal)
761 {
762 xl_relmap_update xlrec;
763 XLogRecPtr lsn;
764
765 /* now errors are fatal ... */
766 START_CRIT_SECTION();
767
768 xlrec.dbid = dbid;
769 xlrec.tsid = tsid;
770 xlrec.nbytes = sizeof(RelMapFile);
771
772 XLogBeginInsert();
773 XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
774 XLogRegisterData((char *) newmap, sizeof(RelMapFile));
775
776 lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
777
778 /* As always, WAL must hit the disk before the data update does */
779 XLogFlush(lsn);
780 }
781
782 errno = 0;
783 if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
784 {
785 /* if write didn't set errno, assume problem is no disk space */
786 if (errno == 0)
787 errno = ENOSPC;
788 ereport(ERROR,
789 (errcode_for_file_access(),
790 errmsg("could not write to relation mapping file \"%s\": %m",
791 mapfilename)));
792 }
793
794 /*
795 * We choose to fsync the data to disk before considering the task done.
796 * It would be possible to relax this if it turns out to be a performance
797 * issue, but it would complicate checkpointing --- see notes for
798 * CheckPointRelationMap.
799 */
800 if (pg_fsync(fd) != 0)
801 ereport(data_sync_elevel(ERROR),
802 (errcode_for_file_access(),
803 errmsg("could not fsync relation mapping file \"%s\": %m",
804 mapfilename)));
805
806 if (CloseTransientFile(fd))
807 ereport(ERROR,
808 (errcode_for_file_access(),
809 errmsg("could not close relation mapping file \"%s\": %m",
810 mapfilename)));
811
812 /*
813 * Now that the file is safely on disk, send sinval message to let other
814 * backends know to re-read it. We must do this inside the critical
815 * section: if for some reason we fail to send the message, we have to
816 * force a database-wide PANIC. Otherwise other backends might continue
817 * execution with stale mapping information, which would be catastrophic
818 * as soon as others began to use the now-committed data.
819 */
820 if (send_sinval)
821 CacheInvalidateRelmap(dbid);
822
823 /*
824 * Make sure that the files listed in the map are not deleted if the outer
825 * transaction aborts. This had better be within the critical section
826 * too: it's not likely to fail, but if it did, we'd arrive at transaction
827 * abort with the files still vulnerable. PANICing will leave things in a
828 * good state on-disk.
829 *
830 * Note: we're cheating a little bit here by assuming that mapped files
831 * are either in pg_global or the database's default tablespace.
832 */
833 if (preserve_files)
834 {
835 int32 i;
836
837 for (i = 0; i < newmap->num_mappings; i++)
838 {
839 RelFileNode rnode;
840
841 rnode.spcNode = tsid;
842 rnode.dbNode = dbid;
843 rnode.relNode = newmap->mappings[i].mapfilenode;
844 RelationPreserveStorage(rnode, false);
845 }
846 }
847
848 /*
849 * Success, update permanent copy. During bootstrap, we might be working
850 * on the permanent copy itself, in which case skip the memcpy() to avoid
851 * invoking nominally-undefined behavior.
852 */
853 if (realmap != newmap)
854 memcpy(realmap, newmap, sizeof(RelMapFile));
855 else
856 Assert(!send_sinval); /* must be bootstrapping */
857
858 /* Critical section done */
859 if (write_wal)
860 END_CRIT_SECTION();
861 }
862
863 /*
864 * Merge the specified updates into the appropriate "real" map,
865 * and write out the changes. This function must be used for committing
866 * updates during normal multiuser operation.
867 */
868 static void
perform_relmap_update(bool shared,const RelMapFile * updates)869 perform_relmap_update(bool shared, const RelMapFile *updates)
870 {
871 RelMapFile newmap;
872
873 /*
874 * Anyone updating a relation's mapping info should take exclusive lock on
875 * that rel and hold it until commit. This ensures that there will not be
876 * concurrent updates on the same mapping value; but there could easily be
877 * concurrent updates on different values in the same file. We cover that
878 * by acquiring the RelationMappingLock, re-reading the target file to
879 * ensure it's up to date, applying the updates, and writing the data
880 * before releasing RelationMappingLock.
881 *
882 * There is only one RelationMappingLock. In principle we could try to
883 * have one per mapping file, but it seems unlikely to be worth the
884 * trouble.
885 */
886 LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
887
888 /* Be certain we see any other updates just made */
889 load_relmap_file(shared, true);
890
891 /* Prepare updated data in a local variable */
892 if (shared)
893 memcpy(&newmap, &shared_map, sizeof(RelMapFile));
894 else
895 memcpy(&newmap, &local_map, sizeof(RelMapFile));
896
897 /*
898 * Apply the updates to newmap. No new mappings should appear, unless
899 * somebody is adding indexes to system catalogs.
900 */
901 merge_map_updates(&newmap, updates, allowSystemTableMods);
902
903 /* Write out the updated map and do other necessary tasks */
904 write_relmap_file(shared, &newmap, true, true, true,
905 (shared ? InvalidOid : MyDatabaseId),
906 (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
907 DatabasePath);
908
909 /* Now we can release the lock */
910 LWLockRelease(RelationMappingLock);
911 }
912
913 /*
914 * RELMAP resource manager's routines
915 */
916 void
relmap_redo(XLogReaderState * record)917 relmap_redo(XLogReaderState *record)
918 {
919 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
920
921 /* Backup blocks are not used in relmap records */
922 Assert(!XLogRecHasAnyBlockRefs(record));
923
924 if (info == XLOG_RELMAP_UPDATE)
925 {
926 xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
927 RelMapFile newmap;
928 char *dbpath;
929
930 if (xlrec->nbytes != sizeof(RelMapFile))
931 elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
932 xlrec->nbytes);
933 memcpy(&newmap, xlrec->data, sizeof(newmap));
934
935 /* We need to construct the pathname for this database */
936 dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
937
938 /*
939 * Write out the new map and send sinval, but of course don't write a
940 * new WAL entry. There's no surrounding transaction to tell to
941 * preserve files, either.
942 *
943 * There shouldn't be anyone else updating relmaps during WAL replay,
944 * but grab the lock to interlock against load_relmap_file().
945 */
946 LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
947 write_relmap_file((xlrec->dbid == InvalidOid), &newmap,
948 false, true, false,
949 xlrec->dbid, xlrec->tsid, dbpath);
950 LWLockRelease(RelationMappingLock);
951
952 pfree(dbpath);
953 }
954 else
955 elog(PANIC, "relmap_redo: unknown op code %u", info);
956 }
957