1 /*-------------------------------------------------------------------------
2 *
3 * cluster.c
4 * CLUSTER a table on an index. This is now also used for VACUUM FULL.
5 *
6 * There is hardly anything left of Paul Brown's original implementation...
7 *
8 *
9 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994-5, Regents of the University of California
11 *
12 *
13 * IDENTIFICATION
14 * src/backend/commands/cluster.c
15 *
16 *-------------------------------------------------------------------------
17 */
18 #include "postgres.h"
19
20 #include "access/amapi.h"
21 #include "access/heapam.h"
22 #include "access/multixact.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "access/toast_internals.h"
26 #include "access/transam.h"
27 #include "access/xact.h"
28 #include "access/xlog.h"
29 #include "catalog/catalog.h"
30 #include "catalog/dependency.h"
31 #include "catalog/heap.h"
32 #include "catalog/index.h"
33 #include "catalog/namespace.h"
34 #include "catalog/objectaccess.h"
35 #include "catalog/pg_am.h"
36 #include "catalog/toasting.h"
37 #include "commands/cluster.h"
38 #include "commands/progress.h"
39 #include "commands/tablecmds.h"
40 #include "commands/vacuum.h"
41 #include "miscadmin.h"
42 #include "optimizer/optimizer.h"
43 #include "pgstat.h"
44 #include "storage/bufmgr.h"
45 #include "storage/lmgr.h"
46 #include "storage/predicate.h"
47 #include "utils/acl.h"
48 #include "utils/fmgroids.h"
49 #include "utils/inval.h"
50 #include "utils/lsyscache.h"
51 #include "utils/memutils.h"
52 #include "utils/pg_rusage.h"
53 #include "utils/relmapper.h"
54 #include "utils/snapmgr.h"
55 #include "utils/syscache.h"
56 #include "utils/tuplesort.h"
57
58 /*
59 * This struct is used to pass around the information on tables to be
60 * clustered. We need this so we can make a list of them when invoked without
61 * a specific table/index pair.
62 */
63 typedef struct
64 {
65 Oid tableOid;
66 Oid indexOid;
67 } RelToCluster;
68
69
70 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
71 static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
72 bool verbose, bool *pSwapToastByContent,
73 TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
74 static List *get_tables_to_cluster(MemoryContext cluster_context);
75
76
77 /*---------------------------------------------------------------------------
78 * This cluster code allows for clustering multiple tables at once. Because
79 * of this, we cannot just run everything on a single transaction, or we
80 * would be forced to acquire exclusive locks on all the tables being
81 * clustered, simultaneously --- very likely leading to deadlock.
82 *
83 * To solve this we follow a similar strategy to VACUUM code,
84 * clustering each relation in a separate transaction. For this to work,
85 * we need to:
86 * - provide a separate memory context so that we can pass information in
87 * a way that survives across transactions
88 * - start a new transaction every time a new relation is clustered
89 * - check for validity of the information on to-be-clustered relations,
90 * as someone might have deleted a relation behind our back, or
91 * clustered one on a different index
92 * - end the transaction
93 *
94 * The single-relation case does not have any such overhead.
95 *
96 * We also allow a relation to be specified without index. In that case,
97 * the indisclustered bit will be looked up, and an ERROR will be thrown
98 * if there is no index with the bit set.
99 *---------------------------------------------------------------------------
100 */
101 void
cluster(ClusterStmt * stmt,bool isTopLevel)102 cluster(ClusterStmt *stmt, bool isTopLevel)
103 {
104 if (stmt->relation != NULL)
105 {
106 /* This is the single-relation case. */
107 Oid tableOid,
108 indexOid = InvalidOid;
109 Relation rel;
110
111 /* Find, lock, and check permissions on the table */
112 tableOid = RangeVarGetRelidExtended(stmt->relation,
113 AccessExclusiveLock,
114 0,
115 RangeVarCallbackOwnsTable, NULL);
116 rel = table_open(tableOid, NoLock);
117
118 /*
119 * Reject clustering a remote temp table ... their local buffer
120 * manager is not going to cope.
121 */
122 if (RELATION_IS_OTHER_TEMP(rel))
123 ereport(ERROR,
124 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
125 errmsg("cannot cluster temporary tables of other sessions")));
126
127 /*
128 * Reject clustering a partitioned table.
129 */
130 if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
131 ereport(ERROR,
132 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
133 errmsg("cannot cluster a partitioned table")));
134
135 if (stmt->indexname == NULL)
136 {
137 ListCell *index;
138
139 /* We need to find the index that has indisclustered set. */
140 foreach(index, RelationGetIndexList(rel))
141 {
142 indexOid = lfirst_oid(index);
143 if (get_index_isclustered(indexOid))
144 break;
145 indexOid = InvalidOid;
146 }
147
148 if (!OidIsValid(indexOid))
149 ereport(ERROR,
150 (errcode(ERRCODE_UNDEFINED_OBJECT),
151 errmsg("there is no previously clustered index for table \"%s\"",
152 stmt->relation->relname)));
153 }
154 else
155 {
156 /*
157 * The index is expected to be in the same namespace as the
158 * relation.
159 */
160 indexOid = get_relname_relid(stmt->indexname,
161 rel->rd_rel->relnamespace);
162 if (!OidIsValid(indexOid))
163 ereport(ERROR,
164 (errcode(ERRCODE_UNDEFINED_OBJECT),
165 errmsg("index \"%s\" for table \"%s\" does not exist",
166 stmt->indexname, stmt->relation->relname)));
167 }
168
169 /* close relation, keep lock till commit */
170 table_close(rel, NoLock);
171
172 /* Do the job. */
173 cluster_rel(tableOid, indexOid, stmt->options);
174 }
175 else
176 {
177 /*
178 * This is the "multi relation" case. We need to cluster all tables
179 * that have some index with indisclustered set.
180 */
181 MemoryContext cluster_context;
182 List *rvs;
183 ListCell *rv;
184
185 /*
186 * We cannot run this form of CLUSTER inside a user transaction block;
187 * we'd be holding locks way too long.
188 */
189 PreventInTransactionBlock(isTopLevel, "CLUSTER");
190
191 /*
192 * Create special memory context for cross-transaction storage.
193 *
194 * Since it is a child of PortalContext, it will go away even in case
195 * of error.
196 */
197 cluster_context = AllocSetContextCreate(PortalContext,
198 "Cluster",
199 ALLOCSET_DEFAULT_SIZES);
200
201 /*
202 * Build the list of relations to cluster. Note that this lives in
203 * cluster_context.
204 */
205 rvs = get_tables_to_cluster(cluster_context);
206
207 /* Commit to get out of starting transaction */
208 PopActiveSnapshot();
209 CommitTransactionCommand();
210
211 /* Ok, now that we've got them all, cluster them one by one */
212 foreach(rv, rvs)
213 {
214 RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
215
216 /* Start a new transaction for each relation. */
217 StartTransactionCommand();
218 /* functions in indexes may want a snapshot set */
219 PushActiveSnapshot(GetTransactionSnapshot());
220 /* Do the job. */
221 cluster_rel(rvtc->tableOid, rvtc->indexOid,
222 stmt->options | CLUOPT_RECHECK);
223 PopActiveSnapshot();
224 CommitTransactionCommand();
225 }
226
227 /* Start a new transaction for the cleanup work. */
228 StartTransactionCommand();
229
230 /* Clean up working storage */
231 MemoryContextDelete(cluster_context);
232 }
233 }
234
235 /*
236 * cluster_rel
237 *
238 * This clusters the table by creating a new, clustered table and
239 * swapping the relfilenodes of the new table and the old table, so
240 * the OID of the original table is preserved. Thus we do not lose
241 * GRANT, inheritance nor references to this table (this was a bug
242 * in releases through 7.3).
243 *
244 * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
245 * the new table, it's better to create the indexes afterwards than to fill
246 * them incrementally while we load the table.
247 *
248 * If indexOid is InvalidOid, the table will be rewritten in physical order
249 * instead of index order. This is the new implementation of VACUUM FULL,
250 * and error messages should refer to the operation as VACUUM not CLUSTER.
251 */
252 void
cluster_rel(Oid tableOid,Oid indexOid,int options)253 cluster_rel(Oid tableOid, Oid indexOid, int options)
254 {
255 Relation OldHeap;
256 bool verbose = ((options & CLUOPT_VERBOSE) != 0);
257 bool recheck = ((options & CLUOPT_RECHECK) != 0);
258
259 /* Check for user-requested abort. */
260 CHECK_FOR_INTERRUPTS();
261
262 pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
263 if (OidIsValid(indexOid))
264 pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
265 PROGRESS_CLUSTER_COMMAND_CLUSTER);
266 else
267 pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
268 PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
269
270 /*
271 * We grab exclusive access to the target rel and index for the duration
272 * of the transaction. (This is redundant for the single-transaction
273 * case, since cluster() already did it.) The index lock is taken inside
274 * check_index_is_clusterable.
275 */
276 OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
277
278 /* If the table has gone away, we can skip processing it */
279 if (!OldHeap)
280 {
281 pgstat_progress_end_command();
282 return;
283 }
284
285 /*
286 * Since we may open a new transaction for each relation, we have to check
287 * that the relation still is what we think it is.
288 *
289 * If this is a single-transaction CLUSTER, we can skip these tests. We
290 * *must* skip the one on indisclustered since it would reject an attempt
291 * to cluster a not-previously-clustered index.
292 */
293 if (recheck)
294 {
295 /* Check that the user still owns the relation */
296 if (!pg_class_ownercheck(tableOid, GetUserId()))
297 {
298 relation_close(OldHeap, AccessExclusiveLock);
299 pgstat_progress_end_command();
300 return;
301 }
302
303 /*
304 * Silently skip a temp table for a remote session. Only doing this
305 * check in the "recheck" case is appropriate (which currently means
306 * somebody is executing a database-wide CLUSTER), because there is
307 * another check in cluster() which will stop any attempt to cluster
308 * remote temp tables by name. There is another check in cluster_rel
309 * which is redundant, but we leave it for extra safety.
310 */
311 if (RELATION_IS_OTHER_TEMP(OldHeap))
312 {
313 relation_close(OldHeap, AccessExclusiveLock);
314 pgstat_progress_end_command();
315 return;
316 }
317
318 if (OidIsValid(indexOid))
319 {
320 /*
321 * Check that the index still exists
322 */
323 if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
324 {
325 relation_close(OldHeap, AccessExclusiveLock);
326 pgstat_progress_end_command();
327 return;
328 }
329
330 /*
331 * Check that the index is still the one with indisclustered set.
332 */
333 if (!get_index_isclustered(indexOid))
334 {
335 relation_close(OldHeap, AccessExclusiveLock);
336 pgstat_progress_end_command();
337 return;
338 }
339 }
340 }
341
342 /*
343 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
344 * would work in most respects, but the index would only get marked as
345 * indisclustered in the current database, leading to unexpected behavior
346 * if CLUSTER were later invoked in another database.
347 */
348 if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
349 ereport(ERROR,
350 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
351 errmsg("cannot cluster a shared catalog")));
352
353 /*
354 * Don't process temp tables of other backends ... their local buffer
355 * manager is not going to cope.
356 */
357 if (RELATION_IS_OTHER_TEMP(OldHeap))
358 {
359 if (OidIsValid(indexOid))
360 ereport(ERROR,
361 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
362 errmsg("cannot cluster temporary tables of other sessions")));
363 else
364 ereport(ERROR,
365 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
366 errmsg("cannot vacuum temporary tables of other sessions")));
367 }
368
369 /*
370 * Also check for active uses of the relation in the current transaction,
371 * including open scans and pending AFTER trigger events.
372 */
373 CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
374
375 /* Check heap and index are valid to cluster on */
376 if (OidIsValid(indexOid))
377 check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
378
379 /*
380 * Quietly ignore the request if this is a materialized view which has not
381 * been populated from its query. No harm is done because there is no data
382 * to deal with, and we don't want to throw an error if this is part of a
383 * multi-relation request -- for example, CLUSTER was run on the entire
384 * database.
385 */
386 if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
387 !RelationIsPopulated(OldHeap))
388 {
389 relation_close(OldHeap, AccessExclusiveLock);
390 pgstat_progress_end_command();
391 return;
392 }
393
394 /*
395 * All predicate locks on the tuples or pages are about to be made
396 * invalid, because we move tuples around. Promote them to relation
397 * locks. Predicate locks on indexes will be promoted when they are
398 * reindexed.
399 */
400 TransferPredicateLocksToHeapRelation(OldHeap);
401
402 /* rebuild_relation does all the dirty work */
403 rebuild_relation(OldHeap, indexOid, verbose);
404
405 /* NB: rebuild_relation does table_close() on OldHeap */
406
407 pgstat_progress_end_command();
408 }
409
410 /*
411 * Verify that the specified heap and index are valid to cluster on
412 *
413 * Side effect: obtains lock on the index. The caller may
414 * in some cases already have AccessExclusiveLock on the table, but
415 * not in all cases so we can't rely on the table-level lock for
416 * protection here.
417 */
418 void
check_index_is_clusterable(Relation OldHeap,Oid indexOid,bool recheck,LOCKMODE lockmode)419 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
420 {
421 Relation OldIndex;
422
423 OldIndex = index_open(indexOid, lockmode);
424
425 /*
426 * Check that index is in fact an index on the given relation
427 */
428 if (OldIndex->rd_index == NULL ||
429 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
430 ereport(ERROR,
431 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
432 errmsg("\"%s\" is not an index for table \"%s\"",
433 RelationGetRelationName(OldIndex),
434 RelationGetRelationName(OldHeap))));
435
436 /* Index AM must allow clustering */
437 if (!OldIndex->rd_indam->amclusterable)
438 ereport(ERROR,
439 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
440 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
441 RelationGetRelationName(OldIndex))));
442
443 /*
444 * Disallow clustering on incomplete indexes (those that might not index
445 * every row of the relation). We could relax this by making a separate
446 * seqscan pass over the table to copy the missing rows, but that seems
447 * expensive and tedious.
448 */
449 if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
450 ereport(ERROR,
451 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
452 errmsg("cannot cluster on partial index \"%s\"",
453 RelationGetRelationName(OldIndex))));
454
455 /*
456 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
457 * it might well not contain entries for every heap row, or might not even
458 * be internally consistent. (But note that we don't check indcheckxmin;
459 * the worst consequence of following broken HOT chains would be that we
460 * might put recently-dead tuples out-of-order in the new table, and there
461 * is little harm in that.)
462 */
463 if (!OldIndex->rd_index->indisvalid)
464 ereport(ERROR,
465 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
466 errmsg("cannot cluster on invalid index \"%s\"",
467 RelationGetRelationName(OldIndex))));
468
469 /* Drop relcache refcnt on OldIndex, but keep lock */
470 index_close(OldIndex, NoLock);
471 }
472
473 /*
474 * mark_index_clustered: mark the specified index as the one clustered on
475 *
476 * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
477 */
478 void
mark_index_clustered(Relation rel,Oid indexOid,bool is_internal)479 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
480 {
481 HeapTuple indexTuple;
482 Form_pg_index indexForm;
483 Relation pg_index;
484 ListCell *index;
485
486 /* Disallow applying to a partitioned table */
487 if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
488 ereport(ERROR,
489 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
490 errmsg("cannot mark index clustered in partitioned table")));
491
492 /*
493 * If the index is already marked clustered, no need to do anything.
494 */
495 if (OidIsValid(indexOid))
496 {
497 if (get_index_isclustered(indexOid))
498 return;
499 }
500
501 /*
502 * Check each index of the relation and set/clear the bit as needed.
503 */
504 pg_index = table_open(IndexRelationId, RowExclusiveLock);
505
506 foreach(index, RelationGetIndexList(rel))
507 {
508 Oid thisIndexOid = lfirst_oid(index);
509
510 indexTuple = SearchSysCacheCopy1(INDEXRELID,
511 ObjectIdGetDatum(thisIndexOid));
512 if (!HeapTupleIsValid(indexTuple))
513 elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
514 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
515
516 /*
517 * Unset the bit if set. We know it's wrong because we checked this
518 * earlier.
519 */
520 if (indexForm->indisclustered)
521 {
522 indexForm->indisclustered = false;
523 CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
524 }
525 else if (thisIndexOid == indexOid)
526 {
527 /* this was checked earlier, but let's be real sure */
528 if (!indexForm->indisvalid)
529 elog(ERROR, "cannot cluster on invalid index %u", indexOid);
530 indexForm->indisclustered = true;
531 CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
532 }
533
534 InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
535 InvalidOid, is_internal);
536
537 heap_freetuple(indexTuple);
538 }
539
540 table_close(pg_index, RowExclusiveLock);
541 }
542
543 /*
544 * rebuild_relation: rebuild an existing relation in index or physical order
545 *
546 * OldHeap: table to rebuild --- must be opened and exclusive-locked!
547 * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
548 *
549 * NB: this routine closes OldHeap at the right time; caller should not.
550 */
551 static void
rebuild_relation(Relation OldHeap,Oid indexOid,bool verbose)552 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
553 {
554 Oid tableOid = RelationGetRelid(OldHeap);
555 Oid tableSpace = OldHeap->rd_rel->reltablespace;
556 Oid OIDNewHeap;
557 char relpersistence;
558 bool is_system_catalog;
559 bool swap_toast_by_content;
560 TransactionId frozenXid;
561 MultiXactId cutoffMulti;
562
563 /* Mark the correct index as clustered */
564 if (OidIsValid(indexOid))
565 mark_index_clustered(OldHeap, indexOid, true);
566
567 /* Remember info about rel before closing OldHeap */
568 relpersistence = OldHeap->rd_rel->relpersistence;
569 is_system_catalog = IsSystemRelation(OldHeap);
570
571 /* Close relcache entry, but keep lock until transaction commit */
572 table_close(OldHeap, NoLock);
573
574 /* Create the transient table that will receive the re-ordered data */
575 OIDNewHeap = make_new_heap(tableOid, tableSpace,
576 relpersistence,
577 AccessExclusiveLock);
578
579 /* Copy the heap data into the new table in the desired order */
580 copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
581 &swap_toast_by_content, &frozenXid, &cutoffMulti);
582
583 /*
584 * Swap the physical files of the target and transient tables, then
585 * rebuild the target's indexes and throw away the transient table.
586 */
587 finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
588 swap_toast_by_content, false, true,
589 frozenXid, cutoffMulti,
590 relpersistence);
591 }
592
593
594 /*
595 * Create the transient table that will be filled with new data during
596 * CLUSTER, ALTER TABLE, and similar operations. The transient table
597 * duplicates the logical structure of the OldHeap, but is placed in
598 * NewTableSpace which might be different from OldHeap's. Also, it's built
599 * with the specified persistence, which might differ from the original's.
600 *
601 * After this, the caller should load the new heap with transferred/modified
602 * data, then call finish_heap_swap to complete the operation.
603 */
604 Oid
make_new_heap(Oid OIDOldHeap,Oid NewTableSpace,char relpersistence,LOCKMODE lockmode)605 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
606 LOCKMODE lockmode)
607 {
608 TupleDesc OldHeapDesc;
609 char NewHeapName[NAMEDATALEN];
610 Oid OIDNewHeap;
611 Oid toastid;
612 Relation OldHeap;
613 HeapTuple tuple;
614 Datum reloptions;
615 bool isNull;
616 Oid namespaceid;
617
618 OldHeap = table_open(OIDOldHeap, lockmode);
619 OldHeapDesc = RelationGetDescr(OldHeap);
620
621 /*
622 * Note that the NewHeap will not receive any of the defaults or
623 * constraints associated with the OldHeap; we don't need 'em, and there's
624 * no reason to spend cycles inserting them into the catalogs only to
625 * delete them.
626 */
627
628 /*
629 * But we do want to use reloptions of the old heap for new heap.
630 */
631 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
632 if (!HeapTupleIsValid(tuple))
633 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
634 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
635 &isNull);
636 if (isNull)
637 reloptions = (Datum) 0;
638
639 if (relpersistence == RELPERSISTENCE_TEMP)
640 namespaceid = LookupCreationNamespace("pg_temp");
641 else
642 namespaceid = RelationGetNamespace(OldHeap);
643
644 /*
645 * Create the new heap, using a temporary name in the same namespace as
646 * the existing table. NOTE: there is some risk of collision with user
647 * relnames. Working around this seems more trouble than it's worth; in
648 * particular, we can't create the new heap in a different namespace from
649 * the old, or we will have problems with the TEMP status of temp tables.
650 *
651 * Note: the new heap is not a shared relation, even if we are rebuilding
652 * a shared rel. However, we do make the new heap mapped if the source is
653 * mapped. This simplifies swap_relation_files, and is absolutely
654 * necessary for rebuilding pg_class, for reasons explained there.
655 */
656 snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
657
658 OIDNewHeap = heap_create_with_catalog(NewHeapName,
659 namespaceid,
660 NewTableSpace,
661 InvalidOid,
662 InvalidOid,
663 InvalidOid,
664 OldHeap->rd_rel->relowner,
665 OldHeap->rd_rel->relam,
666 OldHeapDesc,
667 NIL,
668 RELKIND_RELATION,
669 relpersistence,
670 false,
671 RelationIsMapped(OldHeap),
672 ONCOMMIT_NOOP,
673 reloptions,
674 false,
675 true,
676 true,
677 OIDOldHeap,
678 NULL);
679 Assert(OIDNewHeap != InvalidOid);
680
681 ReleaseSysCache(tuple);
682
683 /*
684 * Advance command counter so that the newly-created relation's catalog
685 * tuples will be visible to table_open.
686 */
687 CommandCounterIncrement();
688
689 /*
690 * If necessary, create a TOAST table for the new relation.
691 *
692 * If the relation doesn't have a TOAST table already, we can't need one
693 * for the new relation. The other way around is possible though: if some
694 * wide columns have been dropped, NewHeapCreateToastTable can decide that
695 * no TOAST table is needed for the new table.
696 *
697 * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
698 * that the TOAST table will be visible for insertion.
699 */
700 toastid = OldHeap->rd_rel->reltoastrelid;
701 if (OidIsValid(toastid))
702 {
703 /* keep the existing toast table's reloptions, if any */
704 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
705 if (!HeapTupleIsValid(tuple))
706 elog(ERROR, "cache lookup failed for relation %u", toastid);
707 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
708 &isNull);
709 if (isNull)
710 reloptions = (Datum) 0;
711
712 NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid);
713
714 ReleaseSysCache(tuple);
715 }
716
717 table_close(OldHeap, NoLock);
718
719 return OIDNewHeap;
720 }
721
722 /*
723 * Do the physical copying of table data.
724 *
725 * There are three output parameters:
726 * *pSwapToastByContent is set true if toast tables must be swapped by content.
727 * *pFreezeXid receives the TransactionId used as freeze cutoff point.
728 * *pCutoffMulti receives the MultiXactId used as a cutoff point.
729 */
730 static void
copy_table_data(Oid OIDNewHeap,Oid OIDOldHeap,Oid OIDOldIndex,bool verbose,bool * pSwapToastByContent,TransactionId * pFreezeXid,MultiXactId * pCutoffMulti)731 copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
732 bool *pSwapToastByContent, TransactionId *pFreezeXid,
733 MultiXactId *pCutoffMulti)
734 {
735 Relation NewHeap,
736 OldHeap,
737 OldIndex;
738 Relation relRelation;
739 HeapTuple reltup;
740 Form_pg_class relform;
741 TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
742 TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY;
743 TransactionId OldestXmin;
744 TransactionId FreezeXid;
745 MultiXactId MultiXactCutoff;
746 bool use_sort;
747 double num_tuples = 0,
748 tups_vacuumed = 0,
749 tups_recently_dead = 0;
750 BlockNumber num_pages;
751 int elevel = verbose ? INFO : DEBUG2;
752 PGRUsage ru0;
753
754 pg_rusage_init(&ru0);
755
756 /*
757 * Open the relations we need.
758 */
759 NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
760 OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
761 if (OidIsValid(OIDOldIndex))
762 OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
763 else
764 OldIndex = NULL;
765
766 /*
767 * Their tuple descriptors should be exactly alike, but here we only need
768 * assume that they have the same number of columns.
769 */
770 oldTupDesc = RelationGetDescr(OldHeap);
771 newTupDesc = RelationGetDescr(NewHeap);
772 Assert(newTupDesc->natts == oldTupDesc->natts);
773
774 /*
775 * If the OldHeap has a toast table, get lock on the toast table to keep
776 * it from being vacuumed. This is needed because autovacuum processes
777 * toast tables independently of their main tables, with no lock on the
778 * latter. If an autovacuum were to start on the toast table after we
779 * compute our OldestXmin below, it would use a later OldestXmin, and then
780 * possibly remove as DEAD toast tuples belonging to main tuples we think
781 * are only RECENTLY_DEAD. Then we'd fail while trying to copy those
782 * tuples.
783 *
784 * We don't need to open the toast relation here, just lock it. The lock
785 * will be held till end of transaction.
786 */
787 if (OldHeap->rd_rel->reltoastrelid)
788 LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
789
790 /*
791 * If both tables have TOAST tables, perform toast swap by content. It is
792 * possible that the old table has a toast table but the new one doesn't,
793 * if toastable columns have been dropped. In that case we have to do
794 * swap by links. This is okay because swap by content is only essential
795 * for system catalogs, and we don't support schema changes for them.
796 */
797 if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
798 {
799 *pSwapToastByContent = true;
800
801 /*
802 * When doing swap by content, any toast pointers written into NewHeap
803 * must use the old toast table's OID, because that's where the toast
804 * data will eventually be found. Set this up by setting rd_toastoid.
805 * This also tells toast_save_datum() to preserve the toast value
806 * OIDs, which we want so as not to invalidate toast pointers in
807 * system catalog caches, and to avoid making multiple copies of a
808 * single toast value.
809 *
810 * Note that we must hold NewHeap open until we are done writing data,
811 * since the relcache will not guarantee to remember this setting once
812 * the relation is closed. Also, this technique depends on the fact
813 * that no one will try to read from the NewHeap until after we've
814 * finished writing it and swapping the rels --- otherwise they could
815 * follow the toast pointers to the wrong place. (It would actually
816 * work for values copied over from the old toast table, but not for
817 * any values that we toast which were previously not toasted.)
818 */
819 NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
820 }
821 else
822 *pSwapToastByContent = false;
823
824 /*
825 * Compute xids used to freeze and weed out dead tuples and multixacts.
826 * Since we're going to rewrite the whole table anyway, there's no reason
827 * not to be aggressive about this.
828 */
829 vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
830 &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
831 NULL);
832
833 /*
834 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
835 * backwards, so take the max.
836 */
837 if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
838 TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
839 FreezeXid = OldHeap->rd_rel->relfrozenxid;
840
841 /*
842 * MultiXactCutoff, similarly, shouldn't go backwards either.
843 */
844 if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
845 MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
846 MultiXactCutoff = OldHeap->rd_rel->relminmxid;
847
848 /*
849 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
850 * the OldHeap. We know how to use a sort to duplicate the ordering of a
851 * btree index, and will use seqscan-and-sort for that case if the planner
852 * tells us it's cheaper. Otherwise, always indexscan if an index is
853 * provided, else plain seqscan.
854 */
855 if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
856 use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
857 else
858 use_sort = false;
859
860 /* Log what we're doing */
861 if (OldIndex != NULL && !use_sort)
862 ereport(elevel,
863 (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
864 get_namespace_name(RelationGetNamespace(OldHeap)),
865 RelationGetRelationName(OldHeap),
866 RelationGetRelationName(OldIndex))));
867 else if (use_sort)
868 ereport(elevel,
869 (errmsg("clustering \"%s.%s\" using sequential scan and sort",
870 get_namespace_name(RelationGetNamespace(OldHeap)),
871 RelationGetRelationName(OldHeap))));
872 else
873 ereport(elevel,
874 (errmsg("vacuuming \"%s.%s\"",
875 get_namespace_name(RelationGetNamespace(OldHeap)),
876 RelationGetRelationName(OldHeap))));
877
878 /*
879 * Hand of the actual copying to AM specific function, the generic code
880 * cannot know how to deal with visibility across AMs. Note that this
881 * routine is allowed to set FreezeXid / MultiXactCutoff to different
882 * values (e.g. because the AM doesn't use freezing).
883 */
884 table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
885 OldestXmin, &FreezeXid, &MultiXactCutoff,
886 &num_tuples, &tups_vacuumed,
887 &tups_recently_dead);
888
889 /* return selected values to caller, get set as relfrozenxid/minmxid */
890 *pFreezeXid = FreezeXid;
891 *pCutoffMulti = MultiXactCutoff;
892
893 /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
894 NewHeap->rd_toastoid = InvalidOid;
895
896 num_pages = RelationGetNumberOfBlocks(NewHeap);
897
898 /* Log what we did */
899 ereport(elevel,
900 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
901 RelationGetRelationName(OldHeap),
902 tups_vacuumed, num_tuples,
903 RelationGetNumberOfBlocks(OldHeap)),
904 errdetail("%.0f dead row versions cannot be removed yet.\n"
905 "%s.",
906 tups_recently_dead,
907 pg_rusage_show(&ru0))));
908
909 if (OldIndex != NULL)
910 index_close(OldIndex, NoLock);
911 table_close(OldHeap, NoLock);
912 table_close(NewHeap, NoLock);
913
914 /* Update pg_class to reflect the correct values of pages and tuples. */
915 relRelation = table_open(RelationRelationId, RowExclusiveLock);
916
917 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
918 if (!HeapTupleIsValid(reltup))
919 elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
920 relform = (Form_pg_class) GETSTRUCT(reltup);
921
922 relform->relpages = num_pages;
923 relform->reltuples = num_tuples;
924
925 /* Don't update the stats for pg_class. See swap_relation_files. */
926 if (OIDOldHeap != RelationRelationId)
927 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
928 else
929 CacheInvalidateRelcacheByTuple(reltup);
930
931 /* Clean up. */
932 heap_freetuple(reltup);
933 table_close(relRelation, RowExclusiveLock);
934
935 /* Make the update visible */
936 CommandCounterIncrement();
937 }
938
939 /*
940 * Swap the physical files of two given relations.
941 *
942 * We swap the physical identity (reltablespace, relfilenode) while keeping the
943 * same logical identities of the two relations. relpersistence is also
944 * swapped, which is critical since it determines where buffers live for each
945 * relation.
946 *
947 * We can swap associated TOAST data in either of two ways: recursively swap
948 * the physical content of the toast tables (and their indexes), or swap the
949 * TOAST links in the given relations' pg_class entries. The former is needed
950 * to manage rewrites of shared catalogs (where we cannot change the pg_class
951 * links) while the latter is the only way to handle cases in which a toast
952 * table is added or removed altogether.
953 *
954 * Additionally, the first relation is marked with relfrozenxid set to
955 * frozenXid. It seems a bit ugly to have this here, but the caller would
956 * have to do it anyway, so having it here saves a heap_update. Note: in
957 * the swap-toast-links case, we assume we don't need to change the toast
958 * table's relfrozenxid: the new version of the toast table should already
959 * have relfrozenxid set to RecentXmin, which is good enough.
960 *
961 * Lastly, if r2 and its toast table and toast index (if any) are mapped,
962 * their OIDs are emitted into mapped_tables[]. This is hacky but beats
963 * having to look the information up again later in finish_heap_swap.
964 */
965 static void
swap_relation_files(Oid r1,Oid r2,bool target_is_pg_class,bool swap_toast_by_content,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,Oid * mapped_tables)966 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
967 bool swap_toast_by_content,
968 bool is_internal,
969 TransactionId frozenXid,
970 MultiXactId cutoffMulti,
971 Oid *mapped_tables)
972 {
973 Relation relRelation;
974 HeapTuple reltup1,
975 reltup2;
976 Form_pg_class relform1,
977 relform2;
978 Oid relfilenode1,
979 relfilenode2;
980 Oid swaptemp;
981 char swptmpchr;
982
983 /* We need writable copies of both pg_class tuples. */
984 relRelation = table_open(RelationRelationId, RowExclusiveLock);
985
986 reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
987 if (!HeapTupleIsValid(reltup1))
988 elog(ERROR, "cache lookup failed for relation %u", r1);
989 relform1 = (Form_pg_class) GETSTRUCT(reltup1);
990
991 reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
992 if (!HeapTupleIsValid(reltup2))
993 elog(ERROR, "cache lookup failed for relation %u", r2);
994 relform2 = (Form_pg_class) GETSTRUCT(reltup2);
995
996 relfilenode1 = relform1->relfilenode;
997 relfilenode2 = relform2->relfilenode;
998
999 if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1000 {
1001 /*
1002 * Normal non-mapped relations: swap relfilenodes, reltablespaces,
1003 * relpersistence
1004 */
1005 Assert(!target_is_pg_class);
1006
1007 swaptemp = relform1->relfilenode;
1008 relform1->relfilenode = relform2->relfilenode;
1009 relform2->relfilenode = swaptemp;
1010
1011 swaptemp = relform1->reltablespace;
1012 relform1->reltablespace = relform2->reltablespace;
1013 relform2->reltablespace = swaptemp;
1014
1015 swptmpchr = relform1->relpersistence;
1016 relform1->relpersistence = relform2->relpersistence;
1017 relform2->relpersistence = swptmpchr;
1018
1019 /* Also swap toast links, if we're swapping by links */
1020 if (!swap_toast_by_content)
1021 {
1022 swaptemp = relform1->reltoastrelid;
1023 relform1->reltoastrelid = relform2->reltoastrelid;
1024 relform2->reltoastrelid = swaptemp;
1025 }
1026 }
1027 else
1028 {
1029 /*
1030 * Mapped-relation case. Here we have to swap the relation mappings
1031 * instead of modifying the pg_class columns. Both must be mapped.
1032 */
1033 if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
1034 elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1035 NameStr(relform1->relname));
1036
1037 /*
1038 * We can't change the tablespace nor persistence of a mapped rel, and
1039 * we can't handle toast link swapping for one either, because we must
1040 * not apply any critical changes to its pg_class row. These cases
1041 * should be prevented by upstream permissions tests, so these checks
1042 * are non-user-facing emergency backstop.
1043 */
1044 if (relform1->reltablespace != relform2->reltablespace)
1045 elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1046 NameStr(relform1->relname));
1047 if (relform1->relpersistence != relform2->relpersistence)
1048 elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1049 NameStr(relform1->relname));
1050 if (!swap_toast_by_content &&
1051 (relform1->reltoastrelid || relform2->reltoastrelid))
1052 elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1053 NameStr(relform1->relname));
1054
1055 /*
1056 * Fetch the mappings --- shouldn't fail, but be paranoid
1057 */
1058 relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1059 if (!OidIsValid(relfilenode1))
1060 elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1061 NameStr(relform1->relname), r1);
1062 relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1063 if (!OidIsValid(relfilenode2))
1064 elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1065 NameStr(relform2->relname), r2);
1066
1067 /*
1068 * Send replacement mappings to relmapper. Note these won't actually
1069 * take effect until CommandCounterIncrement.
1070 */
1071 RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1072 RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1073
1074 /* Pass OIDs of mapped r2 tables back to caller */
1075 *mapped_tables++ = r2;
1076 }
1077
1078 /*
1079 * Recognize that rel1's relfilenode (swapped from rel2) is new in this
1080 * subtransaction. The rel2 storage (swapped from rel1) may or may not be
1081 * new.
1082 */
1083 {
1084 Relation rel1,
1085 rel2;
1086
1087 rel1 = relation_open(r1, NoLock);
1088 rel2 = relation_open(r2, NoLock);
1089 rel2->rd_createSubid = rel1->rd_createSubid;
1090 rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
1091 rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
1092 RelationAssumeNewRelfilenode(rel1);
1093 relation_close(rel1, NoLock);
1094 relation_close(rel2, NoLock);
1095 }
1096
1097 /*
1098 * In the case of a shared catalog, these next few steps will only affect
1099 * our own database's pg_class row; but that's okay, because they are all
1100 * noncritical updates. That's also an important fact for the case of a
1101 * mapped catalog, because it's possible that we'll commit the map change
1102 * and then fail to commit the pg_class update.
1103 */
1104
1105 /* set rel1's frozen Xid and minimum MultiXid */
1106 if (relform1->relkind != RELKIND_INDEX)
1107 {
1108 Assert(!TransactionIdIsValid(frozenXid) ||
1109 TransactionIdIsNormal(frozenXid));
1110 relform1->relfrozenxid = frozenXid;
1111 relform1->relminmxid = cutoffMulti;
1112 }
1113
1114 /* swap size statistics too, since new rel has freshly-updated stats */
1115 {
1116 int32 swap_pages;
1117 float4 swap_tuples;
1118 int32 swap_allvisible;
1119
1120 swap_pages = relform1->relpages;
1121 relform1->relpages = relform2->relpages;
1122 relform2->relpages = swap_pages;
1123
1124 swap_tuples = relform1->reltuples;
1125 relform1->reltuples = relform2->reltuples;
1126 relform2->reltuples = swap_tuples;
1127
1128 swap_allvisible = relform1->relallvisible;
1129 relform1->relallvisible = relform2->relallvisible;
1130 relform2->relallvisible = swap_allvisible;
1131 }
1132
1133 /*
1134 * Update the tuples in pg_class --- unless the target relation of the
1135 * swap is pg_class itself. In that case, there is zero point in making
1136 * changes because we'd be updating the old data that we're about to throw
1137 * away. Because the real work being done here for a mapped relation is
1138 * just to change the relation map settings, it's all right to not update
1139 * the pg_class rows in this case. The most important changes will instead
1140 * performed later, in finish_heap_swap() itself.
1141 */
1142 if (!target_is_pg_class)
1143 {
1144 CatalogIndexState indstate;
1145
1146 indstate = CatalogOpenIndexes(relRelation);
1147 CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1148 indstate);
1149 CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1150 indstate);
1151 CatalogCloseIndexes(indstate);
1152 }
1153 else
1154 {
1155 /* no update ... but we do still need relcache inval */
1156 CacheInvalidateRelcacheByTuple(reltup1);
1157 CacheInvalidateRelcacheByTuple(reltup2);
1158 }
1159
1160 /*
1161 * Post alter hook for modified relations. The change to r2 is always
1162 * internal, but r1 depends on the invocation context.
1163 */
1164 InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1165 InvalidOid, is_internal);
1166 InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1167 InvalidOid, true);
1168
1169 /*
1170 * If we have toast tables associated with the relations being swapped,
1171 * deal with them too.
1172 */
1173 if (relform1->reltoastrelid || relform2->reltoastrelid)
1174 {
1175 if (swap_toast_by_content)
1176 {
1177 if (relform1->reltoastrelid && relform2->reltoastrelid)
1178 {
1179 /* Recursively swap the contents of the toast tables */
1180 swap_relation_files(relform1->reltoastrelid,
1181 relform2->reltoastrelid,
1182 target_is_pg_class,
1183 swap_toast_by_content,
1184 is_internal,
1185 frozenXid,
1186 cutoffMulti,
1187 mapped_tables);
1188 }
1189 else
1190 {
1191 /* caller messed up */
1192 elog(ERROR, "cannot swap toast files by content when there's only one");
1193 }
1194 }
1195 else
1196 {
1197 /*
1198 * We swapped the ownership links, so we need to change dependency
1199 * data to match.
1200 *
1201 * NOTE: it is possible that only one table has a toast table.
1202 *
1203 * NOTE: at present, a TOAST table's only dependency is the one on
1204 * its owning table. If more are ever created, we'd need to use
1205 * something more selective than deleteDependencyRecordsFor() to
1206 * get rid of just the link we want.
1207 */
1208 ObjectAddress baseobject,
1209 toastobject;
1210 long count;
1211
1212 /*
1213 * We disallow this case for system catalogs, to avoid the
1214 * possibility that the catalog we're rebuilding is one of the
1215 * ones the dependency changes would change. It's too late to be
1216 * making any data changes to the target catalog.
1217 */
1218 if (IsSystemClass(r1, relform1))
1219 elog(ERROR, "cannot swap toast files by links for system catalogs");
1220
1221 /* Delete old dependencies */
1222 if (relform1->reltoastrelid)
1223 {
1224 count = deleteDependencyRecordsFor(RelationRelationId,
1225 relform1->reltoastrelid,
1226 false);
1227 if (count != 1)
1228 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1229 count);
1230 }
1231 if (relform2->reltoastrelid)
1232 {
1233 count = deleteDependencyRecordsFor(RelationRelationId,
1234 relform2->reltoastrelid,
1235 false);
1236 if (count != 1)
1237 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1238 count);
1239 }
1240
1241 /* Register new dependencies */
1242 baseobject.classId = RelationRelationId;
1243 baseobject.objectSubId = 0;
1244 toastobject.classId = RelationRelationId;
1245 toastobject.objectSubId = 0;
1246
1247 if (relform1->reltoastrelid)
1248 {
1249 baseobject.objectId = r1;
1250 toastobject.objectId = relform1->reltoastrelid;
1251 recordDependencyOn(&toastobject, &baseobject,
1252 DEPENDENCY_INTERNAL);
1253 }
1254
1255 if (relform2->reltoastrelid)
1256 {
1257 baseobject.objectId = r2;
1258 toastobject.objectId = relform2->reltoastrelid;
1259 recordDependencyOn(&toastobject, &baseobject,
1260 DEPENDENCY_INTERNAL);
1261 }
1262 }
1263 }
1264
1265 /*
1266 * If we're swapping two toast tables by content, do the same for their
1267 * valid index. The swap can actually be safely done only if the relations
1268 * have indexes.
1269 */
1270 if (swap_toast_by_content &&
1271 relform1->relkind == RELKIND_TOASTVALUE &&
1272 relform2->relkind == RELKIND_TOASTVALUE)
1273 {
1274 Oid toastIndex1,
1275 toastIndex2;
1276
1277 /* Get valid index for each relation */
1278 toastIndex1 = toast_get_valid_index(r1,
1279 AccessExclusiveLock);
1280 toastIndex2 = toast_get_valid_index(r2,
1281 AccessExclusiveLock);
1282
1283 swap_relation_files(toastIndex1,
1284 toastIndex2,
1285 target_is_pg_class,
1286 swap_toast_by_content,
1287 is_internal,
1288 InvalidTransactionId,
1289 InvalidMultiXactId,
1290 mapped_tables);
1291 }
1292
1293 /* Clean up. */
1294 heap_freetuple(reltup1);
1295 heap_freetuple(reltup2);
1296
1297 table_close(relRelation, RowExclusiveLock);
1298
1299 /*
1300 * Close both relcache entries' smgr links. We need this kluge because
1301 * both links will be invalidated during upcoming CommandCounterIncrement.
1302 * Whichever of the rels is the second to be cleared will have a dangling
1303 * reference to the other's smgr entry. Rather than trying to avoid this
1304 * by ordering operations just so, it's easiest to close the links first.
1305 * (Fortunately, since one of the entries is local in our transaction,
1306 * it's sufficient to clear out our own relcache this way; the problem
1307 * cannot arise for other backends when they see our update on the
1308 * non-transient relation.)
1309 *
1310 * Caution: the placement of this step interacts with the decision to
1311 * handle toast rels by recursion. When we are trying to rebuild pg_class
1312 * itself, the smgr close on pg_class must happen after all accesses in
1313 * this function.
1314 */
1315 RelationCloseSmgrByOid(r1);
1316 RelationCloseSmgrByOid(r2);
1317 }
1318
1319 /*
1320 * Remove the transient table that was built by make_new_heap, and finish
1321 * cleaning up (including rebuilding all indexes on the old heap).
1322 */
1323 void
finish_heap_swap(Oid OIDOldHeap,Oid OIDNewHeap,bool is_system_catalog,bool swap_toast_by_content,bool check_constraints,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,char newrelpersistence)1324 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1325 bool is_system_catalog,
1326 bool swap_toast_by_content,
1327 bool check_constraints,
1328 bool is_internal,
1329 TransactionId frozenXid,
1330 MultiXactId cutoffMulti,
1331 char newrelpersistence)
1332 {
1333 ObjectAddress object;
1334 Oid mapped_tables[4];
1335 int reindex_flags;
1336 int i;
1337
1338 /* Report that we are now swapping relation files */
1339 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1340 PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
1341
1342 /* Zero out possible results from swapped_relation_files */
1343 memset(mapped_tables, 0, sizeof(mapped_tables));
1344
1345 /*
1346 * Swap the contents of the heap relations (including any toast tables).
1347 * Also set old heap's relfrozenxid to frozenXid.
1348 */
1349 swap_relation_files(OIDOldHeap, OIDNewHeap,
1350 (OIDOldHeap == RelationRelationId),
1351 swap_toast_by_content, is_internal,
1352 frozenXid, cutoffMulti, mapped_tables);
1353
1354 /*
1355 * If it's a system catalog, queue a sinval message to flush all catcaches
1356 * on the catalog when we reach CommandCounterIncrement.
1357 */
1358 if (is_system_catalog)
1359 CacheInvalidateCatalog(OIDOldHeap);
1360
1361 /*
1362 * Rebuild each index on the relation (but not the toast table, which is
1363 * all-new at this point). It is important to do this before the DROP
1364 * step because if we are processing a system catalog that will be used
1365 * during DROP, we want to have its indexes available. There is no
1366 * advantage to the other order anyway because this is all transactional,
1367 * so no chance to reclaim disk space before commit. We do not need a
1368 * final CommandCounterIncrement() because reindex_relation does it.
1369 *
1370 * Note: because index_build is called via reindex_relation, it will never
1371 * set indcheckxmin true for the indexes. This is OK even though in some
1372 * sense we are building new indexes rather than rebuilding existing ones,
1373 * because the new heap won't contain any HOT chains at all, let alone
1374 * broken ones, so it can't be necessary to set indcheckxmin.
1375 */
1376 reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1377 if (check_constraints)
1378 reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1379
1380 /*
1381 * Ensure that the indexes have the same persistence as the parent
1382 * relation.
1383 */
1384 if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1385 reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1386 else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1387 reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1388
1389 /* Report that we are now reindexing relations */
1390 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1391 PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
1392
1393 reindex_relation(OIDOldHeap, reindex_flags, 0);
1394
1395 /* Report that we are now doing clean up */
1396 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1397 PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
1398
1399 /*
1400 * If the relation being rebuild is pg_class, swap_relation_files()
1401 * couldn't update pg_class's own pg_class entry (check comments in
1402 * swap_relation_files()), thus relfrozenxid was not updated. That's
1403 * annoying because a potential reason for doing a VACUUM FULL is a
1404 * imminent or actual anti-wraparound shutdown. So, now that we can
1405 * access the new relation using its indices, update relfrozenxid.
1406 * pg_class doesn't have a toast relation, so we don't need to update the
1407 * corresponding toast relation. Not that there's little point moving all
1408 * relfrozenxid updates here since swap_relation_files() needs to write to
1409 * pg_class for non-mapped relations anyway.
1410 */
1411 if (OIDOldHeap == RelationRelationId)
1412 {
1413 Relation relRelation;
1414 HeapTuple reltup;
1415 Form_pg_class relform;
1416
1417 relRelation = table_open(RelationRelationId, RowExclusiveLock);
1418
1419 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1420 if (!HeapTupleIsValid(reltup))
1421 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1422 relform = (Form_pg_class) GETSTRUCT(reltup);
1423
1424 relform->relfrozenxid = frozenXid;
1425 relform->relminmxid = cutoffMulti;
1426
1427 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1428
1429 table_close(relRelation, RowExclusiveLock);
1430 }
1431
1432 /* Destroy new heap with old filenode */
1433 object.classId = RelationRelationId;
1434 object.objectId = OIDNewHeap;
1435 object.objectSubId = 0;
1436
1437 /*
1438 * The new relation is local to our transaction and we know nothing
1439 * depends on it, so DROP_RESTRICT should be OK.
1440 */
1441 performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1442
1443 /* performDeletion does CommandCounterIncrement at end */
1444
1445 /*
1446 * Now we must remove any relation mapping entries that we set up for the
1447 * transient table, as well as its toast table and toast index if any. If
1448 * we fail to do this before commit, the relmapper will complain about new
1449 * permanent map entries being added post-bootstrap.
1450 */
1451 for (i = 0; OidIsValid(mapped_tables[i]); i++)
1452 RelationMapRemoveMapping(mapped_tables[i]);
1453
1454 /*
1455 * At this point, everything is kosher except that, if we did toast swap
1456 * by links, the toast table's name corresponds to the transient table.
1457 * The name is irrelevant to the backend because it's referenced by OID,
1458 * but users looking at the catalogs could be confused. Rename it to
1459 * prevent this problem.
1460 *
1461 * Note no lock required on the relation, because we already hold an
1462 * exclusive lock on it.
1463 */
1464 if (!swap_toast_by_content)
1465 {
1466 Relation newrel;
1467
1468 newrel = table_open(OIDOldHeap, NoLock);
1469 if (OidIsValid(newrel->rd_rel->reltoastrelid))
1470 {
1471 Oid toastidx;
1472 char NewToastName[NAMEDATALEN];
1473
1474 /* Get the associated valid index to be renamed */
1475 toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1476 NoLock);
1477
1478 /* rename the toast table ... */
1479 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1480 OIDOldHeap);
1481 RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1482 NewToastName, true, false);
1483
1484 /* ... and its valid index too. */
1485 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1486 OIDOldHeap);
1487
1488 RenameRelationInternal(toastidx,
1489 NewToastName, true, true);
1490
1491 /*
1492 * Reset the relrewrite for the toast. The command-counter
1493 * increment is required here as we are about to update
1494 * the tuple that is updated as part of RenameRelationInternal.
1495 */
1496 CommandCounterIncrement();
1497 ResetRelRewrite(newrel->rd_rel->reltoastrelid);
1498 }
1499 relation_close(newrel, NoLock);
1500 }
1501
1502 /* if it's not a catalog table, clear any missing attribute settings */
1503 if (!is_system_catalog)
1504 {
1505 Relation newrel;
1506
1507 newrel = table_open(OIDOldHeap, NoLock);
1508 RelationClearMissing(newrel);
1509 relation_close(newrel, NoLock);
1510 }
1511 }
1512
1513
1514 /*
1515 * Get a list of tables that the current user owns and
1516 * have indisclustered set. Return the list in a List * of RelToCluster
1517 * (stored in the specified memory context), each one giving the tableOid
1518 * and the indexOid on which the table is already clustered.
1519 */
1520 static List *
get_tables_to_cluster(MemoryContext cluster_context)1521 get_tables_to_cluster(MemoryContext cluster_context)
1522 {
1523 Relation indRelation;
1524 TableScanDesc scan;
1525 ScanKeyData entry;
1526 HeapTuple indexTuple;
1527 Form_pg_index index;
1528 MemoryContext old_context;
1529 RelToCluster *rvtc;
1530 List *rvs = NIL;
1531
1532 /*
1533 * Get all indexes that have indisclustered set and are owned by
1534 * appropriate user.
1535 */
1536 indRelation = table_open(IndexRelationId, AccessShareLock);
1537 ScanKeyInit(&entry,
1538 Anum_pg_index_indisclustered,
1539 BTEqualStrategyNumber, F_BOOLEQ,
1540 BoolGetDatum(true));
1541 scan = table_beginscan_catalog(indRelation, 1, &entry);
1542 while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1543 {
1544 index = (Form_pg_index) GETSTRUCT(indexTuple);
1545
1546 if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1547 continue;
1548
1549 /*
1550 * We have to build the list in a different memory context so it will
1551 * survive the cross-transaction processing
1552 */
1553 old_context = MemoryContextSwitchTo(cluster_context);
1554
1555 rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1556 rvtc->tableOid = index->indrelid;
1557 rvtc->indexOid = index->indexrelid;
1558 rvs = lappend(rvs, rvtc);
1559
1560 MemoryContextSwitchTo(old_context);
1561 }
1562 table_endscan(scan);
1563
1564 relation_close(indRelation, AccessShareLock);
1565
1566 return rvs;
1567 }
1568