1 /*-------------------------------------------------------------------------
2 *
3 * cluster.c
4 * CLUSTER a table on an index. This is now also used for VACUUM FULL.
5 *
6 * There is hardly anything left of Paul Brown's original implementation...
7 *
8 *
9 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994-5, Regents of the University of California
11 *
12 *
13 * IDENTIFICATION
14 * src/backend/commands/cluster.c
15 *
16 *-------------------------------------------------------------------------
17 */
18 #include "postgres.h"
19
20 #include "access/amapi.h"
21 #include "access/multixact.h"
22 #include "access/relscan.h"
23 #include "access/rewriteheap.h"
24 #include "access/transam.h"
25 #include "access/tuptoaster.h"
26 #include "access/xact.h"
27 #include "access/xlog.h"
28 #include "catalog/pg_am.h"
29 #include "catalog/catalog.h"
30 #include "catalog/dependency.h"
31 #include "catalog/heap.h"
32 #include "catalog/index.h"
33 #include "catalog/namespace.h"
34 #include "catalog/objectaccess.h"
35 #include "catalog/toasting.h"
36 #include "commands/cluster.h"
37 #include "commands/tablecmds.h"
38 #include "commands/vacuum.h"
39 #include "miscadmin.h"
40 #include "optimizer/planner.h"
41 #include "storage/bufmgr.h"
42 #include "storage/lmgr.h"
43 #include "storage/predicate.h"
44 #include "storage/smgr.h"
45 #include "utils/acl.h"
46 #include "utils/fmgroids.h"
47 #include "utils/inval.h"
48 #include "utils/lsyscache.h"
49 #include "utils/memutils.h"
50 #include "utils/pg_rusage.h"
51 #include "utils/relmapper.h"
52 #include "utils/snapmgr.h"
53 #include "utils/syscache.h"
54 #include "utils/tqual.h"
55 #include "utils/tuplesort.h"
56
57
58 /*
59 * This struct is used to pass around the information on tables to be
60 * clustered. We need this so we can make a list of them when invoked without
61 * a specific table/index pair.
62 */
63 typedef struct
64 {
65 Oid tableOid;
66 Oid indexOid;
67 } RelToCluster;
68
69
70 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
71 static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
72 bool verbose, bool *pSwapToastByContent,
73 TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
74 static List *get_tables_to_cluster(MemoryContext cluster_context);
75 static void reform_and_rewrite_tuple(HeapTuple tuple,
76 TupleDesc oldTupDesc, TupleDesc newTupDesc,
77 Datum *values, bool *isnull,
78 bool newRelHasOids, RewriteState rwstate);
79
80
81 /*---------------------------------------------------------------------------
82 * This cluster code allows for clustering multiple tables at once. Because
83 * of this, we cannot just run everything on a single transaction, or we
84 * would be forced to acquire exclusive locks on all the tables being
85 * clustered, simultaneously --- very likely leading to deadlock.
86 *
87 * To solve this we follow a similar strategy to VACUUM code,
88 * clustering each relation in a separate transaction. For this to work,
89 * we need to:
90 * - provide a separate memory context so that we can pass information in
91 * a way that survives across transactions
92 * - start a new transaction every time a new relation is clustered
93 * - check for validity of the information on to-be-clustered relations,
94 * as someone might have deleted a relation behind our back, or
95 * clustered one on a different index
96 * - end the transaction
97 *
98 * The single-relation case does not have any such overhead.
99 *
100 * We also allow a relation to be specified without index. In that case,
101 * the indisclustered bit will be looked up, and an ERROR will be thrown
102 * if there is no index with the bit set.
103 *---------------------------------------------------------------------------
104 */
105 void
cluster(ClusterStmt * stmt,bool isTopLevel)106 cluster(ClusterStmt *stmt, bool isTopLevel)
107 {
108 if (stmt->relation != NULL)
109 {
110 /* This is the single-relation case. */
111 Oid tableOid,
112 indexOid = InvalidOid;
113 Relation rel;
114
115 /* Find, lock, and check permissions on the table */
116 tableOid = RangeVarGetRelidExtended(stmt->relation,
117 AccessExclusiveLock,
118 false, false,
119 RangeVarCallbackOwnsTable, NULL);
120 rel = heap_open(tableOid, NoLock);
121
122 /*
123 * Reject clustering a remote temp table ... their local buffer
124 * manager is not going to cope.
125 */
126 if (RELATION_IS_OTHER_TEMP(rel))
127 ereport(ERROR,
128 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
129 errmsg("cannot cluster temporary tables of other sessions")));
130
131 if (stmt->indexname == NULL)
132 {
133 ListCell *index;
134
135 /* We need to find the index that has indisclustered set. */
136 foreach(index, RelationGetIndexList(rel))
137 {
138 HeapTuple idxtuple;
139 Form_pg_index indexForm;
140
141 indexOid = lfirst_oid(index);
142 idxtuple = SearchSysCache1(INDEXRELID,
143 ObjectIdGetDatum(indexOid));
144 if (!HeapTupleIsValid(idxtuple))
145 elog(ERROR, "cache lookup failed for index %u", indexOid);
146 indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
147 if (indexForm->indisclustered)
148 {
149 ReleaseSysCache(idxtuple);
150 break;
151 }
152 ReleaseSysCache(idxtuple);
153 indexOid = InvalidOid;
154 }
155
156 if (!OidIsValid(indexOid))
157 ereport(ERROR,
158 (errcode(ERRCODE_UNDEFINED_OBJECT),
159 errmsg("there is no previously clustered index for table \"%s\"",
160 stmt->relation->relname)));
161 }
162 else
163 {
164 /*
165 * The index is expected to be in the same namespace as the
166 * relation.
167 */
168 indexOid = get_relname_relid(stmt->indexname,
169 rel->rd_rel->relnamespace);
170 if (!OidIsValid(indexOid))
171 ereport(ERROR,
172 (errcode(ERRCODE_UNDEFINED_OBJECT),
173 errmsg("index \"%s\" for table \"%s\" does not exist",
174 stmt->indexname, stmt->relation->relname)));
175 }
176
177 /* close relation, keep lock till commit */
178 heap_close(rel, NoLock);
179
180 /* Do the job. */
181 cluster_rel(tableOid, indexOid, false, stmt->verbose);
182 }
183 else
184 {
185 /*
186 * This is the "multi relation" case. We need to cluster all tables
187 * that have some index with indisclustered set.
188 */
189 MemoryContext cluster_context;
190 List *rvs;
191 ListCell *rv;
192
193 /*
194 * We cannot run this form of CLUSTER inside a user transaction block;
195 * we'd be holding locks way too long.
196 */
197 PreventTransactionChain(isTopLevel, "CLUSTER");
198
199 /*
200 * Create special memory context for cross-transaction storage.
201 *
202 * Since it is a child of PortalContext, it will go away even in case
203 * of error.
204 */
205 cluster_context = AllocSetContextCreate(PortalContext,
206 "Cluster",
207 ALLOCSET_DEFAULT_SIZES);
208
209 /*
210 * Build the list of relations to cluster. Note that this lives in
211 * cluster_context.
212 */
213 rvs = get_tables_to_cluster(cluster_context);
214
215 /* Commit to get out of starting transaction */
216 PopActiveSnapshot();
217 CommitTransactionCommand();
218
219 /* Ok, now that we've got them all, cluster them one by one */
220 foreach(rv, rvs)
221 {
222 RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
223
224 /* Start a new transaction for each relation. */
225 StartTransactionCommand();
226 /* functions in indexes may want a snapshot set */
227 PushActiveSnapshot(GetTransactionSnapshot());
228 /* Do the job. */
229 cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose);
230 PopActiveSnapshot();
231 CommitTransactionCommand();
232 }
233
234 /* Start a new transaction for the cleanup work. */
235 StartTransactionCommand();
236
237 /* Clean up working storage */
238 MemoryContextDelete(cluster_context);
239 }
240 }
241
242 /*
243 * cluster_rel
244 *
245 * This clusters the table by creating a new, clustered table and
246 * swapping the relfilenodes of the new table and the old table, so
247 * the OID of the original table is preserved. Thus we do not lose
248 * GRANT, inheritance nor references to this table (this was a bug
249 * in releases through 7.3).
250 *
251 * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
252 * the new table, it's better to create the indexes afterwards than to fill
253 * them incrementally while we load the table.
254 *
255 * If indexOid is InvalidOid, the table will be rewritten in physical order
256 * instead of index order. This is the new implementation of VACUUM FULL,
257 * and error messages should refer to the operation as VACUUM not CLUSTER.
258 */
259 void
cluster_rel(Oid tableOid,Oid indexOid,bool recheck,bool verbose)260 cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose)
261 {
262 Relation OldHeap;
263
264 /* Check for user-requested abort. */
265 CHECK_FOR_INTERRUPTS();
266
267 /*
268 * We grab exclusive access to the target rel and index for the duration
269 * of the transaction. (This is redundant for the single-transaction
270 * case, since cluster() already did it.) The index lock is taken inside
271 * check_index_is_clusterable.
272 */
273 OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
274
275 /* If the table has gone away, we can skip processing it */
276 if (!OldHeap)
277 return;
278
279 /*
280 * Since we may open a new transaction for each relation, we have to check
281 * that the relation still is what we think it is.
282 *
283 * If this is a single-transaction CLUSTER, we can skip these tests. We
284 * *must* skip the one on indisclustered since it would reject an attempt
285 * to cluster a not-previously-clustered index.
286 */
287 if (recheck)
288 {
289 HeapTuple tuple;
290 Form_pg_index indexForm;
291
292 /* Check that the user still owns the relation */
293 if (!pg_class_ownercheck(tableOid, GetUserId()))
294 {
295 relation_close(OldHeap, AccessExclusiveLock);
296 return;
297 }
298
299 /*
300 * Silently skip a temp table for a remote session. Only doing this
301 * check in the "recheck" case is appropriate (which currently means
302 * somebody is executing a database-wide CLUSTER), because there is
303 * another check in cluster() which will stop any attempt to cluster
304 * remote temp tables by name. There is another check in cluster_rel
305 * which is redundant, but we leave it for extra safety.
306 */
307 if (RELATION_IS_OTHER_TEMP(OldHeap))
308 {
309 relation_close(OldHeap, AccessExclusiveLock);
310 return;
311 }
312
313 if (OidIsValid(indexOid))
314 {
315 /*
316 * Check that the index still exists
317 */
318 if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
319 {
320 relation_close(OldHeap, AccessExclusiveLock);
321 return;
322 }
323
324 /*
325 * Check that the index is still the one with indisclustered set.
326 */
327 tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
328 if (!HeapTupleIsValid(tuple)) /* probably can't happen */
329 {
330 relation_close(OldHeap, AccessExclusiveLock);
331 return;
332 }
333 indexForm = (Form_pg_index) GETSTRUCT(tuple);
334 if (!indexForm->indisclustered)
335 {
336 ReleaseSysCache(tuple);
337 relation_close(OldHeap, AccessExclusiveLock);
338 return;
339 }
340 ReleaseSysCache(tuple);
341 }
342 }
343
344 /*
345 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
346 * would work in most respects, but the index would only get marked as
347 * indisclustered in the current database, leading to unexpected behavior
348 * if CLUSTER were later invoked in another database.
349 */
350 if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
351 ereport(ERROR,
352 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
353 errmsg("cannot cluster a shared catalog")));
354
355 /*
356 * Don't process temp tables of other backends ... their local buffer
357 * manager is not going to cope.
358 */
359 if (RELATION_IS_OTHER_TEMP(OldHeap))
360 {
361 if (OidIsValid(indexOid))
362 ereport(ERROR,
363 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
364 errmsg("cannot cluster temporary tables of other sessions")));
365 else
366 ereport(ERROR,
367 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
368 errmsg("cannot vacuum temporary tables of other sessions")));
369 }
370
371 /*
372 * Also check for active uses of the relation in the current transaction,
373 * including open scans and pending AFTER trigger events.
374 */
375 CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
376
377 /* Check heap and index are valid to cluster on */
378 if (OidIsValid(indexOid))
379 check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
380
381 /*
382 * Quietly ignore the request if this is a materialized view which has not
383 * been populated from its query. No harm is done because there is no data
384 * to deal with, and we don't want to throw an error if this is part of a
385 * multi-relation request -- for example, CLUSTER was run on the entire
386 * database.
387 */
388 if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
389 !RelationIsPopulated(OldHeap))
390 {
391 relation_close(OldHeap, AccessExclusiveLock);
392 return;
393 }
394
395 /*
396 * All predicate locks on the tuples or pages are about to be made
397 * invalid, because we move tuples around. Promote them to relation
398 * locks. Predicate locks on indexes will be promoted when they are
399 * reindexed.
400 */
401 TransferPredicateLocksToHeapRelation(OldHeap);
402
403 /* rebuild_relation does all the dirty work */
404 rebuild_relation(OldHeap, indexOid, verbose);
405
406 /* NB: rebuild_relation does heap_close() on OldHeap */
407 }
408
409 /*
410 * Verify that the specified heap and index are valid to cluster on
411 *
412 * Side effect: obtains lock on the index. The caller may
413 * in some cases already have AccessExclusiveLock on the table, but
414 * not in all cases so we can't rely on the table-level lock for
415 * protection here.
416 */
417 void
check_index_is_clusterable(Relation OldHeap,Oid indexOid,bool recheck,LOCKMODE lockmode)418 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
419 {
420 Relation OldIndex;
421
422 OldIndex = index_open(indexOid, lockmode);
423
424 /*
425 * Check that index is in fact an index on the given relation
426 */
427 if (OldIndex->rd_index == NULL ||
428 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
429 ereport(ERROR,
430 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
431 errmsg("\"%s\" is not an index for table \"%s\"",
432 RelationGetRelationName(OldIndex),
433 RelationGetRelationName(OldHeap))));
434
435 /* Index AM must allow clustering */
436 if (!OldIndex->rd_amroutine->amclusterable)
437 ereport(ERROR,
438 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
439 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
440 RelationGetRelationName(OldIndex))));
441
442 /*
443 * Disallow clustering on incomplete indexes (those that might not index
444 * every row of the relation). We could relax this by making a separate
445 * seqscan pass over the table to copy the missing rows, but that seems
446 * expensive and tedious.
447 */
448 if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
449 ereport(ERROR,
450 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
451 errmsg("cannot cluster on partial index \"%s\"",
452 RelationGetRelationName(OldIndex))));
453
454 /*
455 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
456 * it might well not contain entries for every heap row, or might not even
457 * be internally consistent. (But note that we don't check indcheckxmin;
458 * the worst consequence of following broken HOT chains would be that we
459 * might put recently-dead tuples out-of-order in the new table, and there
460 * is little harm in that.)
461 */
462 if (!IndexIsValid(OldIndex->rd_index))
463 ereport(ERROR,
464 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
465 errmsg("cannot cluster on invalid index \"%s\"",
466 RelationGetRelationName(OldIndex))));
467
468 /* Drop relcache refcnt on OldIndex, but keep lock */
469 index_close(OldIndex, NoLock);
470 }
471
472 /*
473 * mark_index_clustered: mark the specified index as the one clustered on
474 *
475 * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
476 */
477 void
mark_index_clustered(Relation rel,Oid indexOid,bool is_internal)478 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
479 {
480 HeapTuple indexTuple;
481 Form_pg_index indexForm;
482 Relation pg_index;
483 ListCell *index;
484
485 /*
486 * If the index is already marked clustered, no need to do anything.
487 */
488 if (OidIsValid(indexOid))
489 {
490 indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
491 if (!HeapTupleIsValid(indexTuple))
492 elog(ERROR, "cache lookup failed for index %u", indexOid);
493 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
494
495 if (indexForm->indisclustered)
496 {
497 ReleaseSysCache(indexTuple);
498 return;
499 }
500
501 ReleaseSysCache(indexTuple);
502 }
503
504 /*
505 * Check each index of the relation and set/clear the bit as needed.
506 */
507 pg_index = heap_open(IndexRelationId, RowExclusiveLock);
508
509 foreach(index, RelationGetIndexList(rel))
510 {
511 Oid thisIndexOid = lfirst_oid(index);
512
513 indexTuple = SearchSysCacheCopy1(INDEXRELID,
514 ObjectIdGetDatum(thisIndexOid));
515 if (!HeapTupleIsValid(indexTuple))
516 elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
517 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
518
519 /*
520 * Unset the bit if set. We know it's wrong because we checked this
521 * earlier.
522 */
523 if (indexForm->indisclustered)
524 {
525 indexForm->indisclustered = false;
526 simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
527 CatalogUpdateIndexes(pg_index, indexTuple);
528 }
529 else if (thisIndexOid == indexOid)
530 {
531 /* this was checked earlier, but let's be real sure */
532 if (!IndexIsValid(indexForm))
533 elog(ERROR, "cannot cluster on invalid index %u", indexOid);
534 indexForm->indisclustered = true;
535 simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
536 CatalogUpdateIndexes(pg_index, indexTuple);
537 }
538
539 InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
540 InvalidOid, is_internal);
541
542 heap_freetuple(indexTuple);
543 }
544
545 heap_close(pg_index, RowExclusiveLock);
546 }
547
548 /*
549 * rebuild_relation: rebuild an existing relation in index or physical order
550 *
551 * OldHeap: table to rebuild --- must be opened and exclusive-locked!
552 * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
553 *
554 * NB: this routine closes OldHeap at the right time; caller should not.
555 */
556 static void
rebuild_relation(Relation OldHeap,Oid indexOid,bool verbose)557 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
558 {
559 Oid tableOid = RelationGetRelid(OldHeap);
560 Oid tableSpace = OldHeap->rd_rel->reltablespace;
561 Oid OIDNewHeap;
562 char relpersistence;
563 bool is_system_catalog;
564 bool swap_toast_by_content;
565 TransactionId frozenXid;
566 MultiXactId cutoffMulti;
567
568 /* Mark the correct index as clustered */
569 if (OidIsValid(indexOid))
570 mark_index_clustered(OldHeap, indexOid, true);
571
572 /* Remember info about rel before closing OldHeap */
573 relpersistence = OldHeap->rd_rel->relpersistence;
574 is_system_catalog = IsSystemRelation(OldHeap);
575
576 /* Close relcache entry, but keep lock until transaction commit */
577 heap_close(OldHeap, NoLock);
578
579 /* Create the transient table that will receive the re-ordered data */
580 OIDNewHeap = make_new_heap(tableOid, tableSpace,
581 relpersistence,
582 AccessExclusiveLock);
583
584 /* Copy the heap data into the new table in the desired order */
585 copy_heap_data(OIDNewHeap, tableOid, indexOid, verbose,
586 &swap_toast_by_content, &frozenXid, &cutoffMulti);
587
588 /*
589 * Swap the physical files of the target and transient tables, then
590 * rebuild the target's indexes and throw away the transient table.
591 */
592 finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
593 swap_toast_by_content, false, true,
594 frozenXid, cutoffMulti,
595 relpersistence);
596 }
597
598
599 /*
600 * Create the transient table that will be filled with new data during
601 * CLUSTER, ALTER TABLE, and similar operations. The transient table
602 * duplicates the logical structure of the OldHeap, but is placed in
603 * NewTableSpace which might be different from OldHeap's. Also, it's built
604 * with the specified persistence, which might differ from the original's.
605 *
606 * After this, the caller should load the new heap with transferred/modified
607 * data, then call finish_heap_swap to complete the operation.
608 */
609 Oid
make_new_heap(Oid OIDOldHeap,Oid NewTableSpace,char relpersistence,LOCKMODE lockmode)610 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
611 LOCKMODE lockmode)
612 {
613 TupleDesc OldHeapDesc;
614 char NewHeapName[NAMEDATALEN];
615 Oid OIDNewHeap;
616 Oid toastid;
617 Relation OldHeap;
618 HeapTuple tuple;
619 Datum reloptions;
620 bool isNull;
621 Oid namespaceid;
622
623 OldHeap = heap_open(OIDOldHeap, lockmode);
624 OldHeapDesc = RelationGetDescr(OldHeap);
625
626 /*
627 * Note that the NewHeap will not receive any of the defaults or
628 * constraints associated with the OldHeap; we don't need 'em, and there's
629 * no reason to spend cycles inserting them into the catalogs only to
630 * delete them.
631 */
632
633 /*
634 * But we do want to use reloptions of the old heap for new heap.
635 */
636 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
637 if (!HeapTupleIsValid(tuple))
638 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
639 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
640 &isNull);
641 if (isNull)
642 reloptions = (Datum) 0;
643
644 if (relpersistence == RELPERSISTENCE_TEMP)
645 namespaceid = LookupCreationNamespace("pg_temp");
646 else
647 namespaceid = RelationGetNamespace(OldHeap);
648
649 /*
650 * Create the new heap, using a temporary name in the same namespace as
651 * the existing table. NOTE: there is some risk of collision with user
652 * relnames. Working around this seems more trouble than it's worth; in
653 * particular, we can't create the new heap in a different namespace from
654 * the old, or we will have problems with the TEMP status of temp tables.
655 *
656 * Note: the new heap is not a shared relation, even if we are rebuilding
657 * a shared rel. However, we do make the new heap mapped if the source is
658 * mapped. This simplifies swap_relation_files, and is absolutely
659 * necessary for rebuilding pg_class, for reasons explained there.
660 */
661 snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
662
663 OIDNewHeap = heap_create_with_catalog(NewHeapName,
664 namespaceid,
665 NewTableSpace,
666 InvalidOid,
667 InvalidOid,
668 InvalidOid,
669 OldHeap->rd_rel->relowner,
670 OldHeapDesc,
671 NIL,
672 RELKIND_RELATION,
673 relpersistence,
674 false,
675 RelationIsMapped(OldHeap),
676 true,
677 0,
678 ONCOMMIT_NOOP,
679 reloptions,
680 false,
681 true,
682 true,
683 NULL);
684 Assert(OIDNewHeap != InvalidOid);
685
686 ReleaseSysCache(tuple);
687
688 /*
689 * Advance command counter so that the newly-created relation's catalog
690 * tuples will be visible to heap_open.
691 */
692 CommandCounterIncrement();
693
694 /*
695 * If necessary, create a TOAST table for the new relation.
696 *
697 * If the relation doesn't have a TOAST table already, we can't need one
698 * for the new relation. The other way around is possible though: if some
699 * wide columns have been dropped, NewHeapCreateToastTable can decide that
700 * no TOAST table is needed for the new table.
701 *
702 * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
703 * that the TOAST table will be visible for insertion.
704 */
705 toastid = OldHeap->rd_rel->reltoastrelid;
706 if (OidIsValid(toastid))
707 {
708 /* keep the existing toast table's reloptions, if any */
709 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
710 if (!HeapTupleIsValid(tuple))
711 elog(ERROR, "cache lookup failed for relation %u", toastid);
712 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
713 &isNull);
714 if (isNull)
715 reloptions = (Datum) 0;
716
717 NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode);
718
719 ReleaseSysCache(tuple);
720 }
721
722 heap_close(OldHeap, NoLock);
723
724 return OIDNewHeap;
725 }
726
727 /*
728 * Do the physical copying of heap data.
729 *
730 * There are three output parameters:
731 * *pSwapToastByContent is set true if toast tables must be swapped by content.
732 * *pFreezeXid receives the TransactionId used as freeze cutoff point.
733 * *pCutoffMulti receives the MultiXactId used as a cutoff point.
734 */
735 static void
copy_heap_data(Oid OIDNewHeap,Oid OIDOldHeap,Oid OIDOldIndex,bool verbose,bool * pSwapToastByContent,TransactionId * pFreezeXid,MultiXactId * pCutoffMulti)736 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
737 bool *pSwapToastByContent, TransactionId *pFreezeXid,
738 MultiXactId *pCutoffMulti)
739 {
740 Relation NewHeap,
741 OldHeap,
742 OldIndex;
743 TupleDesc oldTupDesc;
744 TupleDesc newTupDesc;
745 int natts;
746 Datum *values;
747 bool *isnull;
748 IndexScanDesc indexScan;
749 HeapScanDesc heapScan;
750 bool use_wal;
751 bool is_system_catalog;
752 TransactionId OldestXmin;
753 TransactionId FreezeXid;
754 MultiXactId MultiXactCutoff;
755 RewriteState rwstate;
756 bool use_sort;
757 Tuplesortstate *tuplesort;
758 double num_tuples = 0,
759 tups_vacuumed = 0,
760 tups_recently_dead = 0;
761 int elevel = verbose ? INFO : DEBUG2;
762 PGRUsage ru0;
763
764 pg_rusage_init(&ru0);
765
766 /*
767 * Open the relations we need.
768 */
769 NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
770 OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
771 if (OidIsValid(OIDOldIndex))
772 OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
773 else
774 OldIndex = NULL;
775
776 /*
777 * Their tuple descriptors should be exactly alike, but here we only need
778 * assume that they have the same number of columns.
779 */
780 oldTupDesc = RelationGetDescr(OldHeap);
781 newTupDesc = RelationGetDescr(NewHeap);
782 Assert(newTupDesc->natts == oldTupDesc->natts);
783
784 /* Preallocate values/isnull arrays */
785 natts = newTupDesc->natts;
786 values = (Datum *) palloc(natts * sizeof(Datum));
787 isnull = (bool *) palloc(natts * sizeof(bool));
788
789 /*
790 * If the OldHeap has a toast table, get lock on the toast table to keep
791 * it from being vacuumed. This is needed because autovacuum processes
792 * toast tables independently of their main tables, with no lock on the
793 * latter. If an autovacuum were to start on the toast table after we
794 * compute our OldestXmin below, it would use a later OldestXmin, and then
795 * possibly remove as DEAD toast tuples belonging to main tuples we think
796 * are only RECENTLY_DEAD. Then we'd fail while trying to copy those
797 * tuples.
798 *
799 * We don't need to open the toast relation here, just lock it. The lock
800 * will be held till end of transaction.
801 */
802 if (OldHeap->rd_rel->reltoastrelid)
803 LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
804
805 /*
806 * We need to log the copied data in WAL iff WAL archiving/streaming is
807 * enabled AND it's a WAL-logged rel.
808 */
809 use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
810
811 /* use_wal off requires smgr_targblock be initially invalid */
812 Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
813
814 /*
815 * If both tables have TOAST tables, perform toast swap by content. It is
816 * possible that the old table has a toast table but the new one doesn't,
817 * if toastable columns have been dropped. In that case we have to do
818 * swap by links. This is okay because swap by content is only essential
819 * for system catalogs, and we don't support schema changes for them.
820 */
821 if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
822 {
823 *pSwapToastByContent = true;
824
825 /*
826 * When doing swap by content, any toast pointers written into NewHeap
827 * must use the old toast table's OID, because that's where the toast
828 * data will eventually be found. Set this up by setting rd_toastoid.
829 * This also tells toast_save_datum() to preserve the toast value
830 * OIDs, which we want so as not to invalidate toast pointers in
831 * system catalog caches, and to avoid making multiple copies of a
832 * single toast value.
833 *
834 * Note that we must hold NewHeap open until we are done writing data,
835 * since the relcache will not guarantee to remember this setting once
836 * the relation is closed. Also, this technique depends on the fact
837 * that no one will try to read from the NewHeap until after we've
838 * finished writing it and swapping the rels --- otherwise they could
839 * follow the toast pointers to the wrong place. (It would actually
840 * work for values copied over from the old toast table, but not for
841 * any values that we toast which were previously not toasted.)
842 */
843 NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
844 }
845 else
846 *pSwapToastByContent = false;
847
848 /*
849 * Compute xids used to freeze and weed out dead tuples and multixacts.
850 * Since we're going to rewrite the whole table anyway, there's no reason
851 * not to be aggressive about this.
852 */
853 vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
854 &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
855 NULL);
856
857 /*
858 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
859 * backwards, so take the max.
860 */
861 if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
862 FreezeXid = OldHeap->rd_rel->relfrozenxid;
863
864 /*
865 * MultiXactCutoff, similarly, shouldn't go backwards either.
866 */
867 if (MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
868 MultiXactCutoff = OldHeap->rd_rel->relminmxid;
869
870 /* return selected values to caller */
871 *pFreezeXid = FreezeXid;
872 *pCutoffMulti = MultiXactCutoff;
873
874 /* Remember if it's a system catalog */
875 is_system_catalog = IsSystemRelation(OldHeap);
876
877 /* Initialize the rewrite operation */
878 rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
879 MultiXactCutoff, use_wal);
880
881 /*
882 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
883 * the OldHeap. We know how to use a sort to duplicate the ordering of a
884 * btree index, and will use seqscan-and-sort for that case if the planner
885 * tells us it's cheaper. Otherwise, always indexscan if an index is
886 * provided, else plain seqscan.
887 */
888 if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
889 use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
890 else
891 use_sort = false;
892
893 /* Set up sorting if wanted */
894 if (use_sort)
895 tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
896 maintenance_work_mem, false);
897 else
898 tuplesort = NULL;
899
900 /*
901 * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
902 * that still need to be copied, we scan with SnapshotAny and use
903 * HeapTupleSatisfiesVacuum for the visibility test.
904 */
905 if (OldIndex != NULL && !use_sort)
906 {
907 heapScan = NULL;
908 indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
909 index_rescan(indexScan, NULL, 0, NULL, 0);
910 }
911 else
912 {
913 heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
914 indexScan = NULL;
915 }
916
917 /* Log what we're doing */
918 if (indexScan != NULL)
919 ereport(elevel,
920 (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
921 get_namespace_name(RelationGetNamespace(OldHeap)),
922 RelationGetRelationName(OldHeap),
923 RelationGetRelationName(OldIndex))));
924 else if (tuplesort != NULL)
925 ereport(elevel,
926 (errmsg("clustering \"%s.%s\" using sequential scan and sort",
927 get_namespace_name(RelationGetNamespace(OldHeap)),
928 RelationGetRelationName(OldHeap))));
929 else
930 ereport(elevel,
931 (errmsg("vacuuming \"%s.%s\"",
932 get_namespace_name(RelationGetNamespace(OldHeap)),
933 RelationGetRelationName(OldHeap))));
934
935 /*
936 * Scan through the OldHeap, either in OldIndex order or sequentially;
937 * copy each tuple into the NewHeap, or transiently to the tuplesort
938 * module. Note that we don't bother sorting dead tuples (they won't get
939 * to the new table anyway).
940 */
941 for (;;)
942 {
943 HeapTuple tuple;
944 Buffer buf;
945 bool isdead;
946
947 CHECK_FOR_INTERRUPTS();
948
949 if (indexScan != NULL)
950 {
951 tuple = index_getnext(indexScan, ForwardScanDirection);
952 if (tuple == NULL)
953 break;
954
955 /* Since we used no scan keys, should never need to recheck */
956 if (indexScan->xs_recheck)
957 elog(ERROR, "CLUSTER does not support lossy index conditions");
958
959 buf = indexScan->xs_cbuf;
960 }
961 else
962 {
963 tuple = heap_getnext(heapScan, ForwardScanDirection);
964 if (tuple == NULL)
965 break;
966
967 buf = heapScan->rs_cbuf;
968 }
969
970 LockBuffer(buf, BUFFER_LOCK_SHARE);
971
972 switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
973 {
974 case HEAPTUPLE_DEAD:
975 /* Definitely dead */
976 isdead = true;
977 break;
978 case HEAPTUPLE_RECENTLY_DEAD:
979 tups_recently_dead += 1;
980 /* fall through */
981 case HEAPTUPLE_LIVE:
982 /* Live or recently dead, must copy it */
983 isdead = false;
984 break;
985 case HEAPTUPLE_INSERT_IN_PROGRESS:
986
987 /*
988 * Since we hold exclusive lock on the relation, normally the
989 * only way to see this is if it was inserted earlier in our
990 * own transaction. However, it can happen in system
991 * catalogs, since we tend to release write lock before commit
992 * there. Give a warning if neither case applies; but in any
993 * case we had better copy it.
994 */
995 if (!is_system_catalog &&
996 !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
997 elog(WARNING, "concurrent insert in progress within table \"%s\"",
998 RelationGetRelationName(OldHeap));
999 /* treat as live */
1000 isdead = false;
1001 break;
1002 case HEAPTUPLE_DELETE_IN_PROGRESS:
1003
1004 /*
1005 * Similar situation to INSERT_IN_PROGRESS case.
1006 */
1007 if (!is_system_catalog &&
1008 !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
1009 elog(WARNING, "concurrent delete in progress within table \"%s\"",
1010 RelationGetRelationName(OldHeap));
1011 /* treat as recently dead */
1012 tups_recently_dead += 1;
1013 isdead = false;
1014 break;
1015 default:
1016 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1017 isdead = false; /* keep compiler quiet */
1018 break;
1019 }
1020
1021 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1022
1023 if (isdead)
1024 {
1025 tups_vacuumed += 1;
1026 /* heap rewrite module still needs to see it... */
1027 if (rewrite_heap_dead_tuple(rwstate, tuple))
1028 {
1029 /* A previous recently-dead tuple is now known dead */
1030 tups_vacuumed += 1;
1031 tups_recently_dead -= 1;
1032 }
1033 continue;
1034 }
1035
1036 num_tuples += 1;
1037 if (tuplesort != NULL)
1038 tuplesort_putheaptuple(tuplesort, tuple);
1039 else
1040 reform_and_rewrite_tuple(tuple,
1041 oldTupDesc, newTupDesc,
1042 values, isnull,
1043 NewHeap->rd_rel->relhasoids, rwstate);
1044 }
1045
1046 if (indexScan != NULL)
1047 index_endscan(indexScan);
1048 if (heapScan != NULL)
1049 heap_endscan(heapScan);
1050
1051 /*
1052 * In scan-and-sort mode, complete the sort, then read out all live tuples
1053 * from the tuplestore and write them to the new relation.
1054 */
1055 if (tuplesort != NULL)
1056 {
1057 tuplesort_performsort(tuplesort);
1058
1059 for (;;)
1060 {
1061 HeapTuple tuple;
1062 bool shouldfree;
1063
1064 CHECK_FOR_INTERRUPTS();
1065
1066 tuple = tuplesort_getheaptuple(tuplesort, true, &shouldfree);
1067 if (tuple == NULL)
1068 break;
1069
1070 reform_and_rewrite_tuple(tuple,
1071 oldTupDesc, newTupDesc,
1072 values, isnull,
1073 NewHeap->rd_rel->relhasoids, rwstate);
1074
1075 if (shouldfree)
1076 heap_freetuple(tuple);
1077 }
1078
1079 tuplesort_end(tuplesort);
1080 }
1081
1082 /* Write out any remaining tuples, and fsync if needed */
1083 end_heap_rewrite(rwstate);
1084
1085 /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
1086 NewHeap->rd_toastoid = InvalidOid;
1087
1088 /* Log what we did */
1089 ereport(elevel,
1090 (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
1091 RelationGetRelationName(OldHeap),
1092 tups_vacuumed, num_tuples,
1093 RelationGetNumberOfBlocks(OldHeap)),
1094 errdetail("%.0f dead row versions cannot be removed yet.\n"
1095 "%s.",
1096 tups_recently_dead,
1097 pg_rusage_show(&ru0))));
1098
1099 /* Clean up */
1100 pfree(values);
1101 pfree(isnull);
1102
1103 if (OldIndex != NULL)
1104 index_close(OldIndex, NoLock);
1105 heap_close(OldHeap, NoLock);
1106 heap_close(NewHeap, NoLock);
1107 }
1108
1109 /*
1110 * Swap the physical files of two given relations.
1111 *
1112 * We swap the physical identity (reltablespace, relfilenode) while keeping the
1113 * same logical identities of the two relations. relpersistence is also
1114 * swapped, which is critical since it determines where buffers live for each
1115 * relation.
1116 *
1117 * We can swap associated TOAST data in either of two ways: recursively swap
1118 * the physical content of the toast tables (and their indexes), or swap the
1119 * TOAST links in the given relations' pg_class entries. The former is needed
1120 * to manage rewrites of shared catalogs (where we cannot change the pg_class
1121 * links) while the latter is the only way to handle cases in which a toast
1122 * table is added or removed altogether.
1123 *
1124 * Additionally, the first relation is marked with relfrozenxid set to
1125 * frozenXid. It seems a bit ugly to have this here, but the caller would
1126 * have to do it anyway, so having it here saves a heap_update. Note: in
1127 * the swap-toast-links case, we assume we don't need to change the toast
1128 * table's relfrozenxid: the new version of the toast table should already
1129 * have relfrozenxid set to RecentXmin, which is good enough.
1130 *
1131 * Lastly, if r2 and its toast table and toast index (if any) are mapped,
1132 * their OIDs are emitted into mapped_tables[]. This is hacky but beats
1133 * having to look the information up again later in finish_heap_swap.
1134 */
1135 static void
swap_relation_files(Oid r1,Oid r2,bool target_is_pg_class,bool swap_toast_by_content,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,Oid * mapped_tables)1136 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1137 bool swap_toast_by_content,
1138 bool is_internal,
1139 TransactionId frozenXid,
1140 MultiXactId cutoffMulti,
1141 Oid *mapped_tables)
1142 {
1143 Relation relRelation;
1144 HeapTuple reltup1,
1145 reltup2;
1146 Form_pg_class relform1,
1147 relform2;
1148 Oid relfilenode1,
1149 relfilenode2;
1150 Oid swaptemp;
1151 char swptmpchr;
1152 CatalogIndexState indstate;
1153
1154 /* We need writable copies of both pg_class tuples. */
1155 relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1156
1157 reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1158 if (!HeapTupleIsValid(reltup1))
1159 elog(ERROR, "cache lookup failed for relation %u", r1);
1160 relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1161
1162 reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1163 if (!HeapTupleIsValid(reltup2))
1164 elog(ERROR, "cache lookup failed for relation %u", r2);
1165 relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1166
1167 relfilenode1 = relform1->relfilenode;
1168 relfilenode2 = relform2->relfilenode;
1169
1170 if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1171 {
1172 /*
1173 * Normal non-mapped relations: swap relfilenodes, reltablespaces,
1174 * relpersistence
1175 */
1176 Assert(!target_is_pg_class);
1177
1178 swaptemp = relform1->relfilenode;
1179 relform1->relfilenode = relform2->relfilenode;
1180 relform2->relfilenode = swaptemp;
1181
1182 swaptemp = relform1->reltablespace;
1183 relform1->reltablespace = relform2->reltablespace;
1184 relform2->reltablespace = swaptemp;
1185
1186 swptmpchr = relform1->relpersistence;
1187 relform1->relpersistence = relform2->relpersistence;
1188 relform2->relpersistence = swptmpchr;
1189
1190 /* Also swap toast links, if we're swapping by links */
1191 if (!swap_toast_by_content)
1192 {
1193 swaptemp = relform1->reltoastrelid;
1194 relform1->reltoastrelid = relform2->reltoastrelid;
1195 relform2->reltoastrelid = swaptemp;
1196 }
1197 }
1198 else
1199 {
1200 /*
1201 * Mapped-relation case. Here we have to swap the relation mappings
1202 * instead of modifying the pg_class columns. Both must be mapped.
1203 */
1204 if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
1205 elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1206 NameStr(relform1->relname));
1207
1208 /*
1209 * We can't change the tablespace nor persistence of a mapped rel, and
1210 * we can't handle toast link swapping for one either, because we must
1211 * not apply any critical changes to its pg_class row. These cases
1212 * should be prevented by upstream permissions tests, so these checks
1213 * are non-user-facing emergency backstop.
1214 */
1215 if (relform1->reltablespace != relform2->reltablespace)
1216 elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1217 NameStr(relform1->relname));
1218 if (relform1->relpersistence != relform2->relpersistence)
1219 elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1220 NameStr(relform1->relname));
1221 if (!swap_toast_by_content &&
1222 (relform1->reltoastrelid || relform2->reltoastrelid))
1223 elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1224 NameStr(relform1->relname));
1225
1226 /*
1227 * Fetch the mappings --- shouldn't fail, but be paranoid
1228 */
1229 relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1230 if (!OidIsValid(relfilenode1))
1231 elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1232 NameStr(relform1->relname), r1);
1233 relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1234 if (!OidIsValid(relfilenode2))
1235 elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1236 NameStr(relform2->relname), r2);
1237
1238 /*
1239 * Send replacement mappings to relmapper. Note these won't actually
1240 * take effect until CommandCounterIncrement.
1241 */
1242 RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1243 RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1244
1245 /* Pass OIDs of mapped r2 tables back to caller */
1246 *mapped_tables++ = r2;
1247 }
1248
1249 /*
1250 * In the case of a shared catalog, these next few steps will only affect
1251 * our own database's pg_class row; but that's okay, because they are all
1252 * noncritical updates. That's also an important fact for the case of a
1253 * mapped catalog, because it's possible that we'll commit the map change
1254 * and then fail to commit the pg_class update.
1255 */
1256
1257 /* set rel1's frozen Xid and minimum MultiXid */
1258 if (relform1->relkind != RELKIND_INDEX)
1259 {
1260 Assert(TransactionIdIsNormal(frozenXid));
1261 relform1->relfrozenxid = frozenXid;
1262 Assert(MultiXactIdIsValid(cutoffMulti));
1263 relform1->relminmxid = cutoffMulti;
1264 }
1265
1266 /* swap size statistics too, since new rel has freshly-updated stats */
1267 {
1268 int32 swap_pages;
1269 float4 swap_tuples;
1270 int32 swap_allvisible;
1271
1272 swap_pages = relform1->relpages;
1273 relform1->relpages = relform2->relpages;
1274 relform2->relpages = swap_pages;
1275
1276 swap_tuples = relform1->reltuples;
1277 relform1->reltuples = relform2->reltuples;
1278 relform2->reltuples = swap_tuples;
1279
1280 swap_allvisible = relform1->relallvisible;
1281 relform1->relallvisible = relform2->relallvisible;
1282 relform2->relallvisible = swap_allvisible;
1283 }
1284
1285 /*
1286 * Update the tuples in pg_class --- unless the target relation of the
1287 * swap is pg_class itself. In that case, there is zero point in making
1288 * changes because we'd be updating the old data that we're about to throw
1289 * away. Because the real work being done here for a mapped relation is
1290 * just to change the relation map settings, it's all right to not update
1291 * the pg_class rows in this case. The most important changes will instead
1292 * performed later, in finish_heap_swap() itself.
1293 */
1294 if (!target_is_pg_class)
1295 {
1296 simple_heap_update(relRelation, &reltup1->t_self, reltup1);
1297 simple_heap_update(relRelation, &reltup2->t_self, reltup2);
1298
1299 /* Keep system catalogs current */
1300 indstate = CatalogOpenIndexes(relRelation);
1301 CatalogIndexInsert(indstate, reltup1);
1302 CatalogIndexInsert(indstate, reltup2);
1303 CatalogCloseIndexes(indstate);
1304 }
1305 else
1306 {
1307 /* no update ... but we do still need relcache inval */
1308 CacheInvalidateRelcacheByTuple(reltup1);
1309 CacheInvalidateRelcacheByTuple(reltup2);
1310 }
1311
1312 /*
1313 * Post alter hook for modified relations. The change to r2 is always
1314 * internal, but r1 depends on the invocation context.
1315 */
1316 InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1317 InvalidOid, is_internal);
1318 InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1319 InvalidOid, true);
1320
1321 /*
1322 * If we have toast tables associated with the relations being swapped,
1323 * deal with them too.
1324 */
1325 if (relform1->reltoastrelid || relform2->reltoastrelid)
1326 {
1327 if (swap_toast_by_content)
1328 {
1329 if (relform1->reltoastrelid && relform2->reltoastrelid)
1330 {
1331 /* Recursively swap the contents of the toast tables */
1332 swap_relation_files(relform1->reltoastrelid,
1333 relform2->reltoastrelid,
1334 target_is_pg_class,
1335 swap_toast_by_content,
1336 is_internal,
1337 frozenXid,
1338 cutoffMulti,
1339 mapped_tables);
1340 }
1341 else
1342 {
1343 /* caller messed up */
1344 elog(ERROR, "cannot swap toast files by content when there's only one");
1345 }
1346 }
1347 else
1348 {
1349 /*
1350 * We swapped the ownership links, so we need to change dependency
1351 * data to match.
1352 *
1353 * NOTE: it is possible that only one table has a toast table.
1354 *
1355 * NOTE: at present, a TOAST table's only dependency is the one on
1356 * its owning table. If more are ever created, we'd need to use
1357 * something more selective than deleteDependencyRecordsFor() to
1358 * get rid of just the link we want.
1359 */
1360 ObjectAddress baseobject,
1361 toastobject;
1362 long count;
1363
1364 /*
1365 * We disallow this case for system catalogs, to avoid the
1366 * possibility that the catalog we're rebuilding is one of the
1367 * ones the dependency changes would change. It's too late to be
1368 * making any data changes to the target catalog.
1369 */
1370 if (IsSystemClass(r1, relform1))
1371 elog(ERROR, "cannot swap toast files by links for system catalogs");
1372
1373 /* Delete old dependencies */
1374 if (relform1->reltoastrelid)
1375 {
1376 count = deleteDependencyRecordsFor(RelationRelationId,
1377 relform1->reltoastrelid,
1378 false);
1379 if (count != 1)
1380 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1381 count);
1382 }
1383 if (relform2->reltoastrelid)
1384 {
1385 count = deleteDependencyRecordsFor(RelationRelationId,
1386 relform2->reltoastrelid,
1387 false);
1388 if (count != 1)
1389 elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1390 count);
1391 }
1392
1393 /* Register new dependencies */
1394 baseobject.classId = RelationRelationId;
1395 baseobject.objectSubId = 0;
1396 toastobject.classId = RelationRelationId;
1397 toastobject.objectSubId = 0;
1398
1399 if (relform1->reltoastrelid)
1400 {
1401 baseobject.objectId = r1;
1402 toastobject.objectId = relform1->reltoastrelid;
1403 recordDependencyOn(&toastobject, &baseobject,
1404 DEPENDENCY_INTERNAL);
1405 }
1406
1407 if (relform2->reltoastrelid)
1408 {
1409 baseobject.objectId = r2;
1410 toastobject.objectId = relform2->reltoastrelid;
1411 recordDependencyOn(&toastobject, &baseobject,
1412 DEPENDENCY_INTERNAL);
1413 }
1414 }
1415 }
1416
1417 /*
1418 * If we're swapping two toast tables by content, do the same for their
1419 * valid index. The swap can actually be safely done only if the relations
1420 * have indexes.
1421 */
1422 if (swap_toast_by_content &&
1423 relform1->relkind == RELKIND_TOASTVALUE &&
1424 relform2->relkind == RELKIND_TOASTVALUE)
1425 {
1426 Oid toastIndex1,
1427 toastIndex2;
1428
1429 /* Get valid index for each relation */
1430 toastIndex1 = toast_get_valid_index(r1,
1431 AccessExclusiveLock);
1432 toastIndex2 = toast_get_valid_index(r2,
1433 AccessExclusiveLock);
1434
1435 swap_relation_files(toastIndex1,
1436 toastIndex2,
1437 target_is_pg_class,
1438 swap_toast_by_content,
1439 is_internal,
1440 InvalidTransactionId,
1441 InvalidMultiXactId,
1442 mapped_tables);
1443 }
1444
1445 /* Clean up. */
1446 heap_freetuple(reltup1);
1447 heap_freetuple(reltup2);
1448
1449 heap_close(relRelation, RowExclusiveLock);
1450
1451 /*
1452 * Close both relcache entries' smgr links. We need this kluge because
1453 * both links will be invalidated during upcoming CommandCounterIncrement.
1454 * Whichever of the rels is the second to be cleared will have a dangling
1455 * reference to the other's smgr entry. Rather than trying to avoid this
1456 * by ordering operations just so, it's easiest to close the links first.
1457 * (Fortunately, since one of the entries is local in our transaction,
1458 * it's sufficient to clear out our own relcache this way; the problem
1459 * cannot arise for other backends when they see our update on the
1460 * non-transient relation.)
1461 *
1462 * Caution: the placement of this step interacts with the decision to
1463 * handle toast rels by recursion. When we are trying to rebuild pg_class
1464 * itself, the smgr close on pg_class must happen after all accesses in
1465 * this function.
1466 */
1467 RelationCloseSmgrByOid(r1);
1468 RelationCloseSmgrByOid(r2);
1469 }
1470
1471 /*
1472 * Remove the transient table that was built by make_new_heap, and finish
1473 * cleaning up (including rebuilding all indexes on the old heap).
1474 */
1475 void
finish_heap_swap(Oid OIDOldHeap,Oid OIDNewHeap,bool is_system_catalog,bool swap_toast_by_content,bool check_constraints,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,char newrelpersistence)1476 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1477 bool is_system_catalog,
1478 bool swap_toast_by_content,
1479 bool check_constraints,
1480 bool is_internal,
1481 TransactionId frozenXid,
1482 MultiXactId cutoffMulti,
1483 char newrelpersistence)
1484 {
1485 ObjectAddress object;
1486 Oid mapped_tables[4];
1487 int reindex_flags;
1488 int i;
1489
1490 /* Zero out possible results from swapped_relation_files */
1491 memset(mapped_tables, 0, sizeof(mapped_tables));
1492
1493 /*
1494 * Swap the contents of the heap relations (including any toast tables).
1495 * Also set old heap's relfrozenxid to frozenXid.
1496 */
1497 swap_relation_files(OIDOldHeap, OIDNewHeap,
1498 (OIDOldHeap == RelationRelationId),
1499 swap_toast_by_content, is_internal,
1500 frozenXid, cutoffMulti, mapped_tables);
1501
1502 /*
1503 * If it's a system catalog, queue an sinval message to flush all
1504 * catcaches on the catalog when we reach CommandCounterIncrement.
1505 */
1506 if (is_system_catalog)
1507 CacheInvalidateCatalog(OIDOldHeap);
1508
1509 /*
1510 * Rebuild each index on the relation (but not the toast table, which is
1511 * all-new at this point). It is important to do this before the DROP
1512 * step because if we are processing a system catalog that will be used
1513 * during DROP, we want to have its indexes available. There is no
1514 * advantage to the other order anyway because this is all transactional,
1515 * so no chance to reclaim disk space before commit. We do not need a
1516 * final CommandCounterIncrement() because reindex_relation does it.
1517 *
1518 * Note: because index_build is called via reindex_relation, it will never
1519 * set indcheckxmin true for the indexes. This is OK even though in some
1520 * sense we are building new indexes rather than rebuilding existing ones,
1521 * because the new heap won't contain any HOT chains at all, let alone
1522 * broken ones, so it can't be necessary to set indcheckxmin.
1523 */
1524 reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1525 if (check_constraints)
1526 reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1527
1528 /*
1529 * Ensure that the indexes have the same persistence as the parent
1530 * relation.
1531 */
1532 if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1533 reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1534 else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1535 reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1536
1537 reindex_relation(OIDOldHeap, reindex_flags, 0);
1538
1539 /*
1540 * If the relation being rebuild is pg_class, swap_relation_files()
1541 * couldn't update pg_class's own pg_class entry (check comments in
1542 * swap_relation_files()), thus relfrozenxid was not updated. That's
1543 * annoying because a potential reason for doing a VACUUM FULL is a
1544 * imminent or actual anti-wraparound shutdown. So, now that we can
1545 * access the new relation using its indices, update relfrozenxid.
1546 * pg_class doesn't have a toast relation, so we don't need to update the
1547 * corresponding toast relation. Not that there's little point moving all
1548 * relfrozenxid updates here since swap_relation_files() needs to write to
1549 * pg_class for non-mapped relations anyway.
1550 */
1551 if (OIDOldHeap == RelationRelationId)
1552 {
1553 Relation relRelation;
1554 HeapTuple reltup;
1555 Form_pg_class relform;
1556
1557 relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1558
1559 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1560 if (!HeapTupleIsValid(reltup))
1561 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1562 relform = (Form_pg_class) GETSTRUCT(reltup);
1563
1564 relform->relfrozenxid = frozenXid;
1565 relform->relminmxid = cutoffMulti;
1566
1567 simple_heap_update(relRelation, &reltup->t_self, reltup);
1568 CatalogUpdateIndexes(relRelation, reltup);
1569
1570 heap_close(relRelation, RowExclusiveLock);
1571 }
1572
1573 /* Destroy new heap with old filenode */
1574 object.classId = RelationRelationId;
1575 object.objectId = OIDNewHeap;
1576 object.objectSubId = 0;
1577
1578 /*
1579 * The new relation is local to our transaction and we know nothing
1580 * depends on it, so DROP_RESTRICT should be OK.
1581 */
1582 performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1583
1584 /* performDeletion does CommandCounterIncrement at end */
1585
1586 /*
1587 * Now we must remove any relation mapping entries that we set up for the
1588 * transient table, as well as its toast table and toast index if any. If
1589 * we fail to do this before commit, the relmapper will complain about new
1590 * permanent map entries being added post-bootstrap.
1591 */
1592 for (i = 0; OidIsValid(mapped_tables[i]); i++)
1593 RelationMapRemoveMapping(mapped_tables[i]);
1594
1595 /*
1596 * At this point, everything is kosher except that, if we did toast swap
1597 * by links, the toast table's name corresponds to the transient table.
1598 * The name is irrelevant to the backend because it's referenced by OID,
1599 * but users looking at the catalogs could be confused. Rename it to
1600 * prevent this problem.
1601 *
1602 * Note no lock required on the relation, because we already hold an
1603 * exclusive lock on it.
1604 */
1605 if (!swap_toast_by_content)
1606 {
1607 Relation newrel;
1608
1609 newrel = heap_open(OIDOldHeap, NoLock);
1610 if (OidIsValid(newrel->rd_rel->reltoastrelid))
1611 {
1612 Oid toastidx;
1613 char NewToastName[NAMEDATALEN];
1614
1615 /* Get the associated valid index to be renamed */
1616 toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1617 NoLock);
1618
1619 /* rename the toast table ... */
1620 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1621 OIDOldHeap);
1622 RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1623 NewToastName, true);
1624
1625 /* ... and its valid index too. */
1626 snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1627 OIDOldHeap);
1628
1629 RenameRelationInternal(toastidx,
1630 NewToastName, true);
1631 }
1632 relation_close(newrel, NoLock);
1633 }
1634 }
1635
1636
1637 /*
1638 * Get a list of tables that the current user owns and
1639 * have indisclustered set. Return the list in a List * of rvsToCluster
1640 * with the tableOid and the indexOid on which the table is already
1641 * clustered.
1642 */
1643 static List *
get_tables_to_cluster(MemoryContext cluster_context)1644 get_tables_to_cluster(MemoryContext cluster_context)
1645 {
1646 Relation indRelation;
1647 HeapScanDesc scan;
1648 ScanKeyData entry;
1649 HeapTuple indexTuple;
1650 Form_pg_index index;
1651 MemoryContext old_context;
1652 RelToCluster *rvtc;
1653 List *rvs = NIL;
1654
1655 /*
1656 * Get all indexes that have indisclustered set and are owned by
1657 * appropriate user. System relations or nailed-in relations cannot ever
1658 * have indisclustered set, because CLUSTER will refuse to set it when
1659 * called with one of them as argument.
1660 */
1661 indRelation = heap_open(IndexRelationId, AccessShareLock);
1662 ScanKeyInit(&entry,
1663 Anum_pg_index_indisclustered,
1664 BTEqualStrategyNumber, F_BOOLEQ,
1665 BoolGetDatum(true));
1666 scan = heap_beginscan_catalog(indRelation, 1, &entry);
1667 while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1668 {
1669 index = (Form_pg_index) GETSTRUCT(indexTuple);
1670
1671 if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1672 continue;
1673
1674 /*
1675 * We have to build the list in a different memory context so it will
1676 * survive the cross-transaction processing
1677 */
1678 old_context = MemoryContextSwitchTo(cluster_context);
1679
1680 rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1681 rvtc->tableOid = index->indrelid;
1682 rvtc->indexOid = index->indexrelid;
1683 rvs = lcons(rvtc, rvs);
1684
1685 MemoryContextSwitchTo(old_context);
1686 }
1687 heap_endscan(scan);
1688
1689 relation_close(indRelation, AccessShareLock);
1690
1691 return rvs;
1692 }
1693
1694
1695 /*
1696 * Reconstruct and rewrite the given tuple
1697 *
1698 * We cannot simply copy the tuple as-is, for several reasons:
1699 *
1700 * 1. We'd like to squeeze out the values of any dropped columns, both
1701 * to save space and to ensure we have no corner-case failures. (It's
1702 * possible for example that the new table hasn't got a TOAST table
1703 * and so is unable to store any large values of dropped cols.)
1704 *
1705 * 2. The tuple might not even be legal for the new table; this is
1706 * currently only known to happen as an after-effect of ALTER TABLE
1707 * SET WITHOUT OIDS.
1708 *
1709 * So, we must reconstruct the tuple from component Datums.
1710 */
1711 static void
reform_and_rewrite_tuple(HeapTuple tuple,TupleDesc oldTupDesc,TupleDesc newTupDesc,Datum * values,bool * isnull,bool newRelHasOids,RewriteState rwstate)1712 reform_and_rewrite_tuple(HeapTuple tuple,
1713 TupleDesc oldTupDesc, TupleDesc newTupDesc,
1714 Datum *values, bool *isnull,
1715 bool newRelHasOids, RewriteState rwstate)
1716 {
1717 HeapTuple copiedTuple;
1718 int i;
1719
1720 heap_deform_tuple(tuple, oldTupDesc, values, isnull);
1721
1722 /* Be sure to null out any dropped columns */
1723 for (i = 0; i < newTupDesc->natts; i++)
1724 {
1725 if (newTupDesc->attrs[i]->attisdropped)
1726 isnull[i] = true;
1727 }
1728
1729 copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
1730
1731 /* Preserve OID, if any */
1732 if (newRelHasOids)
1733 HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
1734
1735 /* The heap rewrite module does the rest */
1736 rewrite_heap_tuple(rwstate, tuple, copiedTuple);
1737
1738 heap_freetuple(copiedTuple);
1739 }
1740