1 /*-------------------------------------------------------------------------
2  *
3  * cluster.c
4  *	  CLUSTER a table on an index.  This is now also used for VACUUM FULL.
5  *
6  * There is hardly anything left of Paul Brown's original implementation...
7  *
8  *
9  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994-5, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  *	  src/backend/commands/cluster.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include "access/amapi.h"
21 #include "access/heapam.h"
22 #include "access/multixact.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "access/toast_internals.h"
26 #include "access/transam.h"
27 #include "access/xact.h"
28 #include "access/xlog.h"
29 #include "catalog/catalog.h"
30 #include "catalog/dependency.h"
31 #include "catalog/heap.h"
32 #include "catalog/index.h"
33 #include "catalog/namespace.h"
34 #include "catalog/objectaccess.h"
35 #include "catalog/pg_am.h"
36 #include "catalog/toasting.h"
37 #include "commands/cluster.h"
38 #include "commands/progress.h"
39 #include "commands/tablecmds.h"
40 #include "commands/vacuum.h"
41 #include "miscadmin.h"
42 #include "optimizer/optimizer.h"
43 #include "pgstat.h"
44 #include "storage/bufmgr.h"
45 #include "storage/lmgr.h"
46 #include "storage/predicate.h"
47 #include "utils/acl.h"
48 #include "utils/fmgroids.h"
49 #include "utils/inval.h"
50 #include "utils/lsyscache.h"
51 #include "utils/memutils.h"
52 #include "utils/pg_rusage.h"
53 #include "utils/relmapper.h"
54 #include "utils/snapmgr.h"
55 #include "utils/syscache.h"
56 #include "utils/tuplesort.h"
57 
58 /*
59  * This struct is used to pass around the information on tables to be
60  * clustered. We need this so we can make a list of them when invoked without
61  * a specific table/index pair.
62  */
63 typedef struct
64 {
65 	Oid			tableOid;
66 	Oid			indexOid;
67 } RelToCluster;
68 
69 
70 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
71 static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
72 							bool verbose, bool *pSwapToastByContent,
73 							TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
74 static List *get_tables_to_cluster(MemoryContext cluster_context);
75 
76 
77 /*---------------------------------------------------------------------------
78  * This cluster code allows for clustering multiple tables at once. Because
79  * of this, we cannot just run everything on a single transaction, or we
80  * would be forced to acquire exclusive locks on all the tables being
81  * clustered, simultaneously --- very likely leading to deadlock.
82  *
83  * To solve this we follow a similar strategy to VACUUM code,
84  * clustering each relation in a separate transaction. For this to work,
85  * we need to:
86  *	- provide a separate memory context so that we can pass information in
87  *	  a way that survives across transactions
88  *	- start a new transaction every time a new relation is clustered
89  *	- check for validity of the information on to-be-clustered relations,
90  *	  as someone might have deleted a relation behind our back, or
91  *	  clustered one on a different index
92  *	- end the transaction
93  *
94  * The single-relation case does not have any such overhead.
95  *
96  * We also allow a relation to be specified without index.  In that case,
97  * the indisclustered bit will be looked up, and an ERROR will be thrown
98  * if there is no index with the bit set.
99  *---------------------------------------------------------------------------
100  */
101 void
cluster(ClusterStmt * stmt,bool isTopLevel)102 cluster(ClusterStmt *stmt, bool isTopLevel)
103 {
104 	if (stmt->relation != NULL)
105 	{
106 		/* This is the single-relation case. */
107 		Oid			tableOid,
108 					indexOid = InvalidOid;
109 		Relation	rel;
110 
111 		/* Find, lock, and check permissions on the table */
112 		tableOid = RangeVarGetRelidExtended(stmt->relation,
113 											AccessExclusiveLock,
114 											0,
115 											RangeVarCallbackOwnsTable, NULL);
116 		rel = table_open(tableOid, NoLock);
117 
118 		/*
119 		 * Reject clustering a remote temp table ... their local buffer
120 		 * manager is not going to cope.
121 		 */
122 		if (RELATION_IS_OTHER_TEMP(rel))
123 			ereport(ERROR,
124 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
125 					 errmsg("cannot cluster temporary tables of other sessions")));
126 
127 		/*
128 		 * Reject clustering a partitioned table.
129 		 */
130 		if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
131 			ereport(ERROR,
132 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
133 					 errmsg("cannot cluster a partitioned table")));
134 
135 		if (stmt->indexname == NULL)
136 		{
137 			ListCell   *index;
138 
139 			/* We need to find the index that has indisclustered set. */
140 			foreach(index, RelationGetIndexList(rel))
141 			{
142 				indexOid = lfirst_oid(index);
143 				if (get_index_isclustered(indexOid))
144 					break;
145 				indexOid = InvalidOid;
146 			}
147 
148 			if (!OidIsValid(indexOid))
149 				ereport(ERROR,
150 						(errcode(ERRCODE_UNDEFINED_OBJECT),
151 						 errmsg("there is no previously clustered index for table \"%s\"",
152 								stmt->relation->relname)));
153 		}
154 		else
155 		{
156 			/*
157 			 * The index is expected to be in the same namespace as the
158 			 * relation.
159 			 */
160 			indexOid = get_relname_relid(stmt->indexname,
161 										 rel->rd_rel->relnamespace);
162 			if (!OidIsValid(indexOid))
163 				ereport(ERROR,
164 						(errcode(ERRCODE_UNDEFINED_OBJECT),
165 						 errmsg("index \"%s\" for table \"%s\" does not exist",
166 								stmt->indexname, stmt->relation->relname)));
167 		}
168 
169 		/* close relation, keep lock till commit */
170 		table_close(rel, NoLock);
171 
172 		/* Do the job. */
173 		cluster_rel(tableOid, indexOid, stmt->options);
174 	}
175 	else
176 	{
177 		/*
178 		 * This is the "multi relation" case. We need to cluster all tables
179 		 * that have some index with indisclustered set.
180 		 */
181 		MemoryContext cluster_context;
182 		List	   *rvs;
183 		ListCell   *rv;
184 
185 		/*
186 		 * We cannot run this form of CLUSTER inside a user transaction block;
187 		 * we'd be holding locks way too long.
188 		 */
189 		PreventInTransactionBlock(isTopLevel, "CLUSTER");
190 
191 		/*
192 		 * Create special memory context for cross-transaction storage.
193 		 *
194 		 * Since it is a child of PortalContext, it will go away even in case
195 		 * of error.
196 		 */
197 		cluster_context = AllocSetContextCreate(PortalContext,
198 												"Cluster",
199 												ALLOCSET_DEFAULT_SIZES);
200 
201 		/*
202 		 * Build the list of relations to cluster.  Note that this lives in
203 		 * cluster_context.
204 		 */
205 		rvs = get_tables_to_cluster(cluster_context);
206 
207 		/* Commit to get out of starting transaction */
208 		PopActiveSnapshot();
209 		CommitTransactionCommand();
210 
211 		/* Ok, now that we've got them all, cluster them one by one */
212 		foreach(rv, rvs)
213 		{
214 			RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
215 
216 			/* Start a new transaction for each relation. */
217 			StartTransactionCommand();
218 			/* functions in indexes may want a snapshot set */
219 			PushActiveSnapshot(GetTransactionSnapshot());
220 			/* Do the job. */
221 			cluster_rel(rvtc->tableOid, rvtc->indexOid,
222 						stmt->options | CLUOPT_RECHECK);
223 			PopActiveSnapshot();
224 			CommitTransactionCommand();
225 		}
226 
227 		/* Start a new transaction for the cleanup work. */
228 		StartTransactionCommand();
229 
230 		/* Clean up working storage */
231 		MemoryContextDelete(cluster_context);
232 	}
233 }
234 
235 /*
236  * cluster_rel
237  *
238  * This clusters the table by creating a new, clustered table and
239  * swapping the relfilenodes of the new table and the old table, so
240  * the OID of the original table is preserved.  Thus we do not lose
241  * GRANT, inheritance nor references to this table (this was a bug
242  * in releases through 7.3).
243  *
244  * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
245  * the new table, it's better to create the indexes afterwards than to fill
246  * them incrementally while we load the table.
247  *
248  * If indexOid is InvalidOid, the table will be rewritten in physical order
249  * instead of index order.  This is the new implementation of VACUUM FULL,
250  * and error messages should refer to the operation as VACUUM not CLUSTER.
251  */
252 void
cluster_rel(Oid tableOid,Oid indexOid,int options)253 cluster_rel(Oid tableOid, Oid indexOid, int options)
254 {
255 	Relation	OldHeap;
256 	bool		verbose = ((options & CLUOPT_VERBOSE) != 0);
257 	bool		recheck = ((options & CLUOPT_RECHECK) != 0);
258 
259 	/* Check for user-requested abort. */
260 	CHECK_FOR_INTERRUPTS();
261 
262 	pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
263 	if (OidIsValid(indexOid))
264 		pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
265 									 PROGRESS_CLUSTER_COMMAND_CLUSTER);
266 	else
267 		pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
268 									 PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
269 
270 	/*
271 	 * We grab exclusive access to the target rel and index for the duration
272 	 * of the transaction.  (This is redundant for the single-transaction
273 	 * case, since cluster() already did it.)  The index lock is taken inside
274 	 * check_index_is_clusterable.
275 	 */
276 	OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
277 
278 	/* If the table has gone away, we can skip processing it */
279 	if (!OldHeap)
280 	{
281 		pgstat_progress_end_command();
282 		return;
283 	}
284 
285 	/*
286 	 * Since we may open a new transaction for each relation, we have to check
287 	 * that the relation still is what we think it is.
288 	 *
289 	 * If this is a single-transaction CLUSTER, we can skip these tests. We
290 	 * *must* skip the one on indisclustered since it would reject an attempt
291 	 * to cluster a not-previously-clustered index.
292 	 */
293 	if (recheck)
294 	{
295 		/* Check that the user still owns the relation */
296 		if (!pg_class_ownercheck(tableOid, GetUserId()))
297 		{
298 			relation_close(OldHeap, AccessExclusiveLock);
299 			pgstat_progress_end_command();
300 			return;
301 		}
302 
303 		/*
304 		 * Silently skip a temp table for a remote session.  Only doing this
305 		 * check in the "recheck" case is appropriate (which currently means
306 		 * somebody is executing a database-wide CLUSTER), because there is
307 		 * another check in cluster() which will stop any attempt to cluster
308 		 * remote temp tables by name.  There is another check in cluster_rel
309 		 * which is redundant, but we leave it for extra safety.
310 		 */
311 		if (RELATION_IS_OTHER_TEMP(OldHeap))
312 		{
313 			relation_close(OldHeap, AccessExclusiveLock);
314 			pgstat_progress_end_command();
315 			return;
316 		}
317 
318 		if (OidIsValid(indexOid))
319 		{
320 			/*
321 			 * Check that the index still exists
322 			 */
323 			if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
324 			{
325 				relation_close(OldHeap, AccessExclusiveLock);
326 				pgstat_progress_end_command();
327 				return;
328 			}
329 
330 			/*
331 			 * Check that the index is still the one with indisclustered set.
332 			 */
333 			if (!get_index_isclustered(indexOid))
334 			{
335 				relation_close(OldHeap, AccessExclusiveLock);
336 				pgstat_progress_end_command();
337 				return;
338 			}
339 		}
340 	}
341 
342 	/*
343 	 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
344 	 * would work in most respects, but the index would only get marked as
345 	 * indisclustered in the current database, leading to unexpected behavior
346 	 * if CLUSTER were later invoked in another database.
347 	 */
348 	if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
349 		ereport(ERROR,
350 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
351 				 errmsg("cannot cluster a shared catalog")));
352 
353 	/*
354 	 * Don't process temp tables of other backends ... their local buffer
355 	 * manager is not going to cope.
356 	 */
357 	if (RELATION_IS_OTHER_TEMP(OldHeap))
358 	{
359 		if (OidIsValid(indexOid))
360 			ereport(ERROR,
361 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
362 					 errmsg("cannot cluster temporary tables of other sessions")));
363 		else
364 			ereport(ERROR,
365 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
366 					 errmsg("cannot vacuum temporary tables of other sessions")));
367 	}
368 
369 	/*
370 	 * Also check for active uses of the relation in the current transaction,
371 	 * including open scans and pending AFTER trigger events.
372 	 */
373 	CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
374 
375 	/* Check heap and index are valid to cluster on */
376 	if (OidIsValid(indexOid))
377 		check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
378 
379 	/*
380 	 * Quietly ignore the request if this is a materialized view which has not
381 	 * been populated from its query. No harm is done because there is no data
382 	 * to deal with, and we don't want to throw an error if this is part of a
383 	 * multi-relation request -- for example, CLUSTER was run on the entire
384 	 * database.
385 	 */
386 	if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
387 		!RelationIsPopulated(OldHeap))
388 	{
389 		relation_close(OldHeap, AccessExclusiveLock);
390 		pgstat_progress_end_command();
391 		return;
392 	}
393 
394 	/*
395 	 * All predicate locks on the tuples or pages are about to be made
396 	 * invalid, because we move tuples around.  Promote them to relation
397 	 * locks.  Predicate locks on indexes will be promoted when they are
398 	 * reindexed.
399 	 */
400 	TransferPredicateLocksToHeapRelation(OldHeap);
401 
402 	/* rebuild_relation does all the dirty work */
403 	rebuild_relation(OldHeap, indexOid, verbose);
404 
405 	/* NB: rebuild_relation does table_close() on OldHeap */
406 
407 	pgstat_progress_end_command();
408 }
409 
410 /*
411  * Verify that the specified heap and index are valid to cluster on
412  *
413  * Side effect: obtains lock on the index.  The caller may
414  * in some cases already have AccessExclusiveLock on the table, but
415  * not in all cases so we can't rely on the table-level lock for
416  * protection here.
417  */
418 void
check_index_is_clusterable(Relation OldHeap,Oid indexOid,bool recheck,LOCKMODE lockmode)419 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
420 {
421 	Relation	OldIndex;
422 
423 	OldIndex = index_open(indexOid, lockmode);
424 
425 	/*
426 	 * Check that index is in fact an index on the given relation
427 	 */
428 	if (OldIndex->rd_index == NULL ||
429 		OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
430 		ereport(ERROR,
431 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
432 				 errmsg("\"%s\" is not an index for table \"%s\"",
433 						RelationGetRelationName(OldIndex),
434 						RelationGetRelationName(OldHeap))));
435 
436 	/* Index AM must allow clustering */
437 	if (!OldIndex->rd_indam->amclusterable)
438 		ereport(ERROR,
439 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
440 				 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
441 						RelationGetRelationName(OldIndex))));
442 
443 	/*
444 	 * Disallow clustering on incomplete indexes (those that might not index
445 	 * every row of the relation).  We could relax this by making a separate
446 	 * seqscan pass over the table to copy the missing rows, but that seems
447 	 * expensive and tedious.
448 	 */
449 	if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
450 		ereport(ERROR,
451 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
452 				 errmsg("cannot cluster on partial index \"%s\"",
453 						RelationGetRelationName(OldIndex))));
454 
455 	/*
456 	 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
457 	 * it might well not contain entries for every heap row, or might not even
458 	 * be internally consistent.  (But note that we don't check indcheckxmin;
459 	 * the worst consequence of following broken HOT chains would be that we
460 	 * might put recently-dead tuples out-of-order in the new table, and there
461 	 * is little harm in that.)
462 	 */
463 	if (!OldIndex->rd_index->indisvalid)
464 		ereport(ERROR,
465 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
466 				 errmsg("cannot cluster on invalid index \"%s\"",
467 						RelationGetRelationName(OldIndex))));
468 
469 	/* Drop relcache refcnt on OldIndex, but keep lock */
470 	index_close(OldIndex, NoLock);
471 }
472 
473 /*
474  * mark_index_clustered: mark the specified index as the one clustered on
475  *
476  * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
477  */
478 void
mark_index_clustered(Relation rel,Oid indexOid,bool is_internal)479 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
480 {
481 	HeapTuple	indexTuple;
482 	Form_pg_index indexForm;
483 	Relation	pg_index;
484 	ListCell   *index;
485 
486 	/* Disallow applying to a partitioned table */
487 	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
488 		ereport(ERROR,
489 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
490 				 errmsg("cannot mark index clustered in partitioned table")));
491 
492 	/*
493 	 * If the index is already marked clustered, no need to do anything.
494 	 */
495 	if (OidIsValid(indexOid))
496 	{
497 		if (get_index_isclustered(indexOid))
498 			return;
499 	}
500 
501 	/*
502 	 * Check each index of the relation and set/clear the bit as needed.
503 	 */
504 	pg_index = table_open(IndexRelationId, RowExclusiveLock);
505 
506 	foreach(index, RelationGetIndexList(rel))
507 	{
508 		Oid			thisIndexOid = lfirst_oid(index);
509 
510 		indexTuple = SearchSysCacheCopy1(INDEXRELID,
511 										 ObjectIdGetDatum(thisIndexOid));
512 		if (!HeapTupleIsValid(indexTuple))
513 			elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
514 		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
515 
516 		/*
517 		 * Unset the bit if set.  We know it's wrong because we checked this
518 		 * earlier.
519 		 */
520 		if (indexForm->indisclustered)
521 		{
522 			indexForm->indisclustered = false;
523 			CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
524 		}
525 		else if (thisIndexOid == indexOid)
526 		{
527 			/* this was checked earlier, but let's be real sure */
528 			if (!indexForm->indisvalid)
529 				elog(ERROR, "cannot cluster on invalid index %u", indexOid);
530 			indexForm->indisclustered = true;
531 			CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
532 		}
533 
534 		InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
535 									 InvalidOid, is_internal);
536 
537 		heap_freetuple(indexTuple);
538 	}
539 
540 	table_close(pg_index, RowExclusiveLock);
541 }
542 
543 /*
544  * rebuild_relation: rebuild an existing relation in index or physical order
545  *
546  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
547  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
548  *
549  * NB: this routine closes OldHeap at the right time; caller should not.
550  */
551 static void
rebuild_relation(Relation OldHeap,Oid indexOid,bool verbose)552 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
553 {
554 	Oid			tableOid = RelationGetRelid(OldHeap);
555 	Oid			tableSpace = OldHeap->rd_rel->reltablespace;
556 	Oid			OIDNewHeap;
557 	char		relpersistence;
558 	bool		is_system_catalog;
559 	bool		swap_toast_by_content;
560 	TransactionId frozenXid;
561 	MultiXactId cutoffMulti;
562 
563 	/* Mark the correct index as clustered */
564 	if (OidIsValid(indexOid))
565 		mark_index_clustered(OldHeap, indexOid, true);
566 
567 	/* Remember info about rel before closing OldHeap */
568 	relpersistence = OldHeap->rd_rel->relpersistence;
569 	is_system_catalog = IsSystemRelation(OldHeap);
570 
571 	/* Close relcache entry, but keep lock until transaction commit */
572 	table_close(OldHeap, NoLock);
573 
574 	/* Create the transient table that will receive the re-ordered data */
575 	OIDNewHeap = make_new_heap(tableOid, tableSpace,
576 							   relpersistence,
577 							   AccessExclusiveLock);
578 
579 	/* Copy the heap data into the new table in the desired order */
580 	copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
581 					&swap_toast_by_content, &frozenXid, &cutoffMulti);
582 
583 	/*
584 	 * Swap the physical files of the target and transient tables, then
585 	 * rebuild the target's indexes and throw away the transient table.
586 	 */
587 	finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
588 					 swap_toast_by_content, false, true,
589 					 frozenXid, cutoffMulti,
590 					 relpersistence);
591 }
592 
593 
594 /*
595  * Create the transient table that will be filled with new data during
596  * CLUSTER, ALTER TABLE, and similar operations.  The transient table
597  * duplicates the logical structure of the OldHeap, but is placed in
598  * NewTableSpace which might be different from OldHeap's.  Also, it's built
599  * with the specified persistence, which might differ from the original's.
600  *
601  * After this, the caller should load the new heap with transferred/modified
602  * data, then call finish_heap_swap to complete the operation.
603  */
604 Oid
make_new_heap(Oid OIDOldHeap,Oid NewTableSpace,char relpersistence,LOCKMODE lockmode)605 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
606 			  LOCKMODE lockmode)
607 {
608 	TupleDesc	OldHeapDesc;
609 	char		NewHeapName[NAMEDATALEN];
610 	Oid			OIDNewHeap;
611 	Oid			toastid;
612 	Relation	OldHeap;
613 	HeapTuple	tuple;
614 	Datum		reloptions;
615 	bool		isNull;
616 	Oid			namespaceid;
617 
618 	OldHeap = table_open(OIDOldHeap, lockmode);
619 	OldHeapDesc = RelationGetDescr(OldHeap);
620 
621 	/*
622 	 * Note that the NewHeap will not receive any of the defaults or
623 	 * constraints associated with the OldHeap; we don't need 'em, and there's
624 	 * no reason to spend cycles inserting them into the catalogs only to
625 	 * delete them.
626 	 */
627 
628 	/*
629 	 * But we do want to use reloptions of the old heap for new heap.
630 	 */
631 	tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
632 	if (!HeapTupleIsValid(tuple))
633 		elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
634 	reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
635 								 &isNull);
636 	if (isNull)
637 		reloptions = (Datum) 0;
638 
639 	if (relpersistence == RELPERSISTENCE_TEMP)
640 		namespaceid = LookupCreationNamespace("pg_temp");
641 	else
642 		namespaceid = RelationGetNamespace(OldHeap);
643 
644 	/*
645 	 * Create the new heap, using a temporary name in the same namespace as
646 	 * the existing table.  NOTE: there is some risk of collision with user
647 	 * relnames.  Working around this seems more trouble than it's worth; in
648 	 * particular, we can't create the new heap in a different namespace from
649 	 * the old, or we will have problems with the TEMP status of temp tables.
650 	 *
651 	 * Note: the new heap is not a shared relation, even if we are rebuilding
652 	 * a shared rel.  However, we do make the new heap mapped if the source is
653 	 * mapped.  This simplifies swap_relation_files, and is absolutely
654 	 * necessary for rebuilding pg_class, for reasons explained there.
655 	 */
656 	snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
657 
658 	OIDNewHeap = heap_create_with_catalog(NewHeapName,
659 										  namespaceid,
660 										  NewTableSpace,
661 										  InvalidOid,
662 										  InvalidOid,
663 										  InvalidOid,
664 										  OldHeap->rd_rel->relowner,
665 										  OldHeap->rd_rel->relam,
666 										  OldHeapDesc,
667 										  NIL,
668 										  RELKIND_RELATION,
669 										  relpersistence,
670 										  false,
671 										  RelationIsMapped(OldHeap),
672 										  ONCOMMIT_NOOP,
673 										  reloptions,
674 										  false,
675 										  true,
676 										  true,
677 										  OIDOldHeap,
678 										  NULL);
679 	Assert(OIDNewHeap != InvalidOid);
680 
681 	ReleaseSysCache(tuple);
682 
683 	/*
684 	 * Advance command counter so that the newly-created relation's catalog
685 	 * tuples will be visible to table_open.
686 	 */
687 	CommandCounterIncrement();
688 
689 	/*
690 	 * If necessary, create a TOAST table for the new relation.
691 	 *
692 	 * If the relation doesn't have a TOAST table already, we can't need one
693 	 * for the new relation.  The other way around is possible though: if some
694 	 * wide columns have been dropped, NewHeapCreateToastTable can decide that
695 	 * no TOAST table is needed for the new table.
696 	 *
697 	 * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
698 	 * that the TOAST table will be visible for insertion.
699 	 */
700 	toastid = OldHeap->rd_rel->reltoastrelid;
701 	if (OidIsValid(toastid))
702 	{
703 		/* keep the existing toast table's reloptions, if any */
704 		tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
705 		if (!HeapTupleIsValid(tuple))
706 			elog(ERROR, "cache lookup failed for relation %u", toastid);
707 		reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
708 									 &isNull);
709 		if (isNull)
710 			reloptions = (Datum) 0;
711 
712 		NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid);
713 
714 		ReleaseSysCache(tuple);
715 	}
716 
717 	table_close(OldHeap, NoLock);
718 
719 	return OIDNewHeap;
720 }
721 
722 /*
723  * Do the physical copying of table data.
724  *
725  * There are three output parameters:
726  * *pSwapToastByContent is set true if toast tables must be swapped by content.
727  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
728  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
729  */
730 static void
copy_table_data(Oid OIDNewHeap,Oid OIDOldHeap,Oid OIDOldIndex,bool verbose,bool * pSwapToastByContent,TransactionId * pFreezeXid,MultiXactId * pCutoffMulti)731 copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
732 				bool *pSwapToastByContent, TransactionId *pFreezeXid,
733 				MultiXactId *pCutoffMulti)
734 {
735 	Relation	NewHeap,
736 				OldHeap,
737 				OldIndex;
738 	Relation	relRelation;
739 	HeapTuple	reltup;
740 	Form_pg_class relform;
741 	TupleDesc	oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
742 	TupleDesc	newTupDesc PG_USED_FOR_ASSERTS_ONLY;
743 	TransactionId OldestXmin;
744 	TransactionId FreezeXid;
745 	MultiXactId MultiXactCutoff;
746 	bool		use_sort;
747 	double		num_tuples = 0,
748 				tups_vacuumed = 0,
749 				tups_recently_dead = 0;
750 	BlockNumber num_pages;
751 	int			elevel = verbose ? INFO : DEBUG2;
752 	PGRUsage	ru0;
753 
754 	pg_rusage_init(&ru0);
755 
756 	/*
757 	 * Open the relations we need.
758 	 */
759 	NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
760 	OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
761 	if (OidIsValid(OIDOldIndex))
762 		OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
763 	else
764 		OldIndex = NULL;
765 
766 	/*
767 	 * Their tuple descriptors should be exactly alike, but here we only need
768 	 * assume that they have the same number of columns.
769 	 */
770 	oldTupDesc = RelationGetDescr(OldHeap);
771 	newTupDesc = RelationGetDescr(NewHeap);
772 	Assert(newTupDesc->natts == oldTupDesc->natts);
773 
774 	/*
775 	 * If the OldHeap has a toast table, get lock on the toast table to keep
776 	 * it from being vacuumed.  This is needed because autovacuum processes
777 	 * toast tables independently of their main tables, with no lock on the
778 	 * latter.  If an autovacuum were to start on the toast table after we
779 	 * compute our OldestXmin below, it would use a later OldestXmin, and then
780 	 * possibly remove as DEAD toast tuples belonging to main tuples we think
781 	 * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
782 	 * tuples.
783 	 *
784 	 * We don't need to open the toast relation here, just lock it.  The lock
785 	 * will be held till end of transaction.
786 	 */
787 	if (OldHeap->rd_rel->reltoastrelid)
788 		LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
789 
790 	/*
791 	 * If both tables have TOAST tables, perform toast swap by content.  It is
792 	 * possible that the old table has a toast table but the new one doesn't,
793 	 * if toastable columns have been dropped.  In that case we have to do
794 	 * swap by links.  This is okay because swap by content is only essential
795 	 * for system catalogs, and we don't support schema changes for them.
796 	 */
797 	if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
798 	{
799 		*pSwapToastByContent = true;
800 
801 		/*
802 		 * When doing swap by content, any toast pointers written into NewHeap
803 		 * must use the old toast table's OID, because that's where the toast
804 		 * data will eventually be found.  Set this up by setting rd_toastoid.
805 		 * This also tells toast_save_datum() to preserve the toast value
806 		 * OIDs, which we want so as not to invalidate toast pointers in
807 		 * system catalog caches, and to avoid making multiple copies of a
808 		 * single toast value.
809 		 *
810 		 * Note that we must hold NewHeap open until we are done writing data,
811 		 * since the relcache will not guarantee to remember this setting once
812 		 * the relation is closed.  Also, this technique depends on the fact
813 		 * that no one will try to read from the NewHeap until after we've
814 		 * finished writing it and swapping the rels --- otherwise they could
815 		 * follow the toast pointers to the wrong place.  (It would actually
816 		 * work for values copied over from the old toast table, but not for
817 		 * any values that we toast which were previously not toasted.)
818 		 */
819 		NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
820 	}
821 	else
822 		*pSwapToastByContent = false;
823 
824 	/*
825 	 * Compute xids used to freeze and weed out dead tuples and multixacts.
826 	 * Since we're going to rewrite the whole table anyway, there's no reason
827 	 * not to be aggressive about this.
828 	 */
829 	vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
830 						  &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
831 						  NULL);
832 
833 	/*
834 	 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
835 	 * backwards, so take the max.
836 	 */
837 	if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
838 		TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
839 		FreezeXid = OldHeap->rd_rel->relfrozenxid;
840 
841 	/*
842 	 * MultiXactCutoff, similarly, shouldn't go backwards either.
843 	 */
844 	if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
845 		MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
846 		MultiXactCutoff = OldHeap->rd_rel->relminmxid;
847 
848 	/*
849 	 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
850 	 * the OldHeap.  We know how to use a sort to duplicate the ordering of a
851 	 * btree index, and will use seqscan-and-sort for that case if the planner
852 	 * tells us it's cheaper.  Otherwise, always indexscan if an index is
853 	 * provided, else plain seqscan.
854 	 */
855 	if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
856 		use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
857 	else
858 		use_sort = false;
859 
860 	/* Log what we're doing */
861 	if (OldIndex != NULL && !use_sort)
862 		ereport(elevel,
863 				(errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
864 						get_namespace_name(RelationGetNamespace(OldHeap)),
865 						RelationGetRelationName(OldHeap),
866 						RelationGetRelationName(OldIndex))));
867 	else if (use_sort)
868 		ereport(elevel,
869 				(errmsg("clustering \"%s.%s\" using sequential scan and sort",
870 						get_namespace_name(RelationGetNamespace(OldHeap)),
871 						RelationGetRelationName(OldHeap))));
872 	else
873 		ereport(elevel,
874 				(errmsg("vacuuming \"%s.%s\"",
875 						get_namespace_name(RelationGetNamespace(OldHeap)),
876 						RelationGetRelationName(OldHeap))));
877 
878 	/*
879 	 * Hand of the actual copying to AM specific function, the generic code
880 	 * cannot know how to deal with visibility across AMs. Note that this
881 	 * routine is allowed to set FreezeXid / MultiXactCutoff to different
882 	 * values (e.g. because the AM doesn't use freezing).
883 	 */
884 	table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
885 									OldestXmin, &FreezeXid, &MultiXactCutoff,
886 									&num_tuples, &tups_vacuumed,
887 									&tups_recently_dead);
888 
889 	/* return selected values to caller, get set as relfrozenxid/minmxid */
890 	*pFreezeXid = FreezeXid;
891 	*pCutoffMulti = MultiXactCutoff;
892 
893 	/* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
894 	NewHeap->rd_toastoid = InvalidOid;
895 
896 	num_pages = RelationGetNumberOfBlocks(NewHeap);
897 
898 	/* Log what we did */
899 	ereport(elevel,
900 			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
901 					RelationGetRelationName(OldHeap),
902 					tups_vacuumed, num_tuples,
903 					RelationGetNumberOfBlocks(OldHeap)),
904 			 errdetail("%.0f dead row versions cannot be removed yet.\n"
905 					   "%s.",
906 					   tups_recently_dead,
907 					   pg_rusage_show(&ru0))));
908 
909 	if (OldIndex != NULL)
910 		index_close(OldIndex, NoLock);
911 	table_close(OldHeap, NoLock);
912 	table_close(NewHeap, NoLock);
913 
914 	/* Update pg_class to reflect the correct values of pages and tuples. */
915 	relRelation = table_open(RelationRelationId, RowExclusiveLock);
916 
917 	reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
918 	if (!HeapTupleIsValid(reltup))
919 		elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
920 	relform = (Form_pg_class) GETSTRUCT(reltup);
921 
922 	relform->relpages = num_pages;
923 	relform->reltuples = num_tuples;
924 
925 	/* Don't update the stats for pg_class.  See swap_relation_files. */
926 	if (OIDOldHeap != RelationRelationId)
927 		CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
928 	else
929 		CacheInvalidateRelcacheByTuple(reltup);
930 
931 	/* Clean up. */
932 	heap_freetuple(reltup);
933 	table_close(relRelation, RowExclusiveLock);
934 
935 	/* Make the update visible */
936 	CommandCounterIncrement();
937 }
938 
939 /*
940  * Swap the physical files of two given relations.
941  *
942  * We swap the physical identity (reltablespace, relfilenode) while keeping the
943  * same logical identities of the two relations.  relpersistence is also
944  * swapped, which is critical since it determines where buffers live for each
945  * relation.
946  *
947  * We can swap associated TOAST data in either of two ways: recursively swap
948  * the physical content of the toast tables (and their indexes), or swap the
949  * TOAST links in the given relations' pg_class entries.  The former is needed
950  * to manage rewrites of shared catalogs (where we cannot change the pg_class
951  * links) while the latter is the only way to handle cases in which a toast
952  * table is added or removed altogether.
953  *
954  * Additionally, the first relation is marked with relfrozenxid set to
955  * frozenXid.  It seems a bit ugly to have this here, but the caller would
956  * have to do it anyway, so having it here saves a heap_update.  Note: in
957  * the swap-toast-links case, we assume we don't need to change the toast
958  * table's relfrozenxid: the new version of the toast table should already
959  * have relfrozenxid set to RecentXmin, which is good enough.
960  *
961  * Lastly, if r2 and its toast table and toast index (if any) are mapped,
962  * their OIDs are emitted into mapped_tables[].  This is hacky but beats
963  * having to look the information up again later in finish_heap_swap.
964  */
965 static void
swap_relation_files(Oid r1,Oid r2,bool target_is_pg_class,bool swap_toast_by_content,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,Oid * mapped_tables)966 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
967 					bool swap_toast_by_content,
968 					bool is_internal,
969 					TransactionId frozenXid,
970 					MultiXactId cutoffMulti,
971 					Oid *mapped_tables)
972 {
973 	Relation	relRelation;
974 	HeapTuple	reltup1,
975 				reltup2;
976 	Form_pg_class relform1,
977 				relform2;
978 	Oid			relfilenode1,
979 				relfilenode2;
980 	Oid			swaptemp;
981 	char		swptmpchr;
982 
983 	/* We need writable copies of both pg_class tuples. */
984 	relRelation = table_open(RelationRelationId, RowExclusiveLock);
985 
986 	reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
987 	if (!HeapTupleIsValid(reltup1))
988 		elog(ERROR, "cache lookup failed for relation %u", r1);
989 	relform1 = (Form_pg_class) GETSTRUCT(reltup1);
990 
991 	reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
992 	if (!HeapTupleIsValid(reltup2))
993 		elog(ERROR, "cache lookup failed for relation %u", r2);
994 	relform2 = (Form_pg_class) GETSTRUCT(reltup2);
995 
996 	relfilenode1 = relform1->relfilenode;
997 	relfilenode2 = relform2->relfilenode;
998 
999 	if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1000 	{
1001 		/*
1002 		 * Normal non-mapped relations: swap relfilenodes, reltablespaces,
1003 		 * relpersistence
1004 		 */
1005 		Assert(!target_is_pg_class);
1006 
1007 		swaptemp = relform1->relfilenode;
1008 		relform1->relfilenode = relform2->relfilenode;
1009 		relform2->relfilenode = swaptemp;
1010 
1011 		swaptemp = relform1->reltablespace;
1012 		relform1->reltablespace = relform2->reltablespace;
1013 		relform2->reltablespace = swaptemp;
1014 
1015 		swptmpchr = relform1->relpersistence;
1016 		relform1->relpersistence = relform2->relpersistence;
1017 		relform2->relpersistence = swptmpchr;
1018 
1019 		/* Also swap toast links, if we're swapping by links */
1020 		if (!swap_toast_by_content)
1021 		{
1022 			swaptemp = relform1->reltoastrelid;
1023 			relform1->reltoastrelid = relform2->reltoastrelid;
1024 			relform2->reltoastrelid = swaptemp;
1025 		}
1026 	}
1027 	else
1028 	{
1029 		/*
1030 		 * Mapped-relation case.  Here we have to swap the relation mappings
1031 		 * instead of modifying the pg_class columns.  Both must be mapped.
1032 		 */
1033 		if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
1034 			elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1035 				 NameStr(relform1->relname));
1036 
1037 		/*
1038 		 * We can't change the tablespace nor persistence of a mapped rel, and
1039 		 * we can't handle toast link swapping for one either, because we must
1040 		 * not apply any critical changes to its pg_class row.  These cases
1041 		 * should be prevented by upstream permissions tests, so these checks
1042 		 * are non-user-facing emergency backstop.
1043 		 */
1044 		if (relform1->reltablespace != relform2->reltablespace)
1045 			elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1046 				 NameStr(relform1->relname));
1047 		if (relform1->relpersistence != relform2->relpersistence)
1048 			elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1049 				 NameStr(relform1->relname));
1050 		if (!swap_toast_by_content &&
1051 			(relform1->reltoastrelid || relform2->reltoastrelid))
1052 			elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1053 				 NameStr(relform1->relname));
1054 
1055 		/*
1056 		 * Fetch the mappings --- shouldn't fail, but be paranoid
1057 		 */
1058 		relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1059 		if (!OidIsValid(relfilenode1))
1060 			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1061 				 NameStr(relform1->relname), r1);
1062 		relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1063 		if (!OidIsValid(relfilenode2))
1064 			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1065 				 NameStr(relform2->relname), r2);
1066 
1067 		/*
1068 		 * Send replacement mappings to relmapper.  Note these won't actually
1069 		 * take effect until CommandCounterIncrement.
1070 		 */
1071 		RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1072 		RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1073 
1074 		/* Pass OIDs of mapped r2 tables back to caller */
1075 		*mapped_tables++ = r2;
1076 	}
1077 
1078 	/*
1079 	 * Recognize that rel1's relfilenode (swapped from rel2) is new in this
1080 	 * subtransaction. The rel2 storage (swapped from rel1) may or may not be
1081 	 * new.
1082 	 */
1083 	{
1084 		Relation	rel1,
1085 					rel2;
1086 
1087 		rel1 = relation_open(r1, NoLock);
1088 		rel2 = relation_open(r2, NoLock);
1089 		rel2->rd_createSubid = rel1->rd_createSubid;
1090 		rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
1091 		rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
1092 		RelationAssumeNewRelfilenode(rel1);
1093 		relation_close(rel1, NoLock);
1094 		relation_close(rel2, NoLock);
1095 	}
1096 
1097 	/*
1098 	 * In the case of a shared catalog, these next few steps will only affect
1099 	 * our own database's pg_class row; but that's okay, because they are all
1100 	 * noncritical updates.  That's also an important fact for the case of a
1101 	 * mapped catalog, because it's possible that we'll commit the map change
1102 	 * and then fail to commit the pg_class update.
1103 	 */
1104 
1105 	/* set rel1's frozen Xid and minimum MultiXid */
1106 	if (relform1->relkind != RELKIND_INDEX)
1107 	{
1108 		Assert(!TransactionIdIsValid(frozenXid) ||
1109 			   TransactionIdIsNormal(frozenXid));
1110 		relform1->relfrozenxid = frozenXid;
1111 		relform1->relminmxid = cutoffMulti;
1112 	}
1113 
1114 	/* swap size statistics too, since new rel has freshly-updated stats */
1115 	{
1116 		int32		swap_pages;
1117 		float4		swap_tuples;
1118 		int32		swap_allvisible;
1119 
1120 		swap_pages = relform1->relpages;
1121 		relform1->relpages = relform2->relpages;
1122 		relform2->relpages = swap_pages;
1123 
1124 		swap_tuples = relform1->reltuples;
1125 		relform1->reltuples = relform2->reltuples;
1126 		relform2->reltuples = swap_tuples;
1127 
1128 		swap_allvisible = relform1->relallvisible;
1129 		relform1->relallvisible = relform2->relallvisible;
1130 		relform2->relallvisible = swap_allvisible;
1131 	}
1132 
1133 	/*
1134 	 * Update the tuples in pg_class --- unless the target relation of the
1135 	 * swap is pg_class itself.  In that case, there is zero point in making
1136 	 * changes because we'd be updating the old data that we're about to throw
1137 	 * away.  Because the real work being done here for a mapped relation is
1138 	 * just to change the relation map settings, it's all right to not update
1139 	 * the pg_class rows in this case. The most important changes will instead
1140 	 * performed later, in finish_heap_swap() itself.
1141 	 */
1142 	if (!target_is_pg_class)
1143 	{
1144 		CatalogIndexState indstate;
1145 
1146 		indstate = CatalogOpenIndexes(relRelation);
1147 		CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1148 								   indstate);
1149 		CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1150 								   indstate);
1151 		CatalogCloseIndexes(indstate);
1152 	}
1153 	else
1154 	{
1155 		/* no update ... but we do still need relcache inval */
1156 		CacheInvalidateRelcacheByTuple(reltup1);
1157 		CacheInvalidateRelcacheByTuple(reltup2);
1158 	}
1159 
1160 	/*
1161 	 * Post alter hook for modified relations. The change to r2 is always
1162 	 * internal, but r1 depends on the invocation context.
1163 	 */
1164 	InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1165 								 InvalidOid, is_internal);
1166 	InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1167 								 InvalidOid, true);
1168 
1169 	/*
1170 	 * If we have toast tables associated with the relations being swapped,
1171 	 * deal with them too.
1172 	 */
1173 	if (relform1->reltoastrelid || relform2->reltoastrelid)
1174 	{
1175 		if (swap_toast_by_content)
1176 		{
1177 			if (relform1->reltoastrelid && relform2->reltoastrelid)
1178 			{
1179 				/* Recursively swap the contents of the toast tables */
1180 				swap_relation_files(relform1->reltoastrelid,
1181 									relform2->reltoastrelid,
1182 									target_is_pg_class,
1183 									swap_toast_by_content,
1184 									is_internal,
1185 									frozenXid,
1186 									cutoffMulti,
1187 									mapped_tables);
1188 			}
1189 			else
1190 			{
1191 				/* caller messed up */
1192 				elog(ERROR, "cannot swap toast files by content when there's only one");
1193 			}
1194 		}
1195 		else
1196 		{
1197 			/*
1198 			 * We swapped the ownership links, so we need to change dependency
1199 			 * data to match.
1200 			 *
1201 			 * NOTE: it is possible that only one table has a toast table.
1202 			 *
1203 			 * NOTE: at present, a TOAST table's only dependency is the one on
1204 			 * its owning table.  If more are ever created, we'd need to use
1205 			 * something more selective than deleteDependencyRecordsFor() to
1206 			 * get rid of just the link we want.
1207 			 */
1208 			ObjectAddress baseobject,
1209 						toastobject;
1210 			long		count;
1211 
1212 			/*
1213 			 * We disallow this case for system catalogs, to avoid the
1214 			 * possibility that the catalog we're rebuilding is one of the
1215 			 * ones the dependency changes would change.  It's too late to be
1216 			 * making any data changes to the target catalog.
1217 			 */
1218 			if (IsSystemClass(r1, relform1))
1219 				elog(ERROR, "cannot swap toast files by links for system catalogs");
1220 
1221 			/* Delete old dependencies */
1222 			if (relform1->reltoastrelid)
1223 			{
1224 				count = deleteDependencyRecordsFor(RelationRelationId,
1225 												   relform1->reltoastrelid,
1226 												   false);
1227 				if (count != 1)
1228 					elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1229 						 count);
1230 			}
1231 			if (relform2->reltoastrelid)
1232 			{
1233 				count = deleteDependencyRecordsFor(RelationRelationId,
1234 												   relform2->reltoastrelid,
1235 												   false);
1236 				if (count != 1)
1237 					elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1238 						 count);
1239 			}
1240 
1241 			/* Register new dependencies */
1242 			baseobject.classId = RelationRelationId;
1243 			baseobject.objectSubId = 0;
1244 			toastobject.classId = RelationRelationId;
1245 			toastobject.objectSubId = 0;
1246 
1247 			if (relform1->reltoastrelid)
1248 			{
1249 				baseobject.objectId = r1;
1250 				toastobject.objectId = relform1->reltoastrelid;
1251 				recordDependencyOn(&toastobject, &baseobject,
1252 								   DEPENDENCY_INTERNAL);
1253 			}
1254 
1255 			if (relform2->reltoastrelid)
1256 			{
1257 				baseobject.objectId = r2;
1258 				toastobject.objectId = relform2->reltoastrelid;
1259 				recordDependencyOn(&toastobject, &baseobject,
1260 								   DEPENDENCY_INTERNAL);
1261 			}
1262 		}
1263 	}
1264 
1265 	/*
1266 	 * If we're swapping two toast tables by content, do the same for their
1267 	 * valid index. The swap can actually be safely done only if the relations
1268 	 * have indexes.
1269 	 */
1270 	if (swap_toast_by_content &&
1271 		relform1->relkind == RELKIND_TOASTVALUE &&
1272 		relform2->relkind == RELKIND_TOASTVALUE)
1273 	{
1274 		Oid			toastIndex1,
1275 					toastIndex2;
1276 
1277 		/* Get valid index for each relation */
1278 		toastIndex1 = toast_get_valid_index(r1,
1279 											AccessExclusiveLock);
1280 		toastIndex2 = toast_get_valid_index(r2,
1281 											AccessExclusiveLock);
1282 
1283 		swap_relation_files(toastIndex1,
1284 							toastIndex2,
1285 							target_is_pg_class,
1286 							swap_toast_by_content,
1287 							is_internal,
1288 							InvalidTransactionId,
1289 							InvalidMultiXactId,
1290 							mapped_tables);
1291 	}
1292 
1293 	/* Clean up. */
1294 	heap_freetuple(reltup1);
1295 	heap_freetuple(reltup2);
1296 
1297 	table_close(relRelation, RowExclusiveLock);
1298 
1299 	/*
1300 	 * Close both relcache entries' smgr links.  We need this kluge because
1301 	 * both links will be invalidated during upcoming CommandCounterIncrement.
1302 	 * Whichever of the rels is the second to be cleared will have a dangling
1303 	 * reference to the other's smgr entry.  Rather than trying to avoid this
1304 	 * by ordering operations just so, it's easiest to close the links first.
1305 	 * (Fortunately, since one of the entries is local in our transaction,
1306 	 * it's sufficient to clear out our own relcache this way; the problem
1307 	 * cannot arise for other backends when they see our update on the
1308 	 * non-transient relation.)
1309 	 *
1310 	 * Caution: the placement of this step interacts with the decision to
1311 	 * handle toast rels by recursion.  When we are trying to rebuild pg_class
1312 	 * itself, the smgr close on pg_class must happen after all accesses in
1313 	 * this function.
1314 	 */
1315 	RelationCloseSmgrByOid(r1);
1316 	RelationCloseSmgrByOid(r2);
1317 }
1318 
1319 /*
1320  * Remove the transient table that was built by make_new_heap, and finish
1321  * cleaning up (including rebuilding all indexes on the old heap).
1322  */
1323 void
finish_heap_swap(Oid OIDOldHeap,Oid OIDNewHeap,bool is_system_catalog,bool swap_toast_by_content,bool check_constraints,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,char newrelpersistence)1324 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1325 				 bool is_system_catalog,
1326 				 bool swap_toast_by_content,
1327 				 bool check_constraints,
1328 				 bool is_internal,
1329 				 TransactionId frozenXid,
1330 				 MultiXactId cutoffMulti,
1331 				 char newrelpersistence)
1332 {
1333 	ObjectAddress object;
1334 	Oid			mapped_tables[4];
1335 	int			reindex_flags;
1336 	int			i;
1337 
1338 	/* Report that we are now swapping relation files */
1339 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1340 								 PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
1341 
1342 	/* Zero out possible results from swapped_relation_files */
1343 	memset(mapped_tables, 0, sizeof(mapped_tables));
1344 
1345 	/*
1346 	 * Swap the contents of the heap relations (including any toast tables).
1347 	 * Also set old heap's relfrozenxid to frozenXid.
1348 	 */
1349 	swap_relation_files(OIDOldHeap, OIDNewHeap,
1350 						(OIDOldHeap == RelationRelationId),
1351 						swap_toast_by_content, is_internal,
1352 						frozenXid, cutoffMulti, mapped_tables);
1353 
1354 	/*
1355 	 * If it's a system catalog, queue a sinval message to flush all catcaches
1356 	 * on the catalog when we reach CommandCounterIncrement.
1357 	 */
1358 	if (is_system_catalog)
1359 		CacheInvalidateCatalog(OIDOldHeap);
1360 
1361 	/*
1362 	 * Rebuild each index on the relation (but not the toast table, which is
1363 	 * all-new at this point).  It is important to do this before the DROP
1364 	 * step because if we are processing a system catalog that will be used
1365 	 * during DROP, we want to have its indexes available.  There is no
1366 	 * advantage to the other order anyway because this is all transactional,
1367 	 * so no chance to reclaim disk space before commit.  We do not need a
1368 	 * final CommandCounterIncrement() because reindex_relation does it.
1369 	 *
1370 	 * Note: because index_build is called via reindex_relation, it will never
1371 	 * set indcheckxmin true for the indexes.  This is OK even though in some
1372 	 * sense we are building new indexes rather than rebuilding existing ones,
1373 	 * because the new heap won't contain any HOT chains at all, let alone
1374 	 * broken ones, so it can't be necessary to set indcheckxmin.
1375 	 */
1376 	reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1377 	if (check_constraints)
1378 		reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1379 
1380 	/*
1381 	 * Ensure that the indexes have the same persistence as the parent
1382 	 * relation.
1383 	 */
1384 	if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1385 		reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1386 	else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1387 		reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1388 
1389 	/* Report that we are now reindexing relations */
1390 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1391 								 PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
1392 
1393 	reindex_relation(OIDOldHeap, reindex_flags, 0);
1394 
1395 	/* Report that we are now doing clean up */
1396 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1397 								 PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
1398 
1399 	/*
1400 	 * If the relation being rebuild is pg_class, swap_relation_files()
1401 	 * couldn't update pg_class's own pg_class entry (check comments in
1402 	 * swap_relation_files()), thus relfrozenxid was not updated. That's
1403 	 * annoying because a potential reason for doing a VACUUM FULL is a
1404 	 * imminent or actual anti-wraparound shutdown.  So, now that we can
1405 	 * access the new relation using its indices, update relfrozenxid.
1406 	 * pg_class doesn't have a toast relation, so we don't need to update the
1407 	 * corresponding toast relation. Not that there's little point moving all
1408 	 * relfrozenxid updates here since swap_relation_files() needs to write to
1409 	 * pg_class for non-mapped relations anyway.
1410 	 */
1411 	if (OIDOldHeap == RelationRelationId)
1412 	{
1413 		Relation	relRelation;
1414 		HeapTuple	reltup;
1415 		Form_pg_class relform;
1416 
1417 		relRelation = table_open(RelationRelationId, RowExclusiveLock);
1418 
1419 		reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1420 		if (!HeapTupleIsValid(reltup))
1421 			elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1422 		relform = (Form_pg_class) GETSTRUCT(reltup);
1423 
1424 		relform->relfrozenxid = frozenXid;
1425 		relform->relminmxid = cutoffMulti;
1426 
1427 		CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1428 
1429 		table_close(relRelation, RowExclusiveLock);
1430 	}
1431 
1432 	/* Destroy new heap with old filenode */
1433 	object.classId = RelationRelationId;
1434 	object.objectId = OIDNewHeap;
1435 	object.objectSubId = 0;
1436 
1437 	/*
1438 	 * The new relation is local to our transaction and we know nothing
1439 	 * depends on it, so DROP_RESTRICT should be OK.
1440 	 */
1441 	performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1442 
1443 	/* performDeletion does CommandCounterIncrement at end */
1444 
1445 	/*
1446 	 * Now we must remove any relation mapping entries that we set up for the
1447 	 * transient table, as well as its toast table and toast index if any. If
1448 	 * we fail to do this before commit, the relmapper will complain about new
1449 	 * permanent map entries being added post-bootstrap.
1450 	 */
1451 	for (i = 0; OidIsValid(mapped_tables[i]); i++)
1452 		RelationMapRemoveMapping(mapped_tables[i]);
1453 
1454 	/*
1455 	 * At this point, everything is kosher except that, if we did toast swap
1456 	 * by links, the toast table's name corresponds to the transient table.
1457 	 * The name is irrelevant to the backend because it's referenced by OID,
1458 	 * but users looking at the catalogs could be confused.  Rename it to
1459 	 * prevent this problem.
1460 	 *
1461 	 * Note no lock required on the relation, because we already hold an
1462 	 * exclusive lock on it.
1463 	 */
1464 	if (!swap_toast_by_content)
1465 	{
1466 		Relation	newrel;
1467 
1468 		newrel = table_open(OIDOldHeap, NoLock);
1469 		if (OidIsValid(newrel->rd_rel->reltoastrelid))
1470 		{
1471 			Oid			toastidx;
1472 			char		NewToastName[NAMEDATALEN];
1473 
1474 			/* Get the associated valid index to be renamed */
1475 			toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1476 											 NoLock);
1477 
1478 			/* rename the toast table ... */
1479 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1480 					 OIDOldHeap);
1481 			RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1482 								   NewToastName, true, false);
1483 
1484 			/* ... and its valid index too. */
1485 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1486 					 OIDOldHeap);
1487 
1488 			RenameRelationInternal(toastidx,
1489 								   NewToastName, true, true);
1490 
1491 			/*
1492 			 * Reset the relrewrite for the toast. The command-counter
1493 			 * increment is required here as we are about to update
1494 			 * the tuple that is updated as part of RenameRelationInternal.
1495 			 */
1496 			CommandCounterIncrement();
1497 			ResetRelRewrite(newrel->rd_rel->reltoastrelid);
1498 		}
1499 		relation_close(newrel, NoLock);
1500 	}
1501 
1502 	/* if it's not a catalog table, clear any missing attribute settings */
1503 	if (!is_system_catalog)
1504 	{
1505 		Relation	newrel;
1506 
1507 		newrel = table_open(OIDOldHeap, NoLock);
1508 		RelationClearMissing(newrel);
1509 		relation_close(newrel, NoLock);
1510 	}
1511 }
1512 
1513 
1514 /*
1515  * Get a list of tables that the current user owns and
1516  * have indisclustered set.  Return the list in a List * of RelToCluster
1517  * (stored in the specified memory context), each one giving the tableOid
1518  * and the indexOid on which the table is already clustered.
1519  */
1520 static List *
get_tables_to_cluster(MemoryContext cluster_context)1521 get_tables_to_cluster(MemoryContext cluster_context)
1522 {
1523 	Relation	indRelation;
1524 	TableScanDesc scan;
1525 	ScanKeyData entry;
1526 	HeapTuple	indexTuple;
1527 	Form_pg_index index;
1528 	MemoryContext old_context;
1529 	RelToCluster *rvtc;
1530 	List	   *rvs = NIL;
1531 
1532 	/*
1533 	 * Get all indexes that have indisclustered set and are owned by
1534 	 * appropriate user.
1535 	 */
1536 	indRelation = table_open(IndexRelationId, AccessShareLock);
1537 	ScanKeyInit(&entry,
1538 				Anum_pg_index_indisclustered,
1539 				BTEqualStrategyNumber, F_BOOLEQ,
1540 				BoolGetDatum(true));
1541 	scan = table_beginscan_catalog(indRelation, 1, &entry);
1542 	while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1543 	{
1544 		index = (Form_pg_index) GETSTRUCT(indexTuple);
1545 
1546 		if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1547 			continue;
1548 
1549 		/*
1550 		 * We have to build the list in a different memory context so it will
1551 		 * survive the cross-transaction processing
1552 		 */
1553 		old_context = MemoryContextSwitchTo(cluster_context);
1554 
1555 		rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1556 		rvtc->tableOid = index->indrelid;
1557 		rvtc->indexOid = index->indexrelid;
1558 		rvs = lappend(rvs, rvtc);
1559 
1560 		MemoryContextSwitchTo(old_context);
1561 	}
1562 	table_endscan(scan);
1563 
1564 	relation_close(indRelation, AccessShareLock);
1565 
1566 	return rvs;
1567 }
1568