1 /*-------------------------------------------------------------------------
2  *
3  * cluster.c
4  *	  CLUSTER a table on an index.  This is now also used for VACUUM FULL.
5  *
6  * There is hardly anything left of Paul Brown's original implementation...
7  *
8  *
9  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994-5, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  *	  src/backend/commands/cluster.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include "access/amapi.h"
21 #include "access/heapam.h"
22 #include "access/multixact.h"
23 #include "access/relscan.h"
24 #include "access/tableam.h"
25 #include "access/transam.h"
26 #include "access/tuptoaster.h"
27 #include "access/xact.h"
28 #include "access/xlog.h"
29 #include "catalog/pg_am.h"
30 #include "catalog/catalog.h"
31 #include "catalog/dependency.h"
32 #include "catalog/heap.h"
33 #include "catalog/index.h"
34 #include "catalog/namespace.h"
35 #include "catalog/objectaccess.h"
36 #include "catalog/toasting.h"
37 #include "commands/cluster.h"
38 #include "commands/progress.h"
39 #include "commands/tablecmds.h"
40 #include "commands/vacuum.h"
41 #include "miscadmin.h"
42 #include "optimizer/optimizer.h"
43 #include "pgstat.h"
44 #include "storage/bufmgr.h"
45 #include "storage/lmgr.h"
46 #include "storage/predicate.h"
47 #include "utils/acl.h"
48 #include "utils/fmgroids.h"
49 #include "utils/inval.h"
50 #include "utils/lsyscache.h"
51 #include "utils/memutils.h"
52 #include "utils/pg_rusage.h"
53 #include "utils/relmapper.h"
54 #include "utils/snapmgr.h"
55 #include "utils/syscache.h"
56 #include "utils/tuplesort.h"
57 
58 
59 /*
60  * This struct is used to pass around the information on tables to be
61  * clustered. We need this so we can make a list of them when invoked without
62  * a specific table/index pair.
63  */
64 typedef struct
65 {
66 	Oid			tableOid;
67 	Oid			indexOid;
68 } RelToCluster;
69 
70 
71 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
72 static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
73 							bool verbose, bool *pSwapToastByContent,
74 							TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
75 static List *get_tables_to_cluster(MemoryContext cluster_context);
76 
77 
78 /*---------------------------------------------------------------------------
79  * This cluster code allows for clustering multiple tables at once. Because
80  * of this, we cannot just run everything on a single transaction, or we
81  * would be forced to acquire exclusive locks on all the tables being
82  * clustered, simultaneously --- very likely leading to deadlock.
83  *
84  * To solve this we follow a similar strategy to VACUUM code,
85  * clustering each relation in a separate transaction. For this to work,
86  * we need to:
87  *	- provide a separate memory context so that we can pass information in
88  *	  a way that survives across transactions
89  *	- start a new transaction every time a new relation is clustered
90  *	- check for validity of the information on to-be-clustered relations,
91  *	  as someone might have deleted a relation behind our back, or
92  *	  clustered one on a different index
93  *	- end the transaction
94  *
95  * The single-relation case does not have any such overhead.
96  *
97  * We also allow a relation to be specified without index.  In that case,
98  * the indisclustered bit will be looked up, and an ERROR will be thrown
99  * if there is no index with the bit set.
100  *---------------------------------------------------------------------------
101  */
102 void
cluster(ClusterStmt * stmt,bool isTopLevel)103 cluster(ClusterStmt *stmt, bool isTopLevel)
104 {
105 	if (stmt->relation != NULL)
106 	{
107 		/* This is the single-relation case. */
108 		Oid			tableOid,
109 					indexOid = InvalidOid;
110 		Relation	rel;
111 
112 		/* Find, lock, and check permissions on the table */
113 		tableOid = RangeVarGetRelidExtended(stmt->relation,
114 											AccessExclusiveLock,
115 											0,
116 											RangeVarCallbackOwnsTable, NULL);
117 		rel = table_open(tableOid, NoLock);
118 
119 		/*
120 		 * Reject clustering a remote temp table ... their local buffer
121 		 * manager is not going to cope.
122 		 */
123 		if (RELATION_IS_OTHER_TEMP(rel))
124 			ereport(ERROR,
125 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
126 					 errmsg("cannot cluster temporary tables of other sessions")));
127 
128 		/*
129 		 * Reject clustering a partitioned table.
130 		 */
131 		if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
132 			ereport(ERROR,
133 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
134 					 errmsg("cannot cluster a partitioned table")));
135 
136 		if (stmt->indexname == NULL)
137 		{
138 			ListCell   *index;
139 
140 			/* We need to find the index that has indisclustered set. */
141 			foreach(index, RelationGetIndexList(rel))
142 			{
143 				HeapTuple	idxtuple;
144 				Form_pg_index indexForm;
145 
146 				indexOid = lfirst_oid(index);
147 				idxtuple = SearchSysCache1(INDEXRELID,
148 										   ObjectIdGetDatum(indexOid));
149 				if (!HeapTupleIsValid(idxtuple))
150 					elog(ERROR, "cache lookup failed for index %u", indexOid);
151 				indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
152 				if (indexForm->indisclustered)
153 				{
154 					ReleaseSysCache(idxtuple);
155 					break;
156 				}
157 				ReleaseSysCache(idxtuple);
158 				indexOid = InvalidOid;
159 			}
160 
161 			if (!OidIsValid(indexOid))
162 				ereport(ERROR,
163 						(errcode(ERRCODE_UNDEFINED_OBJECT),
164 						 errmsg("there is no previously clustered index for table \"%s\"",
165 								stmt->relation->relname)));
166 		}
167 		else
168 		{
169 			/*
170 			 * The index is expected to be in the same namespace as the
171 			 * relation.
172 			 */
173 			indexOid = get_relname_relid(stmt->indexname,
174 										 rel->rd_rel->relnamespace);
175 			if (!OidIsValid(indexOid))
176 				ereport(ERROR,
177 						(errcode(ERRCODE_UNDEFINED_OBJECT),
178 						 errmsg("index \"%s\" for table \"%s\" does not exist",
179 								stmt->indexname, stmt->relation->relname)));
180 		}
181 
182 		/* close relation, keep lock till commit */
183 		table_close(rel, NoLock);
184 
185 		/* Do the job. */
186 		cluster_rel(tableOid, indexOid, stmt->options);
187 	}
188 	else
189 	{
190 		/*
191 		 * This is the "multi relation" case. We need to cluster all tables
192 		 * that have some index with indisclustered set.
193 		 */
194 		MemoryContext cluster_context;
195 		List	   *rvs;
196 		ListCell   *rv;
197 
198 		/*
199 		 * We cannot run this form of CLUSTER inside a user transaction block;
200 		 * we'd be holding locks way too long.
201 		 */
202 		PreventInTransactionBlock(isTopLevel, "CLUSTER");
203 
204 		/*
205 		 * Create special memory context for cross-transaction storage.
206 		 *
207 		 * Since it is a child of PortalContext, it will go away even in case
208 		 * of error.
209 		 */
210 		cluster_context = AllocSetContextCreate(PortalContext,
211 												"Cluster",
212 												ALLOCSET_DEFAULT_SIZES);
213 
214 		/*
215 		 * Build the list of relations to cluster.  Note that this lives in
216 		 * cluster_context.
217 		 */
218 		rvs = get_tables_to_cluster(cluster_context);
219 
220 		/* Commit to get out of starting transaction */
221 		PopActiveSnapshot();
222 		CommitTransactionCommand();
223 
224 		/* Ok, now that we've got them all, cluster them one by one */
225 		foreach(rv, rvs)
226 		{
227 			RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
228 
229 			/* Start a new transaction for each relation. */
230 			StartTransactionCommand();
231 			/* functions in indexes may want a snapshot set */
232 			PushActiveSnapshot(GetTransactionSnapshot());
233 			/* Do the job. */
234 			cluster_rel(rvtc->tableOid, rvtc->indexOid,
235 						stmt->options | CLUOPT_RECHECK);
236 			PopActiveSnapshot();
237 			CommitTransactionCommand();
238 		}
239 
240 		/* Start a new transaction for the cleanup work. */
241 		StartTransactionCommand();
242 
243 		/* Clean up working storage */
244 		MemoryContextDelete(cluster_context);
245 	}
246 }
247 
248 /*
249  * cluster_rel
250  *
251  * This clusters the table by creating a new, clustered table and
252  * swapping the relfilenodes of the new table and the old table, so
253  * the OID of the original table is preserved.  Thus we do not lose
254  * GRANT, inheritance nor references to this table (this was a bug
255  * in releases through 7.3).
256  *
257  * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
258  * the new table, it's better to create the indexes afterwards than to fill
259  * them incrementally while we load the table.
260  *
261  * If indexOid is InvalidOid, the table will be rewritten in physical order
262  * instead of index order.  This is the new implementation of VACUUM FULL,
263  * and error messages should refer to the operation as VACUUM not CLUSTER.
264  */
265 void
cluster_rel(Oid tableOid,Oid indexOid,int options)266 cluster_rel(Oid tableOid, Oid indexOid, int options)
267 {
268 	Relation	OldHeap;
269 	bool		verbose = ((options & CLUOPT_VERBOSE) != 0);
270 	bool		recheck = ((options & CLUOPT_RECHECK) != 0);
271 
272 	/* Check for user-requested abort. */
273 	CHECK_FOR_INTERRUPTS();
274 
275 	pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
276 	if (OidIsValid(indexOid))
277 		pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
278 									 PROGRESS_CLUSTER_COMMAND_CLUSTER);
279 	else
280 		pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
281 									 PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
282 
283 	/*
284 	 * We grab exclusive access to the target rel and index for the duration
285 	 * of the transaction.  (This is redundant for the single-transaction
286 	 * case, since cluster() already did it.)  The index lock is taken inside
287 	 * check_index_is_clusterable.
288 	 */
289 	OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
290 
291 	/* If the table has gone away, we can skip processing it */
292 	if (!OldHeap)
293 	{
294 		pgstat_progress_end_command();
295 		return;
296 	}
297 
298 	/*
299 	 * Since we may open a new transaction for each relation, we have to check
300 	 * that the relation still is what we think it is.
301 	 *
302 	 * If this is a single-transaction CLUSTER, we can skip these tests. We
303 	 * *must* skip the one on indisclustered since it would reject an attempt
304 	 * to cluster a not-previously-clustered index.
305 	 */
306 	if (recheck)
307 	{
308 		HeapTuple	tuple;
309 		Form_pg_index indexForm;
310 
311 		/* Check that the user still owns the relation */
312 		if (!pg_class_ownercheck(tableOid, GetUserId()))
313 		{
314 			relation_close(OldHeap, AccessExclusiveLock);
315 			pgstat_progress_end_command();
316 			return;
317 		}
318 
319 		/*
320 		 * Silently skip a temp table for a remote session.  Only doing this
321 		 * check in the "recheck" case is appropriate (which currently means
322 		 * somebody is executing a database-wide CLUSTER), because there is
323 		 * another check in cluster() which will stop any attempt to cluster
324 		 * remote temp tables by name.  There is another check in cluster_rel
325 		 * which is redundant, but we leave it for extra safety.
326 		 */
327 		if (RELATION_IS_OTHER_TEMP(OldHeap))
328 		{
329 			relation_close(OldHeap, AccessExclusiveLock);
330 			pgstat_progress_end_command();
331 			return;
332 		}
333 
334 		if (OidIsValid(indexOid))
335 		{
336 			/*
337 			 * Check that the index still exists
338 			 */
339 			if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
340 			{
341 				relation_close(OldHeap, AccessExclusiveLock);
342 				pgstat_progress_end_command();
343 				return;
344 			}
345 
346 			/*
347 			 * Check that the index is still the one with indisclustered set.
348 			 */
349 			tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
350 			if (!HeapTupleIsValid(tuple))	/* probably can't happen */
351 			{
352 				relation_close(OldHeap, AccessExclusiveLock);
353 				pgstat_progress_end_command();
354 				return;
355 			}
356 			indexForm = (Form_pg_index) GETSTRUCT(tuple);
357 			if (!indexForm->indisclustered)
358 			{
359 				ReleaseSysCache(tuple);
360 				relation_close(OldHeap, AccessExclusiveLock);
361 				pgstat_progress_end_command();
362 				return;
363 			}
364 			ReleaseSysCache(tuple);
365 		}
366 	}
367 
368 	/*
369 	 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
370 	 * would work in most respects, but the index would only get marked as
371 	 * indisclustered in the current database, leading to unexpected behavior
372 	 * if CLUSTER were later invoked in another database.
373 	 */
374 	if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
375 		ereport(ERROR,
376 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
377 				 errmsg("cannot cluster a shared catalog")));
378 
379 	/*
380 	 * Don't process temp tables of other backends ... their local buffer
381 	 * manager is not going to cope.
382 	 */
383 	if (RELATION_IS_OTHER_TEMP(OldHeap))
384 	{
385 		if (OidIsValid(indexOid))
386 			ereport(ERROR,
387 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
388 					 errmsg("cannot cluster temporary tables of other sessions")));
389 		else
390 			ereport(ERROR,
391 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
392 					 errmsg("cannot vacuum temporary tables of other sessions")));
393 	}
394 
395 	/*
396 	 * Also check for active uses of the relation in the current transaction,
397 	 * including open scans and pending AFTER trigger events.
398 	 */
399 	CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
400 
401 	/* Check heap and index are valid to cluster on */
402 	if (OidIsValid(indexOid))
403 		check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
404 
405 	/*
406 	 * Quietly ignore the request if this is a materialized view which has not
407 	 * been populated from its query. No harm is done because there is no data
408 	 * to deal with, and we don't want to throw an error if this is part of a
409 	 * multi-relation request -- for example, CLUSTER was run on the entire
410 	 * database.
411 	 */
412 	if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
413 		!RelationIsPopulated(OldHeap))
414 	{
415 		relation_close(OldHeap, AccessExclusiveLock);
416 		pgstat_progress_end_command();
417 		return;
418 	}
419 
420 	/*
421 	 * All predicate locks on the tuples or pages are about to be made
422 	 * invalid, because we move tuples around.  Promote them to relation
423 	 * locks.  Predicate locks on indexes will be promoted when they are
424 	 * reindexed.
425 	 */
426 	TransferPredicateLocksToHeapRelation(OldHeap);
427 
428 	/* rebuild_relation does all the dirty work */
429 	rebuild_relation(OldHeap, indexOid, verbose);
430 
431 	/* NB: rebuild_relation does table_close() on OldHeap */
432 
433 	pgstat_progress_end_command();
434 }
435 
436 /*
437  * Verify that the specified heap and index are valid to cluster on
438  *
439  * Side effect: obtains lock on the index.  The caller may
440  * in some cases already have AccessExclusiveLock on the table, but
441  * not in all cases so we can't rely on the table-level lock for
442  * protection here.
443  */
444 void
check_index_is_clusterable(Relation OldHeap,Oid indexOid,bool recheck,LOCKMODE lockmode)445 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
446 {
447 	Relation	OldIndex;
448 
449 	OldIndex = index_open(indexOid, lockmode);
450 
451 	/*
452 	 * Check that index is in fact an index on the given relation
453 	 */
454 	if (OldIndex->rd_index == NULL ||
455 		OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
456 		ereport(ERROR,
457 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
458 				 errmsg("\"%s\" is not an index for table \"%s\"",
459 						RelationGetRelationName(OldIndex),
460 						RelationGetRelationName(OldHeap))));
461 
462 	/* Index AM must allow clustering */
463 	if (!OldIndex->rd_indam->amclusterable)
464 		ereport(ERROR,
465 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
466 				 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
467 						RelationGetRelationName(OldIndex))));
468 
469 	/*
470 	 * Disallow clustering on incomplete indexes (those that might not index
471 	 * every row of the relation).  We could relax this by making a separate
472 	 * seqscan pass over the table to copy the missing rows, but that seems
473 	 * expensive and tedious.
474 	 */
475 	if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
476 		ereport(ERROR,
477 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
478 				 errmsg("cannot cluster on partial index \"%s\"",
479 						RelationGetRelationName(OldIndex))));
480 
481 	/*
482 	 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
483 	 * it might well not contain entries for every heap row, or might not even
484 	 * be internally consistent.  (But note that we don't check indcheckxmin;
485 	 * the worst consequence of following broken HOT chains would be that we
486 	 * might put recently-dead tuples out-of-order in the new table, and there
487 	 * is little harm in that.)
488 	 */
489 	if (!OldIndex->rd_index->indisvalid)
490 		ereport(ERROR,
491 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
492 				 errmsg("cannot cluster on invalid index \"%s\"",
493 						RelationGetRelationName(OldIndex))));
494 
495 	/* Drop relcache refcnt on OldIndex, but keep lock */
496 	index_close(OldIndex, NoLock);
497 }
498 
499 /*
500  * mark_index_clustered: mark the specified index as the one clustered on
501  *
502  * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
503  */
504 void
mark_index_clustered(Relation rel,Oid indexOid,bool is_internal)505 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
506 {
507 	HeapTuple	indexTuple;
508 	Form_pg_index indexForm;
509 	Relation	pg_index;
510 	ListCell   *index;
511 
512 	/* Disallow applying to a partitioned table */
513 	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
514 		ereport(ERROR,
515 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
516 				 errmsg("cannot mark index clustered in partitioned table")));
517 
518 	/*
519 	 * If the index is already marked clustered, no need to do anything.
520 	 */
521 	if (OidIsValid(indexOid))
522 	{
523 		indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
524 		if (!HeapTupleIsValid(indexTuple))
525 			elog(ERROR, "cache lookup failed for index %u", indexOid);
526 		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
527 
528 		if (indexForm->indisclustered)
529 		{
530 			ReleaseSysCache(indexTuple);
531 			return;
532 		}
533 
534 		ReleaseSysCache(indexTuple);
535 	}
536 
537 	/*
538 	 * Check each index of the relation and set/clear the bit as needed.
539 	 */
540 	pg_index = table_open(IndexRelationId, RowExclusiveLock);
541 
542 	foreach(index, RelationGetIndexList(rel))
543 	{
544 		Oid			thisIndexOid = lfirst_oid(index);
545 
546 		indexTuple = SearchSysCacheCopy1(INDEXRELID,
547 										 ObjectIdGetDatum(thisIndexOid));
548 		if (!HeapTupleIsValid(indexTuple))
549 			elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
550 		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
551 
552 		/*
553 		 * Unset the bit if set.  We know it's wrong because we checked this
554 		 * earlier.
555 		 */
556 		if (indexForm->indisclustered)
557 		{
558 			indexForm->indisclustered = false;
559 			CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
560 		}
561 		else if (thisIndexOid == indexOid)
562 		{
563 			/* this was checked earlier, but let's be real sure */
564 			if (!indexForm->indisvalid)
565 				elog(ERROR, "cannot cluster on invalid index %u", indexOid);
566 			indexForm->indisclustered = true;
567 			CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
568 		}
569 
570 		InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
571 									 InvalidOid, is_internal);
572 
573 		heap_freetuple(indexTuple);
574 	}
575 
576 	table_close(pg_index, RowExclusiveLock);
577 }
578 
579 /*
580  * rebuild_relation: rebuild an existing relation in index or physical order
581  *
582  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
583  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
584  *
585  * NB: this routine closes OldHeap at the right time; caller should not.
586  */
587 static void
rebuild_relation(Relation OldHeap,Oid indexOid,bool verbose)588 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
589 {
590 	Oid			tableOid = RelationGetRelid(OldHeap);
591 	Oid			tableSpace = OldHeap->rd_rel->reltablespace;
592 	Oid			OIDNewHeap;
593 	char		relpersistence;
594 	bool		is_system_catalog;
595 	bool		swap_toast_by_content;
596 	TransactionId frozenXid;
597 	MultiXactId cutoffMulti;
598 
599 	/* Mark the correct index as clustered */
600 	if (OidIsValid(indexOid))
601 		mark_index_clustered(OldHeap, indexOid, true);
602 
603 	/* Remember info about rel before closing OldHeap */
604 	relpersistence = OldHeap->rd_rel->relpersistence;
605 	is_system_catalog = IsSystemRelation(OldHeap);
606 
607 	/* Close relcache entry, but keep lock until transaction commit */
608 	table_close(OldHeap, NoLock);
609 
610 	/* Create the transient table that will receive the re-ordered data */
611 	OIDNewHeap = make_new_heap(tableOid, tableSpace,
612 							   relpersistence,
613 							   AccessExclusiveLock);
614 
615 	/* Copy the heap data into the new table in the desired order */
616 	copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
617 					&swap_toast_by_content, &frozenXid, &cutoffMulti);
618 
619 	/*
620 	 * Swap the physical files of the target and transient tables, then
621 	 * rebuild the target's indexes and throw away the transient table.
622 	 */
623 	finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
624 					 swap_toast_by_content, false, true,
625 					 frozenXid, cutoffMulti,
626 					 relpersistence);
627 }
628 
629 
630 /*
631  * Create the transient table that will be filled with new data during
632  * CLUSTER, ALTER TABLE, and similar operations.  The transient table
633  * duplicates the logical structure of the OldHeap, but is placed in
634  * NewTableSpace which might be different from OldHeap's.  Also, it's built
635  * with the specified persistence, which might differ from the original's.
636  *
637  * After this, the caller should load the new heap with transferred/modified
638  * data, then call finish_heap_swap to complete the operation.
639  */
640 Oid
make_new_heap(Oid OIDOldHeap,Oid NewTableSpace,char relpersistence,LOCKMODE lockmode)641 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
642 			  LOCKMODE lockmode)
643 {
644 	TupleDesc	OldHeapDesc;
645 	char		NewHeapName[NAMEDATALEN];
646 	Oid			OIDNewHeap;
647 	Oid			toastid;
648 	Relation	OldHeap;
649 	HeapTuple	tuple;
650 	Datum		reloptions;
651 	bool		isNull;
652 	Oid			namespaceid;
653 
654 	OldHeap = table_open(OIDOldHeap, lockmode);
655 	OldHeapDesc = RelationGetDescr(OldHeap);
656 
657 	/*
658 	 * Note that the NewHeap will not receive any of the defaults or
659 	 * constraints associated with the OldHeap; we don't need 'em, and there's
660 	 * no reason to spend cycles inserting them into the catalogs only to
661 	 * delete them.
662 	 */
663 
664 	/*
665 	 * But we do want to use reloptions of the old heap for new heap.
666 	 */
667 	tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
668 	if (!HeapTupleIsValid(tuple))
669 		elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
670 	reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
671 								 &isNull);
672 	if (isNull)
673 		reloptions = (Datum) 0;
674 
675 	if (relpersistence == RELPERSISTENCE_TEMP)
676 		namespaceid = LookupCreationNamespace("pg_temp");
677 	else
678 		namespaceid = RelationGetNamespace(OldHeap);
679 
680 	/*
681 	 * Create the new heap, using a temporary name in the same namespace as
682 	 * the existing table.  NOTE: there is some risk of collision with user
683 	 * relnames.  Working around this seems more trouble than it's worth; in
684 	 * particular, we can't create the new heap in a different namespace from
685 	 * the old, or we will have problems with the TEMP status of temp tables.
686 	 *
687 	 * Note: the new heap is not a shared relation, even if we are rebuilding
688 	 * a shared rel.  However, we do make the new heap mapped if the source is
689 	 * mapped.  This simplifies swap_relation_files, and is absolutely
690 	 * necessary for rebuilding pg_class, for reasons explained there.
691 	 */
692 	snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
693 
694 	OIDNewHeap = heap_create_with_catalog(NewHeapName,
695 										  namespaceid,
696 										  NewTableSpace,
697 										  InvalidOid,
698 										  InvalidOid,
699 										  InvalidOid,
700 										  OldHeap->rd_rel->relowner,
701 										  OldHeap->rd_rel->relam,
702 										  OldHeapDesc,
703 										  NIL,
704 										  RELKIND_RELATION,
705 										  relpersistence,
706 										  false,
707 										  RelationIsMapped(OldHeap),
708 										  ONCOMMIT_NOOP,
709 										  reloptions,
710 										  false,
711 										  true,
712 										  true,
713 										  OIDOldHeap,
714 										  NULL);
715 	Assert(OIDNewHeap != InvalidOid);
716 
717 	ReleaseSysCache(tuple);
718 
719 	/*
720 	 * Advance command counter so that the newly-created relation's catalog
721 	 * tuples will be visible to table_open.
722 	 */
723 	CommandCounterIncrement();
724 
725 	/*
726 	 * If necessary, create a TOAST table for the new relation.
727 	 *
728 	 * If the relation doesn't have a TOAST table already, we can't need one
729 	 * for the new relation.  The other way around is possible though: if some
730 	 * wide columns have been dropped, NewHeapCreateToastTable can decide that
731 	 * no TOAST table is needed for the new table.
732 	 *
733 	 * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
734 	 * that the TOAST table will be visible for insertion.
735 	 */
736 	toastid = OldHeap->rd_rel->reltoastrelid;
737 	if (OidIsValid(toastid))
738 	{
739 		/* keep the existing toast table's reloptions, if any */
740 		tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
741 		if (!HeapTupleIsValid(tuple))
742 			elog(ERROR, "cache lookup failed for relation %u", toastid);
743 		reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
744 									 &isNull);
745 		if (isNull)
746 			reloptions = (Datum) 0;
747 
748 		NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid);
749 
750 		ReleaseSysCache(tuple);
751 	}
752 
753 	table_close(OldHeap, NoLock);
754 
755 	return OIDNewHeap;
756 }
757 
758 /*
759  * Do the physical copying of table data.
760  *
761  * There are three output parameters:
762  * *pSwapToastByContent is set true if toast tables must be swapped by content.
763  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
764  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
765  */
766 static void
copy_table_data(Oid OIDNewHeap,Oid OIDOldHeap,Oid OIDOldIndex,bool verbose,bool * pSwapToastByContent,TransactionId * pFreezeXid,MultiXactId * pCutoffMulti)767 copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
768 				bool *pSwapToastByContent, TransactionId *pFreezeXid,
769 				MultiXactId *pCutoffMulti)
770 {
771 	Relation	NewHeap,
772 				OldHeap,
773 				OldIndex;
774 	Relation	relRelation;
775 	HeapTuple	reltup;
776 	Form_pg_class relform;
777 	TupleDesc	oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
778 	TupleDesc	newTupDesc PG_USED_FOR_ASSERTS_ONLY;
779 	TransactionId OldestXmin;
780 	TransactionId FreezeXid;
781 	MultiXactId MultiXactCutoff;
782 	bool		use_sort;
783 	double		num_tuples = 0,
784 				tups_vacuumed = 0,
785 				tups_recently_dead = 0;
786 	BlockNumber num_pages;
787 	int			elevel = verbose ? INFO : DEBUG2;
788 	PGRUsage	ru0;
789 
790 	pg_rusage_init(&ru0);
791 
792 	/*
793 	 * Open the relations we need.
794 	 */
795 	NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
796 	OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
797 	if (OidIsValid(OIDOldIndex))
798 		OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
799 	else
800 		OldIndex = NULL;
801 
802 	/*
803 	 * Their tuple descriptors should be exactly alike, but here we only need
804 	 * assume that they have the same number of columns.
805 	 */
806 	oldTupDesc = RelationGetDescr(OldHeap);
807 	newTupDesc = RelationGetDescr(NewHeap);
808 	Assert(newTupDesc->natts == oldTupDesc->natts);
809 
810 	/*
811 	 * If the OldHeap has a toast table, get lock on the toast table to keep
812 	 * it from being vacuumed.  This is needed because autovacuum processes
813 	 * toast tables independently of their main tables, with no lock on the
814 	 * latter.  If an autovacuum were to start on the toast table after we
815 	 * compute our OldestXmin below, it would use a later OldestXmin, and then
816 	 * possibly remove as DEAD toast tuples belonging to main tuples we think
817 	 * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
818 	 * tuples.
819 	 *
820 	 * We don't need to open the toast relation here, just lock it.  The lock
821 	 * will be held till end of transaction.
822 	 */
823 	if (OldHeap->rd_rel->reltoastrelid)
824 		LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
825 
826 	/*
827 	 * If both tables have TOAST tables, perform toast swap by content.  It is
828 	 * possible that the old table has a toast table but the new one doesn't,
829 	 * if toastable columns have been dropped.  In that case we have to do
830 	 * swap by links.  This is okay because swap by content is only essential
831 	 * for system catalogs, and we don't support schema changes for them.
832 	 */
833 	if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
834 	{
835 		*pSwapToastByContent = true;
836 
837 		/*
838 		 * When doing swap by content, any toast pointers written into NewHeap
839 		 * must use the old toast table's OID, because that's where the toast
840 		 * data will eventually be found.  Set this up by setting rd_toastoid.
841 		 * This also tells toast_save_datum() to preserve the toast value
842 		 * OIDs, which we want so as not to invalidate toast pointers in
843 		 * system catalog caches, and to avoid making multiple copies of a
844 		 * single toast value.
845 		 *
846 		 * Note that we must hold NewHeap open until we are done writing data,
847 		 * since the relcache will not guarantee to remember this setting once
848 		 * the relation is closed.  Also, this technique depends on the fact
849 		 * that no one will try to read from the NewHeap until after we've
850 		 * finished writing it and swapping the rels --- otherwise they could
851 		 * follow the toast pointers to the wrong place.  (It would actually
852 		 * work for values copied over from the old toast table, but not for
853 		 * any values that we toast which were previously not toasted.)
854 		 */
855 		NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
856 	}
857 	else
858 		*pSwapToastByContent = false;
859 
860 	/*
861 	 * Compute xids used to freeze and weed out dead tuples and multixacts.
862 	 * Since we're going to rewrite the whole table anyway, there's no reason
863 	 * not to be aggressive about this.
864 	 */
865 	vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
866 						  &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
867 						  NULL);
868 
869 	/*
870 	 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
871 	 * backwards, so take the max.
872 	 */
873 	if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
874 		TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
875 		FreezeXid = OldHeap->rd_rel->relfrozenxid;
876 
877 	/*
878 	 * MultiXactCutoff, similarly, shouldn't go backwards either.
879 	 */
880 	if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
881 		MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
882 		MultiXactCutoff = OldHeap->rd_rel->relminmxid;
883 
884 	/*
885 	 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
886 	 * the OldHeap.  We know how to use a sort to duplicate the ordering of a
887 	 * btree index, and will use seqscan-and-sort for that case if the planner
888 	 * tells us it's cheaper.  Otherwise, always indexscan if an index is
889 	 * provided, else plain seqscan.
890 	 */
891 	if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
892 		use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
893 	else
894 		use_sort = false;
895 
896 	/* Log what we're doing */
897 	if (OldIndex != NULL && !use_sort)
898 		ereport(elevel,
899 				(errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
900 						get_namespace_name(RelationGetNamespace(OldHeap)),
901 						RelationGetRelationName(OldHeap),
902 						RelationGetRelationName(OldIndex))));
903 	else if (use_sort)
904 		ereport(elevel,
905 				(errmsg("clustering \"%s.%s\" using sequential scan and sort",
906 						get_namespace_name(RelationGetNamespace(OldHeap)),
907 						RelationGetRelationName(OldHeap))));
908 	else
909 		ereport(elevel,
910 				(errmsg("vacuuming \"%s.%s\"",
911 						get_namespace_name(RelationGetNamespace(OldHeap)),
912 						RelationGetRelationName(OldHeap))));
913 
914 	/*
915 	 * Hand of the actual copying to AM specific function, the generic code
916 	 * cannot know how to deal with visibility across AMs. Note that this
917 	 * routine is allowed to set FreezeXid / MultiXactCutoff to different
918 	 * values (e.g. because the AM doesn't use freezing).
919 	 */
920 	table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
921 									OldestXmin, &FreezeXid, &MultiXactCutoff,
922 									&num_tuples, &tups_vacuumed,
923 									&tups_recently_dead);
924 
925 	/* return selected values to caller, get set as relfrozenxid/minmxid */
926 	*pFreezeXid = FreezeXid;
927 	*pCutoffMulti = MultiXactCutoff;
928 
929 	/* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
930 	NewHeap->rd_toastoid = InvalidOid;
931 
932 	num_pages = RelationGetNumberOfBlocks(NewHeap);
933 
934 	/* Log what we did */
935 	ereport(elevel,
936 			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
937 					RelationGetRelationName(OldHeap),
938 					tups_vacuumed, num_tuples,
939 					RelationGetNumberOfBlocks(OldHeap)),
940 			 errdetail("%.0f dead row versions cannot be removed yet.\n"
941 					   "%s.",
942 					   tups_recently_dead,
943 					   pg_rusage_show(&ru0))));
944 
945 	if (OldIndex != NULL)
946 		index_close(OldIndex, NoLock);
947 	table_close(OldHeap, NoLock);
948 	table_close(NewHeap, NoLock);
949 
950 	/* Update pg_class to reflect the correct values of pages and tuples. */
951 	relRelation = table_open(RelationRelationId, RowExclusiveLock);
952 
953 	reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
954 	if (!HeapTupleIsValid(reltup))
955 		elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
956 	relform = (Form_pg_class) GETSTRUCT(reltup);
957 
958 	relform->relpages = num_pages;
959 	relform->reltuples = num_tuples;
960 
961 	/* Don't update the stats for pg_class.  See swap_relation_files. */
962 	if (OIDOldHeap != RelationRelationId)
963 		CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
964 	else
965 		CacheInvalidateRelcacheByTuple(reltup);
966 
967 	/* Clean up. */
968 	heap_freetuple(reltup);
969 	table_close(relRelation, RowExclusiveLock);
970 
971 	/* Make the update visible */
972 	CommandCounterIncrement();
973 }
974 
975 /*
976  * Swap the physical files of two given relations.
977  *
978  * We swap the physical identity (reltablespace, relfilenode) while keeping the
979  * same logical identities of the two relations.  relpersistence is also
980  * swapped, which is critical since it determines where buffers live for each
981  * relation.
982  *
983  * We can swap associated TOAST data in either of two ways: recursively swap
984  * the physical content of the toast tables (and their indexes), or swap the
985  * TOAST links in the given relations' pg_class entries.  The former is needed
986  * to manage rewrites of shared catalogs (where we cannot change the pg_class
987  * links) while the latter is the only way to handle cases in which a toast
988  * table is added or removed altogether.
989  *
990  * Additionally, the first relation is marked with relfrozenxid set to
991  * frozenXid.  It seems a bit ugly to have this here, but the caller would
992  * have to do it anyway, so having it here saves a heap_update.  Note: in
993  * the swap-toast-links case, we assume we don't need to change the toast
994  * table's relfrozenxid: the new version of the toast table should already
995  * have relfrozenxid set to RecentXmin, which is good enough.
996  *
997  * Lastly, if r2 and its toast table and toast index (if any) are mapped,
998  * their OIDs are emitted into mapped_tables[].  This is hacky but beats
999  * having to look the information up again later in finish_heap_swap.
1000  */
1001 static void
swap_relation_files(Oid r1,Oid r2,bool target_is_pg_class,bool swap_toast_by_content,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,Oid * mapped_tables)1002 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1003 					bool swap_toast_by_content,
1004 					bool is_internal,
1005 					TransactionId frozenXid,
1006 					MultiXactId cutoffMulti,
1007 					Oid *mapped_tables)
1008 {
1009 	Relation	relRelation;
1010 	HeapTuple	reltup1,
1011 				reltup2;
1012 	Form_pg_class relform1,
1013 				relform2;
1014 	Oid			relfilenode1,
1015 				relfilenode2;
1016 	Oid			swaptemp;
1017 	char		swptmpchr;
1018 
1019 	/* We need writable copies of both pg_class tuples. */
1020 	relRelation = table_open(RelationRelationId, RowExclusiveLock);
1021 
1022 	reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1023 	if (!HeapTupleIsValid(reltup1))
1024 		elog(ERROR, "cache lookup failed for relation %u", r1);
1025 	relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1026 
1027 	reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1028 	if (!HeapTupleIsValid(reltup2))
1029 		elog(ERROR, "cache lookup failed for relation %u", r2);
1030 	relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1031 
1032 	relfilenode1 = relform1->relfilenode;
1033 	relfilenode2 = relform2->relfilenode;
1034 
1035 	if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1036 	{
1037 		/*
1038 		 * Normal non-mapped relations: swap relfilenodes, reltablespaces,
1039 		 * relpersistence
1040 		 */
1041 		Assert(!target_is_pg_class);
1042 
1043 		swaptemp = relform1->relfilenode;
1044 		relform1->relfilenode = relform2->relfilenode;
1045 		relform2->relfilenode = swaptemp;
1046 
1047 		swaptemp = relform1->reltablespace;
1048 		relform1->reltablespace = relform2->reltablespace;
1049 		relform2->reltablespace = swaptemp;
1050 
1051 		swptmpchr = relform1->relpersistence;
1052 		relform1->relpersistence = relform2->relpersistence;
1053 		relform2->relpersistence = swptmpchr;
1054 
1055 		/* Also swap toast links, if we're swapping by links */
1056 		if (!swap_toast_by_content)
1057 		{
1058 			swaptemp = relform1->reltoastrelid;
1059 			relform1->reltoastrelid = relform2->reltoastrelid;
1060 			relform2->reltoastrelid = swaptemp;
1061 		}
1062 	}
1063 	else
1064 	{
1065 		/*
1066 		 * Mapped-relation case.  Here we have to swap the relation mappings
1067 		 * instead of modifying the pg_class columns.  Both must be mapped.
1068 		 */
1069 		if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
1070 			elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1071 				 NameStr(relform1->relname));
1072 
1073 		/*
1074 		 * We can't change the tablespace nor persistence of a mapped rel, and
1075 		 * we can't handle toast link swapping for one either, because we must
1076 		 * not apply any critical changes to its pg_class row.  These cases
1077 		 * should be prevented by upstream permissions tests, so these checks
1078 		 * are non-user-facing emergency backstop.
1079 		 */
1080 		if (relform1->reltablespace != relform2->reltablespace)
1081 			elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1082 				 NameStr(relform1->relname));
1083 		if (relform1->relpersistence != relform2->relpersistence)
1084 			elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1085 				 NameStr(relform1->relname));
1086 		if (!swap_toast_by_content &&
1087 			(relform1->reltoastrelid || relform2->reltoastrelid))
1088 			elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1089 				 NameStr(relform1->relname));
1090 
1091 		/*
1092 		 * Fetch the mappings --- shouldn't fail, but be paranoid
1093 		 */
1094 		relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1095 		if (!OidIsValid(relfilenode1))
1096 			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1097 				 NameStr(relform1->relname), r1);
1098 		relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1099 		if (!OidIsValid(relfilenode2))
1100 			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1101 				 NameStr(relform2->relname), r2);
1102 
1103 		/*
1104 		 * Send replacement mappings to relmapper.  Note these won't actually
1105 		 * take effect until CommandCounterIncrement.
1106 		 */
1107 		RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1108 		RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1109 
1110 		/* Pass OIDs of mapped r2 tables back to caller */
1111 		*mapped_tables++ = r2;
1112 	}
1113 
1114 	/*
1115 	 * In the case of a shared catalog, these next few steps will only affect
1116 	 * our own database's pg_class row; but that's okay, because they are all
1117 	 * noncritical updates.  That's also an important fact for the case of a
1118 	 * mapped catalog, because it's possible that we'll commit the map change
1119 	 * and then fail to commit the pg_class update.
1120 	 */
1121 
1122 	/* set rel1's frozen Xid and minimum MultiXid */
1123 	if (relform1->relkind != RELKIND_INDEX)
1124 	{
1125 		Assert(!TransactionIdIsValid(frozenXid) ||
1126 			   TransactionIdIsNormal(frozenXid));
1127 		relform1->relfrozenxid = frozenXid;
1128 		relform1->relminmxid = cutoffMulti;
1129 	}
1130 
1131 	/* swap size statistics too, since new rel has freshly-updated stats */
1132 	{
1133 		int32		swap_pages;
1134 		float4		swap_tuples;
1135 		int32		swap_allvisible;
1136 
1137 		swap_pages = relform1->relpages;
1138 		relform1->relpages = relform2->relpages;
1139 		relform2->relpages = swap_pages;
1140 
1141 		swap_tuples = relform1->reltuples;
1142 		relform1->reltuples = relform2->reltuples;
1143 		relform2->reltuples = swap_tuples;
1144 
1145 		swap_allvisible = relform1->relallvisible;
1146 		relform1->relallvisible = relform2->relallvisible;
1147 		relform2->relallvisible = swap_allvisible;
1148 	}
1149 
1150 	/*
1151 	 * Update the tuples in pg_class --- unless the target relation of the
1152 	 * swap is pg_class itself.  In that case, there is zero point in making
1153 	 * changes because we'd be updating the old data that we're about to throw
1154 	 * away.  Because the real work being done here for a mapped relation is
1155 	 * just to change the relation map settings, it's all right to not update
1156 	 * the pg_class rows in this case. The most important changes will instead
1157 	 * performed later, in finish_heap_swap() itself.
1158 	 */
1159 	if (!target_is_pg_class)
1160 	{
1161 		CatalogIndexState indstate;
1162 
1163 		indstate = CatalogOpenIndexes(relRelation);
1164 		CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1165 								   indstate);
1166 		CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1167 								   indstate);
1168 		CatalogCloseIndexes(indstate);
1169 	}
1170 	else
1171 	{
1172 		/* no update ... but we do still need relcache inval */
1173 		CacheInvalidateRelcacheByTuple(reltup1);
1174 		CacheInvalidateRelcacheByTuple(reltup2);
1175 	}
1176 
1177 	/*
1178 	 * Post alter hook for modified relations. The change to r2 is always
1179 	 * internal, but r1 depends on the invocation context.
1180 	 */
1181 	InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1182 								 InvalidOid, is_internal);
1183 	InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1184 								 InvalidOid, true);
1185 
1186 	/*
1187 	 * If we have toast tables associated with the relations being swapped,
1188 	 * deal with them too.
1189 	 */
1190 	if (relform1->reltoastrelid || relform2->reltoastrelid)
1191 	{
1192 		if (swap_toast_by_content)
1193 		{
1194 			if (relform1->reltoastrelid && relform2->reltoastrelid)
1195 			{
1196 				/* Recursively swap the contents of the toast tables */
1197 				swap_relation_files(relform1->reltoastrelid,
1198 									relform2->reltoastrelid,
1199 									target_is_pg_class,
1200 									swap_toast_by_content,
1201 									is_internal,
1202 									frozenXid,
1203 									cutoffMulti,
1204 									mapped_tables);
1205 			}
1206 			else
1207 			{
1208 				/* caller messed up */
1209 				elog(ERROR, "cannot swap toast files by content when there's only one");
1210 			}
1211 		}
1212 		else
1213 		{
1214 			/*
1215 			 * We swapped the ownership links, so we need to change dependency
1216 			 * data to match.
1217 			 *
1218 			 * NOTE: it is possible that only one table has a toast table.
1219 			 *
1220 			 * NOTE: at present, a TOAST table's only dependency is the one on
1221 			 * its owning table.  If more are ever created, we'd need to use
1222 			 * something more selective than deleteDependencyRecordsFor() to
1223 			 * get rid of just the link we want.
1224 			 */
1225 			ObjectAddress baseobject,
1226 						toastobject;
1227 			long		count;
1228 
1229 			/*
1230 			 * We disallow this case for system catalogs, to avoid the
1231 			 * possibility that the catalog we're rebuilding is one of the
1232 			 * ones the dependency changes would change.  It's too late to be
1233 			 * making any data changes to the target catalog.
1234 			 */
1235 			if (IsSystemClass(r1, relform1))
1236 				elog(ERROR, "cannot swap toast files by links for system catalogs");
1237 
1238 			/* Delete old dependencies */
1239 			if (relform1->reltoastrelid)
1240 			{
1241 				count = deleteDependencyRecordsFor(RelationRelationId,
1242 												   relform1->reltoastrelid,
1243 												   false);
1244 				if (count != 1)
1245 					elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1246 						 count);
1247 			}
1248 			if (relform2->reltoastrelid)
1249 			{
1250 				count = deleteDependencyRecordsFor(RelationRelationId,
1251 												   relform2->reltoastrelid,
1252 												   false);
1253 				if (count != 1)
1254 					elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1255 						 count);
1256 			}
1257 
1258 			/* Register new dependencies */
1259 			baseobject.classId = RelationRelationId;
1260 			baseobject.objectSubId = 0;
1261 			toastobject.classId = RelationRelationId;
1262 			toastobject.objectSubId = 0;
1263 
1264 			if (relform1->reltoastrelid)
1265 			{
1266 				baseobject.objectId = r1;
1267 				toastobject.objectId = relform1->reltoastrelid;
1268 				recordDependencyOn(&toastobject, &baseobject,
1269 								   DEPENDENCY_INTERNAL);
1270 			}
1271 
1272 			if (relform2->reltoastrelid)
1273 			{
1274 				baseobject.objectId = r2;
1275 				toastobject.objectId = relform2->reltoastrelid;
1276 				recordDependencyOn(&toastobject, &baseobject,
1277 								   DEPENDENCY_INTERNAL);
1278 			}
1279 		}
1280 	}
1281 
1282 	/*
1283 	 * If we're swapping two toast tables by content, do the same for their
1284 	 * valid index. The swap can actually be safely done only if the relations
1285 	 * have indexes.
1286 	 */
1287 	if (swap_toast_by_content &&
1288 		relform1->relkind == RELKIND_TOASTVALUE &&
1289 		relform2->relkind == RELKIND_TOASTVALUE)
1290 	{
1291 		Oid			toastIndex1,
1292 					toastIndex2;
1293 
1294 		/* Get valid index for each relation */
1295 		toastIndex1 = toast_get_valid_index(r1,
1296 											AccessExclusiveLock);
1297 		toastIndex2 = toast_get_valid_index(r2,
1298 											AccessExclusiveLock);
1299 
1300 		swap_relation_files(toastIndex1,
1301 							toastIndex2,
1302 							target_is_pg_class,
1303 							swap_toast_by_content,
1304 							is_internal,
1305 							InvalidTransactionId,
1306 							InvalidMultiXactId,
1307 							mapped_tables);
1308 	}
1309 
1310 	/* Clean up. */
1311 	heap_freetuple(reltup1);
1312 	heap_freetuple(reltup2);
1313 
1314 	table_close(relRelation, RowExclusiveLock);
1315 
1316 	/*
1317 	 * Close both relcache entries' smgr links.  We need this kluge because
1318 	 * both links will be invalidated during upcoming CommandCounterIncrement.
1319 	 * Whichever of the rels is the second to be cleared will have a dangling
1320 	 * reference to the other's smgr entry.  Rather than trying to avoid this
1321 	 * by ordering operations just so, it's easiest to close the links first.
1322 	 * (Fortunately, since one of the entries is local in our transaction,
1323 	 * it's sufficient to clear out our own relcache this way; the problem
1324 	 * cannot arise for other backends when they see our update on the
1325 	 * non-transient relation.)
1326 	 *
1327 	 * Caution: the placement of this step interacts with the decision to
1328 	 * handle toast rels by recursion.  When we are trying to rebuild pg_class
1329 	 * itself, the smgr close on pg_class must happen after all accesses in
1330 	 * this function.
1331 	 */
1332 	RelationCloseSmgrByOid(r1);
1333 	RelationCloseSmgrByOid(r2);
1334 }
1335 
1336 /*
1337  * Remove the transient table that was built by make_new_heap, and finish
1338  * cleaning up (including rebuilding all indexes on the old heap).
1339  */
1340 void
finish_heap_swap(Oid OIDOldHeap,Oid OIDNewHeap,bool is_system_catalog,bool swap_toast_by_content,bool check_constraints,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,char newrelpersistence)1341 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1342 				 bool is_system_catalog,
1343 				 bool swap_toast_by_content,
1344 				 bool check_constraints,
1345 				 bool is_internal,
1346 				 TransactionId frozenXid,
1347 				 MultiXactId cutoffMulti,
1348 				 char newrelpersistence)
1349 {
1350 	ObjectAddress object;
1351 	Oid			mapped_tables[4];
1352 	int			reindex_flags;
1353 	int			i;
1354 
1355 	/* Report that we are now swapping relation files */
1356 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1357 								 PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
1358 
1359 	/* Zero out possible results from swapped_relation_files */
1360 	memset(mapped_tables, 0, sizeof(mapped_tables));
1361 
1362 	/*
1363 	 * Swap the contents of the heap relations (including any toast tables).
1364 	 * Also set old heap's relfrozenxid to frozenXid.
1365 	 */
1366 	swap_relation_files(OIDOldHeap, OIDNewHeap,
1367 						(OIDOldHeap == RelationRelationId),
1368 						swap_toast_by_content, is_internal,
1369 						frozenXid, cutoffMulti, mapped_tables);
1370 
1371 	/*
1372 	 * If it's a system catalog, queue a sinval message to flush all catcaches
1373 	 * on the catalog when we reach CommandCounterIncrement.
1374 	 */
1375 	if (is_system_catalog)
1376 		CacheInvalidateCatalog(OIDOldHeap);
1377 
1378 	/*
1379 	 * Rebuild each index on the relation (but not the toast table, which is
1380 	 * all-new at this point).  It is important to do this before the DROP
1381 	 * step because if we are processing a system catalog that will be used
1382 	 * during DROP, we want to have its indexes available.  There is no
1383 	 * advantage to the other order anyway because this is all transactional,
1384 	 * so no chance to reclaim disk space before commit.  We do not need a
1385 	 * final CommandCounterIncrement() because reindex_relation does it.
1386 	 *
1387 	 * Note: because index_build is called via reindex_relation, it will never
1388 	 * set indcheckxmin true for the indexes.  This is OK even though in some
1389 	 * sense we are building new indexes rather than rebuilding existing ones,
1390 	 * because the new heap won't contain any HOT chains at all, let alone
1391 	 * broken ones, so it can't be necessary to set indcheckxmin.
1392 	 */
1393 	reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1394 	if (check_constraints)
1395 		reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1396 
1397 	/*
1398 	 * Ensure that the indexes have the same persistence as the parent
1399 	 * relation.
1400 	 */
1401 	if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1402 		reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1403 	else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1404 		reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1405 
1406 	/* Report that we are now reindexing relations */
1407 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1408 								 PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
1409 
1410 	reindex_relation(OIDOldHeap, reindex_flags, 0);
1411 
1412 	/* Report that we are now doing clean up */
1413 	pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
1414 								 PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
1415 
1416 	/*
1417 	 * If the relation being rebuild is pg_class, swap_relation_files()
1418 	 * couldn't update pg_class's own pg_class entry (check comments in
1419 	 * swap_relation_files()), thus relfrozenxid was not updated. That's
1420 	 * annoying because a potential reason for doing a VACUUM FULL is a
1421 	 * imminent or actual anti-wraparound shutdown.  So, now that we can
1422 	 * access the new relation using its indices, update relfrozenxid.
1423 	 * pg_class doesn't have a toast relation, so we don't need to update the
1424 	 * corresponding toast relation. Not that there's little point moving all
1425 	 * relfrozenxid updates here since swap_relation_files() needs to write to
1426 	 * pg_class for non-mapped relations anyway.
1427 	 */
1428 	if (OIDOldHeap == RelationRelationId)
1429 	{
1430 		Relation	relRelation;
1431 		HeapTuple	reltup;
1432 		Form_pg_class relform;
1433 
1434 		relRelation = table_open(RelationRelationId, RowExclusiveLock);
1435 
1436 		reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1437 		if (!HeapTupleIsValid(reltup))
1438 			elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1439 		relform = (Form_pg_class) GETSTRUCT(reltup);
1440 
1441 		relform->relfrozenxid = frozenXid;
1442 		relform->relminmxid = cutoffMulti;
1443 
1444 		CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1445 
1446 		table_close(relRelation, RowExclusiveLock);
1447 	}
1448 
1449 	/* Destroy new heap with old filenode */
1450 	object.classId = RelationRelationId;
1451 	object.objectId = OIDNewHeap;
1452 	object.objectSubId = 0;
1453 
1454 	/*
1455 	 * The new relation is local to our transaction and we know nothing
1456 	 * depends on it, so DROP_RESTRICT should be OK.
1457 	 */
1458 	performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1459 
1460 	/* performDeletion does CommandCounterIncrement at end */
1461 
1462 	/*
1463 	 * Now we must remove any relation mapping entries that we set up for the
1464 	 * transient table, as well as its toast table and toast index if any. If
1465 	 * we fail to do this before commit, the relmapper will complain about new
1466 	 * permanent map entries being added post-bootstrap.
1467 	 */
1468 	for (i = 0; OidIsValid(mapped_tables[i]); i++)
1469 		RelationMapRemoveMapping(mapped_tables[i]);
1470 
1471 	/*
1472 	 * At this point, everything is kosher except that, if we did toast swap
1473 	 * by links, the toast table's name corresponds to the transient table.
1474 	 * The name is irrelevant to the backend because it's referenced by OID,
1475 	 * but users looking at the catalogs could be confused.  Rename it to
1476 	 * prevent this problem.
1477 	 *
1478 	 * Note no lock required on the relation, because we already hold an
1479 	 * exclusive lock on it.
1480 	 */
1481 	if (!swap_toast_by_content)
1482 	{
1483 		Relation	newrel;
1484 
1485 		newrel = table_open(OIDOldHeap, NoLock);
1486 		if (OidIsValid(newrel->rd_rel->reltoastrelid))
1487 		{
1488 			Oid			toastidx;
1489 			char		NewToastName[NAMEDATALEN];
1490 
1491 			/* Get the associated valid index to be renamed */
1492 			toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1493 											 NoLock);
1494 
1495 			/* rename the toast table ... */
1496 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1497 					 OIDOldHeap);
1498 			RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1499 								   NewToastName, true, false);
1500 
1501 			/* ... and its valid index too. */
1502 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1503 					 OIDOldHeap);
1504 
1505 			RenameRelationInternal(toastidx,
1506 								   NewToastName, true, true);
1507 
1508 			/*
1509 			 * Reset the relrewrite for the toast. The command-counter
1510 			 * increment is required here as we are about to update
1511 			 * the tuple that is updated as part of RenameRelationInternal.
1512 			 */
1513 			CommandCounterIncrement();
1514 			ResetRelRewrite(newrel->rd_rel->reltoastrelid);
1515 		}
1516 		relation_close(newrel, NoLock);
1517 	}
1518 
1519 	/* if it's not a catalog table, clear any missing attribute settings */
1520 	if (!is_system_catalog)
1521 	{
1522 		Relation	newrel;
1523 
1524 		newrel = table_open(OIDOldHeap, NoLock);
1525 		RelationClearMissing(newrel);
1526 		relation_close(newrel, NoLock);
1527 	}
1528 }
1529 
1530 
1531 /*
1532  * Get a list of tables that the current user owns and
1533  * have indisclustered set.  Return the list in a List * of rvsToCluster
1534  * with the tableOid and the indexOid on which the table is already
1535  * clustered.
1536  */
1537 static List *
get_tables_to_cluster(MemoryContext cluster_context)1538 get_tables_to_cluster(MemoryContext cluster_context)
1539 {
1540 	Relation	indRelation;
1541 	TableScanDesc scan;
1542 	ScanKeyData entry;
1543 	HeapTuple	indexTuple;
1544 	Form_pg_index index;
1545 	MemoryContext old_context;
1546 	RelToCluster *rvtc;
1547 	List	   *rvs = NIL;
1548 
1549 	/*
1550 	 * Get all indexes that have indisclustered set and are owned by
1551 	 * appropriate user. System relations or nailed-in relations cannot ever
1552 	 * have indisclustered set, because CLUSTER will refuse to set it when
1553 	 * called with one of them as argument.
1554 	 */
1555 	indRelation = table_open(IndexRelationId, AccessShareLock);
1556 	ScanKeyInit(&entry,
1557 				Anum_pg_index_indisclustered,
1558 				BTEqualStrategyNumber, F_BOOLEQ,
1559 				BoolGetDatum(true));
1560 	scan = table_beginscan_catalog(indRelation, 1, &entry);
1561 	while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1562 	{
1563 		index = (Form_pg_index) GETSTRUCT(indexTuple);
1564 
1565 		if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1566 			continue;
1567 
1568 		/*
1569 		 * We have to build the list in a different memory context so it will
1570 		 * survive the cross-transaction processing
1571 		 */
1572 		old_context = MemoryContextSwitchTo(cluster_context);
1573 
1574 		rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1575 		rvtc->tableOid = index->indrelid;
1576 		rvtc->indexOid = index->indexrelid;
1577 		rvs = lcons(rvtc, rvs);
1578 
1579 		MemoryContextSwitchTo(old_context);
1580 	}
1581 	table_endscan(scan);
1582 
1583 	relation_close(indRelation, AccessShareLock);
1584 
1585 	return rvs;
1586 }
1587