1 /*-------------------------------------------------------------------------
2  *
3  * cluster.c
4  *	  CLUSTER a table on an index.  This is now also used for VACUUM FULL.
5  *
6  * There is hardly anything left of Paul Brown's original implementation...
7  *
8  *
9  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
10  * Portions Copyright (c) 1994-5, Regents of the University of California
11  *
12  *
13  * IDENTIFICATION
14  *	  src/backend/commands/cluster.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include "access/amapi.h"
21 #include "access/multixact.h"
22 #include "access/relscan.h"
23 #include "access/rewriteheap.h"
24 #include "access/transam.h"
25 #include "access/tuptoaster.h"
26 #include "access/xact.h"
27 #include "access/xlog.h"
28 #include "catalog/pg_am.h"
29 #include "catalog/catalog.h"
30 #include "catalog/dependency.h"
31 #include "catalog/heap.h"
32 #include "catalog/index.h"
33 #include "catalog/namespace.h"
34 #include "catalog/objectaccess.h"
35 #include "catalog/toasting.h"
36 #include "commands/cluster.h"
37 #include "commands/tablecmds.h"
38 #include "commands/vacuum.h"
39 #include "miscadmin.h"
40 #include "optimizer/planner.h"
41 #include "storage/bufmgr.h"
42 #include "storage/lmgr.h"
43 #include "storage/predicate.h"
44 #include "storage/smgr.h"
45 #include "utils/acl.h"
46 #include "utils/fmgroids.h"
47 #include "utils/inval.h"
48 #include "utils/lsyscache.h"
49 #include "utils/memutils.h"
50 #include "utils/pg_rusage.h"
51 #include "utils/relmapper.h"
52 #include "utils/snapmgr.h"
53 #include "utils/syscache.h"
54 #include "utils/tqual.h"
55 #include "utils/tuplesort.h"
56 
57 
58 /*
59  * This struct is used to pass around the information on tables to be
60  * clustered. We need this so we can make a list of them when invoked without
61  * a specific table/index pair.
62  */
63 typedef struct
64 {
65 	Oid			tableOid;
66 	Oid			indexOid;
67 } RelToCluster;
68 
69 
70 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
71 static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
72 			   bool verbose, bool *pSwapToastByContent,
73 			   TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
74 static List *get_tables_to_cluster(MemoryContext cluster_context);
75 static void reform_and_rewrite_tuple(HeapTuple tuple,
76 						 TupleDesc oldTupDesc, TupleDesc newTupDesc,
77 						 Datum *values, bool *isnull,
78 						 bool newRelHasOids, RewriteState rwstate);
79 
80 
81 /*---------------------------------------------------------------------------
82  * This cluster code allows for clustering multiple tables at once. Because
83  * of this, we cannot just run everything on a single transaction, or we
84  * would be forced to acquire exclusive locks on all the tables being
85  * clustered, simultaneously --- very likely leading to deadlock.
86  *
87  * To solve this we follow a similar strategy to VACUUM code,
88  * clustering each relation in a separate transaction. For this to work,
89  * we need to:
90  *	- provide a separate memory context so that we can pass information in
91  *	  a way that survives across transactions
92  *	- start a new transaction every time a new relation is clustered
93  *	- check for validity of the information on to-be-clustered relations,
94  *	  as someone might have deleted a relation behind our back, or
95  *	  clustered one on a different index
96  *	- end the transaction
97  *
98  * The single-relation case does not have any such overhead.
99  *
100  * We also allow a relation to be specified without index.  In that case,
101  * the indisclustered bit will be looked up, and an ERROR will be thrown
102  * if there is no index with the bit set.
103  *---------------------------------------------------------------------------
104  */
105 void
cluster(ClusterStmt * stmt,bool isTopLevel)106 cluster(ClusterStmt *stmt, bool isTopLevel)
107 {
108 	if (stmt->relation != NULL)
109 	{
110 		/* This is the single-relation case. */
111 		Oid			tableOid,
112 					indexOid = InvalidOid;
113 		Relation	rel;
114 
115 		/* Find, lock, and check permissions on the table */
116 		tableOid = RangeVarGetRelidExtended(stmt->relation,
117 											AccessExclusiveLock,
118 											false, false,
119 											RangeVarCallbackOwnsTable, NULL);
120 		rel = heap_open(tableOid, NoLock);
121 
122 		/*
123 		 * Reject clustering a remote temp table ... their local buffer
124 		 * manager is not going to cope.
125 		 */
126 		if (RELATION_IS_OTHER_TEMP(rel))
127 			ereport(ERROR,
128 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
129 			   errmsg("cannot cluster temporary tables of other sessions")));
130 
131 		if (stmt->indexname == NULL)
132 		{
133 			ListCell   *index;
134 
135 			/* We need to find the index that has indisclustered set. */
136 			foreach(index, RelationGetIndexList(rel))
137 			{
138 				HeapTuple	idxtuple;
139 				Form_pg_index indexForm;
140 
141 				indexOid = lfirst_oid(index);
142 				idxtuple = SearchSysCache1(INDEXRELID,
143 										   ObjectIdGetDatum(indexOid));
144 				if (!HeapTupleIsValid(idxtuple))
145 					elog(ERROR, "cache lookup failed for index %u", indexOid);
146 				indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
147 				if (indexForm->indisclustered)
148 				{
149 					ReleaseSysCache(idxtuple);
150 					break;
151 				}
152 				ReleaseSysCache(idxtuple);
153 				indexOid = InvalidOid;
154 			}
155 
156 			if (!OidIsValid(indexOid))
157 				ereport(ERROR,
158 						(errcode(ERRCODE_UNDEFINED_OBJECT),
159 						 errmsg("there is no previously clustered index for table \"%s\"",
160 								stmt->relation->relname)));
161 		}
162 		else
163 		{
164 			/*
165 			 * The index is expected to be in the same namespace as the
166 			 * relation.
167 			 */
168 			indexOid = get_relname_relid(stmt->indexname,
169 										 rel->rd_rel->relnamespace);
170 			if (!OidIsValid(indexOid))
171 				ereport(ERROR,
172 						(errcode(ERRCODE_UNDEFINED_OBJECT),
173 					   errmsg("index \"%s\" for table \"%s\" does not exist",
174 							  stmt->indexname, stmt->relation->relname)));
175 		}
176 
177 		/* close relation, keep lock till commit */
178 		heap_close(rel, NoLock);
179 
180 		/* Do the job. */
181 		cluster_rel(tableOid, indexOid, false, stmt->verbose);
182 	}
183 	else
184 	{
185 		/*
186 		 * This is the "multi relation" case. We need to cluster all tables
187 		 * that have some index with indisclustered set.
188 		 */
189 		MemoryContext cluster_context;
190 		List	   *rvs;
191 		ListCell   *rv;
192 
193 		/*
194 		 * We cannot run this form of CLUSTER inside a user transaction block;
195 		 * we'd be holding locks way too long.
196 		 */
197 		PreventTransactionChain(isTopLevel, "CLUSTER");
198 
199 		/*
200 		 * Create special memory context for cross-transaction storage.
201 		 *
202 		 * Since it is a child of PortalContext, it will go away even in case
203 		 * of error.
204 		 */
205 		cluster_context = AllocSetContextCreate(PortalContext,
206 												"Cluster",
207 												ALLOCSET_DEFAULT_SIZES);
208 
209 		/*
210 		 * Build the list of relations to cluster.  Note that this lives in
211 		 * cluster_context.
212 		 */
213 		rvs = get_tables_to_cluster(cluster_context);
214 
215 		/* Commit to get out of starting transaction */
216 		PopActiveSnapshot();
217 		CommitTransactionCommand();
218 
219 		/* Ok, now that we've got them all, cluster them one by one */
220 		foreach(rv, rvs)
221 		{
222 			RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
223 
224 			/* Start a new transaction for each relation. */
225 			StartTransactionCommand();
226 			/* functions in indexes may want a snapshot set */
227 			PushActiveSnapshot(GetTransactionSnapshot());
228 			/* Do the job. */
229 			cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose);
230 			PopActiveSnapshot();
231 			CommitTransactionCommand();
232 		}
233 
234 		/* Start a new transaction for the cleanup work. */
235 		StartTransactionCommand();
236 
237 		/* Clean up working storage */
238 		MemoryContextDelete(cluster_context);
239 	}
240 }
241 
242 /*
243  * cluster_rel
244  *
245  * This clusters the table by creating a new, clustered table and
246  * swapping the relfilenodes of the new table and the old table, so
247  * the OID of the original table is preserved.  Thus we do not lose
248  * GRANT, inheritance nor references to this table (this was a bug
249  * in releases through 7.3).
250  *
251  * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
252  * the new table, it's better to create the indexes afterwards than to fill
253  * them incrementally while we load the table.
254  *
255  * If indexOid is InvalidOid, the table will be rewritten in physical order
256  * instead of index order.  This is the new implementation of VACUUM FULL,
257  * and error messages should refer to the operation as VACUUM not CLUSTER.
258  */
259 void
cluster_rel(Oid tableOid,Oid indexOid,bool recheck,bool verbose)260 cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose)
261 {
262 	Relation	OldHeap;
263 
264 	/* Check for user-requested abort. */
265 	CHECK_FOR_INTERRUPTS();
266 
267 	/*
268 	 * We grab exclusive access to the target rel and index for the duration
269 	 * of the transaction.  (This is redundant for the single-transaction
270 	 * case, since cluster() already did it.)  The index lock is taken inside
271 	 * check_index_is_clusterable.
272 	 */
273 	OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
274 
275 	/* If the table has gone away, we can skip processing it */
276 	if (!OldHeap)
277 		return;
278 
279 	/*
280 	 * Since we may open a new transaction for each relation, we have to check
281 	 * that the relation still is what we think it is.
282 	 *
283 	 * If this is a single-transaction CLUSTER, we can skip these tests. We
284 	 * *must* skip the one on indisclustered since it would reject an attempt
285 	 * to cluster a not-previously-clustered index.
286 	 */
287 	if (recheck)
288 	{
289 		HeapTuple	tuple;
290 		Form_pg_index indexForm;
291 
292 		/* Check that the user still owns the relation */
293 		if (!pg_class_ownercheck(tableOid, GetUserId()))
294 		{
295 			relation_close(OldHeap, AccessExclusiveLock);
296 			return;
297 		}
298 
299 		/*
300 		 * Silently skip a temp table for a remote session.  Only doing this
301 		 * check in the "recheck" case is appropriate (which currently means
302 		 * somebody is executing a database-wide CLUSTER), because there is
303 		 * another check in cluster() which will stop any attempt to cluster
304 		 * remote temp tables by name.  There is another check in cluster_rel
305 		 * which is redundant, but we leave it for extra safety.
306 		 */
307 		if (RELATION_IS_OTHER_TEMP(OldHeap))
308 		{
309 			relation_close(OldHeap, AccessExclusiveLock);
310 			return;
311 		}
312 
313 		if (OidIsValid(indexOid))
314 		{
315 			/*
316 			 * Check that the index still exists
317 			 */
318 			if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
319 			{
320 				relation_close(OldHeap, AccessExclusiveLock);
321 				return;
322 			}
323 
324 			/*
325 			 * Check that the index is still the one with indisclustered set.
326 			 */
327 			tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
328 			if (!HeapTupleIsValid(tuple))		/* probably can't happen */
329 			{
330 				relation_close(OldHeap, AccessExclusiveLock);
331 				return;
332 			}
333 			indexForm = (Form_pg_index) GETSTRUCT(tuple);
334 			if (!indexForm->indisclustered)
335 			{
336 				ReleaseSysCache(tuple);
337 				relation_close(OldHeap, AccessExclusiveLock);
338 				return;
339 			}
340 			ReleaseSysCache(tuple);
341 		}
342 	}
343 
344 	/*
345 	 * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
346 	 * would work in most respects, but the index would only get marked as
347 	 * indisclustered in the current database, leading to unexpected behavior
348 	 * if CLUSTER were later invoked in another database.
349 	 */
350 	if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
351 		ereport(ERROR,
352 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
353 				 errmsg("cannot cluster a shared catalog")));
354 
355 	/*
356 	 * Don't process temp tables of other backends ... their local buffer
357 	 * manager is not going to cope.
358 	 */
359 	if (RELATION_IS_OTHER_TEMP(OldHeap))
360 	{
361 		if (OidIsValid(indexOid))
362 			ereport(ERROR,
363 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
364 			   errmsg("cannot cluster temporary tables of other sessions")));
365 		else
366 			ereport(ERROR,
367 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
368 				errmsg("cannot vacuum temporary tables of other sessions")));
369 	}
370 
371 	/*
372 	 * Also check for active uses of the relation in the current transaction,
373 	 * including open scans and pending AFTER trigger events.
374 	 */
375 	CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
376 
377 	/* Check heap and index are valid to cluster on */
378 	if (OidIsValid(indexOid))
379 		check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
380 
381 	/*
382 	 * Quietly ignore the request if this is a materialized view which has not
383 	 * been populated from its query. No harm is done because there is no data
384 	 * to deal with, and we don't want to throw an error if this is part of a
385 	 * multi-relation request -- for example, CLUSTER was run on the entire
386 	 * database.
387 	 */
388 	if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
389 		!RelationIsPopulated(OldHeap))
390 	{
391 		relation_close(OldHeap, AccessExclusiveLock);
392 		return;
393 	}
394 
395 	/*
396 	 * All predicate locks on the tuples or pages are about to be made
397 	 * invalid, because we move tuples around.  Promote them to relation
398 	 * locks.  Predicate locks on indexes will be promoted when they are
399 	 * reindexed.
400 	 */
401 	TransferPredicateLocksToHeapRelation(OldHeap);
402 
403 	/* rebuild_relation does all the dirty work */
404 	rebuild_relation(OldHeap, indexOid, verbose);
405 
406 	/* NB: rebuild_relation does heap_close() on OldHeap */
407 }
408 
409 /*
410  * Verify that the specified heap and index are valid to cluster on
411  *
412  * Side effect: obtains lock on the index.  The caller may
413  * in some cases already have AccessExclusiveLock on the table, but
414  * not in all cases so we can't rely on the table-level lock for
415  * protection here.
416  */
417 void
check_index_is_clusterable(Relation OldHeap,Oid indexOid,bool recheck,LOCKMODE lockmode)418 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
419 {
420 	Relation	OldIndex;
421 
422 	OldIndex = index_open(indexOid, lockmode);
423 
424 	/*
425 	 * Check that index is in fact an index on the given relation
426 	 */
427 	if (OldIndex->rd_index == NULL ||
428 		OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
429 		ereport(ERROR,
430 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
431 				 errmsg("\"%s\" is not an index for table \"%s\"",
432 						RelationGetRelationName(OldIndex),
433 						RelationGetRelationName(OldHeap))));
434 
435 	/* Index AM must allow clustering */
436 	if (!OldIndex->rd_amroutine->amclusterable)
437 		ereport(ERROR,
438 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
439 				 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
440 						RelationGetRelationName(OldIndex))));
441 
442 	/*
443 	 * Disallow clustering on incomplete indexes (those that might not index
444 	 * every row of the relation).  We could relax this by making a separate
445 	 * seqscan pass over the table to copy the missing rows, but that seems
446 	 * expensive and tedious.
447 	 */
448 	if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
449 		ereport(ERROR,
450 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
451 				 errmsg("cannot cluster on partial index \"%s\"",
452 						RelationGetRelationName(OldIndex))));
453 
454 	/*
455 	 * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
456 	 * it might well not contain entries for every heap row, or might not even
457 	 * be internally consistent.  (But note that we don't check indcheckxmin;
458 	 * the worst consequence of following broken HOT chains would be that we
459 	 * might put recently-dead tuples out-of-order in the new table, and there
460 	 * is little harm in that.)
461 	 */
462 	if (!IndexIsValid(OldIndex->rd_index))
463 		ereport(ERROR,
464 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
465 				 errmsg("cannot cluster on invalid index \"%s\"",
466 						RelationGetRelationName(OldIndex))));
467 
468 	/* Drop relcache refcnt on OldIndex, but keep lock */
469 	index_close(OldIndex, NoLock);
470 }
471 
472 /*
473  * mark_index_clustered: mark the specified index as the one clustered on
474  *
475  * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
476  */
477 void
mark_index_clustered(Relation rel,Oid indexOid,bool is_internal)478 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
479 {
480 	HeapTuple	indexTuple;
481 	Form_pg_index indexForm;
482 	Relation	pg_index;
483 	ListCell   *index;
484 
485 	/*
486 	 * If the index is already marked clustered, no need to do anything.
487 	 */
488 	if (OidIsValid(indexOid))
489 	{
490 		indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
491 		if (!HeapTupleIsValid(indexTuple))
492 			elog(ERROR, "cache lookup failed for index %u", indexOid);
493 		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
494 
495 		if (indexForm->indisclustered)
496 		{
497 			ReleaseSysCache(indexTuple);
498 			return;
499 		}
500 
501 		ReleaseSysCache(indexTuple);
502 	}
503 
504 	/*
505 	 * Check each index of the relation and set/clear the bit as needed.
506 	 */
507 	pg_index = heap_open(IndexRelationId, RowExclusiveLock);
508 
509 	foreach(index, RelationGetIndexList(rel))
510 	{
511 		Oid			thisIndexOid = lfirst_oid(index);
512 
513 		indexTuple = SearchSysCacheCopy1(INDEXRELID,
514 										 ObjectIdGetDatum(thisIndexOid));
515 		if (!HeapTupleIsValid(indexTuple))
516 			elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
517 		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
518 
519 		/*
520 		 * Unset the bit if set.  We know it's wrong because we checked this
521 		 * earlier.
522 		 */
523 		if (indexForm->indisclustered)
524 		{
525 			indexForm->indisclustered = false;
526 			simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
527 			CatalogUpdateIndexes(pg_index, indexTuple);
528 		}
529 		else if (thisIndexOid == indexOid)
530 		{
531 			/* this was checked earlier, but let's be real sure */
532 			if (!IndexIsValid(indexForm))
533 				elog(ERROR, "cannot cluster on invalid index %u", indexOid);
534 			indexForm->indisclustered = true;
535 			simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
536 			CatalogUpdateIndexes(pg_index, indexTuple);
537 		}
538 
539 		InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
540 									 InvalidOid, is_internal);
541 
542 		heap_freetuple(indexTuple);
543 	}
544 
545 	heap_close(pg_index, RowExclusiveLock);
546 }
547 
548 /*
549  * rebuild_relation: rebuild an existing relation in index or physical order
550  *
551  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
552  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
553  *
554  * NB: this routine closes OldHeap at the right time; caller should not.
555  */
556 static void
rebuild_relation(Relation OldHeap,Oid indexOid,bool verbose)557 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
558 {
559 	Oid			tableOid = RelationGetRelid(OldHeap);
560 	Oid			tableSpace = OldHeap->rd_rel->reltablespace;
561 	Oid			OIDNewHeap;
562 	char		relpersistence;
563 	bool		is_system_catalog;
564 	bool		swap_toast_by_content;
565 	TransactionId frozenXid;
566 	MultiXactId cutoffMulti;
567 
568 	/* Mark the correct index as clustered */
569 	if (OidIsValid(indexOid))
570 		mark_index_clustered(OldHeap, indexOid, true);
571 
572 	/* Remember info about rel before closing OldHeap */
573 	relpersistence = OldHeap->rd_rel->relpersistence;
574 	is_system_catalog = IsSystemRelation(OldHeap);
575 
576 	/* Close relcache entry, but keep lock until transaction commit */
577 	heap_close(OldHeap, NoLock);
578 
579 	/* Create the transient table that will receive the re-ordered data */
580 	OIDNewHeap = make_new_heap(tableOid, tableSpace,
581 							   relpersistence,
582 							   AccessExclusiveLock);
583 
584 	/* Copy the heap data into the new table in the desired order */
585 	copy_heap_data(OIDNewHeap, tableOid, indexOid, verbose,
586 				   &swap_toast_by_content, &frozenXid, &cutoffMulti);
587 
588 	/*
589 	 * Swap the physical files of the target and transient tables, then
590 	 * rebuild the target's indexes and throw away the transient table.
591 	 */
592 	finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
593 					 swap_toast_by_content, false, true,
594 					 frozenXid, cutoffMulti,
595 					 relpersistence);
596 }
597 
598 
599 /*
600  * Create the transient table that will be filled with new data during
601  * CLUSTER, ALTER TABLE, and similar operations.  The transient table
602  * duplicates the logical structure of the OldHeap, but is placed in
603  * NewTableSpace which might be different from OldHeap's.  Also, it's built
604  * with the specified persistence, which might differ from the original's.
605  *
606  * After this, the caller should load the new heap with transferred/modified
607  * data, then call finish_heap_swap to complete the operation.
608  */
609 Oid
make_new_heap(Oid OIDOldHeap,Oid NewTableSpace,char relpersistence,LOCKMODE lockmode)610 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
611 			  LOCKMODE lockmode)
612 {
613 	TupleDesc	OldHeapDesc;
614 	char		NewHeapName[NAMEDATALEN];
615 	Oid			OIDNewHeap;
616 	Oid			toastid;
617 	Relation	OldHeap;
618 	HeapTuple	tuple;
619 	Datum		reloptions;
620 	bool		isNull;
621 	Oid			namespaceid;
622 
623 	OldHeap = heap_open(OIDOldHeap, lockmode);
624 	OldHeapDesc = RelationGetDescr(OldHeap);
625 
626 	/*
627 	 * Note that the NewHeap will not receive any of the defaults or
628 	 * constraints associated with the OldHeap; we don't need 'em, and there's
629 	 * no reason to spend cycles inserting them into the catalogs only to
630 	 * delete them.
631 	 */
632 
633 	/*
634 	 * But we do want to use reloptions of the old heap for new heap.
635 	 */
636 	tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
637 	if (!HeapTupleIsValid(tuple))
638 		elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
639 	reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
640 								 &isNull);
641 	if (isNull)
642 		reloptions = (Datum) 0;
643 
644 	if (relpersistence == RELPERSISTENCE_TEMP)
645 		namespaceid = LookupCreationNamespace("pg_temp");
646 	else
647 		namespaceid = RelationGetNamespace(OldHeap);
648 
649 	/*
650 	 * Create the new heap, using a temporary name in the same namespace as
651 	 * the existing table.  NOTE: there is some risk of collision with user
652 	 * relnames.  Working around this seems more trouble than it's worth; in
653 	 * particular, we can't create the new heap in a different namespace from
654 	 * the old, or we will have problems with the TEMP status of temp tables.
655 	 *
656 	 * Note: the new heap is not a shared relation, even if we are rebuilding
657 	 * a shared rel.  However, we do make the new heap mapped if the source is
658 	 * mapped.  This simplifies swap_relation_files, and is absolutely
659 	 * necessary for rebuilding pg_class, for reasons explained there.
660 	 */
661 	snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
662 
663 	OIDNewHeap = heap_create_with_catalog(NewHeapName,
664 										  namespaceid,
665 										  NewTableSpace,
666 										  InvalidOid,
667 										  InvalidOid,
668 										  InvalidOid,
669 										  OldHeap->rd_rel->relowner,
670 										  OldHeapDesc,
671 										  NIL,
672 										  RELKIND_RELATION,
673 										  relpersistence,
674 										  false,
675 										  RelationIsMapped(OldHeap),
676 										  true,
677 										  0,
678 										  ONCOMMIT_NOOP,
679 										  reloptions,
680 										  false,
681 										  true,
682 										  true,
683 										  NULL);
684 	Assert(OIDNewHeap != InvalidOid);
685 
686 	ReleaseSysCache(tuple);
687 
688 	/*
689 	 * Advance command counter so that the newly-created relation's catalog
690 	 * tuples will be visible to heap_open.
691 	 */
692 	CommandCounterIncrement();
693 
694 	/*
695 	 * If necessary, create a TOAST table for the new relation.
696 	 *
697 	 * If the relation doesn't have a TOAST table already, we can't need one
698 	 * for the new relation.  The other way around is possible though: if some
699 	 * wide columns have been dropped, NewHeapCreateToastTable can decide that
700 	 * no TOAST table is needed for the new table.
701 	 *
702 	 * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
703 	 * that the TOAST table will be visible for insertion.
704 	 */
705 	toastid = OldHeap->rd_rel->reltoastrelid;
706 	if (OidIsValid(toastid))
707 	{
708 		/* keep the existing toast table's reloptions, if any */
709 		tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
710 		if (!HeapTupleIsValid(tuple))
711 			elog(ERROR, "cache lookup failed for relation %u", toastid);
712 		reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
713 									 &isNull);
714 		if (isNull)
715 			reloptions = (Datum) 0;
716 
717 		NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode);
718 
719 		ReleaseSysCache(tuple);
720 	}
721 
722 	heap_close(OldHeap, NoLock);
723 
724 	return OIDNewHeap;
725 }
726 
727 /*
728  * Do the physical copying of heap data.
729  *
730  * There are three output parameters:
731  * *pSwapToastByContent is set true if toast tables must be swapped by content.
732  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
733  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
734  */
735 static void
copy_heap_data(Oid OIDNewHeap,Oid OIDOldHeap,Oid OIDOldIndex,bool verbose,bool * pSwapToastByContent,TransactionId * pFreezeXid,MultiXactId * pCutoffMulti)736 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
737 			   bool *pSwapToastByContent, TransactionId *pFreezeXid,
738 			   MultiXactId *pCutoffMulti)
739 {
740 	Relation	NewHeap,
741 				OldHeap,
742 				OldIndex;
743 	TupleDesc	oldTupDesc;
744 	TupleDesc	newTupDesc;
745 	int			natts;
746 	Datum	   *values;
747 	bool	   *isnull;
748 	IndexScanDesc indexScan;
749 	HeapScanDesc heapScan;
750 	bool		use_wal;
751 	bool		is_system_catalog;
752 	TransactionId OldestXmin;
753 	TransactionId FreezeXid;
754 	MultiXactId MultiXactCutoff;
755 	RewriteState rwstate;
756 	bool		use_sort;
757 	Tuplesortstate *tuplesort;
758 	double		num_tuples = 0,
759 				tups_vacuumed = 0,
760 				tups_recently_dead = 0;
761 	int			elevel = verbose ? INFO : DEBUG2;
762 	PGRUsage	ru0;
763 
764 	pg_rusage_init(&ru0);
765 
766 	/*
767 	 * Open the relations we need.
768 	 */
769 	NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
770 	OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
771 	if (OidIsValid(OIDOldIndex))
772 		OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
773 	else
774 		OldIndex = NULL;
775 
776 	/*
777 	 * Their tuple descriptors should be exactly alike, but here we only need
778 	 * assume that they have the same number of columns.
779 	 */
780 	oldTupDesc = RelationGetDescr(OldHeap);
781 	newTupDesc = RelationGetDescr(NewHeap);
782 	Assert(newTupDesc->natts == oldTupDesc->natts);
783 
784 	/* Preallocate values/isnull arrays */
785 	natts = newTupDesc->natts;
786 	values = (Datum *) palloc(natts * sizeof(Datum));
787 	isnull = (bool *) palloc(natts * sizeof(bool));
788 
789 	/*
790 	 * If the OldHeap has a toast table, get lock on the toast table to keep
791 	 * it from being vacuumed.  This is needed because autovacuum processes
792 	 * toast tables independently of their main tables, with no lock on the
793 	 * latter.  If an autovacuum were to start on the toast table after we
794 	 * compute our OldestXmin below, it would use a later OldestXmin, and then
795 	 * possibly remove as DEAD toast tuples belonging to main tuples we think
796 	 * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
797 	 * tuples.
798 	 *
799 	 * We don't need to open the toast relation here, just lock it.  The lock
800 	 * will be held till end of transaction.
801 	 */
802 	if (OldHeap->rd_rel->reltoastrelid)
803 		LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
804 
805 	/*
806 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
807 	 * enabled AND it's a WAL-logged rel.
808 	 */
809 	use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
810 
811 	/* use_wal off requires smgr_targblock be initially invalid */
812 	Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
813 
814 	/*
815 	 * If both tables have TOAST tables, perform toast swap by content.  It is
816 	 * possible that the old table has a toast table but the new one doesn't,
817 	 * if toastable columns have been dropped.  In that case we have to do
818 	 * swap by links.  This is okay because swap by content is only essential
819 	 * for system catalogs, and we don't support schema changes for them.
820 	 */
821 	if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
822 	{
823 		*pSwapToastByContent = true;
824 
825 		/*
826 		 * When doing swap by content, any toast pointers written into NewHeap
827 		 * must use the old toast table's OID, because that's where the toast
828 		 * data will eventually be found.  Set this up by setting rd_toastoid.
829 		 * This also tells toast_save_datum() to preserve the toast value
830 		 * OIDs, which we want so as not to invalidate toast pointers in
831 		 * system catalog caches, and to avoid making multiple copies of a
832 		 * single toast value.
833 		 *
834 		 * Note that we must hold NewHeap open until we are done writing data,
835 		 * since the relcache will not guarantee to remember this setting once
836 		 * the relation is closed.  Also, this technique depends on the fact
837 		 * that no one will try to read from the NewHeap until after we've
838 		 * finished writing it and swapping the rels --- otherwise they could
839 		 * follow the toast pointers to the wrong place.  (It would actually
840 		 * work for values copied over from the old toast table, but not for
841 		 * any values that we toast which were previously not toasted.)
842 		 */
843 		NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
844 	}
845 	else
846 		*pSwapToastByContent = false;
847 
848 	/*
849 	 * Compute xids used to freeze and weed out dead tuples and multixacts.
850 	 * Since we're going to rewrite the whole table anyway, there's no reason
851 	 * not to be aggressive about this.
852 	 */
853 	vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
854 						  &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
855 						  NULL);
856 
857 	/*
858 	 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
859 	 * backwards, so take the max.
860 	 */
861 	if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
862 		FreezeXid = OldHeap->rd_rel->relfrozenxid;
863 
864 	/*
865 	 * MultiXactCutoff, similarly, shouldn't go backwards either.
866 	 */
867 	if (MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
868 		MultiXactCutoff = OldHeap->rd_rel->relminmxid;
869 
870 	/* return selected values to caller */
871 	*pFreezeXid = FreezeXid;
872 	*pCutoffMulti = MultiXactCutoff;
873 
874 	/* Remember if it's a system catalog */
875 	is_system_catalog = IsSystemRelation(OldHeap);
876 
877 	/* Initialize the rewrite operation */
878 	rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
879 								 MultiXactCutoff, use_wal);
880 
881 	/*
882 	 * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
883 	 * the OldHeap.  We know how to use a sort to duplicate the ordering of a
884 	 * btree index, and will use seqscan-and-sort for that case if the planner
885 	 * tells us it's cheaper.  Otherwise, always indexscan if an index is
886 	 * provided, else plain seqscan.
887 	 */
888 	if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
889 		use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
890 	else
891 		use_sort = false;
892 
893 	/* Set up sorting if wanted */
894 	if (use_sort)
895 		tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
896 											maintenance_work_mem, false);
897 	else
898 		tuplesort = NULL;
899 
900 	/*
901 	 * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
902 	 * that still need to be copied, we scan with SnapshotAny and use
903 	 * HeapTupleSatisfiesVacuum for the visibility test.
904 	 */
905 	if (OldIndex != NULL && !use_sort)
906 	{
907 		heapScan = NULL;
908 		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
909 		index_rescan(indexScan, NULL, 0, NULL, 0);
910 	}
911 	else
912 	{
913 		heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
914 		indexScan = NULL;
915 	}
916 
917 	/* Log what we're doing */
918 	if (indexScan != NULL)
919 		ereport(elevel,
920 				(errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
921 						get_namespace_name(RelationGetNamespace(OldHeap)),
922 						RelationGetRelationName(OldHeap),
923 						RelationGetRelationName(OldIndex))));
924 	else if (tuplesort != NULL)
925 		ereport(elevel,
926 				(errmsg("clustering \"%s.%s\" using sequential scan and sort",
927 						get_namespace_name(RelationGetNamespace(OldHeap)),
928 						RelationGetRelationName(OldHeap))));
929 	else
930 		ereport(elevel,
931 				(errmsg("vacuuming \"%s.%s\"",
932 						get_namespace_name(RelationGetNamespace(OldHeap)),
933 						RelationGetRelationName(OldHeap))));
934 
935 	/*
936 	 * Scan through the OldHeap, either in OldIndex order or sequentially;
937 	 * copy each tuple into the NewHeap, or transiently to the tuplesort
938 	 * module.  Note that we don't bother sorting dead tuples (they won't get
939 	 * to the new table anyway).
940 	 */
941 	for (;;)
942 	{
943 		HeapTuple	tuple;
944 		Buffer		buf;
945 		bool		isdead;
946 
947 		CHECK_FOR_INTERRUPTS();
948 
949 		if (indexScan != NULL)
950 		{
951 			tuple = index_getnext(indexScan, ForwardScanDirection);
952 			if (tuple == NULL)
953 				break;
954 
955 			/* Since we used no scan keys, should never need to recheck */
956 			if (indexScan->xs_recheck)
957 				elog(ERROR, "CLUSTER does not support lossy index conditions");
958 
959 			buf = indexScan->xs_cbuf;
960 		}
961 		else
962 		{
963 			tuple = heap_getnext(heapScan, ForwardScanDirection);
964 			if (tuple == NULL)
965 				break;
966 
967 			buf = heapScan->rs_cbuf;
968 		}
969 
970 		LockBuffer(buf, BUFFER_LOCK_SHARE);
971 
972 		switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
973 		{
974 			case HEAPTUPLE_DEAD:
975 				/* Definitely dead */
976 				isdead = true;
977 				break;
978 			case HEAPTUPLE_RECENTLY_DEAD:
979 				tups_recently_dead += 1;
980 				/* fall through */
981 			case HEAPTUPLE_LIVE:
982 				/* Live or recently dead, must copy it */
983 				isdead = false;
984 				break;
985 			case HEAPTUPLE_INSERT_IN_PROGRESS:
986 
987 				/*
988 				 * Since we hold exclusive lock on the relation, normally the
989 				 * only way to see this is if it was inserted earlier in our
990 				 * own transaction.  However, it can happen in system
991 				 * catalogs, since we tend to release write lock before commit
992 				 * there.  Give a warning if neither case applies; but in any
993 				 * case we had better copy it.
994 				 */
995 				if (!is_system_catalog &&
996 					!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
997 					elog(WARNING, "concurrent insert in progress within table \"%s\"",
998 						 RelationGetRelationName(OldHeap));
999 				/* treat as live */
1000 				isdead = false;
1001 				break;
1002 			case HEAPTUPLE_DELETE_IN_PROGRESS:
1003 
1004 				/*
1005 				 * Similar situation to INSERT_IN_PROGRESS case.
1006 				 */
1007 				if (!is_system_catalog &&
1008 					!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
1009 					elog(WARNING, "concurrent delete in progress within table \"%s\"",
1010 						 RelationGetRelationName(OldHeap));
1011 				/* treat as recently dead */
1012 				tups_recently_dead += 1;
1013 				isdead = false;
1014 				break;
1015 			default:
1016 				elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1017 				isdead = false; /* keep compiler quiet */
1018 				break;
1019 		}
1020 
1021 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1022 
1023 		if (isdead)
1024 		{
1025 			tups_vacuumed += 1;
1026 			/* heap rewrite module still needs to see it... */
1027 			if (rewrite_heap_dead_tuple(rwstate, tuple))
1028 			{
1029 				/* A previous recently-dead tuple is now known dead */
1030 				tups_vacuumed += 1;
1031 				tups_recently_dead -= 1;
1032 			}
1033 			continue;
1034 		}
1035 
1036 		num_tuples += 1;
1037 		if (tuplesort != NULL)
1038 			tuplesort_putheaptuple(tuplesort, tuple);
1039 		else
1040 			reform_and_rewrite_tuple(tuple,
1041 									 oldTupDesc, newTupDesc,
1042 									 values, isnull,
1043 									 NewHeap->rd_rel->relhasoids, rwstate);
1044 	}
1045 
1046 	if (indexScan != NULL)
1047 		index_endscan(indexScan);
1048 	if (heapScan != NULL)
1049 		heap_endscan(heapScan);
1050 
1051 	/*
1052 	 * In scan-and-sort mode, complete the sort, then read out all live tuples
1053 	 * from the tuplestore and write them to the new relation.
1054 	 */
1055 	if (tuplesort != NULL)
1056 	{
1057 		tuplesort_performsort(tuplesort);
1058 
1059 		for (;;)
1060 		{
1061 			HeapTuple	tuple;
1062 			bool		shouldfree;
1063 
1064 			CHECK_FOR_INTERRUPTS();
1065 
1066 			tuple = tuplesort_getheaptuple(tuplesort, true, &shouldfree);
1067 			if (tuple == NULL)
1068 				break;
1069 
1070 			reform_and_rewrite_tuple(tuple,
1071 									 oldTupDesc, newTupDesc,
1072 									 values, isnull,
1073 									 NewHeap->rd_rel->relhasoids, rwstate);
1074 
1075 			if (shouldfree)
1076 				heap_freetuple(tuple);
1077 		}
1078 
1079 		tuplesort_end(tuplesort);
1080 	}
1081 
1082 	/* Write out any remaining tuples, and fsync if needed */
1083 	end_heap_rewrite(rwstate);
1084 
1085 	/* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
1086 	NewHeap->rd_toastoid = InvalidOid;
1087 
1088 	/* Log what we did */
1089 	ereport(elevel,
1090 			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
1091 					RelationGetRelationName(OldHeap),
1092 					tups_vacuumed, num_tuples,
1093 					RelationGetNumberOfBlocks(OldHeap)),
1094 			 errdetail("%.0f dead row versions cannot be removed yet.\n"
1095 					   "%s.",
1096 					   tups_recently_dead,
1097 					   pg_rusage_show(&ru0))));
1098 
1099 	/* Clean up */
1100 	pfree(values);
1101 	pfree(isnull);
1102 
1103 	if (OldIndex != NULL)
1104 		index_close(OldIndex, NoLock);
1105 	heap_close(OldHeap, NoLock);
1106 	heap_close(NewHeap, NoLock);
1107 }
1108 
1109 /*
1110  * Swap the physical files of two given relations.
1111  *
1112  * We swap the physical identity (reltablespace, relfilenode) while keeping the
1113  * same logical identities of the two relations.  relpersistence is also
1114  * swapped, which is critical since it determines where buffers live for each
1115  * relation.
1116  *
1117  * We can swap associated TOAST data in either of two ways: recursively swap
1118  * the physical content of the toast tables (and their indexes), or swap the
1119  * TOAST links in the given relations' pg_class entries.  The former is needed
1120  * to manage rewrites of shared catalogs (where we cannot change the pg_class
1121  * links) while the latter is the only way to handle cases in which a toast
1122  * table is added or removed altogether.
1123  *
1124  * Additionally, the first relation is marked with relfrozenxid set to
1125  * frozenXid.  It seems a bit ugly to have this here, but the caller would
1126  * have to do it anyway, so having it here saves a heap_update.  Note: in
1127  * the swap-toast-links case, we assume we don't need to change the toast
1128  * table's relfrozenxid: the new version of the toast table should already
1129  * have relfrozenxid set to RecentXmin, which is good enough.
1130  *
1131  * Lastly, if r2 and its toast table and toast index (if any) are mapped,
1132  * their OIDs are emitted into mapped_tables[].  This is hacky but beats
1133  * having to look the information up again later in finish_heap_swap.
1134  */
1135 static void
swap_relation_files(Oid r1,Oid r2,bool target_is_pg_class,bool swap_toast_by_content,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,Oid * mapped_tables)1136 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1137 					bool swap_toast_by_content,
1138 					bool is_internal,
1139 					TransactionId frozenXid,
1140 					MultiXactId cutoffMulti,
1141 					Oid *mapped_tables)
1142 {
1143 	Relation	relRelation;
1144 	HeapTuple	reltup1,
1145 				reltup2;
1146 	Form_pg_class relform1,
1147 				relform2;
1148 	Oid			relfilenode1,
1149 				relfilenode2;
1150 	Oid			swaptemp;
1151 	char		swptmpchr;
1152 	CatalogIndexState indstate;
1153 
1154 	/* We need writable copies of both pg_class tuples. */
1155 	relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1156 
1157 	reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1158 	if (!HeapTupleIsValid(reltup1))
1159 		elog(ERROR, "cache lookup failed for relation %u", r1);
1160 	relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1161 
1162 	reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1163 	if (!HeapTupleIsValid(reltup2))
1164 		elog(ERROR, "cache lookup failed for relation %u", r2);
1165 	relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1166 
1167 	relfilenode1 = relform1->relfilenode;
1168 	relfilenode2 = relform2->relfilenode;
1169 
1170 	if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1171 	{
1172 		/*
1173 		 * Normal non-mapped relations: swap relfilenodes, reltablespaces,
1174 		 * relpersistence
1175 		 */
1176 		Assert(!target_is_pg_class);
1177 
1178 		swaptemp = relform1->relfilenode;
1179 		relform1->relfilenode = relform2->relfilenode;
1180 		relform2->relfilenode = swaptemp;
1181 
1182 		swaptemp = relform1->reltablespace;
1183 		relform1->reltablespace = relform2->reltablespace;
1184 		relform2->reltablespace = swaptemp;
1185 
1186 		swptmpchr = relform1->relpersistence;
1187 		relform1->relpersistence = relform2->relpersistence;
1188 		relform2->relpersistence = swptmpchr;
1189 
1190 		/* Also swap toast links, if we're swapping by links */
1191 		if (!swap_toast_by_content)
1192 		{
1193 			swaptemp = relform1->reltoastrelid;
1194 			relform1->reltoastrelid = relform2->reltoastrelid;
1195 			relform2->reltoastrelid = swaptemp;
1196 		}
1197 	}
1198 	else
1199 	{
1200 		/*
1201 		 * Mapped-relation case.  Here we have to swap the relation mappings
1202 		 * instead of modifying the pg_class columns.  Both must be mapped.
1203 		 */
1204 		if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
1205 			elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1206 				 NameStr(relform1->relname));
1207 
1208 		/*
1209 		 * We can't change the tablespace nor persistence of a mapped rel, and
1210 		 * we can't handle toast link swapping for one either, because we must
1211 		 * not apply any critical changes to its pg_class row.  These cases
1212 		 * should be prevented by upstream permissions tests, so these checks
1213 		 * are non-user-facing emergency backstop.
1214 		 */
1215 		if (relform1->reltablespace != relform2->reltablespace)
1216 			elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1217 				 NameStr(relform1->relname));
1218 		if (relform1->relpersistence != relform2->relpersistence)
1219 			elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1220 				 NameStr(relform1->relname));
1221 		if (!swap_toast_by_content &&
1222 			(relform1->reltoastrelid || relform2->reltoastrelid))
1223 			elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1224 				 NameStr(relform1->relname));
1225 
1226 		/*
1227 		 * Fetch the mappings --- shouldn't fail, but be paranoid
1228 		 */
1229 		relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1230 		if (!OidIsValid(relfilenode1))
1231 			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1232 				 NameStr(relform1->relname), r1);
1233 		relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1234 		if (!OidIsValid(relfilenode2))
1235 			elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1236 				 NameStr(relform2->relname), r2);
1237 
1238 		/*
1239 		 * Send replacement mappings to relmapper.  Note these won't actually
1240 		 * take effect until CommandCounterIncrement.
1241 		 */
1242 		RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1243 		RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1244 
1245 		/* Pass OIDs of mapped r2 tables back to caller */
1246 		*mapped_tables++ = r2;
1247 	}
1248 
1249 	/*
1250 	 * In the case of a shared catalog, these next few steps will only affect
1251 	 * our own database's pg_class row; but that's okay, because they are all
1252 	 * noncritical updates.  That's also an important fact for the case of a
1253 	 * mapped catalog, because it's possible that we'll commit the map change
1254 	 * and then fail to commit the pg_class update.
1255 	 */
1256 
1257 	/* set rel1's frozen Xid and minimum MultiXid */
1258 	if (relform1->relkind != RELKIND_INDEX)
1259 	{
1260 		Assert(TransactionIdIsNormal(frozenXid));
1261 		relform1->relfrozenxid = frozenXid;
1262 		Assert(MultiXactIdIsValid(cutoffMulti));
1263 		relform1->relminmxid = cutoffMulti;
1264 	}
1265 
1266 	/* swap size statistics too, since new rel has freshly-updated stats */
1267 	{
1268 		int32		swap_pages;
1269 		float4		swap_tuples;
1270 		int32		swap_allvisible;
1271 
1272 		swap_pages = relform1->relpages;
1273 		relform1->relpages = relform2->relpages;
1274 		relform2->relpages = swap_pages;
1275 
1276 		swap_tuples = relform1->reltuples;
1277 		relform1->reltuples = relform2->reltuples;
1278 		relform2->reltuples = swap_tuples;
1279 
1280 		swap_allvisible = relform1->relallvisible;
1281 		relform1->relallvisible = relform2->relallvisible;
1282 		relform2->relallvisible = swap_allvisible;
1283 	}
1284 
1285 	/*
1286 	 * Update the tuples in pg_class --- unless the target relation of the
1287 	 * swap is pg_class itself.  In that case, there is zero point in making
1288 	 * changes because we'd be updating the old data that we're about to throw
1289 	 * away.  Because the real work being done here for a mapped relation is
1290 	 * just to change the relation map settings, it's all right to not update
1291 	 * the pg_class rows in this case. The most important changes will instead
1292 	 * performed later, in finish_heap_swap() itself.
1293 	 */
1294 	if (!target_is_pg_class)
1295 	{
1296 		simple_heap_update(relRelation, &reltup1->t_self, reltup1);
1297 		simple_heap_update(relRelation, &reltup2->t_self, reltup2);
1298 
1299 		/* Keep system catalogs current */
1300 		indstate = CatalogOpenIndexes(relRelation);
1301 		CatalogIndexInsert(indstate, reltup1);
1302 		CatalogIndexInsert(indstate, reltup2);
1303 		CatalogCloseIndexes(indstate);
1304 	}
1305 	else
1306 	{
1307 		/* no update ... but we do still need relcache inval */
1308 		CacheInvalidateRelcacheByTuple(reltup1);
1309 		CacheInvalidateRelcacheByTuple(reltup2);
1310 	}
1311 
1312 	/*
1313 	 * Post alter hook for modified relations. The change to r2 is always
1314 	 * internal, but r1 depends on the invocation context.
1315 	 */
1316 	InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1317 								 InvalidOid, is_internal);
1318 	InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1319 								 InvalidOid, true);
1320 
1321 	/*
1322 	 * If we have toast tables associated with the relations being swapped,
1323 	 * deal with them too.
1324 	 */
1325 	if (relform1->reltoastrelid || relform2->reltoastrelid)
1326 	{
1327 		if (swap_toast_by_content)
1328 		{
1329 			if (relform1->reltoastrelid && relform2->reltoastrelid)
1330 			{
1331 				/* Recursively swap the contents of the toast tables */
1332 				swap_relation_files(relform1->reltoastrelid,
1333 									relform2->reltoastrelid,
1334 									target_is_pg_class,
1335 									swap_toast_by_content,
1336 									is_internal,
1337 									frozenXid,
1338 									cutoffMulti,
1339 									mapped_tables);
1340 			}
1341 			else
1342 			{
1343 				/* caller messed up */
1344 				elog(ERROR, "cannot swap toast files by content when there's only one");
1345 			}
1346 		}
1347 		else
1348 		{
1349 			/*
1350 			 * We swapped the ownership links, so we need to change dependency
1351 			 * data to match.
1352 			 *
1353 			 * NOTE: it is possible that only one table has a toast table.
1354 			 *
1355 			 * NOTE: at present, a TOAST table's only dependency is the one on
1356 			 * its owning table.  If more are ever created, we'd need to use
1357 			 * something more selective than deleteDependencyRecordsFor() to
1358 			 * get rid of just the link we want.
1359 			 */
1360 			ObjectAddress baseobject,
1361 						toastobject;
1362 			long		count;
1363 
1364 			/*
1365 			 * We disallow this case for system catalogs, to avoid the
1366 			 * possibility that the catalog we're rebuilding is one of the
1367 			 * ones the dependency changes would change.  It's too late to be
1368 			 * making any data changes to the target catalog.
1369 			 */
1370 			if (IsSystemClass(r1, relform1))
1371 				elog(ERROR, "cannot swap toast files by links for system catalogs");
1372 
1373 			/* Delete old dependencies */
1374 			if (relform1->reltoastrelid)
1375 			{
1376 				count = deleteDependencyRecordsFor(RelationRelationId,
1377 												   relform1->reltoastrelid,
1378 												   false);
1379 				if (count != 1)
1380 					elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1381 						 count);
1382 			}
1383 			if (relform2->reltoastrelid)
1384 			{
1385 				count = deleteDependencyRecordsFor(RelationRelationId,
1386 												   relform2->reltoastrelid,
1387 												   false);
1388 				if (count != 1)
1389 					elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1390 						 count);
1391 			}
1392 
1393 			/* Register new dependencies */
1394 			baseobject.classId = RelationRelationId;
1395 			baseobject.objectSubId = 0;
1396 			toastobject.classId = RelationRelationId;
1397 			toastobject.objectSubId = 0;
1398 
1399 			if (relform1->reltoastrelid)
1400 			{
1401 				baseobject.objectId = r1;
1402 				toastobject.objectId = relform1->reltoastrelid;
1403 				recordDependencyOn(&toastobject, &baseobject,
1404 								   DEPENDENCY_INTERNAL);
1405 			}
1406 
1407 			if (relform2->reltoastrelid)
1408 			{
1409 				baseobject.objectId = r2;
1410 				toastobject.objectId = relform2->reltoastrelid;
1411 				recordDependencyOn(&toastobject, &baseobject,
1412 								   DEPENDENCY_INTERNAL);
1413 			}
1414 		}
1415 	}
1416 
1417 	/*
1418 	 * If we're swapping two toast tables by content, do the same for their
1419 	 * valid index. The swap can actually be safely done only if the relations
1420 	 * have indexes.
1421 	 */
1422 	if (swap_toast_by_content &&
1423 		relform1->relkind == RELKIND_TOASTVALUE &&
1424 		relform2->relkind == RELKIND_TOASTVALUE)
1425 	{
1426 		Oid			toastIndex1,
1427 					toastIndex2;
1428 
1429 		/* Get valid index for each relation */
1430 		toastIndex1 = toast_get_valid_index(r1,
1431 											AccessExclusiveLock);
1432 		toastIndex2 = toast_get_valid_index(r2,
1433 											AccessExclusiveLock);
1434 
1435 		swap_relation_files(toastIndex1,
1436 							toastIndex2,
1437 							target_is_pg_class,
1438 							swap_toast_by_content,
1439 							is_internal,
1440 							InvalidTransactionId,
1441 							InvalidMultiXactId,
1442 							mapped_tables);
1443 	}
1444 
1445 	/* Clean up. */
1446 	heap_freetuple(reltup1);
1447 	heap_freetuple(reltup2);
1448 
1449 	heap_close(relRelation, RowExclusiveLock);
1450 
1451 	/*
1452 	 * Close both relcache entries' smgr links.  We need this kluge because
1453 	 * both links will be invalidated during upcoming CommandCounterIncrement.
1454 	 * Whichever of the rels is the second to be cleared will have a dangling
1455 	 * reference to the other's smgr entry.  Rather than trying to avoid this
1456 	 * by ordering operations just so, it's easiest to close the links first.
1457 	 * (Fortunately, since one of the entries is local in our transaction,
1458 	 * it's sufficient to clear out our own relcache this way; the problem
1459 	 * cannot arise for other backends when they see our update on the
1460 	 * non-transient relation.)
1461 	 *
1462 	 * Caution: the placement of this step interacts with the decision to
1463 	 * handle toast rels by recursion.  When we are trying to rebuild pg_class
1464 	 * itself, the smgr close on pg_class must happen after all accesses in
1465 	 * this function.
1466 	 */
1467 	RelationCloseSmgrByOid(r1);
1468 	RelationCloseSmgrByOid(r2);
1469 }
1470 
1471 /*
1472  * Remove the transient table that was built by make_new_heap, and finish
1473  * cleaning up (including rebuilding all indexes on the old heap).
1474  */
1475 void
finish_heap_swap(Oid OIDOldHeap,Oid OIDNewHeap,bool is_system_catalog,bool swap_toast_by_content,bool check_constraints,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,char newrelpersistence)1476 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1477 				 bool is_system_catalog,
1478 				 bool swap_toast_by_content,
1479 				 bool check_constraints,
1480 				 bool is_internal,
1481 				 TransactionId frozenXid,
1482 				 MultiXactId cutoffMulti,
1483 				 char newrelpersistence)
1484 {
1485 	ObjectAddress object;
1486 	Oid			mapped_tables[4];
1487 	int			reindex_flags;
1488 	int			i;
1489 
1490 	/* Zero out possible results from swapped_relation_files */
1491 	memset(mapped_tables, 0, sizeof(mapped_tables));
1492 
1493 	/*
1494 	 * Swap the contents of the heap relations (including any toast tables).
1495 	 * Also set old heap's relfrozenxid to frozenXid.
1496 	 */
1497 	swap_relation_files(OIDOldHeap, OIDNewHeap,
1498 						(OIDOldHeap == RelationRelationId),
1499 						swap_toast_by_content, is_internal,
1500 						frozenXid, cutoffMulti, mapped_tables);
1501 
1502 	/*
1503 	 * If it's a system catalog, queue an sinval message to flush all
1504 	 * catcaches on the catalog when we reach CommandCounterIncrement.
1505 	 */
1506 	if (is_system_catalog)
1507 		CacheInvalidateCatalog(OIDOldHeap);
1508 
1509 	/*
1510 	 * Rebuild each index on the relation (but not the toast table, which is
1511 	 * all-new at this point).  It is important to do this before the DROP
1512 	 * step because if we are processing a system catalog that will be used
1513 	 * during DROP, we want to have its indexes available.  There is no
1514 	 * advantage to the other order anyway because this is all transactional,
1515 	 * so no chance to reclaim disk space before commit.  We do not need a
1516 	 * final CommandCounterIncrement() because reindex_relation does it.
1517 	 *
1518 	 * Note: because index_build is called via reindex_relation, it will never
1519 	 * set indcheckxmin true for the indexes.  This is OK even though in some
1520 	 * sense we are building new indexes rather than rebuilding existing ones,
1521 	 * because the new heap won't contain any HOT chains at all, let alone
1522 	 * broken ones, so it can't be necessary to set indcheckxmin.
1523 	 */
1524 	reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1525 	if (check_constraints)
1526 		reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1527 
1528 	/*
1529 	 * Ensure that the indexes have the same persistence as the parent
1530 	 * relation.
1531 	 */
1532 	if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1533 		reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1534 	else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1535 		reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1536 
1537 	reindex_relation(OIDOldHeap, reindex_flags, 0);
1538 
1539 	/*
1540 	 * If the relation being rebuild is pg_class, swap_relation_files()
1541 	 * couldn't update pg_class's own pg_class entry (check comments in
1542 	 * swap_relation_files()), thus relfrozenxid was not updated. That's
1543 	 * annoying because a potential reason for doing a VACUUM FULL is a
1544 	 * imminent or actual anti-wraparound shutdown.  So, now that we can
1545 	 * access the new relation using its indices, update relfrozenxid.
1546 	 * pg_class doesn't have a toast relation, so we don't need to update the
1547 	 * corresponding toast relation. Not that there's little point moving all
1548 	 * relfrozenxid updates here since swap_relation_files() needs to write to
1549 	 * pg_class for non-mapped relations anyway.
1550 	 */
1551 	if (OIDOldHeap == RelationRelationId)
1552 	{
1553 		Relation	relRelation;
1554 		HeapTuple	reltup;
1555 		Form_pg_class relform;
1556 
1557 		relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1558 
1559 		reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1560 		if (!HeapTupleIsValid(reltup))
1561 			elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1562 		relform = (Form_pg_class) GETSTRUCT(reltup);
1563 
1564 		relform->relfrozenxid = frozenXid;
1565 		relform->relminmxid = cutoffMulti;
1566 
1567 		simple_heap_update(relRelation, &reltup->t_self, reltup);
1568 		CatalogUpdateIndexes(relRelation, reltup);
1569 
1570 		heap_close(relRelation, RowExclusiveLock);
1571 	}
1572 
1573 	/* Destroy new heap with old filenode */
1574 	object.classId = RelationRelationId;
1575 	object.objectId = OIDNewHeap;
1576 	object.objectSubId = 0;
1577 
1578 	/*
1579 	 * The new relation is local to our transaction and we know nothing
1580 	 * depends on it, so DROP_RESTRICT should be OK.
1581 	 */
1582 	performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1583 
1584 	/* performDeletion does CommandCounterIncrement at end */
1585 
1586 	/*
1587 	 * Now we must remove any relation mapping entries that we set up for the
1588 	 * transient table, as well as its toast table and toast index if any. If
1589 	 * we fail to do this before commit, the relmapper will complain about new
1590 	 * permanent map entries being added post-bootstrap.
1591 	 */
1592 	for (i = 0; OidIsValid(mapped_tables[i]); i++)
1593 		RelationMapRemoveMapping(mapped_tables[i]);
1594 
1595 	/*
1596 	 * At this point, everything is kosher except that, if we did toast swap
1597 	 * by links, the toast table's name corresponds to the transient table.
1598 	 * The name is irrelevant to the backend because it's referenced by OID,
1599 	 * but users looking at the catalogs could be confused.  Rename it to
1600 	 * prevent this problem.
1601 	 *
1602 	 * Note no lock required on the relation, because we already hold an
1603 	 * exclusive lock on it.
1604 	 */
1605 	if (!swap_toast_by_content)
1606 	{
1607 		Relation	newrel;
1608 
1609 		newrel = heap_open(OIDOldHeap, NoLock);
1610 		if (OidIsValid(newrel->rd_rel->reltoastrelid))
1611 		{
1612 			Oid			toastidx;
1613 			char		NewToastName[NAMEDATALEN];
1614 
1615 			/* Get the associated valid index to be renamed */
1616 			toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1617 											 NoLock);
1618 
1619 			/* rename the toast table ... */
1620 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1621 					 OIDOldHeap);
1622 			RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1623 								   NewToastName, true);
1624 
1625 			/* ... and its valid index too. */
1626 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1627 					 OIDOldHeap);
1628 
1629 			RenameRelationInternal(toastidx,
1630 								   NewToastName, true);
1631 		}
1632 		relation_close(newrel, NoLock);
1633 	}
1634 }
1635 
1636 
1637 /*
1638  * Get a list of tables that the current user owns and
1639  * have indisclustered set.  Return the list in a List * of rvsToCluster
1640  * with the tableOid and the indexOid on which the table is already
1641  * clustered.
1642  */
1643 static List *
get_tables_to_cluster(MemoryContext cluster_context)1644 get_tables_to_cluster(MemoryContext cluster_context)
1645 {
1646 	Relation	indRelation;
1647 	HeapScanDesc scan;
1648 	ScanKeyData entry;
1649 	HeapTuple	indexTuple;
1650 	Form_pg_index index;
1651 	MemoryContext old_context;
1652 	RelToCluster *rvtc;
1653 	List	   *rvs = NIL;
1654 
1655 	/*
1656 	 * Get all indexes that have indisclustered set and are owned by
1657 	 * appropriate user. System relations or nailed-in relations cannot ever
1658 	 * have indisclustered set, because CLUSTER will refuse to set it when
1659 	 * called with one of them as argument.
1660 	 */
1661 	indRelation = heap_open(IndexRelationId, AccessShareLock);
1662 	ScanKeyInit(&entry,
1663 				Anum_pg_index_indisclustered,
1664 				BTEqualStrategyNumber, F_BOOLEQ,
1665 				BoolGetDatum(true));
1666 	scan = heap_beginscan_catalog(indRelation, 1, &entry);
1667 	while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1668 	{
1669 		index = (Form_pg_index) GETSTRUCT(indexTuple);
1670 
1671 		if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1672 			continue;
1673 
1674 		/*
1675 		 * We have to build the list in a different memory context so it will
1676 		 * survive the cross-transaction processing
1677 		 */
1678 		old_context = MemoryContextSwitchTo(cluster_context);
1679 
1680 		rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1681 		rvtc->tableOid = index->indrelid;
1682 		rvtc->indexOid = index->indexrelid;
1683 		rvs = lcons(rvtc, rvs);
1684 
1685 		MemoryContextSwitchTo(old_context);
1686 	}
1687 	heap_endscan(scan);
1688 
1689 	relation_close(indRelation, AccessShareLock);
1690 
1691 	return rvs;
1692 }
1693 
1694 
1695 /*
1696  * Reconstruct and rewrite the given tuple
1697  *
1698  * We cannot simply copy the tuple as-is, for several reasons:
1699  *
1700  * 1. We'd like to squeeze out the values of any dropped columns, both
1701  * to save space and to ensure we have no corner-case failures. (It's
1702  * possible for example that the new table hasn't got a TOAST table
1703  * and so is unable to store any large values of dropped cols.)
1704  *
1705  * 2. The tuple might not even be legal for the new table; this is
1706  * currently only known to happen as an after-effect of ALTER TABLE
1707  * SET WITHOUT OIDS.
1708  *
1709  * So, we must reconstruct the tuple from component Datums.
1710  */
1711 static void
reform_and_rewrite_tuple(HeapTuple tuple,TupleDesc oldTupDesc,TupleDesc newTupDesc,Datum * values,bool * isnull,bool newRelHasOids,RewriteState rwstate)1712 reform_and_rewrite_tuple(HeapTuple tuple,
1713 						 TupleDesc oldTupDesc, TupleDesc newTupDesc,
1714 						 Datum *values, bool *isnull,
1715 						 bool newRelHasOids, RewriteState rwstate)
1716 {
1717 	HeapTuple	copiedTuple;
1718 	int			i;
1719 
1720 	heap_deform_tuple(tuple, oldTupDesc, values, isnull);
1721 
1722 	/* Be sure to null out any dropped columns */
1723 	for (i = 0; i < newTupDesc->natts; i++)
1724 	{
1725 		if (newTupDesc->attrs[i]->attisdropped)
1726 			isnull[i] = true;
1727 	}
1728 
1729 	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
1730 
1731 	/* Preserve OID, if any */
1732 	if (newRelHasOids)
1733 		HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
1734 
1735 	/* The heap rewrite module does the rest */
1736 	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
1737 
1738 	heap_freetuple(copiedTuple);
1739 }
1740