1 /*
2  * This file and its contents are licensed under the Timescale License.
3  * Please see the included NOTICE for copyright information and
4  * LICENSE-TIMESCALE for a copy of the license.
5  */
6 
7 /*
8  * This file contains source code that was copied and/or modified from the
9  * PostgreSQL database, which is licensed under the open-source PostgreSQL
10  * License. Please see the NOTICE at the top level directory for a copy of
11  * the PostgreSQL License.
12  */
13 
14 /* see postgres commit ab5e9caa4a3ec4765348a0482e88edcf3f6aab4a */
15 
16 #include <postgres.h>
17 #include <access/amapi.h>
18 #include <access/multixact.h>
19 #include <access/relscan.h>
20 #include <access/rewriteheap.h>
21 #include <access/transam.h>
22 #include <access/xact.h>
23 #include <access/xlog.h>
24 #include <catalog/catalog.h>
25 #include <catalog/dependency.h>
26 #include <catalog/heap.h>
27 #include <catalog/index.h>
28 #include <catalog/namespace.h>
29 #include <catalog/objectaccess.h>
30 #include <catalog/pg_am.h>
31 #include <catalog/toasting.h>
32 #include <commands/cluster.h>
33 #include <commands/tablecmds.h>
34 #include <commands/tablespace.h>
35 #include <commands/vacuum.h>
36 #include <miscadmin.h>
37 #include <nodes/pg_list.h>
38 #include <optimizer/planner.h>
39 #include <storage/bufmgr.h>
40 #include <storage/lmgr.h>
41 #include <storage/predicate.h>
42 #include <storage/smgr.h>
43 #include <utils/acl.h>
44 #include <utils/fmgroids.h>
45 #include <utils/guc.h>
46 #include <utils/inval.h>
47 #include <utils/lsyscache.h>
48 #include <utils/memutils.h>
49 #include <utils/pg_rusage.h>
50 #include <utils/relmapper.h>
51 #include <utils/snapmgr.h>
52 #include <utils/syscache.h>
53 #include <utils/tuplesort.h>
54 #include <executor/spi.h>
55 #include <utils/snapmgr.h>
56 
57 #include "compat/compat.h"
58 #if PG13_LT
59 #include <access/tuptoaster.h>
60 #else
61 #include <access/toast_internals.h>
62 #endif
63 
64 #include "annotations.h"
65 #include "chunk.h"
66 #include "chunk_copy.h"
67 #include "chunk_index.h"
68 #include "hypertable_cache.h"
69 #include "indexing.h"
70 #include "reorder.h"
71 
72 extern void timescale_reorder_rel(Oid tableOid, Oid indexOid, bool verbose, Oid wait_id,
73 								  Oid destination_tablespace, Oid index_tablespace);
74 
75 #define REORDER_ACCESS_EXCLUSIVE_DEADLOCK_TIMEOUT "101000"
76 
77 static void timescale_rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose, Oid wait_id,
78 									   Oid destination_tablespace, Oid index_tablespace);
79 static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
80 						   bool *pSwapToastByContent, TransactionId *pFreezeXid,
81 						   MultiXactId *pCutoffMulti);
82 
83 static void finish_heap_swaps(Oid OIDOldHeap, Oid OIDNewHeap, List *old_index_oids,
84 							  List *new_index_oids, bool swap_toast_by_content, bool is_internal,
85 							  TransactionId frozenXid, MultiXactId cutoffMulti, Oid wait_id);
86 
87 static void swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, bool is_internal,
88 								TransactionId frozenXid, MultiXactId cutoffMulti);
89 
90 static bool chunk_get_reorder_index(Hypertable *ht, Chunk *chunk, Oid index_relid,
91 									ChunkIndexMapping *cim_out);
92 
93 Datum
tsl_reorder_chunk(PG_FUNCTION_ARGS)94 tsl_reorder_chunk(PG_FUNCTION_ARGS)
95 {
96 	Oid chunk_id = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0);
97 	Oid index_id = PG_ARGISNULL(1) ? InvalidOid : PG_GETARG_OID(1);
98 	bool verbose = PG_ARGISNULL(2) ? false : PG_GETARG_BOOL(2);
99 
100 	/* used for debugging purposes only see finish_heap_swaps */
101 	Oid wait_id = PG_NARGS() < 4 || PG_ARGISNULL(3) ? InvalidOid : PG_GETARG_OID(3);
102 
103 	/*
104 	 * Allow reorder in transactions for testing purposes only
105 	 */
106 	if (!OidIsValid(wait_id))
107 		PreventInTransactionBlock(true, "reorder");
108 
109 	reorder_chunk(chunk_id, index_id, verbose, wait_id, InvalidOid, InvalidOid);
110 	PG_RETURN_VOID();
111 }
112 
113 Datum
tsl_move_chunk(PG_FUNCTION_ARGS)114 tsl_move_chunk(PG_FUNCTION_ARGS)
115 {
116 	Oid chunk_id = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0);
117 	Oid destination_tablespace =
118 		PG_ARGISNULL(1) ? InvalidOid : get_tablespace_oid(PG_GETARG_NAME(1)->data, false);
119 	Oid index_destination_tablespace =
120 		PG_ARGISNULL(2) ? InvalidOid : get_tablespace_oid(PG_GETARG_NAME(2)->data, false);
121 	Oid index_id = PG_ARGISNULL(3) ? InvalidOid : PG_GETARG_OID(3);
122 	bool verbose = PG_ARGISNULL(4) ? false : PG_GETARG_BOOL(4);
123 	Chunk *chunk;
124 
125 	/* used for debugging purposes only see finish_heap_swaps */
126 	Oid wait_id = PG_NARGS() < 6 || PG_ARGISNULL(5) ? InvalidOid : PG_GETARG_OID(5);
127 
128 	/*
129 	 * Allow move in transactions for testing purposes only
130 	 */
131 	if (!OidIsValid(wait_id))
132 		PreventInTransactionBlock(true, "move");
133 
134 	/*
135 	 * Index_destination_tablespace is currently a required parameter in order
136 	 * to avoid situations where there is ambiguity about where indexes should
137 	 * be placed based on where the index was created and the new tablespace
138 	 * (and avoid interactions with multi-tablespace hypertable functionality).
139 	 * Eventually we may want to offer an option to keep indexes in the
140 	 * tablespace of their parent if it is specified.
141 	 */
142 	if (!OidIsValid(chunk_id) || !OidIsValid(destination_tablespace) ||
143 		!OidIsValid(index_destination_tablespace))
144 		ereport(ERROR,
145 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
146 				 errmsg("valid chunk, destination_tablespace, and index_destination_tablespaces "
147 						"are required")));
148 
149 	chunk = ts_chunk_get_by_relid(chunk_id, false);
150 
151 	if (NULL == chunk)
152 		ereport(ERROR,
153 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
154 				 errmsg("\"%s\" is not a chunk", get_rel_name(chunk_id))));
155 
156 	if (ts_chunk_contains_compressed_data(chunk))
157 	{
158 		Chunk *chunk_parent = ts_chunk_get_compressed_chunk_parent(chunk);
159 
160 		ereport(ERROR,
161 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
162 				 errmsg("cannot directly move internal compression data"),
163 				 errdetail("Chunk \"%s\" contains compressed data for chunk \"%s\" and cannot be "
164 						   "moved directly.",
165 						   get_rel_name(chunk_id),
166 						   get_rel_name(chunk_parent->table_id)),
167 				 errhint("Moving chunk \"%s\" will also move the compressed data.",
168 						 get_rel_name(chunk_parent->table_id))));
169 	}
170 
171 	/* If chunk is compressed move it by altering tablespace on both chunks */
172 	if (OidIsValid(chunk->fd.compressed_chunk_id))
173 	{
174 		Chunk *compressed_chunk = ts_chunk_get_by_id(chunk->fd.compressed_chunk_id, true);
175 		AlterTableCmd cmd = { .type = T_AlterTableCmd,
176 							  .subtype = AT_SetTableSpace,
177 							  .name = get_tablespace_name(destination_tablespace) };
178 
179 		if (OidIsValid(index_id))
180 			ereport(NOTICE,
181 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
182 					 errmsg("ignoring index parameter"),
183 					 errdetail("Chunk will not be reordered as it has compressed data.")));
184 
185 		AlterTableInternal(chunk_id, list_make1(&cmd), false);
186 		AlterTableInternal(compressed_chunk->table_id, list_make1(&cmd), false);
187 	}
188 	else
189 	{
190 		reorder_chunk(chunk_id,
191 					  index_id,
192 					  verbose,
193 					  wait_id,
194 					  destination_tablespace,
195 					  index_destination_tablespace);
196 	}
197 
198 	PG_RETURN_VOID();
199 }
200 
201 /*
202  * Implement a distributed chunk copy/move operation.
203  *
204  * We use a procedure because multiple steps need to be performed via multiple
205  * transactions across the access node and the two datanodes that are involved.
206  * The progress of the various stages/steps are tracked in the
207  * CHUNK_COPY_OPERATION catalog table
208  */
209 static void
tsl_copy_or_move_chunk_proc(FunctionCallInfo fcinfo,bool delete_on_src_node)210 tsl_copy_or_move_chunk_proc(FunctionCallInfo fcinfo, bool delete_on_src_node)
211 {
212 	Oid chunk_id = PG_ARGISNULL(0) ? InvalidOid : PG_GETARG_OID(0);
213 	const char *src_node_name = PG_ARGISNULL(1) ? NULL : NameStr(*PG_GETARG_NAME(1));
214 	const char *dst_node_name = PG_ARGISNULL(2) ? NULL : NameStr(*PG_GETARG_NAME(2));
215 	int rc;
216 	bool nonatomic = fcinfo->context && IsA(fcinfo->context, CallContext) &&
217 					 !castNode(CallContext, fcinfo->context)->atomic;
218 
219 	TS_PREVENT_FUNC_IF_READ_ONLY();
220 
221 	PreventInTransactionBlock(true, get_func_name(FC_FN_OID(fcinfo)));
222 
223 	/* src_node and dst_node both have to be non-NULL */
224 	if (src_node_name == NULL || dst_node_name == NULL)
225 		ereport(ERROR,
226 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
227 				 errmsg("invalid source or destination node")));
228 
229 	if (!OidIsValid(chunk_id))
230 		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid chunk")));
231 
232 	if ((rc = SPI_connect_ext(nonatomic ? SPI_OPT_NONATOMIC : 0)) != SPI_OK_CONNECT)
233 		elog(ERROR, "SPI_connect failed: %s", SPI_result_code_string(rc));
234 
235 	/* perform the actual distributed chunk move after a few sanity checks */
236 	chunk_copy(chunk_id, src_node_name, dst_node_name, delete_on_src_node);
237 
238 	if ((rc = SPI_finish()) != SPI_OK_FINISH)
239 		elog(ERROR, "SPI_finish failed: %s", SPI_result_code_string(rc));
240 }
241 
242 Datum
tsl_move_chunk_proc(PG_FUNCTION_ARGS)243 tsl_move_chunk_proc(PG_FUNCTION_ARGS)
244 {
245 	tsl_copy_or_move_chunk_proc(fcinfo, true);
246 
247 	PG_RETURN_VOID();
248 }
249 
250 Datum
tsl_copy_chunk_proc(PG_FUNCTION_ARGS)251 tsl_copy_chunk_proc(PG_FUNCTION_ARGS)
252 {
253 	tsl_copy_or_move_chunk_proc(fcinfo, false);
254 
255 	PG_RETURN_VOID();
256 }
257 
258 Datum
tsl_copy_chunk_cleanup_proc(PG_FUNCTION_ARGS)259 tsl_copy_chunk_cleanup_proc(PG_FUNCTION_ARGS)
260 {
261 	const char *operation_id = PG_ARGISNULL(0) ? NULL : NameStr(*PG_GETARG_NAME(0));
262 	int rc;
263 	bool nonatomic = fcinfo->context && IsA(fcinfo->context, CallContext) &&
264 					 !castNode(CallContext, fcinfo->context)->atomic;
265 
266 	TS_PREVENT_FUNC_IF_READ_ONLY();
267 
268 	PreventInTransactionBlock(true, get_func_name(FC_FN_OID(fcinfo)));
269 
270 	/* valid input has to be provided */
271 	if (operation_id == NULL)
272 		ereport(ERROR,
273 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
274 				 errmsg("invalid chunk copy operation id")));
275 
276 	if ((rc = SPI_connect_ext(nonatomic ? SPI_OPT_NONATOMIC : 0)) != SPI_OK_CONNECT)
277 		elog(ERROR, "SPI_connect failed: %s", SPI_result_code_string(rc));
278 
279 	/* perform the cleanup/repair depending on the stage */
280 	chunk_copy_cleanup(operation_id);
281 
282 	if ((rc = SPI_finish()) != SPI_OK_FINISH)
283 		elog(ERROR, "SPI_finish failed: %s", SPI_result_code_string(rc));
284 
285 	PG_RETURN_VOID();
286 }
287 
288 void
reorder_chunk(Oid chunk_id,Oid index_id,bool verbose,Oid wait_id,Oid destination_tablespace,Oid index_tablespace)289 reorder_chunk(Oid chunk_id, Oid index_id, bool verbose, Oid wait_id, Oid destination_tablespace,
290 			  Oid index_tablespace)
291 {
292 	Chunk *chunk;
293 	Cache *hcache;
294 	Hypertable *ht;
295 	ChunkIndexMapping cim;
296 
297 	if (!OidIsValid(chunk_id))
298 		ereport(ERROR,
299 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
300 				 errmsg("must provide a valid chunk to cluster")));
301 
302 	chunk = ts_chunk_get_by_relid(chunk_id, false);
303 
304 	if (NULL == chunk)
305 		ereport(ERROR,
306 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
307 				 errmsg("\"%s\" is not a chunk", get_rel_name(chunk_id))));
308 
309 	ht = ts_hypertable_cache_get_cache_and_entry(chunk->hypertable_relid, CACHE_FLAG_NONE, &hcache);
310 
311 	/* Our check gives better error messages, but keep the original one too. */
312 	ts_hypertable_permissions_check(ht->main_table_relid, GetUserId());
313 
314 	if (!pg_class_ownercheck(ht->main_table_relid, GetUserId()))
315 	{
316 		Oid main_table_relid = ht->main_table_relid;
317 
318 		ts_cache_release(hcache);
319 		aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_TABLE, get_rel_name(main_table_relid));
320 	}
321 
322 	if (hypertable_is_distributed(ht))
323 		ereport(ERROR,
324 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
325 				 errmsg("move_chunk() and reorder_chunk() cannot be used "
326 						"with distributed hypertables")));
327 
328 	if (!chunk_get_reorder_index(ht, chunk, index_id, &cim))
329 	{
330 		ts_cache_release(hcache);
331 		if (OidIsValid(index_id))
332 			ereport(ERROR,
333 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
334 					 errmsg("\"%s\" is not a valid clustering index for table \"%s\"",
335 							get_rel_name(index_id),
336 							get_rel_name(chunk_id))));
337 		else
338 			ereport(ERROR,
339 					(errcode(ERRCODE_UNDEFINED_OBJECT),
340 					 errmsg("there is no previously clustered index for table \"%s\"",
341 							get_rel_name(chunk_id))));
342 	}
343 
344 	if (OidIsValid(destination_tablespace) && destination_tablespace != MyDatabaseTableSpace)
345 	{
346 		AclResult aclresult;
347 
348 		aclresult = pg_tablespace_aclcheck(destination_tablespace, GetUserId(), ACL_CREATE);
349 		if (aclresult != ACLCHECK_OK)
350 			ereport(ERROR,
351 					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
352 					 errmsg("permission denied for tablespace \"%s\"",
353 							get_tablespace_name(destination_tablespace))));
354 		;
355 	}
356 
357 	if (OidIsValid(index_tablespace) && index_tablespace != MyDatabaseTableSpace)
358 	{
359 		AclResult aclresult;
360 
361 		aclresult = pg_tablespace_aclcheck(index_tablespace, GetUserId(), ACL_CREATE);
362 		if (aclresult != ACLCHECK_OK)
363 			ereport(ERROR,
364 					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
365 					 errmsg("permission denied for tablespace \"%s\"",
366 							get_tablespace_name(index_tablespace))));
367 	}
368 
369 	Assert(cim.chunkoid == chunk_id);
370 
371 	/*
372 	 * We must mark each chunk index as clustered before calling reorder_rel()
373 	 * because it expects indexes that need to be rechecked (due to new
374 	 * transaction) to already have that mark set
375 	 */
376 	ts_chunk_index_mark_clustered(cim.chunkoid, cim.indexoid);
377 	timescale_reorder_rel(cim.chunkoid,
378 						  cim.indexoid,
379 						  verbose,
380 						  wait_id,
381 						  destination_tablespace,
382 						  index_tablespace);
383 	ts_cache_release(hcache);
384 }
385 
386 /*
387  * Find the index to reorder a chunk on based on a possibly NULL indexname
388  * returns NULL if no such index is found
389  */
390 static bool
chunk_get_reorder_index(Hypertable * ht,Chunk * chunk,Oid index_relid,ChunkIndexMapping * cim_out)391 chunk_get_reorder_index(Hypertable *ht, Chunk *chunk, Oid index_relid, ChunkIndexMapping *cim_out)
392 {
393 	/*
394 	 * Index search order: 1. Explicitly named index 2. Chunk cluster index 3.
395 	 * Hypertable cluster index
396 	 */
397 	if (OidIsValid(index_relid))
398 	{
399 		if (ts_chunk_index_get_by_indexrelid(chunk, index_relid, cim_out))
400 			return true;
401 
402 		return ts_chunk_index_get_by_hypertable_indexrelid(chunk, index_relid, cim_out);
403 	}
404 
405 	index_relid = ts_indexing_find_clustered_index(chunk->table_id);
406 	if (OidIsValid(index_relid))
407 		return ts_chunk_index_get_by_indexrelid(chunk, index_relid, cim_out);
408 
409 	index_relid = ts_indexing_find_clustered_index(ht->main_table_relid);
410 	if (OidIsValid(index_relid))
411 		return ts_chunk_index_get_by_hypertable_indexrelid(chunk, index_relid, cim_out);
412 
413 	return false;
414 }
415 
416 /* The following functions are based on their equivalents in postgres's cluster.c */
417 
418 /*
419  * timescale_reorder_rel
420  *
421  * This clusters the table by creating a new, clustered table and
422  * swapping the relfilenodes of the new table and the old table, so
423  * the OID of the original table is preserved.
424  *
425  * Indexes are rebuilt in the same manner.
426  */
427 void
timescale_reorder_rel(Oid tableOid,Oid indexOid,bool verbose,Oid wait_id,Oid destination_tablespace,Oid index_tablespace)428 timescale_reorder_rel(Oid tableOid, Oid indexOid, bool verbose, Oid wait_id,
429 					  Oid destination_tablespace, Oid index_tablespace)
430 {
431 	Relation OldHeap;
432 	HeapTuple tuple;
433 	Form_pg_index indexForm;
434 
435 	if (!OidIsValid(indexOid))
436 		elog(ERROR, "Reorder must specify an index.");
437 
438 	/* Check for user-requested abort. */
439 	CHECK_FOR_INTERRUPTS();
440 
441 	/*
442 	 * We grab exclusive access to the target rel and index for the duration
443 	 * of the transaction.  (This is redundant for the single-transaction
444 	 * case, since cluster() already did it.)  The index lock is taken inside
445 	 * check_index_is_clusterable.
446 	 */
447 	OldHeap = try_relation_open(tableOid, ExclusiveLock);
448 
449 	/* If the table has gone away, we can skip processing it */
450 	if (!OldHeap)
451 	{
452 		ereport(WARNING, (errcode(ERRCODE_WARNING), errmsg("table disappeared during reorder")));
453 		return;
454 	}
455 
456 	/*
457 	 * Since we may open a new transaction for each relation, we have to check
458 	 * that the relation still is what we think it is.
459 	 */
460 	/* Check that the user still owns the relation */
461 	if (!pg_class_ownercheck(tableOid, GetUserId()))
462 	{
463 		relation_close(OldHeap, ExclusiveLock);
464 		ereport(WARNING, (errcode(ERRCODE_WARNING), errmsg("ownership changed during reorder")));
465 		return;
466 	}
467 
468 	if (IsSystemRelation(OldHeap))
469 		ereport(ERROR,
470 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
471 				 errmsg("cannot reorder a system relation")));
472 
473 	if (OldHeap->rd_rel->relpersistence != RELPERSISTENCE_PERMANENT)
474 		ereport(ERROR,
475 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
476 				 errmsg("can only reorder a permanent table")));
477 
478 	/* We do not allow reordering on shared catalogs. */
479 	if (OldHeap->rd_rel->relisshared)
480 		ereport(ERROR,
481 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
482 				 errmsg("cannot reorder a shared catalog")));
483 
484 	if (OldHeap->rd_rel->relkind != RELKIND_RELATION)
485 		ereport(ERROR,
486 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("can only reorder a relation")));
487 
488 	/*
489 	 * Check that the index still exists
490 	 */
491 	if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
492 	{
493 		ereport(WARNING, (errcode(ERRCODE_WARNING), errmsg("index disappeared during reorder")));
494 		relation_close(OldHeap, ExclusiveLock);
495 		return;
496 	}
497 
498 	/*
499 	 * Check that the index is still the one with indisclustered set.
500 	 */
501 	tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
502 	if (!HeapTupleIsValid(tuple)) /* probably can't happen */
503 	{
504 		ereport(WARNING, (errcode(ERRCODE_WARNING), errmsg("invalid index heap during reorder")));
505 		relation_close(OldHeap, ExclusiveLock);
506 		return;
507 	}
508 	indexForm = (Form_pg_index) GETSTRUCT(tuple);
509 
510 	/*
511 	 * We always mark indexes as clustered when we intercept a cluster
512 	 * command, if it's not marked as such here, something has gone wrong
513 	 */
514 	if (!indexForm->indisclustered)
515 		ereport(ERROR,
516 				(errcode(ERRCODE_ASSERT_FAILURE), errmsg("invalid index heap during reorder")));
517 	ReleaseSysCache(tuple);
518 
519 	/*
520 	 * Also check for active uses of the relation in the current transaction,
521 	 * including open scans and pending AFTER trigger events.
522 	 */
523 	CheckTableNotInUse(OldHeap, "CLUSTER");
524 
525 	/* Check heap and index are valid to cluster on */
526 	check_index_is_clusterable(OldHeap, indexOid, true, ExclusiveLock);
527 
528 	/* timescale_rebuild_relation does all the dirty work */
529 	timescale_rebuild_relation(OldHeap,
530 							   indexOid,
531 							   verbose,
532 							   wait_id,
533 							   destination_tablespace,
534 							   index_tablespace);
535 
536 	/* NB: timescale_rebuild_relation does table_close() on OldHeap */
537 }
538 
539 /*
540  * timescale_rebuild_relation: rebuild an existing relation in index or physical order
541  *
542  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
543  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
544  *
545  * NB: this routine closes OldHeap at the right time; caller should not.
546  */
547 static void
timescale_rebuild_relation(Relation OldHeap,Oid indexOid,bool verbose,Oid wait_id,Oid destination_tablespace,Oid index_tablespace)548 timescale_rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose, Oid wait_id,
549 						   Oid destination_tablespace, Oid index_tablespace)
550 {
551 	Oid tableOid = RelationGetRelid(OldHeap);
552 	Oid tableSpace = OidIsValid(destination_tablespace) ? destination_tablespace :
553 														  OldHeap->rd_rel->reltablespace;
554 	Oid OIDNewHeap;
555 	List *old_index_oids;
556 	List *new_index_oids;
557 	char relpersistence;
558 	bool swap_toast_by_content;
559 	TransactionId frozenXid;
560 	MultiXactId cutoffMulti;
561 
562 	/* Mark the correct index as clustered */
563 	mark_index_clustered(OldHeap, indexOid, true);
564 
565 	/* Remember info about rel before closing OldHeap */
566 	relpersistence = OldHeap->rd_rel->relpersistence;
567 
568 	/* Close relcache entry, but keep lock until transaction commit */
569 	table_close(OldHeap, NoLock);
570 
571 	/* Create the transient table that will receive the re-ordered data */
572 	OIDNewHeap = make_new_heap(tableOid, tableSpace, relpersistence, ExclusiveLock);
573 
574 	/* Copy the heap data into the new table in the desired order */
575 	copy_heap_data(OIDNewHeap,
576 				   tableOid,
577 				   indexOid,
578 				   verbose,
579 				   &swap_toast_by_content,
580 				   &frozenXid,
581 				   &cutoffMulti);
582 
583 	/* Create versions of the tables indexes for the new table */
584 	new_index_oids =
585 		ts_chunk_index_duplicate(tableOid, OIDNewHeap, &old_index_oids, index_tablespace);
586 
587 	/*
588 	 * Swap the physical files of the target and transient tables, then
589 	 * rebuild the target's indexes and throw away the transient table.
590 	 */
591 	finish_heap_swaps(tableOid,
592 					  OIDNewHeap,
593 					  old_index_oids,
594 					  new_index_oids,
595 					  swap_toast_by_content,
596 					  true,
597 					  frozenXid,
598 					  cutoffMulti,
599 					  wait_id);
600 }
601 
602 /*
603  * Do the physical copying of heap data.
604  *
605  * There are three output parameters:
606  * *pSwapToastByContent is set true if toast tables must be swapped by content.
607  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
608  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
609  */
610 static void
copy_heap_data(Oid OIDNewHeap,Oid OIDOldHeap,Oid OIDOldIndex,bool verbose,bool * pSwapToastByContent,TransactionId * pFreezeXid,MultiXactId * pCutoffMulti)611 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
612 			   bool *pSwapToastByContent, TransactionId *pFreezeXid, MultiXactId *pCutoffMulti)
613 {
614 	Relation NewHeap, OldHeap, OldIndex;
615 	Relation relRelation;
616 	HeapTuple reltup;
617 	Form_pg_class relform;
618 	TupleDesc PG_USED_FOR_ASSERTS_ONLY oldTupDesc;
619 	TupleDesc newTupDesc;
620 	int natts;
621 	Datum *values;
622 	bool *isnull;
623 	TransactionId OldestXmin;
624 	TransactionId FreezeXid;
625 	MultiXactId MultiXactCutoff;
626 	bool use_sort;
627 	double num_tuples = 0, tups_vacuumed = 0, tups_recently_dead = 0;
628 	BlockNumber num_pages;
629 	int elevel = verbose ? INFO : DEBUG2;
630 	PGRUsage ru0;
631 	pg_rusage_init(&ru0);
632 
633 	/*
634 	 * Open the relations we need.
635 	 */
636 	NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
637 	OldHeap = table_open(OIDOldHeap, ExclusiveLock);
638 
639 	if (OidIsValid(OIDOldIndex))
640 		OldIndex = index_open(OIDOldIndex, ExclusiveLock);
641 	else
642 		OldIndex = NULL;
643 
644 	/*
645 	 * Their tuple descriptors should be exactly alike, but here we only need
646 	 * assume that they have the same number of columns.
647 	 */
648 	oldTupDesc = RelationGetDescr(OldHeap);
649 	newTupDesc = RelationGetDescr(NewHeap);
650 	Assert(newTupDesc->natts == oldTupDesc->natts);
651 
652 	/* Preallocate values/isnull arrays */
653 	natts = newTupDesc->natts;
654 	values = (Datum *) palloc(natts * sizeof(Datum));
655 	isnull = (bool *) palloc(natts * sizeof(bool));
656 
657 	/*
658 	 * If the OldHeap has a toast table, get lock on the toast table to keep
659 	 * it from being vacuumed.  This is needed because autovacuum processes
660 	 * toast tables independently of their main tables, with no lock on the
661 	 * latter.  If an autovacuum were to start on the toast table after we
662 	 * compute our OldestXmin below, it would use a later OldestXmin, and then
663 	 * possibly remove as DEAD toast tuples belonging to main tuples we think
664 	 * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
665 	 * tuples.
666 	 *
667 	 * We don't need to open the toast relation here, just lock it.  The lock
668 	 * will be held till end of transaction.
669 	 */
670 	if (OldHeap->rd_rel->reltoastrelid)
671 		LockRelationOid(OldHeap->rd_rel->reltoastrelid, ExclusiveLock);
672 
673 	/* use_wal off requires smgr_targblock be initially invalid */
674 	Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
675 
676 	/*
677 	 * If both tables have TOAST tables, perform toast swap by content.  It is
678 	 * possible that the old table has a toast table but the new one doesn't,
679 	 * if toastable columns have been dropped.  In that case we have to do
680 	 * swap by links.  This is okay because swap by content is only essential
681 	 * for system catalogs, and we don't support schema changes for them.
682 	 */
683 	if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
684 	{
685 		*pSwapToastByContent = true;
686 
687 		/*
688 		 * When doing swap by content, any toast pointers written into NewHeap
689 		 * must use the old toast table's OID, because that's where the toast
690 		 * data will eventually be found.  Set this up by setting rd_toastoid.
691 		 * This also tells toast_save_datum() to preserve the toast value
692 		 * OIDs, which we want so as not to invalidate toast pointers in
693 		 * system catalog caches, and to avoid making multiple copies of a
694 		 * single toast value.
695 		 *
696 		 * Note that we must hold NewHeap open until we are done writing data,
697 		 * since the relcache will not guarantee to remember this setting once
698 		 * the relation is closed.  Also, this technique depends on the fact
699 		 * that no one will try to read from the NewHeap until after we've
700 		 * finished writing it and swapping the rels --- otherwise they could
701 		 * follow the toast pointers to the wrong place.  (It would actually
702 		 * work for values copied over from the old toast table, but not for
703 		 * any values that we toast which were previously not toasted.)
704 		 */
705 		NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
706 	}
707 	else
708 		*pSwapToastByContent = false;
709 
710 	/*
711 	 * Compute xids used to freeze and weed out dead tuples and multixacts.
712 	 * Since we're going to rewrite the whole table anyway, there's no reason
713 	 * not to be aggressive about this.
714 	 */
715 	vacuum_set_xid_limits(OldHeap,
716 						  0,
717 						  0,
718 						  0,
719 						  0,
720 						  &OldestXmin,
721 						  &FreezeXid,
722 						  NULL,
723 						  &MultiXactCutoff,
724 						  NULL);
725 
726 	/*
727 	 * FreezeXid will become the table's new relfrozenxid, and that mustn't go
728 	 * backwards, so take the max.
729 	 */
730 	if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
731 		FreezeXid = OldHeap->rd_rel->relfrozenxid;
732 
733 	/*
734 	 * MultiXactCutoff, similarly, shouldn't go backwards either.
735 	 */
736 	if (MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
737 		MultiXactCutoff = OldHeap->rd_rel->relminmxid;
738 
739 	/* return selected values to caller */
740 	*pFreezeXid = FreezeXid;
741 	*pCutoffMulti = MultiXactCutoff;
742 
743 	/*
744 	 * We know how to use a sort to duplicate the ordering of a btree index,
745 	 * and will use seqscan-and-sort for that.  Otherwise, always use an
746 	 * indexscan for other indexes or plain seqscan if no index is supplied.
747 	 */
748 	if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
749 		use_sort = true;
750 	else
751 		use_sort = false;
752 
753 	/* Log what we're doing */
754 	if (OldIndex != NULL && !use_sort)
755 		ereport(elevel,
756 				(errmsg("reordering \"%s.%s\" using index scan on \"%s\"",
757 						get_namespace_name(RelationGetNamespace(OldHeap)),
758 						RelationGetRelationName(OldHeap),
759 						RelationGetRelationName(OldIndex))));
760 	else if (use_sort)
761 		ereport(elevel,
762 				(errmsg("reordering \"%s.%s\" using sequential scan and sort",
763 						get_namespace_name(RelationGetNamespace(OldHeap)),
764 						RelationGetRelationName(OldHeap))));
765 	else
766 		ereport(ERROR,
767 				(errmsg("tried to use a reorder without an index \"%s.%s\"",
768 						get_namespace_name(RelationGetNamespace(OldHeap)),
769 						RelationGetRelationName(OldHeap))));
770 
771 	table_relation_copy_for_cluster(OldHeap,
772 									NewHeap,
773 									OldIndex,
774 									use_sort,
775 									OldestXmin,
776 									&FreezeXid,
777 									&MultiXactCutoff,
778 									&num_tuples,
779 									&tups_vacuumed,
780 									&tups_recently_dead);
781 
782 	/* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
783 	NewHeap->rd_toastoid = InvalidOid;
784 
785 	num_pages = RelationGetNumberOfBlocks(NewHeap);
786 
787 	/* Log what we did */
788 	ereport(elevel,
789 			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
790 					RelationGetRelationName(OldHeap),
791 					tups_vacuumed,
792 					num_tuples,
793 					RelationGetNumberOfBlocks(OldHeap)),
794 			 errdetail("%.0f dead row versions cannot be removed yet.\n"
795 					   "%s.",
796 					   tups_recently_dead,
797 					   pg_rusage_show(&ru0))));
798 
799 	/* Clean up */
800 	pfree(values);
801 	pfree(isnull);
802 
803 	if (OldIndex != NULL)
804 		index_close(OldIndex, NoLock);
805 	table_close(OldHeap, NoLock);
806 	table_close(NewHeap, NoLock);
807 
808 	/* Update pg_class to reflect the correct values of pages and tuples. */
809 	relRelation = table_open(RelationRelationId, RowExclusiveLock);
810 
811 	reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
812 	if (!HeapTupleIsValid(reltup))
813 		elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
814 	relform = (Form_pg_class) GETSTRUCT(reltup);
815 
816 	relform->relpages = num_pages;
817 	relform->reltuples = num_tuples;
818 
819 	/* Don't update the stats for pg_class.  See swap_relation_files. */
820 	Assert(OIDOldHeap != RelationRelationId);
821 	CacheInvalidateRelcacheByTuple(reltup);
822 
823 	/* Clean up. */
824 	heap_freetuple(reltup);
825 	table_close(relRelation, RowExclusiveLock);
826 
827 	/* Make the update visible */
828 	CommandCounterIncrement();
829 }
830 
831 /*
832  * Remove the transient table that was built by make_new_heap, and finish
833  * cleaning up (including rebuilding all indexes on the old heap).
834  *
835  * NB: new_index_oids must be in the same order as RelationGetIndexList
836  *
837  */
838 static void
finish_heap_swaps(Oid OIDOldHeap,Oid OIDNewHeap,List * old_index_oids,List * new_index_oids,bool swap_toast_by_content,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti,Oid wait_id)839 finish_heap_swaps(Oid OIDOldHeap, Oid OIDNewHeap, List *old_index_oids, List *new_index_oids,
840 				  bool swap_toast_by_content, bool is_internal, TransactionId frozenXid,
841 				  MultiXactId cutoffMulti, Oid wait_id)
842 {
843 	ObjectAddress object;
844 	Relation oldHeapRel;
845 	ListCell *old_index_cell;
846 	ListCell *new_index_cell;
847 	int config_change;
848 
849 #ifdef DEBUG
850 
851 	/*
852 	 * For debug purposes we serialize against wait_id if it exists, this
853 	 * allows us to "pause" reorder immediately before swapping in the new
854 	 * table
855 	 */
856 	if (OidIsValid(wait_id))
857 	{
858 		Relation waiter = table_open(wait_id, AccessExclusiveLock);
859 
860 		table_close(waiter, AccessExclusiveLock);
861 	}
862 #endif
863 
864 	/*
865 	 * There's a risk of deadlock if some other process is also trying to
866 	 * upgrade their lock in the same manner as us, at this time. Since our
867 	 * transaction has performed a large amount of work, and only needs to be
868 	 * run once per chunk, we do not want to abort it due to this deadlock. To
869 	 * prevent abort we set our `deadlock_timeout` to a large value in the
870 	 * expectation that the other process will timeout and abort first.
871 	 * Currently we set `deadlock_timeout` to 1 hour, as this should be longer
872 	 * than any other normal process, while still allowing the system to make
873 	 * progress in the event of a real deadlock. As this is the last lock we
874 	 * grab, and the setting is local to our transaction we do not bother
875 	 * changing the guc back.
876 	 */
877 	config_change = set_config_option("deadlock_timeout",
878 									  REORDER_ACCESS_EXCLUSIVE_DEADLOCK_TIMEOUT,
879 									  PGC_SUSET,
880 									  PGC_S_SESSION,
881 									  GUC_ACTION_LOCAL,
882 									  true,
883 									  0,
884 									  false);
885 
886 	if (config_change == 0)
887 		ereport(ERROR,
888 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
889 				 errmsg("deadlock_timeout guc does not exist.")));
890 	else if (config_change < 0)
891 		ereport(ERROR,
892 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
893 				 errmsg("could not set deadlock_timeout guc.")));
894 
895 	oldHeapRel = table_open(OIDOldHeap, AccessExclusiveLock);
896 
897 	/*
898 	 * All predicate locks on the tuples or pages are about to be made
899 	 * invalid, because we move tuples around.  Promote them to relation
900 	 * locks.  Predicate locks on indexes will be promoted when they are
901 	 * reindexed.
902 	 */
903 	TransferPredicateLocksToHeapRelation(oldHeapRel);
904 
905 	/*
906 	 * Swap the contents of the heap relations (including any toast tables).
907 	 * Also set old heap's relfrozenxid to frozenXid.
908 	 */
909 	swap_relation_files(OIDOldHeap,
910 						OIDNewHeap,
911 						swap_toast_by_content,
912 						is_internal,
913 						frozenXid,
914 						cutoffMulti);
915 
916 	/* Swap the contents of the indexes */
917 	Assert(list_length(old_index_oids) == list_length(new_index_oids));
918 	forboth (old_index_cell, old_index_oids, new_index_cell, new_index_oids)
919 	{
920 		Oid old_index_oid = lfirst_oid(old_index_cell);
921 		Oid new_index_oid = lfirst_oid(new_index_cell);
922 
923 		swap_relation_files(old_index_oid,
924 							new_index_oid,
925 							swap_toast_by_content,
926 							true,
927 							frozenXid,
928 							cutoffMulti);
929 	}
930 	table_close(oldHeapRel, NoLock);
931 
932 	CommandCounterIncrement();
933 
934 	/* Destroy new heap with old filenode */
935 	object.classId = RelationRelationId;
936 	object.objectId = OIDNewHeap;
937 	object.objectSubId = 0;
938 
939 	/*
940 	 * The new relation is local to our transaction and we know nothing
941 	 * depends on it, so DROP_RESTRICT should be OK.
942 	 */
943 	performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
944 
945 	/* performDeletion does CommandCounterIncrement at end */
946 
947 	/*
948 	 * At this point, everything is kosher except that, if we did toast swap
949 	 * by links, the toast table's name corresponds to the transient table.
950 	 * The name is irrelevant to the backend because it's referenced by OID,
951 	 * but users looking at the catalogs could be confused.  Rename it to
952 	 * prevent this problem.
953 	 *
954 	 * Note no lock required on the relation, because we already hold an
955 	 * exclusive lock on it.
956 	 */
957 	if (!swap_toast_by_content)
958 	{
959 		Relation newrel;
960 
961 		newrel = table_open(OIDOldHeap, NoLock);
962 		if (OidIsValid(newrel->rd_rel->reltoastrelid))
963 		{
964 			Oid toastidx;
965 			char NewToastName[NAMEDATALEN];
966 
967 			/* Get the associated valid index to be renamed */
968 			toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid, AccessShareLock);
969 
970 			/* rename the toast table ... */
971 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u", OIDOldHeap);
972 			RenameRelationInternal(newrel->rd_rel->reltoastrelid, NewToastName, true, false);
973 
974 			/* ... and its valid index too. */
975 			snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index", OIDOldHeap);
976 
977 			RenameRelationInternal(toastidx, NewToastName, true, true);
978 		}
979 		table_close(newrel, NoLock);
980 	}
981 
982 	/* it's not a catalog table, clear any missing attribute settings */
983 	{
984 		Relation newrel;
985 
986 		newrel = table_open(OIDOldHeap, NoLock);
987 		RelationClearMissing(newrel);
988 		table_close(newrel, NoLock);
989 	}
990 }
991 
992 /*
993  * Swap the physical files of two given relations.
994  *
995  * We swap the physical identity (reltablespace, relfilenode) while keeping the
996  * same logical identities of the two relations.  relpersistence is also
997  * swapped, which is critical since it determines where buffers live for each
998  * relation.
999  *
1000  * We can swap associated TOAST data in either of two ways: recursively swap
1001  * the physical content of the toast tables (and their indexes), or swap the
1002  * TOAST links in the given relations' pg_class entries. The latter is the only
1003  * way to handle cases in which a toast table is added or removed altogether.
1004  *
1005  * Additionally, the first relation is marked with relfrozenxid set to
1006  * frozenXid.  It seems a bit ugly to have this here, but the caller would
1007  * have to do it anyway, so having it here saves a heap_update.  Note: in
1008  * the swap-toast-links case, we assume we don't need to change the toast
1009  * table's relfrozenxid: the new version of the toast table should already
1010  * have relfrozenxid set to RecentXmin, which is good enough.
1011  *
1012  */
1013 static void
swap_relation_files(Oid r1,Oid r2,bool swap_toast_by_content,bool is_internal,TransactionId frozenXid,MultiXactId cutoffMulti)1014 swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, bool is_internal,
1015 					TransactionId frozenXid, MultiXactId cutoffMulti)
1016 {
1017 	Relation relRelation;
1018 	HeapTuple reltup1, reltup2;
1019 	Form_pg_class relform1, relform2;
1020 	Oid relfilenode1, relfilenode2;
1021 	Oid swaptemp;
1022 	char swptmpchr;
1023 
1024 	/* We need writable copies of both pg_class tuples. */
1025 	relRelation = table_open(RelationRelationId, RowExclusiveLock);
1026 
1027 	reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1028 	if (!HeapTupleIsValid(reltup1))
1029 		elog(ERROR, "cache lookup failed for relation %u", r1);
1030 	relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1031 
1032 	reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1033 	if (!HeapTupleIsValid(reltup2))
1034 		elog(ERROR, "cache lookup failed for relation %u", r2);
1035 	relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1036 
1037 	relfilenode1 = relform1->relfilenode;
1038 	relfilenode2 = relform2->relfilenode;
1039 
1040 	if (!OidIsValid(relfilenode1) || !OidIsValid(relfilenode2))
1041 		elog(ERROR, "cannot reorder mapped relation \"%s\".", NameStr(relform1->relname));
1042 
1043 	/* swap relfilenodes, reltablespaces, relpersistence */
1044 
1045 	swaptemp = relform1->relfilenode;
1046 	relform1->relfilenode = relform2->relfilenode;
1047 	relform2->relfilenode = swaptemp;
1048 
1049 	swaptemp = relform1->reltablespace;
1050 	relform1->reltablespace = relform2->reltablespace;
1051 	relform2->reltablespace = swaptemp;
1052 
1053 	swptmpchr = relform1->relpersistence;
1054 	relform1->relpersistence = relform2->relpersistence;
1055 	relform2->relpersistence = swptmpchr;
1056 
1057 	/* Also swap toast links, if we're swapping by links */
1058 	if (!swap_toast_by_content)
1059 	{
1060 		swaptemp = relform1->reltoastrelid;
1061 		relform1->reltoastrelid = relform2->reltoastrelid;
1062 		relform2->reltoastrelid = swaptemp;
1063 	}
1064 
1065 	/* set rel1's frozen Xid and minimum MultiXid */
1066 	if (relform1->relkind != RELKIND_INDEX)
1067 	{
1068 		Assert(TransactionIdIsNormal(frozenXid));
1069 		relform1->relfrozenxid = frozenXid;
1070 		Assert(MultiXactIdIsValid(cutoffMulti));
1071 		relform1->relminmxid = cutoffMulti;
1072 	}
1073 
1074 	/* swap size statistics too, since new rel has freshly-updated stats */
1075 	{
1076 		int32 swap_pages;
1077 		float4 swap_tuples;
1078 		int32 swap_allvisible;
1079 
1080 		swap_pages = relform1->relpages;
1081 		relform1->relpages = relform2->relpages;
1082 		relform2->relpages = swap_pages;
1083 
1084 		swap_tuples = relform1->reltuples;
1085 		relform1->reltuples = relform2->reltuples;
1086 		relform2->reltuples = swap_tuples;
1087 
1088 		swap_allvisible = relform1->relallvisible;
1089 		relform1->relallvisible = relform2->relallvisible;
1090 		relform2->relallvisible = swap_allvisible;
1091 	}
1092 
1093 	/* Update the tuples in pg_class. */
1094 	{
1095 		CatalogIndexState indstate;
1096 		indstate = CatalogOpenIndexes(relRelation);
1097 		CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1, indstate);
1098 		CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2, indstate);
1099 		CatalogCloseIndexes(indstate);
1100 	}
1101 
1102 	/*
1103 	 * Post alter hook for modified relations. The change to r2 is always
1104 	 * internal, but r1 depends on the invocation context.
1105 	 */
1106 	InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0, InvalidOid, is_internal);
1107 	InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0, InvalidOid, true);
1108 
1109 	/*
1110 	 * If we have toast tables associated with the relations being swapped,
1111 	 * deal with them too.
1112 	 */
1113 	if (relform1->reltoastrelid || relform2->reltoastrelid)
1114 	{
1115 		if (swap_toast_by_content)
1116 		{
1117 			if (relform1->reltoastrelid && relform2->reltoastrelid)
1118 			{
1119 				/* Recursively swap the contents of the toast tables */
1120 				swap_relation_files(relform1->reltoastrelid,
1121 									relform2->reltoastrelid,
1122 									swap_toast_by_content,
1123 									is_internal,
1124 									frozenXid,
1125 									cutoffMulti);
1126 			}
1127 			else
1128 			{
1129 				/* caller messed up */
1130 				elog(ERROR, "cannot swap toast files by content when there's only one");
1131 			}
1132 		}
1133 		else
1134 		{
1135 			/*
1136 			 * We swapped the ownership links, so we need to change dependency
1137 			 * data to match.
1138 			 *
1139 			 * NOTE: it is possible that only one table has a toast table.
1140 			 *
1141 			 * NOTE: at present, a TOAST table's only dependency is the one on
1142 			 * its owning table.  If more are ever created, we'd need to use
1143 			 * something more selective than deleteDependencyRecordsFor() to
1144 			 * get rid of just the link we want.
1145 			 */
1146 			ObjectAddress baseobject, toastobject;
1147 			long count;
1148 
1149 			/*
1150 			 * The original code disallowed this case for system catalogs. We
1151 			 * don't allow reordering system catalogs, but Assert anyway
1152 			 */
1153 			Assert(!IsSystemClass(r1, relform1));
1154 
1155 			/* Delete old dependencies */
1156 			if (relform1->reltoastrelid)
1157 			{
1158 				count =
1159 					deleteDependencyRecordsFor(RelationRelationId, relform1->reltoastrelid, false);
1160 				if (count != 1)
1161 					elog(ERROR, "expected one dependency record for TOAST table, found %ld", count);
1162 			}
1163 			if (relform2->reltoastrelid)
1164 			{
1165 				count =
1166 					deleteDependencyRecordsFor(RelationRelationId, relform2->reltoastrelid, false);
1167 				if (count != 1)
1168 					elog(ERROR, "expected one dependency record for TOAST table, found %ld", count);
1169 			}
1170 
1171 			/* Register new dependencies */
1172 			baseobject.classId = RelationRelationId;
1173 			baseobject.objectSubId = 0;
1174 			toastobject.classId = RelationRelationId;
1175 			toastobject.objectSubId = 0;
1176 
1177 			if (relform1->reltoastrelid)
1178 			{
1179 				baseobject.objectId = r1;
1180 				toastobject.objectId = relform1->reltoastrelid;
1181 				recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
1182 			}
1183 
1184 			if (relform2->reltoastrelid)
1185 			{
1186 				baseobject.objectId = r2;
1187 				toastobject.objectId = relform2->reltoastrelid;
1188 				recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
1189 			}
1190 		}
1191 	}
1192 
1193 	/*
1194 	 * If we're swapping two toast tables by content, do the same for their
1195 	 * valid index. The swap can actually be safely done only if the relations
1196 	 * have indexes.
1197 	 */
1198 	if (swap_toast_by_content && relform1->relkind == RELKIND_TOASTVALUE &&
1199 		relform2->relkind == RELKIND_TOASTVALUE)
1200 	{
1201 		Oid toastIndex1, toastIndex2;
1202 
1203 		/* Get valid index for each relation */
1204 		toastIndex1 = toast_get_valid_index(r1, AccessExclusiveLock);
1205 		toastIndex2 = toast_get_valid_index(r2, AccessExclusiveLock);
1206 
1207 		swap_relation_files(toastIndex1,
1208 							toastIndex2,
1209 							swap_toast_by_content,
1210 							is_internal,
1211 							InvalidTransactionId,
1212 							InvalidMultiXactId);
1213 	}
1214 
1215 	/* Clean up. */
1216 	heap_freetuple(reltup1);
1217 	heap_freetuple(reltup2);
1218 
1219 	table_close(relRelation, RowExclusiveLock);
1220 
1221 	/*
1222 	 * Close both relcache entries' smgr links.  We need this kludge because
1223 	 * both links will be invalidated during upcoming CommandCounterIncrement.
1224 	 * Whichever of the rels is the second to be cleared will have a dangling
1225 	 * reference to the other's smgr entry.  Rather than trying to avoid this
1226 	 * by ordering operations just so, it's easiest to close the links first.
1227 	 * (Fortunately, since one of the entries is local in our transaction,
1228 	 * it's sufficient to clear out our own relcache this way; the problem
1229 	 * cannot arise for other backends when they see our update on the
1230 	 * non-transient relation.)
1231 	 *
1232 	 * Caution: the placement of this step interacts with the decision to
1233 	 * handle toast rels by recursion.  When we are trying to rebuild pg_class
1234 	 * itself, the smgr close on pg_class must happen after all accesses in
1235 	 * this function.
1236 	 */
1237 	RelationCloseSmgrByOid(r1);
1238 	RelationCloseSmgrByOid(r2);
1239 }
1240