1 /*-------------------------------------------------------------------------
2  *
3  * heap_surgery.c
4  *	  Functions to perform surgery on the damaged heap table.
5  *
6  * Copyright (c) 2020-2021, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *	  contrib/pg_surgery/heap_surgery.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 
15 #include "access/heapam.h"
16 #include "access/visibilitymap.h"
17 #include "catalog/pg_am_d.h"
18 #include "catalog/pg_proc_d.h"
19 #include "miscadmin.h"
20 #include "storage/bufmgr.h"
21 #include "utils/acl.h"
22 #include "utils/rel.h"
23 
24 PG_MODULE_MAGIC;
25 
26 /* Options to forcefully change the state of a heap tuple. */
27 typedef enum HeapTupleForceOption
28 {
29 	HEAP_FORCE_KILL,
30 	HEAP_FORCE_FREEZE
31 } HeapTupleForceOption;
32 
33 PG_FUNCTION_INFO_V1(heap_force_kill);
34 PG_FUNCTION_INFO_V1(heap_force_freeze);
35 
36 static int32 tidcmp(const void *a, const void *b);
37 static Datum heap_force_common(FunctionCallInfo fcinfo,
38 							   HeapTupleForceOption heap_force_opt);
39 static void sanity_check_tid_array(ArrayType *ta, int *ntids);
40 static void sanity_check_relation(Relation rel);
41 static BlockNumber find_tids_one_page(ItemPointer tids, int ntids,
42 									  OffsetNumber *next_start_ptr);
43 
44 /*-------------------------------------------------------------------------
45  * heap_force_kill()
46  *
47  * Force kill the tuple(s) pointed to by the item pointer(s) stored in the
48  * given TID array.
49  *
50  * Usage: SELECT heap_force_kill(regclass, tid[]);
51  *-------------------------------------------------------------------------
52  */
53 Datum
heap_force_kill(PG_FUNCTION_ARGS)54 heap_force_kill(PG_FUNCTION_ARGS)
55 {
56 	PG_RETURN_DATUM(heap_force_common(fcinfo, HEAP_FORCE_KILL));
57 }
58 
59 /*-------------------------------------------------------------------------
60  * heap_force_freeze()
61  *
62  * Force freeze the tuple(s) pointed to by the item pointer(s) stored in the
63  * given TID array.
64  *
65  * Usage: SELECT heap_force_freeze(regclass, tid[]);
66  *-------------------------------------------------------------------------
67  */
68 Datum
heap_force_freeze(PG_FUNCTION_ARGS)69 heap_force_freeze(PG_FUNCTION_ARGS)
70 {
71 	PG_RETURN_DATUM(heap_force_common(fcinfo, HEAP_FORCE_FREEZE));
72 }
73 
74 /*-------------------------------------------------------------------------
75  * heap_force_common()
76  *
77  * Common code for heap_force_kill and heap_force_freeze
78  *-------------------------------------------------------------------------
79  */
80 static Datum
heap_force_common(FunctionCallInfo fcinfo,HeapTupleForceOption heap_force_opt)81 heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt)
82 {
83 	Oid			relid = PG_GETARG_OID(0);
84 	ArrayType  *ta = PG_GETARG_ARRAYTYPE_P_COPY(1);
85 	ItemPointer tids;
86 	int			ntids,
87 				nblocks;
88 	Relation	rel;
89 	OffsetNumber curr_start_ptr,
90 				next_start_ptr;
91 	bool		include_this_tid[MaxHeapTuplesPerPage];
92 
93 	if (RecoveryInProgress())
94 		ereport(ERROR,
95 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
96 				 errmsg("recovery is in progress"),
97 				 errhint("heap surgery functions cannot be executed during recovery.")));
98 
99 	/* Check inputs. */
100 	sanity_check_tid_array(ta, &ntids);
101 
102 	rel = relation_open(relid, RowExclusiveLock);
103 
104 	/* Check target relation. */
105 	sanity_check_relation(rel);
106 
107 	tids = ((ItemPointer) ARR_DATA_PTR(ta));
108 
109 	/*
110 	 * If there is more than one TID in the array, sort them so that we can
111 	 * easily fetch all the TIDs belonging to one particular page from the
112 	 * array.
113 	 */
114 	if (ntids > 1)
115 		qsort((void *) tids, ntids, sizeof(ItemPointerData), tidcmp);
116 
117 	curr_start_ptr = next_start_ptr = 0;
118 	nblocks = RelationGetNumberOfBlocks(rel);
119 
120 	/*
121 	 * Loop, performing the necessary actions for each block.
122 	 */
123 	while (next_start_ptr != ntids)
124 	{
125 		Buffer		buf;
126 		Buffer		vmbuf = InvalidBuffer;
127 		Page		page;
128 		BlockNumber blkno;
129 		OffsetNumber curoff;
130 		OffsetNumber maxoffset;
131 		int			i;
132 		bool		did_modify_page = false;
133 		bool		did_modify_vm = false;
134 
135 		CHECK_FOR_INTERRUPTS();
136 
137 		/*
138 		 * Find all the TIDs belonging to one particular page starting from
139 		 * next_start_ptr and process them one by one.
140 		 */
141 		blkno = find_tids_one_page(tids, ntids, &next_start_ptr);
142 
143 		/* Check whether the block number is valid. */
144 		if (blkno >= nblocks)
145 		{
146 			/* Update the current_start_ptr before moving to the next page. */
147 			curr_start_ptr = next_start_ptr;
148 
149 			ereport(NOTICE,
150 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
151 					 errmsg("skipping block %u for relation \"%s\" because the block number is out of range",
152 							blkno, RelationGetRelationName(rel))));
153 			continue;
154 		}
155 
156 		buf = ReadBuffer(rel, blkno);
157 		LockBufferForCleanup(buf);
158 
159 		page = BufferGetPage(buf);
160 
161 		maxoffset = PageGetMaxOffsetNumber(page);
162 
163 		/*
164 		 * Figure out which TIDs we are going to process and which ones we are
165 		 * going to skip.
166 		 */
167 		memset(include_this_tid, 0, sizeof(include_this_tid));
168 		for (i = curr_start_ptr; i < next_start_ptr; i++)
169 		{
170 			OffsetNumber offno = ItemPointerGetOffsetNumberNoCheck(&tids[i]);
171 			ItemId		itemid;
172 
173 			/* Check whether the offset number is valid. */
174 			if (offno == InvalidOffsetNumber || offno > maxoffset)
175 			{
176 				ereport(NOTICE,
177 						errmsg("skipping tid (%u, %u) for relation \"%s\" because the item number is out of range",
178 							   blkno, offno, RelationGetRelationName(rel)));
179 				continue;
180 			}
181 
182 			itemid = PageGetItemId(page, offno);
183 
184 			/* Only accept an item ID that is used. */
185 			if (ItemIdIsRedirected(itemid))
186 			{
187 				ereport(NOTICE,
188 						errmsg("skipping tid (%u, %u) for relation \"%s\" because it redirects to item %u",
189 							   blkno, offno, RelationGetRelationName(rel),
190 							   ItemIdGetRedirect(itemid)));
191 				continue;
192 			}
193 			else if (ItemIdIsDead(itemid))
194 			{
195 				ereport(NOTICE,
196 						(errmsg("skipping tid (%u, %u) for relation \"%s\" because it is marked dead",
197 								blkno, offno, RelationGetRelationName(rel))));
198 				continue;
199 			}
200 			else if (!ItemIdIsUsed(itemid))
201 			{
202 				ereport(NOTICE,
203 						(errmsg("skipping tid (%u, %u) for relation \"%s\" because it is marked unused",
204 								blkno, offno, RelationGetRelationName(rel))));
205 				continue;
206 			}
207 
208 			/* Mark it for processing. */
209 			Assert(offno < MaxHeapTuplesPerPage);
210 			include_this_tid[offno] = true;
211 		}
212 
213 		/*
214 		 * Before entering the critical section, pin the visibility map page
215 		 * if it appears to be necessary.
216 		 */
217 		if (heap_force_opt == HEAP_FORCE_KILL && PageIsAllVisible(page))
218 			visibilitymap_pin(rel, blkno, &vmbuf);
219 
220 		/* No ereport(ERROR) from here until all the changes are logged. */
221 		START_CRIT_SECTION();
222 
223 		for (curoff = FirstOffsetNumber; curoff <= maxoffset;
224 			 curoff = OffsetNumberNext(curoff))
225 		{
226 			ItemId		itemid;
227 
228 			if (!include_this_tid[curoff])
229 				continue;
230 
231 			itemid = PageGetItemId(page, curoff);
232 			Assert(ItemIdIsNormal(itemid));
233 
234 			did_modify_page = true;
235 
236 			if (heap_force_opt == HEAP_FORCE_KILL)
237 			{
238 				ItemIdSetDead(itemid);
239 
240 				/*
241 				 * If the page is marked all-visible, we must clear
242 				 * PD_ALL_VISIBLE flag on the page header and an all-visible
243 				 * bit on the visibility map corresponding to the page.
244 				 */
245 				if (PageIsAllVisible(page))
246 				{
247 					PageClearAllVisible(page);
248 					visibilitymap_clear(rel, blkno, vmbuf,
249 										VISIBILITYMAP_VALID_BITS);
250 					did_modify_vm = true;
251 				}
252 			}
253 			else
254 			{
255 				HeapTupleHeader htup;
256 
257 				Assert(heap_force_opt == HEAP_FORCE_FREEZE);
258 
259 				htup = (HeapTupleHeader) PageGetItem(page, itemid);
260 
261 				/*
262 				 * Reset all visibility-related fields of the tuple. This
263 				 * logic should mimic heap_execute_freeze_tuple(), but we
264 				 * choose to reset xmin and ctid just to be sure that no
265 				 * potentially-garbled data is left behind.
266 				 */
267 				ItemPointerSet(&htup->t_ctid, blkno, curoff);
268 				HeapTupleHeaderSetXmin(htup, FrozenTransactionId);
269 				HeapTupleHeaderSetXmax(htup, InvalidTransactionId);
270 				if (htup->t_infomask & HEAP_MOVED)
271 				{
272 					if (htup->t_infomask & HEAP_MOVED_OFF)
273 						HeapTupleHeaderSetXvac(htup, InvalidTransactionId);
274 					else
275 						HeapTupleHeaderSetXvac(htup, FrozenTransactionId);
276 				}
277 
278 				/*
279 				 * Clear all the visibility-related bits of this tuple and
280 				 * mark it as frozen. Also, get rid of HOT_UPDATED and
281 				 * KEYS_UPDATES bits.
282 				 */
283 				htup->t_infomask &= ~HEAP_XACT_MASK;
284 				htup->t_infomask |= (HEAP_XMIN_FROZEN | HEAP_XMAX_INVALID);
285 				htup->t_infomask2 &= ~HEAP_HOT_UPDATED;
286 				htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
287 			}
288 		}
289 
290 		/*
291 		 * If the page was modified, only then, we mark the buffer dirty or do
292 		 * the WAL logging.
293 		 */
294 		if (did_modify_page)
295 		{
296 			/* Mark buffer dirty before we write WAL. */
297 			MarkBufferDirty(buf);
298 
299 			/* XLOG stuff */
300 			if (RelationNeedsWAL(rel))
301 				log_newpage_buffer(buf, true);
302 		}
303 
304 		/* WAL log the VM page if it was modified. */
305 		if (did_modify_vm && RelationNeedsWAL(rel))
306 			log_newpage_buffer(vmbuf, false);
307 
308 		END_CRIT_SECTION();
309 
310 		UnlockReleaseBuffer(buf);
311 
312 		if (vmbuf != InvalidBuffer)
313 			ReleaseBuffer(vmbuf);
314 
315 		/* Update the current_start_ptr before moving to the next page. */
316 		curr_start_ptr = next_start_ptr;
317 	}
318 
319 	relation_close(rel, RowExclusiveLock);
320 
321 	pfree(ta);
322 
323 	PG_RETURN_VOID();
324 }
325 
326 /*-------------------------------------------------------------------------
327  * tidcmp()
328  *
329  * Compare two item pointers, return -1, 0, or +1.
330  *
331  * See ItemPointerCompare for details.
332  * ------------------------------------------------------------------------
333  */
334 static int32
tidcmp(const void * a,const void * b)335 tidcmp(const void *a, const void *b)
336 {
337 	ItemPointer iptr1 = ((const ItemPointer) a);
338 	ItemPointer iptr2 = ((const ItemPointer) b);
339 
340 	return ItemPointerCompare(iptr1, iptr2);
341 }
342 
343 /*-------------------------------------------------------------------------
344  * sanity_check_tid_array()
345  *
346  * Perform sanity checks on the given tid array, and set *ntids to the
347  * number of items in the array.
348  * ------------------------------------------------------------------------
349  */
350 static void
sanity_check_tid_array(ArrayType * ta,int * ntids)351 sanity_check_tid_array(ArrayType *ta, int *ntids)
352 {
353 	if (ARR_HASNULL(ta) && array_contains_nulls(ta))
354 		ereport(ERROR,
355 				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
356 				 errmsg("array must not contain nulls")));
357 
358 	if (ARR_NDIM(ta) > 1)
359 		ereport(ERROR,
360 				(errcode(ERRCODE_DATA_EXCEPTION),
361 				 errmsg("argument must be empty or one-dimensional array")));
362 
363 	*ntids = ArrayGetNItems(ARR_NDIM(ta), ARR_DIMS(ta));
364 }
365 
366 /*-------------------------------------------------------------------------
367  * sanity_check_relation()
368  *
369  * Perform sanity checks on the given relation.
370  * ------------------------------------------------------------------------
371  */
372 static void
sanity_check_relation(Relation rel)373 sanity_check_relation(Relation rel)
374 {
375 	if (rel->rd_rel->relkind != RELKIND_RELATION &&
376 		rel->rd_rel->relkind != RELKIND_MATVIEW &&
377 		rel->rd_rel->relkind != RELKIND_TOASTVALUE)
378 		ereport(ERROR,
379 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
380 				 errmsg("\"%s\" is not a table, materialized view, or TOAST table",
381 						RelationGetRelationName(rel))));
382 
383 	if (rel->rd_rel->relam != HEAP_TABLE_AM_OID)
384 		ereport(ERROR,
385 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
386 				 errmsg("only heap AM is supported")));
387 
388 	/* Must be owner of the table or superuser. */
389 	if (!pg_class_ownercheck(RelationGetRelid(rel), GetUserId()))
390 		aclcheck_error(ACLCHECK_NOT_OWNER,
391 					   get_relkind_objtype(rel->rd_rel->relkind),
392 					   RelationGetRelationName(rel));
393 }
394 
395 /*-------------------------------------------------------------------------
396  * find_tids_one_page()
397  *
398  * Find all the tids residing in the same page as tids[next_start_ptr], and
399  * update next_start_ptr so that it points to the first tid in the next page.
400  *
401  * NOTE: The input tids[] array must be sorted.
402  * ------------------------------------------------------------------------
403  */
404 static BlockNumber
find_tids_one_page(ItemPointer tids,int ntids,OffsetNumber * next_start_ptr)405 find_tids_one_page(ItemPointer tids, int ntids, OffsetNumber *next_start_ptr)
406 {
407 	int			i;
408 	BlockNumber prev_blkno,
409 				blkno;
410 
411 	prev_blkno = blkno = InvalidBlockNumber;
412 
413 	for (i = *next_start_ptr; i < ntids; i++)
414 	{
415 		ItemPointerData tid = tids[i];
416 
417 		blkno = ItemPointerGetBlockNumberNoCheck(&tid);
418 
419 		if (i == *next_start_ptr)
420 			prev_blkno = blkno;
421 
422 		if (prev_blkno != blkno)
423 			break;
424 	}
425 
426 	*next_start_ptr = i;
427 	return prev_blkno;
428 }
429