1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  *	  heap access method code
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  *		heap_beginscan	- begin relation scan
16  *		heap_rescan		- restart a relation scan
17  *		heap_endscan	- end relation scan
18  *		heap_getnext	- retrieve next tuple in scan
19  *		heap_fetch		- retrieve tuple with given tid
20  *		heap_insert		- insert tuple into a relation
21  *		heap_multi_insert - insert multiple tuples into a relation
22  *		heap_delete		- delete a tuple from a relation
23  *		heap_update		- replace a tuple in a relation with another tuple
24  *
25  * NOTES
26  *	  This file contains the heap_ routines which implement
27  *	  the POSTGRES heap access method used for all POSTGRES
28  *	  relations.
29  *
30  *-------------------------------------------------------------------------
31  */
32 #include "postgres.h"
33 
34 #include "access/bufmask.h"
35 #include "access/genam.h"
36 #include "access/heapam.h"
37 #include "access/heapam_xlog.h"
38 #include "access/heaptoast.h"
39 #include "access/hio.h"
40 #include "access/multixact.h"
41 #include "access/parallel.h"
42 #include "access/relscan.h"
43 #include "access/subtrans.h"
44 #include "access/syncscan.h"
45 #include "access/sysattr.h"
46 #include "access/tableam.h"
47 #include "access/transam.h"
48 #include "access/valid.h"
49 #include "access/visibilitymap.h"
50 #include "access/xact.h"
51 #include "access/xlog.h"
52 #include "access/xloginsert.h"
53 #include "access/xlogutils.h"
54 #include "catalog/catalog.h"
55 #include "miscadmin.h"
56 #include "pgstat.h"
57 #include "port/atomics.h"
58 #include "port/pg_bitutils.h"
59 #include "storage/bufmgr.h"
60 #include "storage/freespace.h"
61 #include "storage/lmgr.h"
62 #include "storage/predicate.h"
63 #include "storage/procarray.h"
64 #include "storage/smgr.h"
65 #include "storage/spin.h"
66 #include "storage/standby.h"
67 #include "utils/datum.h"
68 #include "utils/inval.h"
69 #include "utils/lsyscache.h"
70 #include "utils/relcache.h"
71 #include "utils/snapmgr.h"
72 #include "utils/spccache.h"
73 
74 
75 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
76 									 TransactionId xid, CommandId cid, int options);
77 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
78 								  Buffer newbuf, HeapTuple oldtup,
79 								  HeapTuple newtup, HeapTuple old_key_tuple,
80 								  bool all_visible_cleared, bool new_all_visible_cleared);
81 static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
82 											   Bitmapset *interesting_cols,
83 											   HeapTuple oldtup, HeapTuple newtup);
84 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
85 								 LockTupleMode mode, LockWaitPolicy wait_policy,
86 								 bool *have_tuple_lock);
87 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
88 									  uint16 old_infomask2, TransactionId add_to_xmax,
89 									  LockTupleMode mode, bool is_update,
90 									  TransactionId *result_xmax, uint16 *result_infomask,
91 									  uint16 *result_infomask2);
92 static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
93 										 ItemPointer ctid, TransactionId xid,
94 										 LockTupleMode mode);
95 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
96 								   uint16 *new_infomask2);
97 static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
98 											 uint16 t_infomask);
99 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
100 									LockTupleMode lockmode, bool *current_is_member);
101 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
102 							Relation rel, ItemPointer ctid, XLTW_Oper oper,
103 							int *remaining);
104 static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
105 									   uint16 infomask, Relation rel, int *remaining);
106 static void index_delete_sort(TM_IndexDeleteOp *delstate);
107 static int	bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
108 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
109 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
110 										bool *copy);
111 
112 
113 /*
114  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
115  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
116  * update them).  This table (and the macros below) helps us determine the
117  * heavyweight lock mode and MultiXactStatus values to use for any particular
118  * tuple lock strength.
119  *
120  * Don't look at lockstatus/updstatus directly!  Use get_mxact_status_for_lock
121  * instead.
122  */
123 static const struct
124 {
125 	LOCKMODE	hwlock;
126 	int			lockstatus;
127 	int			updstatus;
128 }
129 
130 			tupleLockExtraInfo[MaxLockTupleMode + 1] =
131 {
132 	{							/* LockTupleKeyShare */
133 		AccessShareLock,
134 		MultiXactStatusForKeyShare,
135 		-1						/* KeyShare does not allow updating tuples */
136 	},
137 	{							/* LockTupleShare */
138 		RowShareLock,
139 		MultiXactStatusForShare,
140 		-1						/* Share does not allow updating tuples */
141 	},
142 	{							/* LockTupleNoKeyExclusive */
143 		ExclusiveLock,
144 		MultiXactStatusForNoKeyUpdate,
145 		MultiXactStatusNoKeyUpdate
146 	},
147 	{							/* LockTupleExclusive */
148 		AccessExclusiveLock,
149 		MultiXactStatusForUpdate,
150 		MultiXactStatusUpdate
151 	}
152 };
153 
154 /* Get the LOCKMODE for a given MultiXactStatus */
155 #define LOCKMODE_from_mxstatus(status) \
156 			(tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
157 
158 /*
159  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
160  * This is more readable than having every caller translate it to lock.h's
161  * LOCKMODE.
162  */
163 #define LockTupleTuplock(rel, tup, mode) \
164 	LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165 #define UnlockTupleTuplock(rel, tup, mode) \
166 	UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
167 #define ConditionalLockTupleTuplock(rel, tup, mode) \
168 	ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
169 
170 #ifdef USE_PREFETCH
171 /*
172  * heap_index_delete_tuples and index_delete_prefetch_buffer use this
173  * structure to coordinate prefetching activity
174  */
175 typedef struct
176 {
177 	BlockNumber cur_hblkno;
178 	int			next_item;
179 	int			ndeltids;
180 	TM_IndexDelete *deltids;
181 } IndexDeletePrefetchState;
182 #endif
183 
184 /* heap_index_delete_tuples bottom-up index deletion costing constants */
185 #define BOTTOMUP_MAX_NBLOCKS			6
186 #define BOTTOMUP_TOLERANCE_NBLOCKS		3
187 
188 /*
189  * heap_index_delete_tuples uses this when determining which heap blocks it
190  * must visit to help its bottom-up index deletion caller
191  */
192 typedef struct IndexDeleteCounts
193 {
194 	int16		npromisingtids; /* Number of "promising" TIDs in group */
195 	int16		ntids;			/* Number of TIDs in group */
196 	int16		ifirsttid;		/* Offset to group's first deltid */
197 } IndexDeleteCounts;
198 
199 /*
200  * This table maps tuple lock strength values for each particular
201  * MultiXactStatus value.
202  */
203 static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
204 {
205 	LockTupleKeyShare,			/* ForKeyShare */
206 	LockTupleShare,				/* ForShare */
207 	LockTupleNoKeyExclusive,	/* ForNoKeyUpdate */
208 	LockTupleExclusive,			/* ForUpdate */
209 	LockTupleNoKeyExclusive,	/* NoKeyUpdate */
210 	LockTupleExclusive			/* Update */
211 };
212 
213 /* Get the LockTupleMode for a given MultiXactStatus */
214 #define TUPLOCK_from_mxstatus(status) \
215 			(MultiXactStatusLock[(status)])
216 
217 /* ----------------------------------------------------------------
218  *						 heap support routines
219  * ----------------------------------------------------------------
220  */
221 
222 /* ----------------
223  *		initscan - scan code common to heap_beginscan and heap_rescan
224  * ----------------
225  */
226 static void
initscan(HeapScanDesc scan,ScanKey key,bool keep_startblock)227 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
228 {
229 	ParallelBlockTableScanDesc bpscan = NULL;
230 	bool		allow_strat;
231 	bool		allow_sync;
232 
233 	/*
234 	 * Determine the number of blocks we have to scan.
235 	 *
236 	 * It is sufficient to do this once at scan start, since any tuples added
237 	 * while the scan is in progress will be invisible to my snapshot anyway.
238 	 * (That is not true when using a non-MVCC snapshot.  However, we couldn't
239 	 * guarantee to return tuples added after scan start anyway, since they
240 	 * might go into pages we already scanned.  To guarantee consistent
241 	 * results for a non-MVCC snapshot, the caller must hold some higher-level
242 	 * lock that ensures the interesting tuple(s) won't change.)
243 	 */
244 	if (scan->rs_base.rs_parallel != NULL)
245 	{
246 		bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
247 		scan->rs_nblocks = bpscan->phs_nblocks;
248 	}
249 	else
250 		scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
251 
252 	/*
253 	 * If the table is large relative to NBuffers, use a bulk-read access
254 	 * strategy and enable synchronized scanning (see syncscan.c).  Although
255 	 * the thresholds for these features could be different, we make them the
256 	 * same so that there are only two behaviors to tune rather than four.
257 	 * (However, some callers need to be able to disable one or both of these
258 	 * behaviors, independently of the size of the table; also there is a GUC
259 	 * variable that can disable synchronized scanning.)
260 	 *
261 	 * Note that table_block_parallelscan_initialize has a very similar test;
262 	 * if you change this, consider changing that one, too.
263 	 */
264 	if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
265 		scan->rs_nblocks > NBuffers / 4)
266 	{
267 		allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
268 		allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
269 	}
270 	else
271 		allow_strat = allow_sync = false;
272 
273 	if (allow_strat)
274 	{
275 		/* During a rescan, keep the previous strategy object. */
276 		if (scan->rs_strategy == NULL)
277 			scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
278 	}
279 	else
280 	{
281 		if (scan->rs_strategy != NULL)
282 			FreeAccessStrategy(scan->rs_strategy);
283 		scan->rs_strategy = NULL;
284 	}
285 
286 	if (scan->rs_base.rs_parallel != NULL)
287 	{
288 		/* For parallel scan, believe whatever ParallelTableScanDesc says. */
289 		if (scan->rs_base.rs_parallel->phs_syncscan)
290 			scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
291 		else
292 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
293 	}
294 	else if (keep_startblock)
295 	{
296 		/*
297 		 * When rescanning, we want to keep the previous startblock setting,
298 		 * so that rewinding a cursor doesn't generate surprising results.
299 		 * Reset the active syncscan setting, though.
300 		 */
301 		if (allow_sync && synchronize_seqscans)
302 			scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
303 		else
304 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
305 	}
306 	else if (allow_sync && synchronize_seqscans)
307 	{
308 		scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
309 		scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
310 	}
311 	else
312 	{
313 		scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
314 		scan->rs_startblock = 0;
315 	}
316 
317 	scan->rs_numblocks = InvalidBlockNumber;
318 	scan->rs_inited = false;
319 	scan->rs_ctup.t_data = NULL;
320 	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
321 	scan->rs_cbuf = InvalidBuffer;
322 	scan->rs_cblock = InvalidBlockNumber;
323 
324 	/* page-at-a-time fields are always invalid when not rs_inited */
325 
326 	/*
327 	 * copy the scan key, if appropriate
328 	 */
329 	if (key != NULL)
330 		memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
331 
332 	/*
333 	 * Currently, we only have a stats counter for sequential heap scans (but
334 	 * e.g for bitmap scans the underlying bitmap index scans will be counted,
335 	 * and for sample scans we update stats for tuple fetches).
336 	 */
337 	if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
338 		pgstat_count_heap_scan(scan->rs_base.rs_rd);
339 }
340 
341 /*
342  * heap_setscanlimits - restrict range of a heapscan
343  *
344  * startBlk is the page to start at
345  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
346  */
347 void
heap_setscanlimits(TableScanDesc sscan,BlockNumber startBlk,BlockNumber numBlks)348 heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
349 {
350 	HeapScanDesc scan = (HeapScanDesc) sscan;
351 
352 	Assert(!scan->rs_inited);	/* else too late to change */
353 	/* else rs_startblock is significant */
354 	Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
355 
356 	/* Check startBlk is valid (but allow case of zero blocks...) */
357 	Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
358 
359 	scan->rs_startblock = startBlk;
360 	scan->rs_numblocks = numBlks;
361 }
362 
363 /*
364  * heapgetpage - subroutine for heapgettup()
365  *
366  * This routine reads and pins the specified page of the relation.
367  * In page-at-a-time mode it performs additional work, namely determining
368  * which tuples on the page are visible.
369  */
370 void
heapgetpage(TableScanDesc sscan,BlockNumber page)371 heapgetpage(TableScanDesc sscan, BlockNumber page)
372 {
373 	HeapScanDesc scan = (HeapScanDesc) sscan;
374 	Buffer		buffer;
375 	Snapshot	snapshot;
376 	Page		dp;
377 	int			lines;
378 	int			ntup;
379 	OffsetNumber lineoff;
380 	ItemId		lpp;
381 	bool		all_visible;
382 
383 	Assert(page < scan->rs_nblocks);
384 
385 	/* release previous scan buffer, if any */
386 	if (BufferIsValid(scan->rs_cbuf))
387 	{
388 		ReleaseBuffer(scan->rs_cbuf);
389 		scan->rs_cbuf = InvalidBuffer;
390 	}
391 
392 	/*
393 	 * Be sure to check for interrupts at least once per page.  Checks at
394 	 * higher code levels won't be able to stop a seqscan that encounters many
395 	 * pages' worth of consecutive dead tuples.
396 	 */
397 	CHECK_FOR_INTERRUPTS();
398 
399 	/* read page using selected strategy */
400 	scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
401 									   RBM_NORMAL, scan->rs_strategy);
402 	scan->rs_cblock = page;
403 
404 	if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
405 		return;
406 
407 	buffer = scan->rs_cbuf;
408 	snapshot = scan->rs_base.rs_snapshot;
409 
410 	/*
411 	 * Prune and repair fragmentation for the whole page, if possible.
412 	 */
413 	heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
414 
415 	/*
416 	 * We must hold share lock on the buffer content while examining tuple
417 	 * visibility.  Afterwards, however, the tuples we have found to be
418 	 * visible are guaranteed good as long as we hold the buffer pin.
419 	 */
420 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
421 
422 	dp = BufferGetPage(buffer);
423 	TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
424 	lines = PageGetMaxOffsetNumber(dp);
425 	ntup = 0;
426 
427 	/*
428 	 * If the all-visible flag indicates that all tuples on the page are
429 	 * visible to everyone, we can skip the per-tuple visibility tests.
430 	 *
431 	 * Note: In hot standby, a tuple that's already visible to all
432 	 * transactions on the primary might still be invisible to a read-only
433 	 * transaction in the standby. We partly handle this problem by tracking
434 	 * the minimum xmin of visible tuples as the cut-off XID while marking a
435 	 * page all-visible on the primary and WAL log that along with the
436 	 * visibility map SET operation. In hot standby, we wait for (or abort)
437 	 * all transactions that can potentially may not see one or more tuples on
438 	 * the page. That's how index-only scans work fine in hot standby. A
439 	 * crucial difference between index-only scans and heap scans is that the
440 	 * index-only scan completely relies on the visibility map where as heap
441 	 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
442 	 * the page-level flag can be trusted in the same way, because it might
443 	 * get propagated somehow without being explicitly WAL-logged, e.g. via a
444 	 * full page write. Until we can prove that beyond doubt, let's check each
445 	 * tuple for visibility the hard way.
446 	 */
447 	all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
448 
449 	for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
450 		 lineoff <= lines;
451 		 lineoff++, lpp++)
452 	{
453 		if (ItemIdIsNormal(lpp))
454 		{
455 			HeapTupleData loctup;
456 			bool		valid;
457 
458 			loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
459 			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
460 			loctup.t_len = ItemIdGetLength(lpp);
461 			ItemPointerSet(&(loctup.t_self), page, lineoff);
462 
463 			if (all_visible)
464 				valid = true;
465 			else
466 				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
467 
468 			HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
469 												&loctup, buffer, snapshot);
470 
471 			if (valid)
472 				scan->rs_vistuples[ntup++] = lineoff;
473 		}
474 	}
475 
476 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
477 
478 	Assert(ntup <= MaxHeapTuplesPerPage);
479 	scan->rs_ntuples = ntup;
480 }
481 
482 /* ----------------
483  *		heapgettup - fetch next heap tuple
484  *
485  *		Initialize the scan if not already done; then advance to the next
486  *		tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
487  *		or set scan->rs_ctup.t_data = NULL if no more tuples.
488  *
489  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
490  * by scan->rs_ctup".
491  *
492  * Note: the reason nkeys/key are passed separately, even though they are
493  * kept in the scan descriptor, is that the caller may not want us to check
494  * the scankeys.
495  *
496  * Note: when we fall off the end of the scan in either direction, we
497  * reset rs_inited.  This means that a further request with the same
498  * scan direction will restart the scan, which is a bit odd, but a
499  * request with the opposite scan direction will start a fresh scan
500  * in the proper direction.  The latter is required behavior for cursors,
501  * while the former case is generally undefined behavior in Postgres
502  * so we don't care too much.
503  * ----------------
504  */
505 static void
heapgettup(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)506 heapgettup(HeapScanDesc scan,
507 		   ScanDirection dir,
508 		   int nkeys,
509 		   ScanKey key)
510 {
511 	HeapTuple	tuple = &(scan->rs_ctup);
512 	Snapshot	snapshot = scan->rs_base.rs_snapshot;
513 	bool		backward = ScanDirectionIsBackward(dir);
514 	BlockNumber page;
515 	bool		finished;
516 	Page		dp;
517 	int			lines;
518 	OffsetNumber lineoff;
519 	int			linesleft;
520 	ItemId		lpp;
521 
522 	/*
523 	 * calculate next starting lineoff, given scan direction
524 	 */
525 	if (ScanDirectionIsForward(dir))
526 	{
527 		if (!scan->rs_inited)
528 		{
529 			/*
530 			 * return null immediately if relation is empty
531 			 */
532 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
533 			{
534 				Assert(!BufferIsValid(scan->rs_cbuf));
535 				tuple->t_data = NULL;
536 				return;
537 			}
538 			if (scan->rs_base.rs_parallel != NULL)
539 			{
540 				ParallelBlockTableScanDesc pbscan =
541 				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
542 				ParallelBlockTableScanWorker pbscanwork =
543 				scan->rs_parallelworkerdata;
544 
545 				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
546 														 pbscanwork, pbscan);
547 
548 				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
549 														 pbscanwork, pbscan);
550 
551 				/* Other processes might have already finished the scan. */
552 				if (page == InvalidBlockNumber)
553 				{
554 					Assert(!BufferIsValid(scan->rs_cbuf));
555 					tuple->t_data = NULL;
556 					return;
557 				}
558 			}
559 			else
560 				page = scan->rs_startblock; /* first page */
561 			heapgetpage((TableScanDesc) scan, page);
562 			lineoff = FirstOffsetNumber;	/* first offnum */
563 			scan->rs_inited = true;
564 		}
565 		else
566 		{
567 			/* continue from previously returned page/tuple */
568 			page = scan->rs_cblock; /* current page */
569 			lineoff =			/* next offnum */
570 				OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
571 		}
572 
573 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
574 
575 		dp = BufferGetPage(scan->rs_cbuf);
576 		TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
577 		lines = PageGetMaxOffsetNumber(dp);
578 		/* page and lineoff now reference the physically next tid */
579 
580 		linesleft = lines - lineoff + 1;
581 	}
582 	else if (backward)
583 	{
584 		/* backward parallel scan not supported */
585 		Assert(scan->rs_base.rs_parallel == NULL);
586 
587 		if (!scan->rs_inited)
588 		{
589 			/*
590 			 * return null immediately if relation is empty
591 			 */
592 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
593 			{
594 				Assert(!BufferIsValid(scan->rs_cbuf));
595 				tuple->t_data = NULL;
596 				return;
597 			}
598 
599 			/*
600 			 * Disable reporting to syncscan logic in a backwards scan; it's
601 			 * not very likely anyone else is doing the same thing at the same
602 			 * time, and much more likely that we'll just bollix things for
603 			 * forward scanners.
604 			 */
605 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
606 
607 			/*
608 			 * Start from last page of the scan.  Ensure we take into account
609 			 * rs_numblocks if it's been adjusted by heap_setscanlimits().
610 			 */
611 			if (scan->rs_numblocks != InvalidBlockNumber)
612 				page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
613 			else if (scan->rs_startblock > 0)
614 				page = scan->rs_startblock - 1;
615 			else
616 				page = scan->rs_nblocks - 1;
617 			heapgetpage((TableScanDesc) scan, page);
618 		}
619 		else
620 		{
621 			/* continue from previously returned page/tuple */
622 			page = scan->rs_cblock; /* current page */
623 		}
624 
625 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
626 
627 		dp = BufferGetPage(scan->rs_cbuf);
628 		TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
629 		lines = PageGetMaxOffsetNumber(dp);
630 
631 		if (!scan->rs_inited)
632 		{
633 			lineoff = lines;	/* final offnum */
634 			scan->rs_inited = true;
635 		}
636 		else
637 		{
638 			/*
639 			 * The previous returned tuple may have been vacuumed since the
640 			 * previous scan when we use a non-MVCC snapshot, so we must
641 			 * re-establish the lineoff <= PageGetMaxOffsetNumber(dp)
642 			 * invariant
643 			 */
644 			lineoff =			/* previous offnum */
645 				Min(lines,
646 					OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self))));
647 		}
648 		/* page and lineoff now reference the physically previous tid */
649 
650 		linesleft = lineoff;
651 	}
652 	else
653 	{
654 		/*
655 		 * ``no movement'' scan direction: refetch prior tuple
656 		 */
657 		if (!scan->rs_inited)
658 		{
659 			Assert(!BufferIsValid(scan->rs_cbuf));
660 			tuple->t_data = NULL;
661 			return;
662 		}
663 
664 		page = ItemPointerGetBlockNumber(&(tuple->t_self));
665 		if (page != scan->rs_cblock)
666 			heapgetpage((TableScanDesc) scan, page);
667 
668 		/* Since the tuple was previously fetched, needn't lock page here */
669 		dp = BufferGetPage(scan->rs_cbuf);
670 		TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
671 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
672 		lpp = PageGetItemId(dp, lineoff);
673 		Assert(ItemIdIsNormal(lpp));
674 
675 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
676 		tuple->t_len = ItemIdGetLength(lpp);
677 
678 		return;
679 	}
680 
681 	/*
682 	 * advance the scan until we find a qualifying tuple or run out of stuff
683 	 * to scan
684 	 */
685 	lpp = PageGetItemId(dp, lineoff);
686 	for (;;)
687 	{
688 		/*
689 		 * Only continue scanning the page while we have lines left.
690 		 *
691 		 * Note that this protects us from accessing line pointers past
692 		 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
693 		 * table scan, and for when we start scanning a new page.
694 		 */
695 		while (linesleft > 0)
696 		{
697 			if (ItemIdIsNormal(lpp))
698 			{
699 				bool		valid;
700 
701 				tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
702 				tuple->t_len = ItemIdGetLength(lpp);
703 				ItemPointerSet(&(tuple->t_self), page, lineoff);
704 
705 				/*
706 				 * if current tuple qualifies, return it.
707 				 */
708 				valid = HeapTupleSatisfiesVisibility(tuple,
709 													 snapshot,
710 													 scan->rs_cbuf);
711 
712 				HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
713 													tuple, scan->rs_cbuf,
714 													snapshot);
715 
716 				if (valid && key != NULL)
717 					HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
718 								nkeys, key, valid);
719 
720 				if (valid)
721 				{
722 					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
723 					return;
724 				}
725 			}
726 
727 			/*
728 			 * otherwise move to the next item on the page
729 			 */
730 			--linesleft;
731 			if (backward)
732 			{
733 				--lpp;			/* move back in this page's ItemId array */
734 				--lineoff;
735 			}
736 			else
737 			{
738 				++lpp;			/* move forward in this page's ItemId array */
739 				++lineoff;
740 			}
741 		}
742 
743 		/*
744 		 * if we get here, it means we've exhausted the items on this page and
745 		 * it's time to move to the next.
746 		 */
747 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
748 
749 		/*
750 		 * advance to next/prior page and detect end of scan
751 		 */
752 		if (backward)
753 		{
754 			finished = (page == scan->rs_startblock) ||
755 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
756 			if (page == 0)
757 				page = scan->rs_nblocks;
758 			page--;
759 		}
760 		else if (scan->rs_base.rs_parallel != NULL)
761 		{
762 			ParallelBlockTableScanDesc pbscan =
763 			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
764 			ParallelBlockTableScanWorker pbscanwork =
765 			scan->rs_parallelworkerdata;
766 
767 			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
768 													 pbscanwork, pbscan);
769 			finished = (page == InvalidBlockNumber);
770 		}
771 		else
772 		{
773 			page++;
774 			if (page >= scan->rs_nblocks)
775 				page = 0;
776 			finished = (page == scan->rs_startblock) ||
777 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
778 
779 			/*
780 			 * Report our new scan position for synchronization purposes. We
781 			 * don't do that when moving backwards, however. That would just
782 			 * mess up any other forward-moving scanners.
783 			 *
784 			 * Note: we do this before checking for end of scan so that the
785 			 * final state of the position hint is back at the start of the
786 			 * rel.  That's not strictly necessary, but otherwise when you run
787 			 * the same query multiple times the starting position would shift
788 			 * a little bit backwards on every invocation, which is confusing.
789 			 * We don't guarantee any specific ordering in general, though.
790 			 */
791 			if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
792 				ss_report_location(scan->rs_base.rs_rd, page);
793 		}
794 
795 		/*
796 		 * return NULL if we've exhausted all the pages
797 		 */
798 		if (finished)
799 		{
800 			if (BufferIsValid(scan->rs_cbuf))
801 				ReleaseBuffer(scan->rs_cbuf);
802 			scan->rs_cbuf = InvalidBuffer;
803 			scan->rs_cblock = InvalidBlockNumber;
804 			tuple->t_data = NULL;
805 			scan->rs_inited = false;
806 			return;
807 		}
808 
809 		heapgetpage((TableScanDesc) scan, page);
810 
811 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
812 
813 		dp = BufferGetPage(scan->rs_cbuf);
814 		TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
815 		lines = PageGetMaxOffsetNumber((Page) dp);
816 		linesleft = lines;
817 		if (backward)
818 		{
819 			lineoff = lines;
820 			lpp = PageGetItemId(dp, lines);
821 		}
822 		else
823 		{
824 			lineoff = FirstOffsetNumber;
825 			lpp = PageGetItemId(dp, FirstOffsetNumber);
826 		}
827 	}
828 }
829 
830 /* ----------------
831  *		heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
832  *
833  *		Same API as heapgettup, but used in page-at-a-time mode
834  *
835  * The internal logic is much the same as heapgettup's too, but there are some
836  * differences: we do not take the buffer content lock (that only needs to
837  * happen inside heapgetpage), and we iterate through just the tuples listed
838  * in rs_vistuples[] rather than all tuples on the page.  Notice that
839  * lineindex is 0-based, where the corresponding loop variable lineoff in
840  * heapgettup is 1-based.
841  * ----------------
842  */
843 static void
heapgettup_pagemode(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)844 heapgettup_pagemode(HeapScanDesc scan,
845 					ScanDirection dir,
846 					int nkeys,
847 					ScanKey key)
848 {
849 	HeapTuple	tuple = &(scan->rs_ctup);
850 	bool		backward = ScanDirectionIsBackward(dir);
851 	BlockNumber page;
852 	bool		finished;
853 	Page		dp;
854 	int			lines;
855 	int			lineindex;
856 	OffsetNumber lineoff;
857 	int			linesleft;
858 	ItemId		lpp;
859 
860 	/*
861 	 * calculate next starting lineindex, given scan direction
862 	 */
863 	if (ScanDirectionIsForward(dir))
864 	{
865 		if (!scan->rs_inited)
866 		{
867 			/*
868 			 * return null immediately if relation is empty
869 			 */
870 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
871 			{
872 				Assert(!BufferIsValid(scan->rs_cbuf));
873 				tuple->t_data = NULL;
874 				return;
875 			}
876 			if (scan->rs_base.rs_parallel != NULL)
877 			{
878 				ParallelBlockTableScanDesc pbscan =
879 				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
880 				ParallelBlockTableScanWorker pbscanwork =
881 				scan->rs_parallelworkerdata;
882 
883 				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
884 														 pbscanwork, pbscan);
885 
886 				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
887 														 pbscanwork, pbscan);
888 
889 				/* Other processes might have already finished the scan. */
890 				if (page == InvalidBlockNumber)
891 				{
892 					Assert(!BufferIsValid(scan->rs_cbuf));
893 					tuple->t_data = NULL;
894 					return;
895 				}
896 			}
897 			else
898 				page = scan->rs_startblock; /* first page */
899 			heapgetpage((TableScanDesc) scan, page);
900 			lineindex = 0;
901 			scan->rs_inited = true;
902 		}
903 		else
904 		{
905 			/* continue from previously returned page/tuple */
906 			page = scan->rs_cblock; /* current page */
907 			lineindex = scan->rs_cindex + 1;
908 		}
909 
910 		dp = BufferGetPage(scan->rs_cbuf);
911 		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
912 		lines = scan->rs_ntuples;
913 		/* page and lineindex now reference the next visible tid */
914 
915 		linesleft = lines - lineindex;
916 	}
917 	else if (backward)
918 	{
919 		/* backward parallel scan not supported */
920 		Assert(scan->rs_base.rs_parallel == NULL);
921 
922 		if (!scan->rs_inited)
923 		{
924 			/*
925 			 * return null immediately if relation is empty
926 			 */
927 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
928 			{
929 				Assert(!BufferIsValid(scan->rs_cbuf));
930 				tuple->t_data = NULL;
931 				return;
932 			}
933 
934 			/*
935 			 * Disable reporting to syncscan logic in a backwards scan; it's
936 			 * not very likely anyone else is doing the same thing at the same
937 			 * time, and much more likely that we'll just bollix things for
938 			 * forward scanners.
939 			 */
940 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
941 
942 			/*
943 			 * Start from last page of the scan.  Ensure we take into account
944 			 * rs_numblocks if it's been adjusted by heap_setscanlimits().
945 			 */
946 			if (scan->rs_numblocks != InvalidBlockNumber)
947 				page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
948 			else if (scan->rs_startblock > 0)
949 				page = scan->rs_startblock - 1;
950 			else
951 				page = scan->rs_nblocks - 1;
952 			heapgetpage((TableScanDesc) scan, page);
953 		}
954 		else
955 		{
956 			/* continue from previously returned page/tuple */
957 			page = scan->rs_cblock; /* current page */
958 		}
959 
960 		dp = BufferGetPage(scan->rs_cbuf);
961 		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
962 		lines = scan->rs_ntuples;
963 
964 		if (!scan->rs_inited)
965 		{
966 			lineindex = lines - 1;
967 			scan->rs_inited = true;
968 		}
969 		else
970 		{
971 			lineindex = scan->rs_cindex - 1;
972 		}
973 		/* page and lineindex now reference the previous visible tid */
974 
975 		linesleft = lineindex + 1;
976 	}
977 	else
978 	{
979 		/*
980 		 * ``no movement'' scan direction: refetch prior tuple
981 		 */
982 		if (!scan->rs_inited)
983 		{
984 			Assert(!BufferIsValid(scan->rs_cbuf));
985 			tuple->t_data = NULL;
986 			return;
987 		}
988 
989 		page = ItemPointerGetBlockNumber(&(tuple->t_self));
990 		if (page != scan->rs_cblock)
991 			heapgetpage((TableScanDesc) scan, page);
992 
993 		/* Since the tuple was previously fetched, needn't lock page here */
994 		dp = BufferGetPage(scan->rs_cbuf);
995 		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
996 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
997 		lpp = PageGetItemId(dp, lineoff);
998 		Assert(ItemIdIsNormal(lpp));
999 
1000 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
1001 		tuple->t_len = ItemIdGetLength(lpp);
1002 
1003 		/* check that rs_cindex is in sync */
1004 		Assert(scan->rs_cindex < scan->rs_ntuples);
1005 		Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
1006 
1007 		return;
1008 	}
1009 
1010 	/*
1011 	 * advance the scan until we find a qualifying tuple or run out of stuff
1012 	 * to scan
1013 	 */
1014 	for (;;)
1015 	{
1016 		while (linesleft > 0)
1017 		{
1018 			lineoff = scan->rs_vistuples[lineindex];
1019 			lpp = PageGetItemId(dp, lineoff);
1020 			Assert(ItemIdIsNormal(lpp));
1021 
1022 			tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
1023 			tuple->t_len = ItemIdGetLength(lpp);
1024 			ItemPointerSet(&(tuple->t_self), page, lineoff);
1025 
1026 			/*
1027 			 * if current tuple qualifies, return it.
1028 			 */
1029 			if (key != NULL)
1030 			{
1031 				bool		valid;
1032 
1033 				HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
1034 							nkeys, key, valid);
1035 				if (valid)
1036 				{
1037 					scan->rs_cindex = lineindex;
1038 					return;
1039 				}
1040 			}
1041 			else
1042 			{
1043 				scan->rs_cindex = lineindex;
1044 				return;
1045 			}
1046 
1047 			/*
1048 			 * otherwise move to the next item on the page
1049 			 */
1050 			--linesleft;
1051 			if (backward)
1052 				--lineindex;
1053 			else
1054 				++lineindex;
1055 		}
1056 
1057 		/*
1058 		 * if we get here, it means we've exhausted the items on this page and
1059 		 * it's time to move to the next.
1060 		 */
1061 		if (backward)
1062 		{
1063 			finished = (page == scan->rs_startblock) ||
1064 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1065 			if (page == 0)
1066 				page = scan->rs_nblocks;
1067 			page--;
1068 		}
1069 		else if (scan->rs_base.rs_parallel != NULL)
1070 		{
1071 			ParallelBlockTableScanDesc pbscan =
1072 			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
1073 			ParallelBlockTableScanWorker pbscanwork =
1074 			scan->rs_parallelworkerdata;
1075 
1076 			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
1077 													 pbscanwork, pbscan);
1078 			finished = (page == InvalidBlockNumber);
1079 		}
1080 		else
1081 		{
1082 			page++;
1083 			if (page >= scan->rs_nblocks)
1084 				page = 0;
1085 			finished = (page == scan->rs_startblock) ||
1086 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1087 
1088 			/*
1089 			 * Report our new scan position for synchronization purposes. We
1090 			 * don't do that when moving backwards, however. That would just
1091 			 * mess up any other forward-moving scanners.
1092 			 *
1093 			 * Note: we do this before checking for end of scan so that the
1094 			 * final state of the position hint is back at the start of the
1095 			 * rel.  That's not strictly necessary, but otherwise when you run
1096 			 * the same query multiple times the starting position would shift
1097 			 * a little bit backwards on every invocation, which is confusing.
1098 			 * We don't guarantee any specific ordering in general, though.
1099 			 */
1100 			if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1101 				ss_report_location(scan->rs_base.rs_rd, page);
1102 		}
1103 
1104 		/*
1105 		 * return NULL if we've exhausted all the pages
1106 		 */
1107 		if (finished)
1108 		{
1109 			if (BufferIsValid(scan->rs_cbuf))
1110 				ReleaseBuffer(scan->rs_cbuf);
1111 			scan->rs_cbuf = InvalidBuffer;
1112 			scan->rs_cblock = InvalidBlockNumber;
1113 			tuple->t_data = NULL;
1114 			scan->rs_inited = false;
1115 			return;
1116 		}
1117 
1118 		heapgetpage((TableScanDesc) scan, page);
1119 
1120 		dp = BufferGetPage(scan->rs_cbuf);
1121 		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1122 		lines = scan->rs_ntuples;
1123 		linesleft = lines;
1124 		if (backward)
1125 			lineindex = lines - 1;
1126 		else
1127 			lineindex = 0;
1128 	}
1129 }
1130 
1131 
1132 #if defined(DISABLE_COMPLEX_MACRO)
1133 /*
1134  * This is formatted so oddly so that the correspondence to the macro
1135  * definition in access/htup_details.h is maintained.
1136  */
1137 Datum
fastgetattr(HeapTuple tup,int attnum,TupleDesc tupleDesc,bool * isnull)1138 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1139 			bool *isnull)
1140 {
1141 	return (
1142 			(attnum) > 0 ?
1143 			(
1144 			 (*(isnull) = false),
1145 			 HeapTupleNoNulls(tup) ?
1146 			 (
1147 			  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1148 			  (
1149 			   fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1150 						(char *) (tup)->t_data + (tup)->t_data->t_hoff +
1151 						TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1152 			   )
1153 			  :
1154 			  nocachegetattr((tup), (attnum), (tupleDesc))
1155 			  )
1156 			 :
1157 			 (
1158 			  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1159 			  (
1160 			   (*(isnull) = true),
1161 			   (Datum) NULL
1162 			   )
1163 			  :
1164 			  (
1165 			   nocachegetattr((tup), (attnum), (tupleDesc))
1166 			   )
1167 			  )
1168 			 )
1169 			:
1170 			(
1171 			 (Datum) NULL
1172 			 )
1173 		);
1174 }
1175 #endif							/* defined(DISABLE_COMPLEX_MACRO) */
1176 
1177 
1178 /* ----------------------------------------------------------------
1179  *					 heap access method interface
1180  * ----------------------------------------------------------------
1181  */
1182 
1183 
1184 TableScanDesc
heap_beginscan(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,ParallelTableScanDesc parallel_scan,uint32 flags)1185 heap_beginscan(Relation relation, Snapshot snapshot,
1186 			   int nkeys, ScanKey key,
1187 			   ParallelTableScanDesc parallel_scan,
1188 			   uint32 flags)
1189 {
1190 	HeapScanDesc scan;
1191 
1192 	/*
1193 	 * increment relation ref count while scanning relation
1194 	 *
1195 	 * This is just to make really sure the relcache entry won't go away while
1196 	 * the scan has a pointer to it.  Caller should be holding the rel open
1197 	 * anyway, so this is redundant in all normal scenarios...
1198 	 */
1199 	RelationIncrementReferenceCount(relation);
1200 
1201 	/*
1202 	 * allocate and initialize scan descriptor
1203 	 */
1204 	scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1205 
1206 	scan->rs_base.rs_rd = relation;
1207 	scan->rs_base.rs_snapshot = snapshot;
1208 	scan->rs_base.rs_nkeys = nkeys;
1209 	scan->rs_base.rs_flags = flags;
1210 	scan->rs_base.rs_parallel = parallel_scan;
1211 	scan->rs_strategy = NULL;	/* set in initscan */
1212 
1213 	/*
1214 	 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1215 	 */
1216 	if (!(snapshot && IsMVCCSnapshot(snapshot)))
1217 		scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1218 
1219 	/*
1220 	 * For seqscan and sample scans in a serializable transaction, acquire a
1221 	 * predicate lock on the entire relation. This is required not only to
1222 	 * lock all the matching tuples, but also to conflict with new insertions
1223 	 * into the table. In an indexscan, we take page locks on the index pages
1224 	 * covering the range specified in the scan qual, but in a heap scan there
1225 	 * is nothing more fine-grained to lock. A bitmap scan is a different
1226 	 * story, there we have already scanned the index and locked the index
1227 	 * pages covering the predicate. But in that case we still have to lock
1228 	 * any matching heap tuples. For sample scan we could optimize the locking
1229 	 * to be at least page-level granularity, but we'd need to add per-tuple
1230 	 * locking for that.
1231 	 */
1232 	if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
1233 	{
1234 		/*
1235 		 * Ensure a missing snapshot is noticed reliably, even if the
1236 		 * isolation mode means predicate locking isn't performed (and
1237 		 * therefore the snapshot isn't used here).
1238 		 */
1239 		Assert(snapshot);
1240 		PredicateLockRelation(relation, snapshot);
1241 	}
1242 
1243 	/* we only need to set this up once */
1244 	scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1245 
1246 	/*
1247 	 * Allocate memory to keep track of page allocation for parallel workers
1248 	 * when doing a parallel scan.
1249 	 */
1250 	if (parallel_scan != NULL)
1251 		scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData));
1252 	else
1253 		scan->rs_parallelworkerdata = NULL;
1254 
1255 	/*
1256 	 * we do this here instead of in initscan() because heap_rescan also calls
1257 	 * initscan() and we don't want to allocate memory again
1258 	 */
1259 	if (nkeys > 0)
1260 		scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1261 	else
1262 		scan->rs_base.rs_key = NULL;
1263 
1264 	initscan(scan, key, false);
1265 
1266 	return (TableScanDesc) scan;
1267 }
1268 
1269 void
heap_rescan(TableScanDesc sscan,ScanKey key,bool set_params,bool allow_strat,bool allow_sync,bool allow_pagemode)1270 heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1271 			bool allow_strat, bool allow_sync, bool allow_pagemode)
1272 {
1273 	HeapScanDesc scan = (HeapScanDesc) sscan;
1274 
1275 	if (set_params)
1276 	{
1277 		if (allow_strat)
1278 			scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1279 		else
1280 			scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1281 
1282 		if (allow_sync)
1283 			scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1284 		else
1285 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1286 
1287 		if (allow_pagemode && scan->rs_base.rs_snapshot &&
1288 			IsMVCCSnapshot(scan->rs_base.rs_snapshot))
1289 			scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
1290 		else
1291 			scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1292 	}
1293 
1294 	/*
1295 	 * unpin scan buffers
1296 	 */
1297 	if (BufferIsValid(scan->rs_cbuf))
1298 		ReleaseBuffer(scan->rs_cbuf);
1299 
1300 	/*
1301 	 * reinitialize scan descriptor
1302 	 */
1303 	initscan(scan, key, true);
1304 }
1305 
1306 void
heap_endscan(TableScanDesc sscan)1307 heap_endscan(TableScanDesc sscan)
1308 {
1309 	HeapScanDesc scan = (HeapScanDesc) sscan;
1310 
1311 	/* Note: no locking manipulations needed */
1312 
1313 	/*
1314 	 * unpin scan buffers
1315 	 */
1316 	if (BufferIsValid(scan->rs_cbuf))
1317 		ReleaseBuffer(scan->rs_cbuf);
1318 
1319 	/*
1320 	 * decrement relation reference count and free scan descriptor storage
1321 	 */
1322 	RelationDecrementReferenceCount(scan->rs_base.rs_rd);
1323 
1324 	if (scan->rs_base.rs_key)
1325 		pfree(scan->rs_base.rs_key);
1326 
1327 	if (scan->rs_strategy != NULL)
1328 		FreeAccessStrategy(scan->rs_strategy);
1329 
1330 	if (scan->rs_parallelworkerdata != NULL)
1331 		pfree(scan->rs_parallelworkerdata);
1332 
1333 	if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1334 		UnregisterSnapshot(scan->rs_base.rs_snapshot);
1335 
1336 	pfree(scan);
1337 }
1338 
1339 HeapTuple
heap_getnext(TableScanDesc sscan,ScanDirection direction)1340 heap_getnext(TableScanDesc sscan, ScanDirection direction)
1341 {
1342 	HeapScanDesc scan = (HeapScanDesc) sscan;
1343 
1344 	/*
1345 	 * This is still widely used directly, without going through table AM, so
1346 	 * add a safety check.  It's possible we should, at a later point,
1347 	 * downgrade this to an assert. The reason for checking the AM routine,
1348 	 * rather than the AM oid, is that this allows to write regression tests
1349 	 * that create another AM reusing the heap handler.
1350 	 */
1351 	if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1352 		ereport(ERROR,
1353 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1354 				 errmsg_internal("only heap AM is supported")));
1355 
1356 	/*
1357 	 * We don't expect direct calls to heap_getnext with valid CheckXidAlive
1358 	 * for catalog or regular tables.  See detailed comments in xact.c where
1359 	 * these variables are declared.  Normally we have such a check at tableam
1360 	 * level API but this is called from many places so we need to ensure it
1361 	 * here.
1362 	 */
1363 	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
1364 		elog(ERROR, "unexpected heap_getnext call during logical decoding");
1365 
1366 	/* Note: no locking manipulations needed */
1367 
1368 	if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1369 		heapgettup_pagemode(scan, direction,
1370 							scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1371 	else
1372 		heapgettup(scan, direction,
1373 				   scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1374 
1375 	if (scan->rs_ctup.t_data == NULL)
1376 		return NULL;
1377 
1378 	/*
1379 	 * if we get here it means we have a new current scan tuple, so point to
1380 	 * the proper return buffer and return the tuple.
1381 	 */
1382 
1383 	pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1384 
1385 	return &scan->rs_ctup;
1386 }
1387 
1388 bool
heap_getnextslot(TableScanDesc sscan,ScanDirection direction,TupleTableSlot * slot)1389 heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
1390 {
1391 	HeapScanDesc scan = (HeapScanDesc) sscan;
1392 
1393 	/* Note: no locking manipulations needed */
1394 
1395 	if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1396 		heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1397 	else
1398 		heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1399 
1400 	if (scan->rs_ctup.t_data == NULL)
1401 	{
1402 		ExecClearTuple(slot);
1403 		return false;
1404 	}
1405 
1406 	/*
1407 	 * if we get here it means we have a new current scan tuple, so point to
1408 	 * the proper return buffer and return the tuple.
1409 	 */
1410 
1411 	pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1412 
1413 	ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1414 							 scan->rs_cbuf);
1415 	return true;
1416 }
1417 
1418 void
heap_set_tidrange(TableScanDesc sscan,ItemPointer mintid,ItemPointer maxtid)1419 heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid,
1420 				  ItemPointer maxtid)
1421 {
1422 	HeapScanDesc scan = (HeapScanDesc) sscan;
1423 	BlockNumber startBlk;
1424 	BlockNumber numBlks;
1425 	ItemPointerData highestItem;
1426 	ItemPointerData lowestItem;
1427 
1428 	/*
1429 	 * For relations without any pages, we can simply leave the TID range
1430 	 * unset.  There will be no tuples to scan, therefore no tuples outside
1431 	 * the given TID range.
1432 	 */
1433 	if (scan->rs_nblocks == 0)
1434 		return;
1435 
1436 	/*
1437 	 * Set up some ItemPointers which point to the first and last possible
1438 	 * tuples in the heap.
1439 	 */
1440 	ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber);
1441 	ItemPointerSet(&lowestItem, 0, FirstOffsetNumber);
1442 
1443 	/*
1444 	 * If the given maximum TID is below the highest possible TID in the
1445 	 * relation, then restrict the range to that, otherwise we scan to the end
1446 	 * of the relation.
1447 	 */
1448 	if (ItemPointerCompare(maxtid, &highestItem) < 0)
1449 		ItemPointerCopy(maxtid, &highestItem);
1450 
1451 	/*
1452 	 * If the given minimum TID is above the lowest possible TID in the
1453 	 * relation, then restrict the range to only scan for TIDs above that.
1454 	 */
1455 	if (ItemPointerCompare(mintid, &lowestItem) > 0)
1456 		ItemPointerCopy(mintid, &lowestItem);
1457 
1458 	/*
1459 	 * Check for an empty range and protect from would be negative results
1460 	 * from the numBlks calculation below.
1461 	 */
1462 	if (ItemPointerCompare(&highestItem, &lowestItem) < 0)
1463 	{
1464 		/* Set an empty range of blocks to scan */
1465 		heap_setscanlimits(sscan, 0, 0);
1466 		return;
1467 	}
1468 
1469 	/*
1470 	 * Calculate the first block and the number of blocks we must scan. We
1471 	 * could be more aggressive here and perform some more validation to try
1472 	 * and further narrow the scope of blocks to scan by checking if the
1473 	 * lowerItem has an offset above MaxOffsetNumber.  In this case, we could
1474 	 * advance startBlk by one.  Likewise, if highestItem has an offset of 0
1475 	 * we could scan one fewer blocks.  However, such an optimization does not
1476 	 * seem worth troubling over, currently.
1477 	 */
1478 	startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem);
1479 
1480 	numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) -
1481 		ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1;
1482 
1483 	/* Set the start block and number of blocks to scan */
1484 	heap_setscanlimits(sscan, startBlk, numBlks);
1485 
1486 	/* Finally, set the TID range in sscan */
1487 	ItemPointerCopy(&lowestItem, &sscan->rs_mintid);
1488 	ItemPointerCopy(&highestItem, &sscan->rs_maxtid);
1489 }
1490 
1491 bool
heap_getnextslot_tidrange(TableScanDesc sscan,ScanDirection direction,TupleTableSlot * slot)1492 heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction,
1493 						  TupleTableSlot *slot)
1494 {
1495 	HeapScanDesc scan = (HeapScanDesc) sscan;
1496 	ItemPointer mintid = &sscan->rs_mintid;
1497 	ItemPointer maxtid = &sscan->rs_maxtid;
1498 
1499 	/* Note: no locking manipulations needed */
1500 	for (;;)
1501 	{
1502 		if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1503 			heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1504 		else
1505 			heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1506 
1507 		if (scan->rs_ctup.t_data == NULL)
1508 		{
1509 			ExecClearTuple(slot);
1510 			return false;
1511 		}
1512 
1513 		/*
1514 		 * heap_set_tidrange will have used heap_setscanlimits to limit the
1515 		 * range of pages we scan to only ones that can contain the TID range
1516 		 * we're scanning for.  Here we must filter out any tuples from these
1517 		 * pages that are outwith that range.
1518 		 */
1519 		if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1520 		{
1521 			ExecClearTuple(slot);
1522 
1523 			/*
1524 			 * When scanning backwards, the TIDs will be in descending order.
1525 			 * Future tuples in this direction will be lower still, so we can
1526 			 * just return false to indicate there will be no more tuples.
1527 			 */
1528 			if (ScanDirectionIsBackward(direction))
1529 				return false;
1530 
1531 			continue;
1532 		}
1533 
1534 		/*
1535 		 * Likewise for the final page, we must filter out TIDs greater than
1536 		 * maxtid.
1537 		 */
1538 		if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1539 		{
1540 			ExecClearTuple(slot);
1541 
1542 			/*
1543 			 * When scanning forward, the TIDs will be in ascending order.
1544 			 * Future tuples in this direction will be higher still, so we can
1545 			 * just return false to indicate there will be no more tuples.
1546 			 */
1547 			if (ScanDirectionIsForward(direction))
1548 				return false;
1549 			continue;
1550 		}
1551 
1552 		break;
1553 	}
1554 
1555 	/*
1556 	 * if we get here it means we have a new current scan tuple, so point to
1557 	 * the proper return buffer and return the tuple.
1558 	 */
1559 	pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1560 
1561 	ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1562 	return true;
1563 }
1564 
1565 /*
1566  *	heap_fetch		- retrieve tuple with given tid
1567  *
1568  * On entry, tuple->t_self is the TID to fetch.  We pin the buffer holding
1569  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1570  * against the specified snapshot.
1571  *
1572  * If successful (tuple found and passes snapshot time qual), then *userbuf
1573  * is set to the buffer holding the tuple and true is returned.  The caller
1574  * must unpin the buffer when done with the tuple.
1575  *
1576  * If the tuple is not found (ie, item number references a deleted slot),
1577  * then tuple->t_data is set to NULL and false is returned.
1578  *
1579  * If the tuple is found but fails the time qual check, then false is returned
1580  * but tuple->t_data is left pointing to the tuple.
1581  *
1582  * heap_fetch does not follow HOT chains: only the exact TID requested will
1583  * be fetched.
1584  *
1585  * It is somewhat inconsistent that we ereport() on invalid block number but
1586  * return false on invalid item number.  There are a couple of reasons though.
1587  * One is that the caller can relatively easily check the block number for
1588  * validity, but cannot check the item number without reading the page
1589  * himself.  Another is that when we are following a t_ctid link, we can be
1590  * reasonably confident that the page number is valid (since VACUUM shouldn't
1591  * truncate off the destination page without having killed the referencing
1592  * tuple first), but the item number might well not be good.
1593  */
1594 bool
heap_fetch(Relation relation,Snapshot snapshot,HeapTuple tuple,Buffer * userbuf)1595 heap_fetch(Relation relation,
1596 		   Snapshot snapshot,
1597 		   HeapTuple tuple,
1598 		   Buffer *userbuf)
1599 {
1600 	ItemPointer tid = &(tuple->t_self);
1601 	ItemId		lp;
1602 	Buffer		buffer;
1603 	Page		page;
1604 	OffsetNumber offnum;
1605 	bool		valid;
1606 
1607 	/*
1608 	 * Fetch and pin the appropriate page of the relation.
1609 	 */
1610 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1611 
1612 	/*
1613 	 * Need share lock on buffer to examine tuple commit status.
1614 	 */
1615 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
1616 	page = BufferGetPage(buffer);
1617 	TestForOldSnapshot(snapshot, relation, page);
1618 
1619 	/*
1620 	 * We'd better check for out-of-range offnum in case of VACUUM since the
1621 	 * TID was obtained.
1622 	 */
1623 	offnum = ItemPointerGetOffsetNumber(tid);
1624 	if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1625 	{
1626 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1627 		ReleaseBuffer(buffer);
1628 		*userbuf = InvalidBuffer;
1629 		tuple->t_data = NULL;
1630 		return false;
1631 	}
1632 
1633 	/*
1634 	 * get the item line pointer corresponding to the requested tid
1635 	 */
1636 	lp = PageGetItemId(page, offnum);
1637 
1638 	/*
1639 	 * Must check for deleted tuple.
1640 	 */
1641 	if (!ItemIdIsNormal(lp))
1642 	{
1643 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1644 		ReleaseBuffer(buffer);
1645 		*userbuf = InvalidBuffer;
1646 		tuple->t_data = NULL;
1647 		return false;
1648 	}
1649 
1650 	/*
1651 	 * fill in *tuple fields
1652 	 */
1653 	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1654 	tuple->t_len = ItemIdGetLength(lp);
1655 	tuple->t_tableOid = RelationGetRelid(relation);
1656 
1657 	/*
1658 	 * check tuple visibility, then release lock
1659 	 */
1660 	valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1661 
1662 	if (valid)
1663 		PredicateLockTID(relation, &(tuple->t_self), snapshot,
1664 						 HeapTupleHeaderGetXmin(tuple->t_data));
1665 
1666 	HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1667 
1668 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1669 
1670 	if (valid)
1671 	{
1672 		/*
1673 		 * All checks passed, so return the tuple as valid. Caller is now
1674 		 * responsible for releasing the buffer.
1675 		 */
1676 		*userbuf = buffer;
1677 
1678 		return true;
1679 	}
1680 
1681 	/* Tuple failed time qual */
1682 	ReleaseBuffer(buffer);
1683 	*userbuf = InvalidBuffer;
1684 
1685 	return false;
1686 }
1687 
1688 /*
1689  *	heap_hot_search_buffer	- search HOT chain for tuple satisfying snapshot
1690  *
1691  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1692  * of a HOT chain), and buffer is the buffer holding this tuple.  We search
1693  * for the first chain member satisfying the given snapshot.  If one is
1694  * found, we update *tid to reference that tuple's offset number, and
1695  * return true.  If no match, return false without modifying *tid.
1696  *
1697  * heapTuple is a caller-supplied buffer.  When a match is found, we return
1698  * the tuple here, in addition to updating *tid.  If no match is found, the
1699  * contents of this buffer on return are undefined.
1700  *
1701  * If all_dead is not NULL, we check non-visible tuples to see if they are
1702  * globally dead; *all_dead is set true if all members of the HOT chain
1703  * are vacuumable, false if not.
1704  *
1705  * Unlike heap_fetch, the caller must already have pin and (at least) share
1706  * lock on the buffer; it is still pinned/locked at exit.  Also unlike
1707  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1708  */
1709 bool
heap_hot_search_buffer(ItemPointer tid,Relation relation,Buffer buffer,Snapshot snapshot,HeapTuple heapTuple,bool * all_dead,bool first_call)1710 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
1711 					   Snapshot snapshot, HeapTuple heapTuple,
1712 					   bool *all_dead, bool first_call)
1713 {
1714 	Page		dp = (Page) BufferGetPage(buffer);
1715 	TransactionId prev_xmax = InvalidTransactionId;
1716 	BlockNumber blkno;
1717 	OffsetNumber offnum;
1718 	bool		at_chain_start;
1719 	bool		valid;
1720 	bool		skip;
1721 	GlobalVisState *vistest = NULL;
1722 
1723 	/* If this is not the first call, previous call returned a (live!) tuple */
1724 	if (all_dead)
1725 		*all_dead = first_call;
1726 
1727 	blkno = ItemPointerGetBlockNumber(tid);
1728 	offnum = ItemPointerGetOffsetNumber(tid);
1729 	at_chain_start = first_call;
1730 	skip = !first_call;
1731 
1732 	/* XXX: we should assert that a snapshot is pushed or registered */
1733 	Assert(TransactionIdIsValid(RecentXmin));
1734 	Assert(BufferGetBlockNumber(buffer) == blkno);
1735 
1736 	/* Scan through possible multiple members of HOT-chain */
1737 	for (;;)
1738 	{
1739 		ItemId		lp;
1740 
1741 		/* check for bogus TID */
1742 		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1743 			break;
1744 
1745 		lp = PageGetItemId(dp, offnum);
1746 
1747 		/* check for unused, dead, or redirected items */
1748 		if (!ItemIdIsNormal(lp))
1749 		{
1750 			/* We should only see a redirect at start of chain */
1751 			if (ItemIdIsRedirected(lp) && at_chain_start)
1752 			{
1753 				/* Follow the redirect */
1754 				offnum = ItemIdGetRedirect(lp);
1755 				at_chain_start = false;
1756 				continue;
1757 			}
1758 			/* else must be end of chain */
1759 			break;
1760 		}
1761 
1762 		/*
1763 		 * Update heapTuple to point to the element of the HOT chain we're
1764 		 * currently investigating. Having t_self set correctly is important
1765 		 * because the SSI checks and the *Satisfies routine for historical
1766 		 * MVCC snapshots need the correct tid to decide about the visibility.
1767 		 */
1768 		heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1769 		heapTuple->t_len = ItemIdGetLength(lp);
1770 		heapTuple->t_tableOid = RelationGetRelid(relation);
1771 		ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1772 
1773 		/*
1774 		 * Shouldn't see a HEAP_ONLY tuple at chain start.
1775 		 */
1776 		if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1777 			break;
1778 
1779 		/*
1780 		 * The xmin should match the previous xmax value, else chain is
1781 		 * broken.
1782 		 */
1783 		if (TransactionIdIsValid(prev_xmax) &&
1784 			!TransactionIdEquals(prev_xmax,
1785 								 HeapTupleHeaderGetXmin(heapTuple->t_data)))
1786 			break;
1787 
1788 		/*
1789 		 * When first_call is true (and thus, skip is initially false) we'll
1790 		 * return the first tuple we find.  But on later passes, heapTuple
1791 		 * will initially be pointing to the tuple we returned last time.
1792 		 * Returning it again would be incorrect (and would loop forever), so
1793 		 * we skip it and return the next match we find.
1794 		 */
1795 		if (!skip)
1796 		{
1797 			/* If it's visible per the snapshot, we must return it */
1798 			valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1799 			HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
1800 												buffer, snapshot);
1801 
1802 			if (valid)
1803 			{
1804 				ItemPointerSetOffsetNumber(tid, offnum);
1805 				PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1806 								 HeapTupleHeaderGetXmin(heapTuple->t_data));
1807 				if (all_dead)
1808 					*all_dead = false;
1809 				return true;
1810 			}
1811 		}
1812 		skip = false;
1813 
1814 		/*
1815 		 * If we can't see it, maybe no one else can either.  At caller
1816 		 * request, check whether all chain members are dead to all
1817 		 * transactions.
1818 		 *
1819 		 * Note: if you change the criterion here for what is "dead", fix the
1820 		 * planner's get_actual_variable_range() function to match.
1821 		 */
1822 		if (all_dead && *all_dead)
1823 		{
1824 			if (!vistest)
1825 				vistest = GlobalVisTestFor(relation);
1826 
1827 			if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1828 				*all_dead = false;
1829 		}
1830 
1831 		/*
1832 		 * Check to see if HOT chain continues past this tuple; if so fetch
1833 		 * the next offnum and loop around.
1834 		 */
1835 		if (HeapTupleIsHotUpdated(heapTuple))
1836 		{
1837 			Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1838 				   blkno);
1839 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1840 			at_chain_start = false;
1841 			prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1842 		}
1843 		else
1844 			break;				/* end of chain */
1845 	}
1846 
1847 	return false;
1848 }
1849 
1850 /*
1851  *	heap_get_latest_tid -  get the latest tid of a specified tuple
1852  *
1853  * Actually, this gets the latest version that is visible according to the
1854  * scan's snapshot.  Create a scan using SnapshotDirty to get the very latest,
1855  * possibly uncommitted version.
1856  *
1857  * *tid is both an input and an output parameter: it is updated to
1858  * show the latest version of the row.  Note that it will not be changed
1859  * if no version of the row passes the snapshot test.
1860  */
1861 void
heap_get_latest_tid(TableScanDesc sscan,ItemPointer tid)1862 heap_get_latest_tid(TableScanDesc sscan,
1863 					ItemPointer tid)
1864 {
1865 	Relation	relation = sscan->rs_rd;
1866 	Snapshot	snapshot = sscan->rs_snapshot;
1867 	ItemPointerData ctid;
1868 	TransactionId priorXmax;
1869 
1870 	/*
1871 	 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1872 	 * Assume that t_ctid links are valid however - there shouldn't be invalid
1873 	 * ones in the table.
1874 	 */
1875 	Assert(ItemPointerIsValid(tid));
1876 
1877 	/*
1878 	 * Loop to chase down t_ctid links.  At top of loop, ctid is the tuple we
1879 	 * need to examine, and *tid is the TID we will return if ctid turns out
1880 	 * to be bogus.
1881 	 *
1882 	 * Note that we will loop until we reach the end of the t_ctid chain.
1883 	 * Depending on the snapshot passed, there might be at most one visible
1884 	 * version of the row, but we don't try to optimize for that.
1885 	 */
1886 	ctid = *tid;
1887 	priorXmax = InvalidTransactionId;	/* cannot check first XMIN */
1888 	for (;;)
1889 	{
1890 		Buffer		buffer;
1891 		Page		page;
1892 		OffsetNumber offnum;
1893 		ItemId		lp;
1894 		HeapTupleData tp;
1895 		bool		valid;
1896 
1897 		/*
1898 		 * Read, pin, and lock the page.
1899 		 */
1900 		buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1901 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
1902 		page = BufferGetPage(buffer);
1903 		TestForOldSnapshot(snapshot, relation, page);
1904 
1905 		/*
1906 		 * Check for bogus item number.  This is not treated as an error
1907 		 * condition because it can happen while following a t_ctid link. We
1908 		 * just assume that the prior tid is OK and return it unchanged.
1909 		 */
1910 		offnum = ItemPointerGetOffsetNumber(&ctid);
1911 		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1912 		{
1913 			UnlockReleaseBuffer(buffer);
1914 			break;
1915 		}
1916 		lp = PageGetItemId(page, offnum);
1917 		if (!ItemIdIsNormal(lp))
1918 		{
1919 			UnlockReleaseBuffer(buffer);
1920 			break;
1921 		}
1922 
1923 		/* OK to access the tuple */
1924 		tp.t_self = ctid;
1925 		tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1926 		tp.t_len = ItemIdGetLength(lp);
1927 		tp.t_tableOid = RelationGetRelid(relation);
1928 
1929 		/*
1930 		 * After following a t_ctid link, we might arrive at an unrelated
1931 		 * tuple.  Check for XMIN match.
1932 		 */
1933 		if (TransactionIdIsValid(priorXmax) &&
1934 			!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1935 		{
1936 			UnlockReleaseBuffer(buffer);
1937 			break;
1938 		}
1939 
1940 		/*
1941 		 * Check tuple visibility; if visible, set it as the new result
1942 		 * candidate.
1943 		 */
1944 		valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1945 		HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1946 		if (valid)
1947 			*tid = ctid;
1948 
1949 		/*
1950 		 * If there's a valid t_ctid link, follow it, else we're done.
1951 		 */
1952 		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1953 			HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
1954 			HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
1955 			ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
1956 		{
1957 			UnlockReleaseBuffer(buffer);
1958 			break;
1959 		}
1960 
1961 		ctid = tp.t_data->t_ctid;
1962 		priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1963 		UnlockReleaseBuffer(buffer);
1964 	}							/* end of loop */
1965 }
1966 
1967 
1968 /*
1969  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1970  *
1971  * This is called after we have waited for the XMAX transaction to terminate.
1972  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1973  * be set on exit.  If the transaction committed, we set the XMAX_COMMITTED
1974  * hint bit if possible --- but beware that that may not yet be possible,
1975  * if the transaction committed asynchronously.
1976  *
1977  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1978  * even if it commits.
1979  *
1980  * Hence callers should look only at XMAX_INVALID.
1981  *
1982  * Note this is not allowed for tuples whose xmax is a multixact.
1983  */
1984 static void
UpdateXmaxHintBits(HeapTupleHeader tuple,Buffer buffer,TransactionId xid)1985 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
1986 {
1987 	Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
1988 	Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1989 
1990 	if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1991 	{
1992 		if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1993 			TransactionIdDidCommit(xid))
1994 			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
1995 								 xid);
1996 		else
1997 			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1998 								 InvalidTransactionId);
1999 	}
2000 }
2001 
2002 
2003 /*
2004  * GetBulkInsertState - prepare status object for a bulk insert
2005  */
2006 BulkInsertState
GetBulkInsertState(void)2007 GetBulkInsertState(void)
2008 {
2009 	BulkInsertState bistate;
2010 
2011 	bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2012 	bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
2013 	bistate->current_buf = InvalidBuffer;
2014 	return bistate;
2015 }
2016 
2017 /*
2018  * FreeBulkInsertState - clean up after finishing a bulk insert
2019  */
2020 void
FreeBulkInsertState(BulkInsertState bistate)2021 FreeBulkInsertState(BulkInsertState bistate)
2022 {
2023 	if (bistate->current_buf != InvalidBuffer)
2024 		ReleaseBuffer(bistate->current_buf);
2025 	FreeAccessStrategy(bistate->strategy);
2026 	pfree(bistate);
2027 }
2028 
2029 /*
2030  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2031  */
2032 void
ReleaseBulkInsertStatePin(BulkInsertState bistate)2033 ReleaseBulkInsertStatePin(BulkInsertState bistate)
2034 {
2035 	if (bistate->current_buf != InvalidBuffer)
2036 		ReleaseBuffer(bistate->current_buf);
2037 	bistate->current_buf = InvalidBuffer;
2038 }
2039 
2040 
2041 /*
2042  *	heap_insert		- insert tuple into a heap
2043  *
2044  * The new tuple is stamped with current transaction ID and the specified
2045  * command ID.
2046  *
2047  * See table_tuple_insert for comments about most of the input flags, except
2048  * that this routine directly takes a tuple rather than a slot.
2049  *
2050  * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
2051  * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
2052  * implement table_tuple_insert_speculative().
2053  *
2054  * On return the header fields of *tup are updated to match the stored tuple;
2055  * in particular tup->t_self receives the actual TID where the tuple was
2056  * stored.  But note that any toasting of fields within the tuple data is NOT
2057  * reflected into *tup.
2058  */
2059 void
heap_insert(Relation relation,HeapTuple tup,CommandId cid,int options,BulkInsertState bistate)2060 heap_insert(Relation relation, HeapTuple tup, CommandId cid,
2061 			int options, BulkInsertState bistate)
2062 {
2063 	TransactionId xid = GetCurrentTransactionId();
2064 	HeapTuple	heaptup;
2065 	Buffer		buffer;
2066 	Buffer		vmbuffer = InvalidBuffer;
2067 	bool		all_visible_cleared = false;
2068 
2069 	/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2070 	Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
2071 		   RelationGetNumberOfAttributes(relation));
2072 
2073 	/*
2074 	 * Fill in tuple header fields and toast the tuple if necessary.
2075 	 *
2076 	 * Note: below this point, heaptup is the data we actually intend to store
2077 	 * into the relation; tup is the caller's original untoasted data.
2078 	 */
2079 	heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2080 
2081 	/*
2082 	 * Find buffer to insert this tuple into.  If the page is all visible,
2083 	 * this will also pin the requisite visibility map page.
2084 	 */
2085 	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2086 									   InvalidBuffer, options, bistate,
2087 									   &vmbuffer, NULL);
2088 
2089 	/*
2090 	 * We're about to do the actual insert -- but check for conflict first, to
2091 	 * avoid possibly having to roll back work we've just done.
2092 	 *
2093 	 * This is safe without a recheck as long as there is no possibility of
2094 	 * another process scanning the page between this check and the insert
2095 	 * being visible to the scan (i.e., an exclusive buffer content lock is
2096 	 * continuously held from this point until the tuple insert is visible).
2097 	 *
2098 	 * For a heap insert, we only need to check for table-level SSI locks. Our
2099 	 * new tuple can't possibly conflict with existing tuple locks, and heap
2100 	 * page locks are only consolidated versions of tuple locks; they do not
2101 	 * lock "gaps" as index page locks do.  So we don't need to specify a
2102 	 * buffer when making the call, which makes for a faster check.
2103 	 */
2104 	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2105 
2106 	/* NO EREPORT(ERROR) from here till changes are logged */
2107 	START_CRIT_SECTION();
2108 
2109 	RelationPutHeapTuple(relation, buffer, heaptup,
2110 						 (options & HEAP_INSERT_SPECULATIVE) != 0);
2111 
2112 	if (PageIsAllVisible(BufferGetPage(buffer)))
2113 	{
2114 		all_visible_cleared = true;
2115 		PageClearAllVisible(BufferGetPage(buffer));
2116 		visibilitymap_clear(relation,
2117 							ItemPointerGetBlockNumber(&(heaptup->t_self)),
2118 							vmbuffer, VISIBILITYMAP_VALID_BITS);
2119 	}
2120 
2121 	/*
2122 	 * XXX Should we set PageSetPrunable on this page ?
2123 	 *
2124 	 * The inserting transaction may eventually abort thus making this tuple
2125 	 * DEAD and hence available for pruning. Though we don't want to optimize
2126 	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2127 	 * aborted tuple will never be pruned until next vacuum is triggered.
2128 	 *
2129 	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2130 	 */
2131 
2132 	MarkBufferDirty(buffer);
2133 
2134 	/* XLOG stuff */
2135 	if (RelationNeedsWAL(relation))
2136 	{
2137 		xl_heap_insert xlrec;
2138 		xl_heap_header xlhdr;
2139 		XLogRecPtr	recptr;
2140 		Page		page = BufferGetPage(buffer);
2141 		uint8		info = XLOG_HEAP_INSERT;
2142 		int			bufflags = 0;
2143 
2144 		/*
2145 		 * If this is a catalog, we need to transmit combo CIDs to properly
2146 		 * decode, so log that as well.
2147 		 */
2148 		if (RelationIsAccessibleInLogicalDecoding(relation))
2149 			log_heap_new_cid(relation, heaptup);
2150 
2151 		/*
2152 		 * If this is the single and first tuple on page, we can reinit the
2153 		 * page instead of restoring the whole thing.  Set flag, and hide
2154 		 * buffer references from XLogInsert.
2155 		 */
2156 		if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2157 			PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
2158 		{
2159 			info |= XLOG_HEAP_INIT_PAGE;
2160 			bufflags |= REGBUF_WILL_INIT;
2161 		}
2162 
2163 		xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2164 		xlrec.flags = 0;
2165 		if (all_visible_cleared)
2166 			xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
2167 		if (options & HEAP_INSERT_SPECULATIVE)
2168 			xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
2169 		Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
2170 
2171 		/*
2172 		 * For logical decoding, we need the tuple even if we're doing a full
2173 		 * page write, so make sure it's included even if we take a full-page
2174 		 * image. (XXX We could alternatively store a pointer into the FPW).
2175 		 */
2176 		if (RelationIsLogicallyLogged(relation) &&
2177 			!(options & HEAP_INSERT_NO_LOGICAL))
2178 		{
2179 			xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2180 			bufflags |= REGBUF_KEEP_DATA;
2181 
2182 			if (IsToastRelation(relation))
2183 				xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION;
2184 		}
2185 
2186 		XLogBeginInsert();
2187 		XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2188 
2189 		xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2190 		xlhdr.t_infomask = heaptup->t_data->t_infomask;
2191 		xlhdr.t_hoff = heaptup->t_data->t_hoff;
2192 
2193 		/*
2194 		 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2195 		 * write the whole page to the xlog, we don't need to store
2196 		 * xl_heap_header in the xlog.
2197 		 */
2198 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2199 		XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2200 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2201 		XLogRegisterBufData(0,
2202 							(char *) heaptup->t_data + SizeofHeapTupleHeader,
2203 							heaptup->t_len - SizeofHeapTupleHeader);
2204 
2205 		/* filtering by origin on a row level is much more efficient */
2206 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2207 
2208 		recptr = XLogInsert(RM_HEAP_ID, info);
2209 
2210 		PageSetLSN(page, recptr);
2211 	}
2212 
2213 	END_CRIT_SECTION();
2214 
2215 	UnlockReleaseBuffer(buffer);
2216 	if (vmbuffer != InvalidBuffer)
2217 		ReleaseBuffer(vmbuffer);
2218 
2219 	/*
2220 	 * If tuple is cachable, mark it for invalidation from the caches in case
2221 	 * we abort.  Note it is OK to do this after releasing the buffer, because
2222 	 * the heaptup data structure is all in local memory, not in the shared
2223 	 * buffer.
2224 	 */
2225 	CacheInvalidateHeapTuple(relation, heaptup, NULL);
2226 
2227 	/* Note: speculative insertions are counted too, even if aborted later */
2228 	pgstat_count_heap_insert(relation, 1);
2229 
2230 	/*
2231 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
2232 	 * back to the caller's image, too.
2233 	 */
2234 	if (heaptup != tup)
2235 	{
2236 		tup->t_self = heaptup->t_self;
2237 		heap_freetuple(heaptup);
2238 	}
2239 }
2240 
2241 /*
2242  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2243  * tuple header fields and toasts the tuple if necessary.  Returns a toasted
2244  * version of the tuple if it was toasted, or the original tuple if not. Note
2245  * that in any case, the header fields are also set in the original tuple.
2246  */
2247 static HeapTuple
heap_prepare_insert(Relation relation,HeapTuple tup,TransactionId xid,CommandId cid,int options)2248 heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2249 					CommandId cid, int options)
2250 {
2251 	/*
2252 	 * To allow parallel inserts, we need to ensure that they are safe to be
2253 	 * performed in workers. We have the infrastructure to allow parallel
2254 	 * inserts in general except for the cases where inserts generate a new
2255 	 * CommandId (eg. inserts into a table having a foreign key column).
2256 	 */
2257 	if (IsParallelWorker())
2258 		ereport(ERROR,
2259 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2260 				 errmsg("cannot insert tuples in a parallel worker")));
2261 
2262 	tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2263 	tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2264 	tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2265 	HeapTupleHeaderSetXmin(tup->t_data, xid);
2266 	if (options & HEAP_INSERT_FROZEN)
2267 		HeapTupleHeaderSetXminFrozen(tup->t_data);
2268 
2269 	HeapTupleHeaderSetCmin(tup->t_data, cid);
2270 	HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2271 	tup->t_tableOid = RelationGetRelid(relation);
2272 
2273 	/*
2274 	 * If the new tuple is too big for storage or contains already toasted
2275 	 * out-of-line attributes from some other relation, invoke the toaster.
2276 	 */
2277 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
2278 		relation->rd_rel->relkind != RELKIND_MATVIEW)
2279 	{
2280 		/* toast table entries should never be recursively toasted */
2281 		Assert(!HeapTupleHasExternal(tup));
2282 		return tup;
2283 	}
2284 	else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2285 		return heap_toast_insert_or_update(relation, tup, NULL, options);
2286 	else
2287 		return tup;
2288 }
2289 
2290 /*
2291  *	heap_multi_insert	- insert multiple tuples into a heap
2292  *
2293  * This is like heap_insert(), but inserts multiple tuples in one operation.
2294  * That's faster than calling heap_insert() in a loop, because when multiple
2295  * tuples can be inserted on a single page, we can write just a single WAL
2296  * record covering all of them, and only need to lock/unlock the page once.
2297  *
2298  * Note: this leaks memory into the current memory context. You can create a
2299  * temporary context before calling this, if that's a problem.
2300  */
2301 void
heap_multi_insert(Relation relation,TupleTableSlot ** slots,int ntuples,CommandId cid,int options,BulkInsertState bistate)2302 heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2303 				  CommandId cid, int options, BulkInsertState bistate)
2304 {
2305 	TransactionId xid = GetCurrentTransactionId();
2306 	HeapTuple  *heaptuples;
2307 	int			i;
2308 	int			ndone;
2309 	PGAlignedBlock scratch;
2310 	Page		page;
2311 	Buffer		vmbuffer = InvalidBuffer;
2312 	bool		needwal;
2313 	Size		saveFreeSpace;
2314 	bool		need_tuple_data = RelationIsLogicallyLogged(relation);
2315 	bool		need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2316 
2317 	/* currently not needed (thus unsupported) for heap_multi_insert() */
2318 	AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2319 
2320 	needwal = RelationNeedsWAL(relation);
2321 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2322 												   HEAP_DEFAULT_FILLFACTOR);
2323 
2324 	/* Toast and set header data in all the slots */
2325 	heaptuples = palloc(ntuples * sizeof(HeapTuple));
2326 	for (i = 0; i < ntuples; i++)
2327 	{
2328 		HeapTuple	tuple;
2329 
2330 		tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2331 		slots[i]->tts_tableOid = RelationGetRelid(relation);
2332 		tuple->t_tableOid = slots[i]->tts_tableOid;
2333 		heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2334 											options);
2335 	}
2336 
2337 	/*
2338 	 * We're about to do the actual inserts -- but check for conflict first,
2339 	 * to minimize the possibility of having to roll back work we've just
2340 	 * done.
2341 	 *
2342 	 * A check here does not definitively prevent a serialization anomaly;
2343 	 * that check MUST be done at least past the point of acquiring an
2344 	 * exclusive buffer content lock on every buffer that will be affected,
2345 	 * and MAY be done after all inserts are reflected in the buffers and
2346 	 * those locks are released; otherwise there is a race condition.  Since
2347 	 * multiple buffers can be locked and unlocked in the loop below, and it
2348 	 * would not be feasible to identify and lock all of those buffers before
2349 	 * the loop, we must do a final check at the end.
2350 	 *
2351 	 * The check here could be omitted with no loss of correctness; it is
2352 	 * present strictly as an optimization.
2353 	 *
2354 	 * For heap inserts, we only need to check for table-level SSI locks. Our
2355 	 * new tuples can't possibly conflict with existing tuple locks, and heap
2356 	 * page locks are only consolidated versions of tuple locks; they do not
2357 	 * lock "gaps" as index page locks do.  So we don't need to specify a
2358 	 * buffer when making the call, which makes for a faster check.
2359 	 */
2360 	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2361 
2362 	ndone = 0;
2363 	while (ndone < ntuples)
2364 	{
2365 		Buffer		buffer;
2366 		bool		starting_with_empty_page;
2367 		bool		all_visible_cleared = false;
2368 		bool		all_frozen_set = false;
2369 		int			nthispage;
2370 
2371 		CHECK_FOR_INTERRUPTS();
2372 
2373 		/*
2374 		 * Find buffer where at least the next tuple will fit.  If the page is
2375 		 * all-visible, this will also pin the requisite visibility map page.
2376 		 *
2377 		 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2378 		 * empty page. See all_frozen_set below.
2379 		 */
2380 		buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2381 										   InvalidBuffer, options, bistate,
2382 										   &vmbuffer, NULL);
2383 		page = BufferGetPage(buffer);
2384 
2385 		starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
2386 
2387 		if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN))
2388 			all_frozen_set = true;
2389 
2390 		/* NO EREPORT(ERROR) from here till changes are logged */
2391 		START_CRIT_SECTION();
2392 
2393 		/*
2394 		 * RelationGetBufferForTuple has ensured that the first tuple fits.
2395 		 * Put that on the page, and then as many other tuples as fit.
2396 		 */
2397 		RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2398 
2399 		/*
2400 		 * For logical decoding we need combo CIDs to properly decode the
2401 		 * catalog.
2402 		 */
2403 		if (needwal && need_cids)
2404 			log_heap_new_cid(relation, heaptuples[ndone]);
2405 
2406 		for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2407 		{
2408 			HeapTuple	heaptup = heaptuples[ndone + nthispage];
2409 
2410 			if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2411 				break;
2412 
2413 			RelationPutHeapTuple(relation, buffer, heaptup, false);
2414 
2415 			/*
2416 			 * For logical decoding we need combo CIDs to properly decode the
2417 			 * catalog.
2418 			 */
2419 			if (needwal && need_cids)
2420 				log_heap_new_cid(relation, heaptup);
2421 		}
2422 
2423 		/*
2424 		 * If the page is all visible, need to clear that, unless we're only
2425 		 * going to add further frozen rows to it.
2426 		 *
2427 		 * If we're only adding already frozen rows to a previously empty
2428 		 * page, mark it as all-visible.
2429 		 */
2430 		if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN))
2431 		{
2432 			all_visible_cleared = true;
2433 			PageClearAllVisible(page);
2434 			visibilitymap_clear(relation,
2435 								BufferGetBlockNumber(buffer),
2436 								vmbuffer, VISIBILITYMAP_VALID_BITS);
2437 		}
2438 		else if (all_frozen_set)
2439 			PageSetAllVisible(page);
2440 
2441 		/*
2442 		 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2443 		 */
2444 
2445 		MarkBufferDirty(buffer);
2446 
2447 		/* XLOG stuff */
2448 		if (needwal)
2449 		{
2450 			XLogRecPtr	recptr;
2451 			xl_heap_multi_insert *xlrec;
2452 			uint8		info = XLOG_HEAP2_MULTI_INSERT;
2453 			char	   *tupledata;
2454 			int			totaldatalen;
2455 			char	   *scratchptr = scratch.data;
2456 			bool		init;
2457 			int			bufflags = 0;
2458 
2459 			/*
2460 			 * If the page was previously empty, we can reinit the page
2461 			 * instead of restoring the whole thing.
2462 			 */
2463 			init = starting_with_empty_page;
2464 
2465 			/* allocate xl_heap_multi_insert struct from the scratch area */
2466 			xlrec = (xl_heap_multi_insert *) scratchptr;
2467 			scratchptr += SizeOfHeapMultiInsert;
2468 
2469 			/*
2470 			 * Allocate offsets array. Unless we're reinitializing the page,
2471 			 * in that case the tuples are stored in order starting at
2472 			 * FirstOffsetNumber and we don't need to store the offsets
2473 			 * explicitly.
2474 			 */
2475 			if (!init)
2476 				scratchptr += nthispage * sizeof(OffsetNumber);
2477 
2478 			/* the rest of the scratch space is used for tuple data */
2479 			tupledata = scratchptr;
2480 
2481 			/* check that the mutually exclusive flags are not both set */
2482 			Assert(!(all_visible_cleared && all_frozen_set));
2483 
2484 			xlrec->flags = 0;
2485 			if (all_visible_cleared)
2486 				xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED;
2487 			if (all_frozen_set)
2488 				xlrec->flags = XLH_INSERT_ALL_FROZEN_SET;
2489 
2490 			xlrec->ntuples = nthispage;
2491 
2492 			/*
2493 			 * Write out an xl_multi_insert_tuple and the tuple data itself
2494 			 * for each tuple.
2495 			 */
2496 			for (i = 0; i < nthispage; i++)
2497 			{
2498 				HeapTuple	heaptup = heaptuples[ndone + i];
2499 				xl_multi_insert_tuple *tuphdr;
2500 				int			datalen;
2501 
2502 				if (!init)
2503 					xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2504 				/* xl_multi_insert_tuple needs two-byte alignment. */
2505 				tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2506 				scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2507 
2508 				tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2509 				tuphdr->t_infomask = heaptup->t_data->t_infomask;
2510 				tuphdr->t_hoff = heaptup->t_data->t_hoff;
2511 
2512 				/* write bitmap [+ padding] [+ oid] + data */
2513 				datalen = heaptup->t_len - SizeofHeapTupleHeader;
2514 				memcpy(scratchptr,
2515 					   (char *) heaptup->t_data + SizeofHeapTupleHeader,
2516 					   datalen);
2517 				tuphdr->datalen = datalen;
2518 				scratchptr += datalen;
2519 			}
2520 			totaldatalen = scratchptr - tupledata;
2521 			Assert((scratchptr - scratch.data) < BLCKSZ);
2522 
2523 			if (need_tuple_data)
2524 				xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2525 
2526 			/*
2527 			 * Signal that this is the last xl_heap_multi_insert record
2528 			 * emitted by this call to heap_multi_insert(). Needed for logical
2529 			 * decoding so it knows when to cleanup temporary data.
2530 			 */
2531 			if (ndone + nthispage == ntuples)
2532 				xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2533 
2534 			if (init)
2535 			{
2536 				info |= XLOG_HEAP_INIT_PAGE;
2537 				bufflags |= REGBUF_WILL_INIT;
2538 			}
2539 
2540 			/*
2541 			 * If we're doing logical decoding, include the new tuple data
2542 			 * even if we take a full-page image of the page.
2543 			 */
2544 			if (need_tuple_data)
2545 				bufflags |= REGBUF_KEEP_DATA;
2546 
2547 			XLogBeginInsert();
2548 			XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2549 			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2550 
2551 			XLogRegisterBufData(0, tupledata, totaldatalen);
2552 
2553 			/* filtering by origin on a row level is much more efficient */
2554 			XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2555 
2556 			recptr = XLogInsert(RM_HEAP2_ID, info);
2557 
2558 			PageSetLSN(page, recptr);
2559 		}
2560 
2561 		END_CRIT_SECTION();
2562 
2563 		/*
2564 		 * If we've frozen everything on the page, update the visibilitymap.
2565 		 * We're already holding pin on the vmbuffer.
2566 		 */
2567 		if (all_frozen_set)
2568 		{
2569 			Assert(PageIsAllVisible(page));
2570 			Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer));
2571 
2572 			/*
2573 			 * It's fine to use InvalidTransactionId here - this is only used
2574 			 * when HEAP_INSERT_FROZEN is specified, which intentionally
2575 			 * violates visibility rules.
2576 			 */
2577 			visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer,
2578 							  InvalidXLogRecPtr, vmbuffer,
2579 							  InvalidTransactionId,
2580 							  VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
2581 		}
2582 
2583 		UnlockReleaseBuffer(buffer);
2584 		ndone += nthispage;
2585 
2586 		/*
2587 		 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2588 		 * likely that we'll insert into subsequent heap pages that are likely
2589 		 * to use the same vm page.
2590 		 */
2591 	}
2592 
2593 	/* We're done with inserting all tuples, so release the last vmbuffer. */
2594 	if (vmbuffer != InvalidBuffer)
2595 		ReleaseBuffer(vmbuffer);
2596 
2597 	/*
2598 	 * We're done with the actual inserts.  Check for conflicts again, to
2599 	 * ensure that all rw-conflicts in to these inserts are detected.  Without
2600 	 * this final check, a sequential scan of the heap may have locked the
2601 	 * table after the "before" check, missing one opportunity to detect the
2602 	 * conflict, and then scanned the table before the new tuples were there,
2603 	 * missing the other chance to detect the conflict.
2604 	 *
2605 	 * For heap inserts, we only need to check for table-level SSI locks. Our
2606 	 * new tuples can't possibly conflict with existing tuple locks, and heap
2607 	 * page locks are only consolidated versions of tuple locks; they do not
2608 	 * lock "gaps" as index page locks do.  So we don't need to specify a
2609 	 * buffer when making the call.
2610 	 */
2611 	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2612 
2613 	/*
2614 	 * If tuples are cachable, mark them for invalidation from the caches in
2615 	 * case we abort.  Note it is OK to do this after releasing the buffer,
2616 	 * because the heaptuples data structure is all in local memory, not in
2617 	 * the shared buffer.
2618 	 */
2619 	if (IsCatalogRelation(relation))
2620 	{
2621 		for (i = 0; i < ntuples; i++)
2622 			CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2623 	}
2624 
2625 	/* copy t_self fields back to the caller's slots */
2626 	for (i = 0; i < ntuples; i++)
2627 		slots[i]->tts_tid = heaptuples[i]->t_self;
2628 
2629 	pgstat_count_heap_insert(relation, ntuples);
2630 }
2631 
2632 /*
2633  *	simple_heap_insert - insert a tuple
2634  *
2635  * Currently, this routine differs from heap_insert only in supplying
2636  * a default command ID and not allowing access to the speedup options.
2637  *
2638  * This should be used rather than using heap_insert directly in most places
2639  * where we are modifying system catalogs.
2640  */
2641 void
simple_heap_insert(Relation relation,HeapTuple tup)2642 simple_heap_insert(Relation relation, HeapTuple tup)
2643 {
2644 	heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2645 }
2646 
2647 /*
2648  * Given infomask/infomask2, compute the bits that must be saved in the
2649  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2650  * xl_heap_lock_updated WAL records.
2651  *
2652  * See fix_infomask_from_infobits.
2653  */
2654 static uint8
compute_infobits(uint16 infomask,uint16 infomask2)2655 compute_infobits(uint16 infomask, uint16 infomask2)
2656 {
2657 	return
2658 		((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2659 		((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2660 		((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2661 	/* note we ignore HEAP_XMAX_SHR_LOCK here */
2662 		((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2663 		((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2664 		 XLHL_KEYS_UPDATED : 0);
2665 }
2666 
2667 /*
2668  * Given two versions of the same t_infomask for a tuple, compare them and
2669  * return whether the relevant status for a tuple Xmax has changed.  This is
2670  * used after a buffer lock has been released and reacquired: we want to ensure
2671  * that the tuple state continues to be the same it was when we previously
2672  * examined it.
2673  *
2674  * Note the Xmax field itself must be compared separately.
2675  */
2676 static inline bool
xmax_infomask_changed(uint16 new_infomask,uint16 old_infomask)2677 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2678 {
2679 	const uint16 interesting =
2680 	HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
2681 
2682 	if ((new_infomask & interesting) != (old_infomask & interesting))
2683 		return true;
2684 
2685 	return false;
2686 }
2687 
2688 /*
2689  *	heap_delete - delete a tuple
2690  *
2691  * See table_tuple_delete() for an explanation of the parameters, except that
2692  * this routine directly takes a tuple rather than a slot.
2693  *
2694  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2695  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2696  * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2697  * generated by another transaction).
2698  */
2699 TM_Result
heap_delete(Relation relation,ItemPointer tid,CommandId cid,Snapshot crosscheck,bool wait,TM_FailureData * tmfd,bool changingPart)2700 heap_delete(Relation relation, ItemPointer tid,
2701 			CommandId cid, Snapshot crosscheck, bool wait,
2702 			TM_FailureData *tmfd, bool changingPart)
2703 {
2704 	TM_Result	result;
2705 	TransactionId xid = GetCurrentTransactionId();
2706 	ItemId		lp;
2707 	HeapTupleData tp;
2708 	Page		page;
2709 	BlockNumber block;
2710 	Buffer		buffer;
2711 	Buffer		vmbuffer = InvalidBuffer;
2712 	TransactionId new_xmax;
2713 	uint16		new_infomask,
2714 				new_infomask2;
2715 	bool		have_tuple_lock = false;
2716 	bool		iscombo;
2717 	bool		all_visible_cleared = false;
2718 	HeapTuple	old_key_tuple = NULL;	/* replica identity of the tuple */
2719 	bool		old_key_copied = false;
2720 
2721 	Assert(ItemPointerIsValid(tid));
2722 
2723 	/*
2724 	 * Forbid this during a parallel operation, lest it allocate a combo CID.
2725 	 * Other workers might need that combo CID for visibility checks, and we
2726 	 * have no provision for broadcasting it to them.
2727 	 */
2728 	if (IsInParallelMode())
2729 		ereport(ERROR,
2730 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2731 				 errmsg("cannot delete tuples during a parallel operation")));
2732 
2733 	block = ItemPointerGetBlockNumber(tid);
2734 	buffer = ReadBuffer(relation, block);
2735 	page = BufferGetPage(buffer);
2736 
2737 	/*
2738 	 * Before locking the buffer, pin the visibility map page if it appears to
2739 	 * be necessary.  Since we haven't got the lock yet, someone else might be
2740 	 * in the middle of changing this, so we'll need to recheck after we have
2741 	 * the lock.
2742 	 */
2743 	if (PageIsAllVisible(page))
2744 		visibilitymap_pin(relation, block, &vmbuffer);
2745 
2746 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2747 
2748 	/*
2749 	 * If we didn't pin the visibility map page and the page has become all
2750 	 * visible while we were busy locking the buffer, we'll have to unlock and
2751 	 * re-lock, to avoid holding the buffer lock across an I/O.  That's a bit
2752 	 * unfortunate, but hopefully shouldn't happen often.
2753 	 */
2754 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2755 	{
2756 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2757 		visibilitymap_pin(relation, block, &vmbuffer);
2758 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2759 	}
2760 
2761 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2762 	Assert(ItemIdIsNormal(lp));
2763 
2764 	tp.t_tableOid = RelationGetRelid(relation);
2765 	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2766 	tp.t_len = ItemIdGetLength(lp);
2767 	tp.t_self = *tid;
2768 
2769 l1:
2770 	result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2771 
2772 	if (result == TM_Invisible)
2773 	{
2774 		UnlockReleaseBuffer(buffer);
2775 		ereport(ERROR,
2776 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2777 				 errmsg("attempted to delete invisible tuple")));
2778 	}
2779 	else if (result == TM_BeingModified && wait)
2780 	{
2781 		TransactionId xwait;
2782 		uint16		infomask;
2783 
2784 		/* must copy state data before unlocking buffer */
2785 		xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2786 		infomask = tp.t_data->t_infomask;
2787 
2788 		/*
2789 		 * Sleep until concurrent transaction ends -- except when there's a
2790 		 * single locker and it's our own transaction.  Note we don't care
2791 		 * which lock mode the locker has, because we need the strongest one.
2792 		 *
2793 		 * Before sleeping, we need to acquire tuple lock to establish our
2794 		 * priority for the tuple (see heap_lock_tuple).  LockTuple will
2795 		 * release us when we are next-in-line for the tuple.
2796 		 *
2797 		 * If we are forced to "start over" below, we keep the tuple lock;
2798 		 * this arranges that we stay at the head of the line while rechecking
2799 		 * tuple state.
2800 		 */
2801 		if (infomask & HEAP_XMAX_IS_MULTI)
2802 		{
2803 			bool		current_is_member = false;
2804 
2805 			if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2806 										LockTupleExclusive, &current_is_member))
2807 			{
2808 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2809 
2810 				/*
2811 				 * Acquire the lock, if necessary (but skip it when we're
2812 				 * requesting a lock and already have one; avoids deadlock).
2813 				 */
2814 				if (!current_is_member)
2815 					heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2816 										 LockWaitBlock, &have_tuple_lock);
2817 
2818 				/* wait for multixact */
2819 				MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
2820 								relation, &(tp.t_self), XLTW_Delete,
2821 								NULL);
2822 				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2823 
2824 				/*
2825 				 * If xwait had just locked the tuple then some other xact
2826 				 * could update this tuple before we get to this point.  Check
2827 				 * for xmax change, and start over if so.
2828 				 */
2829 				if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2830 					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2831 										 xwait))
2832 					goto l1;
2833 			}
2834 
2835 			/*
2836 			 * You might think the multixact is necessarily done here, but not
2837 			 * so: it could have surviving members, namely our own xact or
2838 			 * other subxacts of this backend.  It is legal for us to delete
2839 			 * the tuple in either case, however (the latter case is
2840 			 * essentially a situation of upgrading our former shared lock to
2841 			 * exclusive).  We don't bother changing the on-disk hint bits
2842 			 * since we are about to overwrite the xmax altogether.
2843 			 */
2844 		}
2845 		else if (!TransactionIdIsCurrentTransactionId(xwait))
2846 		{
2847 			/*
2848 			 * Wait for regular transaction to end; but first, acquire tuple
2849 			 * lock.
2850 			 */
2851 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2852 			heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2853 								 LockWaitBlock, &have_tuple_lock);
2854 			XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2855 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2856 
2857 			/*
2858 			 * xwait is done, but if xwait had just locked the tuple then some
2859 			 * other xact could update this tuple before we get to this point.
2860 			 * Check for xmax change, and start over if so.
2861 			 */
2862 			if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2863 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2864 									 xwait))
2865 				goto l1;
2866 
2867 			/* Otherwise check if it committed or aborted */
2868 			UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2869 		}
2870 
2871 		/*
2872 		 * We may overwrite if previous xmax aborted, or if it committed but
2873 		 * only locked the tuple without updating it.
2874 		 */
2875 		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2876 			HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
2877 			HeapTupleHeaderIsOnlyLocked(tp.t_data))
2878 			result = TM_Ok;
2879 		else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2880 			result = TM_Updated;
2881 		else
2882 			result = TM_Deleted;
2883 	}
2884 
2885 	if (crosscheck != InvalidSnapshot && result == TM_Ok)
2886 	{
2887 		/* Perform additional check for transaction-snapshot mode RI updates */
2888 		if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2889 			result = TM_Updated;
2890 	}
2891 
2892 	if (result != TM_Ok)
2893 	{
2894 		Assert(result == TM_SelfModified ||
2895 			   result == TM_Updated ||
2896 			   result == TM_Deleted ||
2897 			   result == TM_BeingModified);
2898 		Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2899 		Assert(result != TM_Updated ||
2900 			   !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2901 		tmfd->ctid = tp.t_data->t_ctid;
2902 		tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2903 		if (result == TM_SelfModified)
2904 			tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2905 		else
2906 			tmfd->cmax = InvalidCommandId;
2907 		UnlockReleaseBuffer(buffer);
2908 		if (have_tuple_lock)
2909 			UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2910 		if (vmbuffer != InvalidBuffer)
2911 			ReleaseBuffer(vmbuffer);
2912 		return result;
2913 	}
2914 
2915 	/*
2916 	 * We're about to do the actual delete -- check for conflict first, to
2917 	 * avoid possibly having to roll back work we've just done.
2918 	 *
2919 	 * This is safe without a recheck as long as there is no possibility of
2920 	 * another process scanning the page between this check and the delete
2921 	 * being visible to the scan (i.e., an exclusive buffer content lock is
2922 	 * continuously held from this point until the tuple delete is visible).
2923 	 */
2924 	CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
2925 
2926 	/* replace cid with a combo CID if necessary */
2927 	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2928 
2929 	/*
2930 	 * Compute replica identity tuple before entering the critical section so
2931 	 * we don't PANIC upon a memory allocation failure.
2932 	 */
2933 	old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2934 
2935 	/*
2936 	 * If this is the first possibly-multixact-able operation in the current
2937 	 * transaction, set my per-backend OldestMemberMXactId setting. We can be
2938 	 * certain that the transaction will never become a member of any older
2939 	 * MultiXactIds than that.  (We have to do this even if we end up just
2940 	 * using our own TransactionId below, since some other backend could
2941 	 * incorporate our XID into a MultiXact immediately afterwards.)
2942 	 */
2943 	MultiXactIdSetOldestMember();
2944 
2945 	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
2946 							  tp.t_data->t_infomask, tp.t_data->t_infomask2,
2947 							  xid, LockTupleExclusive, true,
2948 							  &new_xmax, &new_infomask, &new_infomask2);
2949 
2950 	START_CRIT_SECTION();
2951 
2952 	/*
2953 	 * If this transaction commits, the tuple will become DEAD sooner or
2954 	 * later.  Set flag that this page is a candidate for pruning once our xid
2955 	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
2956 	 * the subsequent page pruning will be a no-op and the hint will be
2957 	 * cleared.
2958 	 */
2959 	PageSetPrunable(page, xid);
2960 
2961 	if (PageIsAllVisible(page))
2962 	{
2963 		all_visible_cleared = true;
2964 		PageClearAllVisible(page);
2965 		visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2966 							vmbuffer, VISIBILITYMAP_VALID_BITS);
2967 	}
2968 
2969 	/* store transaction information of xact deleting the tuple */
2970 	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
2971 	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
2972 	tp.t_data->t_infomask |= new_infomask;
2973 	tp.t_data->t_infomask2 |= new_infomask2;
2974 	HeapTupleHeaderClearHotUpdated(tp.t_data);
2975 	HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2976 	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2977 	/* Make sure there is no forward chain link in t_ctid */
2978 	tp.t_data->t_ctid = tp.t_self;
2979 
2980 	/* Signal that this is actually a move into another partition */
2981 	if (changingPart)
2982 		HeapTupleHeaderSetMovedPartitions(tp.t_data);
2983 
2984 	MarkBufferDirty(buffer);
2985 
2986 	/*
2987 	 * XLOG stuff
2988 	 *
2989 	 * NB: heap_abort_speculative() uses the same xlog record and replay
2990 	 * routines.
2991 	 */
2992 	if (RelationNeedsWAL(relation))
2993 	{
2994 		xl_heap_delete xlrec;
2995 		xl_heap_header xlhdr;
2996 		XLogRecPtr	recptr;
2997 
2998 		/*
2999 		 * For logical decode we need combo CIDs to properly decode the
3000 		 * catalog
3001 		 */
3002 		if (RelationIsAccessibleInLogicalDecoding(relation))
3003 			log_heap_new_cid(relation, &tp);
3004 
3005 		xlrec.flags = 0;
3006 		if (all_visible_cleared)
3007 			xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
3008 		if (changingPart)
3009 			xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
3010 		xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3011 											  tp.t_data->t_infomask2);
3012 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
3013 		xlrec.xmax = new_xmax;
3014 
3015 		if (old_key_tuple != NULL)
3016 		{
3017 			if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3018 				xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
3019 			else
3020 				xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
3021 		}
3022 
3023 		XLogBeginInsert();
3024 		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3025 
3026 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3027 
3028 		/*
3029 		 * Log replica identity of the deleted tuple if there is one
3030 		 */
3031 		if (old_key_tuple != NULL)
3032 		{
3033 			xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3034 			xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3035 			xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3036 
3037 			XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3038 			XLogRegisterData((char *) old_key_tuple->t_data
3039 							 + SizeofHeapTupleHeader,
3040 							 old_key_tuple->t_len
3041 							 - SizeofHeapTupleHeader);
3042 		}
3043 
3044 		/* filtering by origin on a row level is much more efficient */
3045 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
3046 
3047 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3048 
3049 		PageSetLSN(page, recptr);
3050 	}
3051 
3052 	END_CRIT_SECTION();
3053 
3054 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3055 
3056 	if (vmbuffer != InvalidBuffer)
3057 		ReleaseBuffer(vmbuffer);
3058 
3059 	/*
3060 	 * If the tuple has toasted out-of-line attributes, we need to delete
3061 	 * those items too.  We have to do this before releasing the buffer
3062 	 * because we need to look at the contents of the tuple, but it's OK to
3063 	 * release the content lock on the buffer first.
3064 	 */
3065 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
3066 		relation->rd_rel->relkind != RELKIND_MATVIEW)
3067 	{
3068 		/* toast table entries should never be recursively toasted */
3069 		Assert(!HeapTupleHasExternal(&tp));
3070 	}
3071 	else if (HeapTupleHasExternal(&tp))
3072 		heap_toast_delete(relation, &tp, false);
3073 
3074 	/*
3075 	 * Mark tuple for invalidation from system caches at next command
3076 	 * boundary. We have to do this before releasing the buffer because we
3077 	 * need to look at the contents of the tuple.
3078 	 */
3079 	CacheInvalidateHeapTuple(relation, &tp, NULL);
3080 
3081 	/* Now we can release the buffer */
3082 	ReleaseBuffer(buffer);
3083 
3084 	/*
3085 	 * Release the lmgr tuple lock, if we had it.
3086 	 */
3087 	if (have_tuple_lock)
3088 		UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3089 
3090 	pgstat_count_heap_delete(relation);
3091 
3092 	if (old_key_tuple != NULL && old_key_copied)
3093 		heap_freetuple(old_key_tuple);
3094 
3095 	return TM_Ok;
3096 }
3097 
3098 /*
3099  *	simple_heap_delete - delete a tuple
3100  *
3101  * This routine may be used to delete a tuple when concurrent updates of
3102  * the target tuple are not expected (for example, because we have a lock
3103  * on the relation associated with the tuple).  Any failure is reported
3104  * via ereport().
3105  */
3106 void
simple_heap_delete(Relation relation,ItemPointer tid)3107 simple_heap_delete(Relation relation, ItemPointer tid)
3108 {
3109 	TM_Result	result;
3110 	TM_FailureData tmfd;
3111 
3112 	result = heap_delete(relation, tid,
3113 						 GetCurrentCommandId(true), InvalidSnapshot,
3114 						 true /* wait for commit */ ,
3115 						 &tmfd, false /* changingPart */ );
3116 	switch (result)
3117 	{
3118 		case TM_SelfModified:
3119 			/* Tuple was already updated in current command? */
3120 			elog(ERROR, "tuple already updated by self");
3121 			break;
3122 
3123 		case TM_Ok:
3124 			/* done successfully */
3125 			break;
3126 
3127 		case TM_Updated:
3128 			elog(ERROR, "tuple concurrently updated");
3129 			break;
3130 
3131 		case TM_Deleted:
3132 			elog(ERROR, "tuple concurrently deleted");
3133 			break;
3134 
3135 		default:
3136 			elog(ERROR, "unrecognized heap_delete status: %u", result);
3137 			break;
3138 	}
3139 }
3140 
3141 /*
3142  *	heap_update - replace a tuple
3143  *
3144  * See table_tuple_update() for an explanation of the parameters, except that
3145  * this routine directly takes a tuple rather than a slot.
3146  *
3147  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3148  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3149  * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3150  * generated by another transaction).
3151  */
3152 TM_Result
heap_update(Relation relation,ItemPointer otid,HeapTuple newtup,CommandId cid,Snapshot crosscheck,bool wait,TM_FailureData * tmfd,LockTupleMode * lockmode)3153 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
3154 			CommandId cid, Snapshot crosscheck, bool wait,
3155 			TM_FailureData *tmfd, LockTupleMode *lockmode)
3156 {
3157 	TM_Result	result;
3158 	TransactionId xid = GetCurrentTransactionId();
3159 	Bitmapset  *hot_attrs;
3160 	Bitmapset  *key_attrs;
3161 	Bitmapset  *id_attrs;
3162 	Bitmapset  *interesting_attrs;
3163 	Bitmapset  *modified_attrs;
3164 	ItemId		lp;
3165 	HeapTupleData oldtup;
3166 	HeapTuple	heaptup;
3167 	HeapTuple	old_key_tuple = NULL;
3168 	bool		old_key_copied = false;
3169 	Page		page;
3170 	BlockNumber block;
3171 	MultiXactStatus mxact_status;
3172 	Buffer		buffer,
3173 				newbuf,
3174 				vmbuffer = InvalidBuffer,
3175 				vmbuffer_new = InvalidBuffer;
3176 	bool		need_toast;
3177 	Size		newtupsize,
3178 				pagefree;
3179 	bool		have_tuple_lock = false;
3180 	bool		iscombo;
3181 	bool		use_hot_update = false;
3182 	bool		hot_attrs_checked = false;
3183 	bool		key_intact;
3184 	bool		all_visible_cleared = false;
3185 	bool		all_visible_cleared_new = false;
3186 	bool		checked_lockers;
3187 	bool		locker_remains;
3188 	TransactionId xmax_new_tuple,
3189 				xmax_old_tuple;
3190 	uint16		infomask_old_tuple,
3191 				infomask2_old_tuple,
3192 				infomask_new_tuple,
3193 				infomask2_new_tuple;
3194 
3195 	Assert(ItemPointerIsValid(otid));
3196 
3197 	/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3198 	Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
3199 		   RelationGetNumberOfAttributes(relation));
3200 
3201 	/*
3202 	 * Forbid this during a parallel operation, lest it allocate a combo CID.
3203 	 * Other workers might need that combo CID for visibility checks, and we
3204 	 * have no provision for broadcasting it to them.
3205 	 */
3206 	if (IsInParallelMode())
3207 		ereport(ERROR,
3208 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3209 				 errmsg("cannot update tuples during a parallel operation")));
3210 
3211 	/*
3212 	 * Fetch the list of attributes to be checked for various operations.
3213 	 *
3214 	 * For HOT considerations, this is wasted effort if we fail to update or
3215 	 * have to put the new tuple on a different page.  But we must compute the
3216 	 * list before obtaining buffer lock --- in the worst case, if we are
3217 	 * doing an update on one of the relevant system catalogs, we could
3218 	 * deadlock if we try to fetch the list later.  In any case, the relcache
3219 	 * caches the data so this is usually pretty cheap.
3220 	 *
3221 	 * We also need columns used by the replica identity and columns that are
3222 	 * considered the "key" of rows in the table.
3223 	 *
3224 	 * Note that we get copies of each bitmap, so we need not worry about
3225 	 * relcache flush happening midway through.
3226 	 */
3227 	hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3228 	key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3229 	id_attrs = RelationGetIndexAttrBitmap(relation,
3230 										  INDEX_ATTR_BITMAP_IDENTITY_KEY);
3231 
3232 
3233 	block = ItemPointerGetBlockNumber(otid);
3234 	buffer = ReadBuffer(relation, block);
3235 	page = BufferGetPage(buffer);
3236 
3237 	interesting_attrs = NULL;
3238 
3239 	/*
3240 	 * If the page is already full, there is hardly any chance of doing a HOT
3241 	 * update on this page. It might be wasteful effort to look for index
3242 	 * column updates only to later reject HOT updates for lack of space in
3243 	 * the same page. So we be conservative and only fetch hot_attrs if the
3244 	 * page is not already full. Since we are already holding a pin on the
3245 	 * buffer, there is no chance that the buffer can get cleaned up
3246 	 * concurrently and even if that was possible, in the worst case we lose a
3247 	 * chance to do a HOT update.
3248 	 */
3249 	if (!PageIsFull(page))
3250 	{
3251 		interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3252 		hot_attrs_checked = true;
3253 	}
3254 	interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3255 	interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3256 
3257 	/*
3258 	 * Before locking the buffer, pin the visibility map page if it appears to
3259 	 * be necessary.  Since we haven't got the lock yet, someone else might be
3260 	 * in the middle of changing this, so we'll need to recheck after we have
3261 	 * the lock.
3262 	 */
3263 	if (PageIsAllVisible(page))
3264 		visibilitymap_pin(relation, block, &vmbuffer);
3265 
3266 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3267 
3268 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3269 	Assert(ItemIdIsNormal(lp));
3270 
3271 	/*
3272 	 * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3273 	 * properly.
3274 	 */
3275 	oldtup.t_tableOid = RelationGetRelid(relation);
3276 	oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3277 	oldtup.t_len = ItemIdGetLength(lp);
3278 	oldtup.t_self = *otid;
3279 
3280 	/* the new tuple is ready, except for this: */
3281 	newtup->t_tableOid = RelationGetRelid(relation);
3282 
3283 	/* Determine columns modified by the update. */
3284 	modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3285 												  &oldtup, newtup);
3286 
3287 	/*
3288 	 * If we're not updating any "key" column, we can grab a weaker lock type.
3289 	 * This allows for more concurrency when we are running simultaneously
3290 	 * with foreign key checks.
3291 	 *
3292 	 * Note that if a column gets detoasted while executing the update, but
3293 	 * the value ends up being the same, this test will fail and we will use
3294 	 * the stronger lock.  This is acceptable; the important case to optimize
3295 	 * is updates that don't manipulate key columns, not those that
3296 	 * serendipitously arrive at the same key values.
3297 	 */
3298 	if (!bms_overlap(modified_attrs, key_attrs))
3299 	{
3300 		*lockmode = LockTupleNoKeyExclusive;
3301 		mxact_status = MultiXactStatusNoKeyUpdate;
3302 		key_intact = true;
3303 
3304 		/*
3305 		 * If this is the first possibly-multixact-able operation in the
3306 		 * current transaction, set my per-backend OldestMemberMXactId
3307 		 * setting. We can be certain that the transaction will never become a
3308 		 * member of any older MultiXactIds than that.  (We have to do this
3309 		 * even if we end up just using our own TransactionId below, since
3310 		 * some other backend could incorporate our XID into a MultiXact
3311 		 * immediately afterwards.)
3312 		 */
3313 		MultiXactIdSetOldestMember();
3314 	}
3315 	else
3316 	{
3317 		*lockmode = LockTupleExclusive;
3318 		mxact_status = MultiXactStatusUpdate;
3319 		key_intact = false;
3320 	}
3321 
3322 	/*
3323 	 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3324 	 * otid may very well point at newtup->t_self, which we will overwrite
3325 	 * with the new tuple's location, so there's great risk of confusion if we
3326 	 * use otid anymore.
3327 	 */
3328 
3329 l2:
3330 	checked_lockers = false;
3331 	locker_remains = false;
3332 	result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3333 
3334 	/* see below about the "no wait" case */
3335 	Assert(result != TM_BeingModified || wait);
3336 
3337 	if (result == TM_Invisible)
3338 	{
3339 		UnlockReleaseBuffer(buffer);
3340 		ereport(ERROR,
3341 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3342 				 errmsg("attempted to update invisible tuple")));
3343 	}
3344 	else if (result == TM_BeingModified && wait)
3345 	{
3346 		TransactionId xwait;
3347 		uint16		infomask;
3348 		bool		can_continue = false;
3349 
3350 		/*
3351 		 * XXX note that we don't consider the "no wait" case here.  This
3352 		 * isn't a problem currently because no caller uses that case, but it
3353 		 * should be fixed if such a caller is introduced.  It wasn't a
3354 		 * problem previously because this code would always wait, but now
3355 		 * that some tuple locks do not conflict with one of the lock modes we
3356 		 * use, it is possible that this case is interesting to handle
3357 		 * specially.
3358 		 *
3359 		 * This may cause failures with third-party code that calls
3360 		 * heap_update directly.
3361 		 */
3362 
3363 		/* must copy state data before unlocking buffer */
3364 		xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3365 		infomask = oldtup.t_data->t_infomask;
3366 
3367 		/*
3368 		 * Now we have to do something about the existing locker.  If it's a
3369 		 * multi, sleep on it; we might be awakened before it is completely
3370 		 * gone (or even not sleep at all in some cases); we need to preserve
3371 		 * it as locker, unless it is gone completely.
3372 		 *
3373 		 * If it's not a multi, we need to check for sleeping conditions
3374 		 * before actually going to sleep.  If the update doesn't conflict
3375 		 * with the locks, we just continue without sleeping (but making sure
3376 		 * it is preserved).
3377 		 *
3378 		 * Before sleeping, we need to acquire tuple lock to establish our
3379 		 * priority for the tuple (see heap_lock_tuple).  LockTuple will
3380 		 * release us when we are next-in-line for the tuple.  Note we must
3381 		 * not acquire the tuple lock until we're sure we're going to sleep;
3382 		 * otherwise we're open for race conditions with other transactions
3383 		 * holding the tuple lock which sleep on us.
3384 		 *
3385 		 * If we are forced to "start over" below, we keep the tuple lock;
3386 		 * this arranges that we stay at the head of the line while rechecking
3387 		 * tuple state.
3388 		 */
3389 		if (infomask & HEAP_XMAX_IS_MULTI)
3390 		{
3391 			TransactionId update_xact;
3392 			int			remain;
3393 			bool		current_is_member = false;
3394 
3395 			if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3396 										*lockmode, &current_is_member))
3397 			{
3398 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3399 
3400 				/*
3401 				 * Acquire the lock, if necessary (but skip it when we're
3402 				 * requesting a lock and already have one; avoids deadlock).
3403 				 */
3404 				if (!current_is_member)
3405 					heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3406 										 LockWaitBlock, &have_tuple_lock);
3407 
3408 				/* wait for multixact */
3409 				MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3410 								relation, &oldtup.t_self, XLTW_Update,
3411 								&remain);
3412 				checked_lockers = true;
3413 				locker_remains = remain != 0;
3414 				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3415 
3416 				/*
3417 				 * If xwait had just locked the tuple then some other xact
3418 				 * could update this tuple before we get to this point.  Check
3419 				 * for xmax change, and start over if so.
3420 				 */
3421 				if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3422 										  infomask) ||
3423 					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3424 										 xwait))
3425 					goto l2;
3426 			}
3427 
3428 			/*
3429 			 * Note that the multixact may not be done by now.  It could have
3430 			 * surviving members; our own xact or other subxacts of this
3431 			 * backend, and also any other concurrent transaction that locked
3432 			 * the tuple with LockTupleKeyShare if we only got
3433 			 * LockTupleNoKeyExclusive.  If this is the case, we have to be
3434 			 * careful to mark the updated tuple with the surviving members in
3435 			 * Xmax.
3436 			 *
3437 			 * Note that there could have been another update in the
3438 			 * MultiXact. In that case, we need to check whether it committed
3439 			 * or aborted. If it aborted we are safe to update it again;
3440 			 * otherwise there is an update conflict, and we have to return
3441 			 * TableTuple{Deleted, Updated} below.
3442 			 *
3443 			 * In the LockTupleExclusive case, we still need to preserve the
3444 			 * surviving members: those would include the tuple locks we had
3445 			 * before this one, which are important to keep in case this
3446 			 * subxact aborts.
3447 			 */
3448 			if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3449 				update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3450 			else
3451 				update_xact = InvalidTransactionId;
3452 
3453 			/*
3454 			 * There was no UPDATE in the MultiXact; or it aborted. No
3455 			 * TransactionIdIsInProgress() call needed here, since we called
3456 			 * MultiXactIdWait() above.
3457 			 */
3458 			if (!TransactionIdIsValid(update_xact) ||
3459 				TransactionIdDidAbort(update_xact))
3460 				can_continue = true;
3461 		}
3462 		else if (TransactionIdIsCurrentTransactionId(xwait))
3463 		{
3464 			/*
3465 			 * The only locker is ourselves; we can avoid grabbing the tuple
3466 			 * lock here, but must preserve our locking information.
3467 			 */
3468 			checked_lockers = true;
3469 			locker_remains = true;
3470 			can_continue = true;
3471 		}
3472 		else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3473 		{
3474 			/*
3475 			 * If it's just a key-share locker, and we're not changing the key
3476 			 * columns, we don't need to wait for it to end; but we need to
3477 			 * preserve it as locker.
3478 			 */
3479 			checked_lockers = true;
3480 			locker_remains = true;
3481 			can_continue = true;
3482 		}
3483 		else
3484 		{
3485 			/*
3486 			 * Wait for regular transaction to end; but first, acquire tuple
3487 			 * lock.
3488 			 */
3489 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3490 			heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3491 								 LockWaitBlock, &have_tuple_lock);
3492 			XactLockTableWait(xwait, relation, &oldtup.t_self,
3493 							  XLTW_Update);
3494 			checked_lockers = true;
3495 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3496 
3497 			/*
3498 			 * xwait is done, but if xwait had just locked the tuple then some
3499 			 * other xact could update this tuple before we get to this point.
3500 			 * Check for xmax change, and start over if so.
3501 			 */
3502 			if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3503 				!TransactionIdEquals(xwait,
3504 									 HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3505 				goto l2;
3506 
3507 			/* Otherwise check if it committed or aborted */
3508 			UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3509 			if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3510 				can_continue = true;
3511 		}
3512 
3513 		if (can_continue)
3514 			result = TM_Ok;
3515 		else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3516 			result = TM_Updated;
3517 		else
3518 			result = TM_Deleted;
3519 	}
3520 
3521 	if (crosscheck != InvalidSnapshot && result == TM_Ok)
3522 	{
3523 		/* Perform additional check for transaction-snapshot mode RI updates */
3524 		if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3525 		{
3526 			result = TM_Updated;
3527 			Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3528 		}
3529 	}
3530 
3531 	if (result != TM_Ok)
3532 	{
3533 		Assert(result == TM_SelfModified ||
3534 			   result == TM_Updated ||
3535 			   result == TM_Deleted ||
3536 			   result == TM_BeingModified);
3537 		Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3538 		Assert(result != TM_Updated ||
3539 			   !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3540 		tmfd->ctid = oldtup.t_data->t_ctid;
3541 		tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3542 		if (result == TM_SelfModified)
3543 			tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3544 		else
3545 			tmfd->cmax = InvalidCommandId;
3546 		UnlockReleaseBuffer(buffer);
3547 		if (have_tuple_lock)
3548 			UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3549 		if (vmbuffer != InvalidBuffer)
3550 			ReleaseBuffer(vmbuffer);
3551 		bms_free(hot_attrs);
3552 		bms_free(key_attrs);
3553 		bms_free(id_attrs);
3554 		bms_free(modified_attrs);
3555 		bms_free(interesting_attrs);
3556 		return result;
3557 	}
3558 
3559 	/*
3560 	 * If we didn't pin the visibility map page and the page has become all
3561 	 * visible while we were busy locking the buffer, or during some
3562 	 * subsequent window during which we had it unlocked, we'll have to unlock
3563 	 * and re-lock, to avoid holding the buffer lock across an I/O.  That's a
3564 	 * bit unfortunate, especially since we'll now have to recheck whether the
3565 	 * tuple has been locked or updated under us, but hopefully it won't
3566 	 * happen very often.
3567 	 */
3568 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3569 	{
3570 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3571 		visibilitymap_pin(relation, block, &vmbuffer);
3572 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3573 		goto l2;
3574 	}
3575 
3576 	/* Fill in transaction status data */
3577 
3578 	/*
3579 	 * If the tuple we're updating is locked, we need to preserve the locking
3580 	 * info in the old tuple's Xmax.  Prepare a new Xmax value for this.
3581 	 */
3582 	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3583 							  oldtup.t_data->t_infomask,
3584 							  oldtup.t_data->t_infomask2,
3585 							  xid, *lockmode, true,
3586 							  &xmax_old_tuple, &infomask_old_tuple,
3587 							  &infomask2_old_tuple);
3588 
3589 	/*
3590 	 * And also prepare an Xmax value for the new copy of the tuple.  If there
3591 	 * was no xmax previously, or there was one but all lockers are now gone,
3592 	 * then use InvalidXid; otherwise, get the xmax from the old tuple.  (In
3593 	 * rare cases that might also be InvalidXid and yet not have the
3594 	 * HEAP_XMAX_INVALID bit set; that's fine.)
3595 	 */
3596 	if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3597 		HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3598 		(checked_lockers && !locker_remains))
3599 		xmax_new_tuple = InvalidTransactionId;
3600 	else
3601 		xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3602 
3603 	if (!TransactionIdIsValid(xmax_new_tuple))
3604 	{
3605 		infomask_new_tuple = HEAP_XMAX_INVALID;
3606 		infomask2_new_tuple = 0;
3607 	}
3608 	else
3609 	{
3610 		/*
3611 		 * If we found a valid Xmax for the new tuple, then the infomask bits
3612 		 * to use on the new tuple depend on what was there on the old one.
3613 		 * Note that since we're doing an update, the only possibility is that
3614 		 * the lockers had FOR KEY SHARE lock.
3615 		 */
3616 		if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3617 		{
3618 			GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3619 								   &infomask2_new_tuple);
3620 		}
3621 		else
3622 		{
3623 			infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3624 			infomask2_new_tuple = 0;
3625 		}
3626 	}
3627 
3628 	/*
3629 	 * Prepare the new tuple with the appropriate initial values of Xmin and
3630 	 * Xmax, as well as initial infomask bits as computed above.
3631 	 */
3632 	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3633 	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3634 	HeapTupleHeaderSetXmin(newtup->t_data, xid);
3635 	HeapTupleHeaderSetCmin(newtup->t_data, cid);
3636 	newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3637 	newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3638 	HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3639 
3640 	/*
3641 	 * Replace cid with a combo CID if necessary.  Note that we already put
3642 	 * the plain cid into the new tuple.
3643 	 */
3644 	HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3645 
3646 	/*
3647 	 * If the toaster needs to be activated, OR if the new tuple will not fit
3648 	 * on the same page as the old, then we need to release the content lock
3649 	 * (but not the pin!) on the old tuple's buffer while we are off doing
3650 	 * TOAST and/or table-file-extension work.  We must mark the old tuple to
3651 	 * show that it's locked, else other processes may try to update it
3652 	 * themselves.
3653 	 *
3654 	 * We need to invoke the toaster if there are already any out-of-line
3655 	 * toasted values present, or if the new tuple is over-threshold.
3656 	 */
3657 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
3658 		relation->rd_rel->relkind != RELKIND_MATVIEW)
3659 	{
3660 		/* toast table entries should never be recursively toasted */
3661 		Assert(!HeapTupleHasExternal(&oldtup));
3662 		Assert(!HeapTupleHasExternal(newtup));
3663 		need_toast = false;
3664 	}
3665 	else
3666 		need_toast = (HeapTupleHasExternal(&oldtup) ||
3667 					  HeapTupleHasExternal(newtup) ||
3668 					  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3669 
3670 	pagefree = PageGetHeapFreeSpace(page);
3671 
3672 	newtupsize = MAXALIGN(newtup->t_len);
3673 
3674 	if (need_toast || newtupsize > pagefree)
3675 	{
3676 		TransactionId xmax_lock_old_tuple;
3677 		uint16		infomask_lock_old_tuple,
3678 					infomask2_lock_old_tuple;
3679 		bool		cleared_all_frozen = false;
3680 
3681 		/*
3682 		 * To prevent concurrent sessions from updating the tuple, we have to
3683 		 * temporarily mark it locked, while we release the page-level lock.
3684 		 *
3685 		 * To satisfy the rule that any xid potentially appearing in a buffer
3686 		 * written out to disk, we unfortunately have to WAL log this
3687 		 * temporary modification.  We can reuse xl_heap_lock for this
3688 		 * purpose.  If we crash/error before following through with the
3689 		 * actual update, xmax will be of an aborted transaction, allowing
3690 		 * other sessions to proceed.
3691 		 */
3692 
3693 		/*
3694 		 * Compute xmax / infomask appropriate for locking the tuple. This has
3695 		 * to be done separately from the combo that's going to be used for
3696 		 * updating, because the potentially created multixact would otherwise
3697 		 * be wrong.
3698 		 */
3699 		compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3700 								  oldtup.t_data->t_infomask,
3701 								  oldtup.t_data->t_infomask2,
3702 								  xid, *lockmode, false,
3703 								  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3704 								  &infomask2_lock_old_tuple);
3705 
3706 		Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3707 
3708 		START_CRIT_SECTION();
3709 
3710 		/* Clear obsolete visibility flags ... */
3711 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3712 		oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3713 		HeapTupleClearHotUpdated(&oldtup);
3714 		/* ... and store info about transaction updating this tuple */
3715 		Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3716 		HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3717 		oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3718 		oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3719 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3720 
3721 		/* temporarily make it look not-updated, but locked */
3722 		oldtup.t_data->t_ctid = oldtup.t_self;
3723 
3724 		/*
3725 		 * Clear all-frozen bit on visibility map if needed. We could
3726 		 * immediately reset ALL_VISIBLE, but given that the WAL logging
3727 		 * overhead would be unchanged, that doesn't seem necessarily
3728 		 * worthwhile.
3729 		 */
3730 		if (PageIsAllVisible(page) &&
3731 			visibilitymap_clear(relation, block, vmbuffer,
3732 								VISIBILITYMAP_ALL_FROZEN))
3733 			cleared_all_frozen = true;
3734 
3735 		MarkBufferDirty(buffer);
3736 
3737 		if (RelationNeedsWAL(relation))
3738 		{
3739 			xl_heap_lock xlrec;
3740 			XLogRecPtr	recptr;
3741 
3742 			XLogBeginInsert();
3743 			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3744 
3745 			xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3746 			xlrec.locking_xid = xmax_lock_old_tuple;
3747 			xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3748 												  oldtup.t_data->t_infomask2);
3749 			xlrec.flags =
3750 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3751 			XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3752 			recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3753 			PageSetLSN(page, recptr);
3754 		}
3755 
3756 		END_CRIT_SECTION();
3757 
3758 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3759 
3760 		/*
3761 		 * Let the toaster do its thing, if needed.
3762 		 *
3763 		 * Note: below this point, heaptup is the data we actually intend to
3764 		 * store into the relation; newtup is the caller's original untoasted
3765 		 * data.
3766 		 */
3767 		if (need_toast)
3768 		{
3769 			/* Note we always use WAL and FSM during updates */
3770 			heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3771 			newtupsize = MAXALIGN(heaptup->t_len);
3772 		}
3773 		else
3774 			heaptup = newtup;
3775 
3776 		/*
3777 		 * Now, do we need a new page for the tuple, or not?  This is a bit
3778 		 * tricky since someone else could have added tuples to the page while
3779 		 * we weren't looking.  We have to recheck the available space after
3780 		 * reacquiring the buffer lock.  But don't bother to do that if the
3781 		 * former amount of free space is still not enough; it's unlikely
3782 		 * there's more free now than before.
3783 		 *
3784 		 * What's more, if we need to get a new page, we will need to acquire
3785 		 * buffer locks on both old and new pages.  To avoid deadlock against
3786 		 * some other backend trying to get the same two locks in the other
3787 		 * order, we must be consistent about the order we get the locks in.
3788 		 * We use the rule "lock the lower-numbered page of the relation
3789 		 * first".  To implement this, we must do RelationGetBufferForTuple
3790 		 * while not holding the lock on the old page, and we must rely on it
3791 		 * to get the locks on both pages in the correct order.
3792 		 *
3793 		 * Another consideration is that we need visibility map page pin(s) if
3794 		 * we will have to clear the all-visible flag on either page.  If we
3795 		 * call RelationGetBufferForTuple, we rely on it to acquire any such
3796 		 * pins; but if we don't, we have to handle that here.  Hence we need
3797 		 * a loop.
3798 		 */
3799 		for (;;)
3800 		{
3801 			if (newtupsize > pagefree)
3802 			{
3803 				/* It doesn't fit, must use RelationGetBufferForTuple. */
3804 				newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3805 												   buffer, 0, NULL,
3806 												   &vmbuffer_new, &vmbuffer);
3807 				/* We're all done. */
3808 				break;
3809 			}
3810 			/* Acquire VM page pin if needed and we don't have it. */
3811 			if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3812 				visibilitymap_pin(relation, block, &vmbuffer);
3813 			/* Re-acquire the lock on the old tuple's page. */
3814 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3815 			/* Re-check using the up-to-date free space */
3816 			pagefree = PageGetHeapFreeSpace(page);
3817 			if (newtupsize > pagefree ||
3818 				(vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
3819 			{
3820 				/*
3821 				 * Rats, it doesn't fit anymore, or somebody just now set the
3822 				 * all-visible flag.  We must now unlock and loop to avoid
3823 				 * deadlock.  Fortunately, this path should seldom be taken.
3824 				 */
3825 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3826 			}
3827 			else
3828 			{
3829 				/* We're all done. */
3830 				newbuf = buffer;
3831 				break;
3832 			}
3833 		}
3834 	}
3835 	else
3836 	{
3837 		/* No TOAST work needed, and it'll fit on same page */
3838 		newbuf = buffer;
3839 		heaptup = newtup;
3840 	}
3841 
3842 	/*
3843 	 * We're about to do the actual update -- check for conflict first, to
3844 	 * avoid possibly having to roll back work we've just done.
3845 	 *
3846 	 * This is safe without a recheck as long as there is no possibility of
3847 	 * another process scanning the pages between this check and the update
3848 	 * being visible to the scan (i.e., exclusive buffer content lock(s) are
3849 	 * continuously held from this point until the tuple update is visible).
3850 	 *
3851 	 * For the new tuple the only check needed is at the relation level, but
3852 	 * since both tuples are in the same relation and the check for oldtup
3853 	 * will include checking the relation level, there is no benefit to a
3854 	 * separate check for the new tuple.
3855 	 */
3856 	CheckForSerializableConflictIn(relation, &oldtup.t_self,
3857 								   BufferGetBlockNumber(buffer));
3858 
3859 	/*
3860 	 * At this point newbuf and buffer are both pinned and locked, and newbuf
3861 	 * has enough space for the new tuple.  If they are the same buffer, only
3862 	 * one pin is held.
3863 	 */
3864 
3865 	if (newbuf == buffer)
3866 	{
3867 		/*
3868 		 * Since the new tuple is going into the same page, we might be able
3869 		 * to do a HOT update.  Check if any of the index columns have been
3870 		 * changed. If the page was already full, we may have skipped checking
3871 		 * for index columns, and also can't do a HOT update.
3872 		 */
3873 		if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3874 			use_hot_update = true;
3875 	}
3876 	else
3877 	{
3878 		/* Set a hint that the old page could use prune/defrag */
3879 		PageSetFull(page);
3880 	}
3881 
3882 	/*
3883 	 * Compute replica identity tuple before entering the critical section so
3884 	 * we don't PANIC upon a memory allocation failure.
3885 	 * ExtractReplicaIdentity() will return NULL if nothing needs to be
3886 	 * logged.
3887 	 */
3888 	old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3889 										   bms_overlap(modified_attrs, id_attrs),
3890 										   &old_key_copied);
3891 
3892 	/* NO EREPORT(ERROR) from here till changes are logged */
3893 	START_CRIT_SECTION();
3894 
3895 	/*
3896 	 * If this transaction commits, the old tuple will become DEAD sooner or
3897 	 * later.  Set flag that this page is a candidate for pruning once our xid
3898 	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
3899 	 * the subsequent page pruning will be a no-op and the hint will be
3900 	 * cleared.
3901 	 *
3902 	 * XXX Should we set hint on newbuf as well?  If the transaction aborts,
3903 	 * there would be a prunable tuple in the newbuf; but for now we choose
3904 	 * not to optimize for aborts.  Note that heap_xlog_update must be kept in
3905 	 * sync if this decision changes.
3906 	 */
3907 	PageSetPrunable(page, xid);
3908 
3909 	if (use_hot_update)
3910 	{
3911 		/* Mark the old tuple as HOT-updated */
3912 		HeapTupleSetHotUpdated(&oldtup);
3913 		/* And mark the new tuple as heap-only */
3914 		HeapTupleSetHeapOnly(heaptup);
3915 		/* Mark the caller's copy too, in case different from heaptup */
3916 		HeapTupleSetHeapOnly(newtup);
3917 	}
3918 	else
3919 	{
3920 		/* Make sure tuples are correctly marked as not-HOT */
3921 		HeapTupleClearHotUpdated(&oldtup);
3922 		HeapTupleClearHeapOnly(heaptup);
3923 		HeapTupleClearHeapOnly(newtup);
3924 	}
3925 
3926 	RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3927 
3928 
3929 	/* Clear obsolete visibility flags, possibly set by ourselves above... */
3930 	oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3931 	oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3932 	/* ... and store info about transaction updating this tuple */
3933 	Assert(TransactionIdIsValid(xmax_old_tuple));
3934 	HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3935 	oldtup.t_data->t_infomask |= infomask_old_tuple;
3936 	oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3937 	HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3938 
3939 	/* record address of new tuple in t_ctid of old one */
3940 	oldtup.t_data->t_ctid = heaptup->t_self;
3941 
3942 	/* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3943 	if (PageIsAllVisible(BufferGetPage(buffer)))
3944 	{
3945 		all_visible_cleared = true;
3946 		PageClearAllVisible(BufferGetPage(buffer));
3947 		visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3948 							vmbuffer, VISIBILITYMAP_VALID_BITS);
3949 	}
3950 	if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3951 	{
3952 		all_visible_cleared_new = true;
3953 		PageClearAllVisible(BufferGetPage(newbuf));
3954 		visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3955 							vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3956 	}
3957 
3958 	if (newbuf != buffer)
3959 		MarkBufferDirty(newbuf);
3960 	MarkBufferDirty(buffer);
3961 
3962 	/* XLOG stuff */
3963 	if (RelationNeedsWAL(relation))
3964 	{
3965 		XLogRecPtr	recptr;
3966 
3967 		/*
3968 		 * For logical decoding we need combo CIDs to properly decode the
3969 		 * catalog.
3970 		 */
3971 		if (RelationIsAccessibleInLogicalDecoding(relation))
3972 		{
3973 			log_heap_new_cid(relation, &oldtup);
3974 			log_heap_new_cid(relation, heaptup);
3975 		}
3976 
3977 		recptr = log_heap_update(relation, buffer,
3978 								 newbuf, &oldtup, heaptup,
3979 								 old_key_tuple,
3980 								 all_visible_cleared,
3981 								 all_visible_cleared_new);
3982 		if (newbuf != buffer)
3983 		{
3984 			PageSetLSN(BufferGetPage(newbuf), recptr);
3985 		}
3986 		PageSetLSN(BufferGetPage(buffer), recptr);
3987 	}
3988 
3989 	END_CRIT_SECTION();
3990 
3991 	if (newbuf != buffer)
3992 		LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3993 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3994 
3995 	/*
3996 	 * Mark old tuple for invalidation from system caches at next command
3997 	 * boundary, and mark the new tuple for invalidation in case we abort. We
3998 	 * have to do this before releasing the buffer because oldtup is in the
3999 	 * buffer.  (heaptup is all in local memory, but it's necessary to process
4000 	 * both tuple versions in one call to inval.c so we can avoid redundant
4001 	 * sinval messages.)
4002 	 */
4003 	CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4004 
4005 	/* Now we can release the buffer(s) */
4006 	if (newbuf != buffer)
4007 		ReleaseBuffer(newbuf);
4008 	ReleaseBuffer(buffer);
4009 	if (BufferIsValid(vmbuffer_new))
4010 		ReleaseBuffer(vmbuffer_new);
4011 	if (BufferIsValid(vmbuffer))
4012 		ReleaseBuffer(vmbuffer);
4013 
4014 	/*
4015 	 * Release the lmgr tuple lock, if we had it.
4016 	 */
4017 	if (have_tuple_lock)
4018 		UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4019 
4020 	pgstat_count_heap_update(relation, use_hot_update);
4021 
4022 	/*
4023 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
4024 	 * back to the caller's image, too.
4025 	 */
4026 	if (heaptup != newtup)
4027 	{
4028 		newtup->t_self = heaptup->t_self;
4029 		heap_freetuple(heaptup);
4030 	}
4031 
4032 	if (old_key_tuple != NULL && old_key_copied)
4033 		heap_freetuple(old_key_tuple);
4034 
4035 	bms_free(hot_attrs);
4036 	bms_free(key_attrs);
4037 	bms_free(id_attrs);
4038 	bms_free(modified_attrs);
4039 	bms_free(interesting_attrs);
4040 
4041 	return TM_Ok;
4042 }
4043 
4044 /*
4045  * Check if the specified attribute's value is same in both given tuples.
4046  * Subroutine for HeapDetermineModifiedColumns.
4047  */
4048 static bool
heap_tuple_attr_equals(TupleDesc tupdesc,int attrnum,HeapTuple tup1,HeapTuple tup2)4049 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4050 					   HeapTuple tup1, HeapTuple tup2)
4051 {
4052 	Datum		value1,
4053 				value2;
4054 	bool		isnull1,
4055 				isnull2;
4056 	Form_pg_attribute att;
4057 
4058 	/*
4059 	 * If it's a whole-tuple reference, say "not equal".  It's not really
4060 	 * worth supporting this case, since it could only succeed after a no-op
4061 	 * update, which is hardly a case worth optimizing for.
4062 	 */
4063 	if (attrnum == 0)
4064 		return false;
4065 
4066 	/*
4067 	 * Likewise, automatically say "not equal" for any system attribute other
4068 	 * than tableOID; we cannot expect these to be consistent in a HOT chain,
4069 	 * or even to be set correctly yet in the new tuple.
4070 	 */
4071 	if (attrnum < 0)
4072 	{
4073 		if (attrnum != TableOidAttributeNumber)
4074 			return false;
4075 	}
4076 
4077 	/*
4078 	 * Extract the corresponding values.  XXX this is pretty inefficient if
4079 	 * there are many indexed columns.  Should HeapDetermineModifiedColumns do
4080 	 * a single heap_deform_tuple call on each tuple, instead?	But that
4081 	 * doesn't work for system columns ...
4082 	 */
4083 	value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4084 	value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4085 
4086 	/*
4087 	 * If one value is NULL and other is not, then they are certainly not
4088 	 * equal
4089 	 */
4090 	if (isnull1 != isnull2)
4091 		return false;
4092 
4093 	/*
4094 	 * If both are NULL, they can be considered equal.
4095 	 */
4096 	if (isnull1)
4097 		return true;
4098 
4099 	/*
4100 	 * We do simple binary comparison of the two datums.  This may be overly
4101 	 * strict because there can be multiple binary representations for the
4102 	 * same logical value.  But we should be OK as long as there are no false
4103 	 * positives.  Using a type-specific equality operator is messy because
4104 	 * there could be multiple notions of equality in different operator
4105 	 * classes; furthermore, we cannot safely invoke user-defined functions
4106 	 * while holding exclusive buffer lock.
4107 	 */
4108 	if (attrnum <= 0)
4109 	{
4110 		/* The only allowed system columns are OIDs, so do this */
4111 		return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4112 	}
4113 	else
4114 	{
4115 		Assert(attrnum <= tupdesc->natts);
4116 		att = TupleDescAttr(tupdesc, attrnum - 1);
4117 		return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4118 	}
4119 }
4120 
4121 /*
4122  * Check which columns are being updated.
4123  *
4124  * Given an updated tuple, determine (and return into the output bitmapset),
4125  * from those listed as interesting, the set of columns that changed.
4126  *
4127  * The input bitmapset is destructively modified; that is OK since this is
4128  * invoked at most once in heap_update.
4129  */
4130 static Bitmapset *
HeapDetermineModifiedColumns(Relation relation,Bitmapset * interesting_cols,HeapTuple oldtup,HeapTuple newtup)4131 HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
4132 							 HeapTuple oldtup, HeapTuple newtup)
4133 {
4134 	int			attnum;
4135 	Bitmapset  *modified = NULL;
4136 
4137 	while ((attnum = bms_first_member(interesting_cols)) >= 0)
4138 	{
4139 		attnum += FirstLowInvalidHeapAttributeNumber;
4140 
4141 		if (!heap_tuple_attr_equals(RelationGetDescr(relation),
4142 									attnum, oldtup, newtup))
4143 			modified = bms_add_member(modified,
4144 									  attnum - FirstLowInvalidHeapAttributeNumber);
4145 	}
4146 
4147 	return modified;
4148 }
4149 
4150 /*
4151  *	simple_heap_update - replace a tuple
4152  *
4153  * This routine may be used to update a tuple when concurrent updates of
4154  * the target tuple are not expected (for example, because we have a lock
4155  * on the relation associated with the tuple).  Any failure is reported
4156  * via ereport().
4157  */
4158 void
simple_heap_update(Relation relation,ItemPointer otid,HeapTuple tup)4159 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
4160 {
4161 	TM_Result	result;
4162 	TM_FailureData tmfd;
4163 	LockTupleMode lockmode;
4164 
4165 	result = heap_update(relation, otid, tup,
4166 						 GetCurrentCommandId(true), InvalidSnapshot,
4167 						 true /* wait for commit */ ,
4168 						 &tmfd, &lockmode);
4169 	switch (result)
4170 	{
4171 		case TM_SelfModified:
4172 			/* Tuple was already updated in current command? */
4173 			elog(ERROR, "tuple already updated by self");
4174 			break;
4175 
4176 		case TM_Ok:
4177 			/* done successfully */
4178 			break;
4179 
4180 		case TM_Updated:
4181 			elog(ERROR, "tuple concurrently updated");
4182 			break;
4183 
4184 		case TM_Deleted:
4185 			elog(ERROR, "tuple concurrently deleted");
4186 			break;
4187 
4188 		default:
4189 			elog(ERROR, "unrecognized heap_update status: %u", result);
4190 			break;
4191 	}
4192 }
4193 
4194 
4195 /*
4196  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4197  */
4198 static MultiXactStatus
get_mxact_status_for_lock(LockTupleMode mode,bool is_update)4199 get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
4200 {
4201 	int			retval;
4202 
4203 	if (is_update)
4204 		retval = tupleLockExtraInfo[mode].updstatus;
4205 	else
4206 		retval = tupleLockExtraInfo[mode].lockstatus;
4207 
4208 	if (retval == -1)
4209 		elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4210 			 is_update ? "true" : "false");
4211 
4212 	return (MultiXactStatus) retval;
4213 }
4214 
4215 /*
4216  *	heap_lock_tuple - lock a tuple in shared or exclusive mode
4217  *
4218  * Note that this acquires a buffer pin, which the caller must release.
4219  *
4220  * Input parameters:
4221  *	relation: relation containing tuple (caller must hold suitable lock)
4222  *	tid: TID of tuple to lock
4223  *	cid: current command ID (used for visibility test, and stored into
4224  *		tuple's cmax if lock is successful)
4225  *	mode: indicates if shared or exclusive tuple lock is desired
4226  *	wait_policy: what to do if tuple lock is not available
4227  *	follow_updates: if true, follow the update chain to also lock descendant
4228  *		tuples.
4229  *
4230  * Output parameters:
4231  *	*tuple: all fields filled in
4232  *	*buffer: set to buffer holding tuple (pinned but not locked at exit)
4233  *	*tmfd: filled in failure cases (see below)
4234  *
4235  * Function results are the same as the ones for table_tuple_lock().
4236  *
4237  * In the failure cases other than TM_Invisible, the routine fills
4238  * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4239  * if necessary), and t_cmax (the last only for TM_SelfModified,
4240  * since we cannot obtain cmax from a combo CID generated by another
4241  * transaction).
4242  * See comments for struct TM_FailureData for additional info.
4243  *
4244  * See README.tuplock for a thorough explanation of this mechanism.
4245  */
4246 TM_Result
heap_lock_tuple(Relation relation,HeapTuple tuple,CommandId cid,LockTupleMode mode,LockWaitPolicy wait_policy,bool follow_updates,Buffer * buffer,TM_FailureData * tmfd)4247 heap_lock_tuple(Relation relation, HeapTuple tuple,
4248 				CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4249 				bool follow_updates,
4250 				Buffer *buffer, TM_FailureData *tmfd)
4251 {
4252 	TM_Result	result;
4253 	ItemPointer tid = &(tuple->t_self);
4254 	ItemId		lp;
4255 	Page		page;
4256 	Buffer		vmbuffer = InvalidBuffer;
4257 	BlockNumber block;
4258 	TransactionId xid,
4259 				xmax;
4260 	uint16		old_infomask,
4261 				new_infomask,
4262 				new_infomask2;
4263 	bool		first_time = true;
4264 	bool		skip_tuple_lock = false;
4265 	bool		have_tuple_lock = false;
4266 	bool		cleared_all_frozen = false;
4267 
4268 	*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4269 	block = ItemPointerGetBlockNumber(tid);
4270 
4271 	/*
4272 	 * Before locking the buffer, pin the visibility map page if it appears to
4273 	 * be necessary.  Since we haven't got the lock yet, someone else might be
4274 	 * in the middle of changing this, so we'll need to recheck after we have
4275 	 * the lock.
4276 	 */
4277 	if (PageIsAllVisible(BufferGetPage(*buffer)))
4278 		visibilitymap_pin(relation, block, &vmbuffer);
4279 
4280 	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4281 
4282 	page = BufferGetPage(*buffer);
4283 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4284 	Assert(ItemIdIsNormal(lp));
4285 
4286 	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4287 	tuple->t_len = ItemIdGetLength(lp);
4288 	tuple->t_tableOid = RelationGetRelid(relation);
4289 
4290 l3:
4291 	result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4292 
4293 	if (result == TM_Invisible)
4294 	{
4295 		/*
4296 		 * This is possible, but only when locking a tuple for ON CONFLICT
4297 		 * UPDATE.  We return this value here rather than throwing an error in
4298 		 * order to give that case the opportunity to throw a more specific
4299 		 * error.
4300 		 */
4301 		result = TM_Invisible;
4302 		goto out_locked;
4303 	}
4304 	else if (result == TM_BeingModified ||
4305 			 result == TM_Updated ||
4306 			 result == TM_Deleted)
4307 	{
4308 		TransactionId xwait;
4309 		uint16		infomask;
4310 		uint16		infomask2;
4311 		bool		require_sleep;
4312 		ItemPointerData t_ctid;
4313 
4314 		/* must copy state data before unlocking buffer */
4315 		xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4316 		infomask = tuple->t_data->t_infomask;
4317 		infomask2 = tuple->t_data->t_infomask2;
4318 		ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4319 
4320 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4321 
4322 		/*
4323 		 * If any subtransaction of the current top transaction already holds
4324 		 * a lock as strong as or stronger than what we're requesting, we
4325 		 * effectively hold the desired lock already.  We *must* succeed
4326 		 * without trying to take the tuple lock, else we will deadlock
4327 		 * against anyone wanting to acquire a stronger lock.
4328 		 *
4329 		 * Note we only do this the first time we loop on the HTSU result;
4330 		 * there is no point in testing in subsequent passes, because
4331 		 * evidently our own transaction cannot have acquired a new lock after
4332 		 * the first time we checked.
4333 		 */
4334 		if (first_time)
4335 		{
4336 			first_time = false;
4337 
4338 			if (infomask & HEAP_XMAX_IS_MULTI)
4339 			{
4340 				int			i;
4341 				int			nmembers;
4342 				MultiXactMember *members;
4343 
4344 				/*
4345 				 * We don't need to allow old multixacts here; if that had
4346 				 * been the case, HeapTupleSatisfiesUpdate would have returned
4347 				 * MayBeUpdated and we wouldn't be here.
4348 				 */
4349 				nmembers =
4350 					GetMultiXactIdMembers(xwait, &members, false,
4351 										  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4352 
4353 				for (i = 0; i < nmembers; i++)
4354 				{
4355 					/* only consider members of our own transaction */
4356 					if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4357 						continue;
4358 
4359 					if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4360 					{
4361 						pfree(members);
4362 						result = TM_Ok;
4363 						goto out_unlocked;
4364 					}
4365 					else
4366 					{
4367 						/*
4368 						 * Disable acquisition of the heavyweight tuple lock.
4369 						 * Otherwise, when promoting a weaker lock, we might
4370 						 * deadlock with another locker that has acquired the
4371 						 * heavyweight tuple lock and is waiting for our
4372 						 * transaction to finish.
4373 						 *
4374 						 * Note that in this case we still need to wait for
4375 						 * the multixact if required, to avoid acquiring
4376 						 * conflicting locks.
4377 						 */
4378 						skip_tuple_lock = true;
4379 					}
4380 				}
4381 
4382 				if (members)
4383 					pfree(members);
4384 			}
4385 			else if (TransactionIdIsCurrentTransactionId(xwait))
4386 			{
4387 				switch (mode)
4388 				{
4389 					case LockTupleKeyShare:
4390 						Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4391 							   HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4392 							   HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4393 						result = TM_Ok;
4394 						goto out_unlocked;
4395 					case LockTupleShare:
4396 						if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4397 							HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4398 						{
4399 							result = TM_Ok;
4400 							goto out_unlocked;
4401 						}
4402 						break;
4403 					case LockTupleNoKeyExclusive:
4404 						if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4405 						{
4406 							result = TM_Ok;
4407 							goto out_unlocked;
4408 						}
4409 						break;
4410 					case LockTupleExclusive:
4411 						if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4412 							infomask2 & HEAP_KEYS_UPDATED)
4413 						{
4414 							result = TM_Ok;
4415 							goto out_unlocked;
4416 						}
4417 						break;
4418 				}
4419 			}
4420 		}
4421 
4422 		/*
4423 		 * Initially assume that we will have to wait for the locking
4424 		 * transaction(s) to finish.  We check various cases below in which
4425 		 * this can be turned off.
4426 		 */
4427 		require_sleep = true;
4428 		if (mode == LockTupleKeyShare)
4429 		{
4430 			/*
4431 			 * If we're requesting KeyShare, and there's no update present, we
4432 			 * don't need to wait.  Even if there is an update, we can still
4433 			 * continue if the key hasn't been modified.
4434 			 *
4435 			 * However, if there are updates, we need to walk the update chain
4436 			 * to mark future versions of the row as locked, too.  That way,
4437 			 * if somebody deletes that future version, we're protected
4438 			 * against the key going away.  This locking of future versions
4439 			 * could block momentarily, if a concurrent transaction is
4440 			 * deleting a key; or it could return a value to the effect that
4441 			 * the transaction deleting the key has already committed.  So we
4442 			 * do this before re-locking the buffer; otherwise this would be
4443 			 * prone to deadlocks.
4444 			 *
4445 			 * Note that the TID we're locking was grabbed before we unlocked
4446 			 * the buffer.  For it to change while we're not looking, the
4447 			 * other properties we're testing for below after re-locking the
4448 			 * buffer would also change, in which case we would restart this
4449 			 * loop above.
4450 			 */
4451 			if (!(infomask2 & HEAP_KEYS_UPDATED))
4452 			{
4453 				bool		updated;
4454 
4455 				updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4456 
4457 				/*
4458 				 * If there are updates, follow the update chain; bail out if
4459 				 * that cannot be done.
4460 				 */
4461 				if (follow_updates && updated)
4462 				{
4463 					TM_Result	res;
4464 
4465 					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4466 												  GetCurrentTransactionId(),
4467 												  mode);
4468 					if (res != TM_Ok)
4469 					{
4470 						result = res;
4471 						/* recovery code expects to have buffer lock held */
4472 						LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4473 						goto failed;
4474 					}
4475 				}
4476 
4477 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4478 
4479 				/*
4480 				 * Make sure it's still an appropriate lock, else start over.
4481 				 * Also, if it wasn't updated before we released the lock, but
4482 				 * is updated now, we start over too; the reason is that we
4483 				 * now need to follow the update chain to lock the new
4484 				 * versions.
4485 				 */
4486 				if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4487 					((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4488 					 !updated))
4489 					goto l3;
4490 
4491 				/* Things look okay, so we can skip sleeping */
4492 				require_sleep = false;
4493 
4494 				/*
4495 				 * Note we allow Xmax to change here; other updaters/lockers
4496 				 * could have modified it before we grabbed the buffer lock.
4497 				 * However, this is not a problem, because with the recheck we
4498 				 * just did we ensure that they still don't conflict with the
4499 				 * lock we want.
4500 				 */
4501 			}
4502 		}
4503 		else if (mode == LockTupleShare)
4504 		{
4505 			/*
4506 			 * If we're requesting Share, we can similarly avoid sleeping if
4507 			 * there's no update and no exclusive lock present.
4508 			 */
4509 			if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4510 				!HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4511 			{
4512 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4513 
4514 				/*
4515 				 * Make sure it's still an appropriate lock, else start over.
4516 				 * See above about allowing xmax to change.
4517 				 */
4518 				if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4519 					HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4520 					goto l3;
4521 				require_sleep = false;
4522 			}
4523 		}
4524 		else if (mode == LockTupleNoKeyExclusive)
4525 		{
4526 			/*
4527 			 * If we're requesting NoKeyExclusive, we might also be able to
4528 			 * avoid sleeping; just ensure that there no conflicting lock
4529 			 * already acquired.
4530 			 */
4531 			if (infomask & HEAP_XMAX_IS_MULTI)
4532 			{
4533 				if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4534 											 mode, NULL))
4535 				{
4536 					/*
4537 					 * No conflict, but if the xmax changed under us in the
4538 					 * meantime, start over.
4539 					 */
4540 					LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4541 					if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4542 						!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4543 											 xwait))
4544 						goto l3;
4545 
4546 					/* otherwise, we're good */
4547 					require_sleep = false;
4548 				}
4549 			}
4550 			else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4551 			{
4552 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4553 
4554 				/* if the xmax changed in the meantime, start over */
4555 				if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4556 					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4557 										 xwait))
4558 					goto l3;
4559 				/* otherwise, we're good */
4560 				require_sleep = false;
4561 			}
4562 		}
4563 
4564 		/*
4565 		 * As a check independent from those above, we can also avoid sleeping
4566 		 * if the current transaction is the sole locker of the tuple.  Note
4567 		 * that the strength of the lock already held is irrelevant; this is
4568 		 * not about recording the lock in Xmax (which will be done regardless
4569 		 * of this optimization, below).  Also, note that the cases where we
4570 		 * hold a lock stronger than we are requesting are already handled
4571 		 * above by not doing anything.
4572 		 *
4573 		 * Note we only deal with the non-multixact case here; MultiXactIdWait
4574 		 * is well equipped to deal with this situation on its own.
4575 		 */
4576 		if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4577 			TransactionIdIsCurrentTransactionId(xwait))
4578 		{
4579 			/* ... but if the xmax changed in the meantime, start over */
4580 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4581 			if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4582 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4583 									 xwait))
4584 				goto l3;
4585 			Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
4586 			require_sleep = false;
4587 		}
4588 
4589 		/*
4590 		 * Time to sleep on the other transaction/multixact, if necessary.
4591 		 *
4592 		 * If the other transaction is an update/delete that's already
4593 		 * committed, then sleeping cannot possibly do any good: if we're
4594 		 * required to sleep, get out to raise an error instead.
4595 		 *
4596 		 * By here, we either have already acquired the buffer exclusive lock,
4597 		 * or we must wait for the locking transaction or multixact; so below
4598 		 * we ensure that we grab buffer lock after the sleep.
4599 		 */
4600 		if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4601 		{
4602 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4603 			goto failed;
4604 		}
4605 		else if (require_sleep)
4606 		{
4607 			/*
4608 			 * Acquire tuple lock to establish our priority for the tuple, or
4609 			 * die trying.  LockTuple will release us when we are next-in-line
4610 			 * for the tuple.  We must do this even if we are share-locking,
4611 			 * but not if we already have a weaker lock on the tuple.
4612 			 *
4613 			 * If we are forced to "start over" below, we keep the tuple lock;
4614 			 * this arranges that we stay at the head of the line while
4615 			 * rechecking tuple state.
4616 			 */
4617 			if (!skip_tuple_lock &&
4618 				!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4619 									  &have_tuple_lock))
4620 			{
4621 				/*
4622 				 * This can only happen if wait_policy is Skip and the lock
4623 				 * couldn't be obtained.
4624 				 */
4625 				result = TM_WouldBlock;
4626 				/* recovery code expects to have buffer lock held */
4627 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4628 				goto failed;
4629 			}
4630 
4631 			if (infomask & HEAP_XMAX_IS_MULTI)
4632 			{
4633 				MultiXactStatus status = get_mxact_status_for_lock(mode, false);
4634 
4635 				/* We only ever lock tuples, never update them */
4636 				if (status >= MultiXactStatusNoKeyUpdate)
4637 					elog(ERROR, "invalid lock mode in heap_lock_tuple");
4638 
4639 				/* wait for multixact to end, or die trying  */
4640 				switch (wait_policy)
4641 				{
4642 					case LockWaitBlock:
4643 						MultiXactIdWait((MultiXactId) xwait, status, infomask,
4644 										relation, &tuple->t_self, XLTW_Lock, NULL);
4645 						break;
4646 					case LockWaitSkip:
4647 						if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4648 														status, infomask, relation,
4649 														NULL))
4650 						{
4651 							result = TM_WouldBlock;
4652 							/* recovery code expects to have buffer lock held */
4653 							LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4654 							goto failed;
4655 						}
4656 						break;
4657 					case LockWaitError:
4658 						if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4659 														status, infomask, relation,
4660 														NULL))
4661 							ereport(ERROR,
4662 									(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4663 									 errmsg("could not obtain lock on row in relation \"%s\"",
4664 											RelationGetRelationName(relation))));
4665 
4666 						break;
4667 				}
4668 
4669 				/*
4670 				 * Of course, the multixact might not be done here: if we're
4671 				 * requesting a light lock mode, other transactions with light
4672 				 * locks could still be alive, as well as locks owned by our
4673 				 * own xact or other subxacts of this backend.  We need to
4674 				 * preserve the surviving MultiXact members.  Note that it
4675 				 * isn't absolutely necessary in the latter case, but doing so
4676 				 * is simpler.
4677 				 */
4678 			}
4679 			else
4680 			{
4681 				/* wait for regular transaction to end, or die trying */
4682 				switch (wait_policy)
4683 				{
4684 					case LockWaitBlock:
4685 						XactLockTableWait(xwait, relation, &tuple->t_self,
4686 										  XLTW_Lock);
4687 						break;
4688 					case LockWaitSkip:
4689 						if (!ConditionalXactLockTableWait(xwait))
4690 						{
4691 							result = TM_WouldBlock;
4692 							/* recovery code expects to have buffer lock held */
4693 							LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4694 							goto failed;
4695 						}
4696 						break;
4697 					case LockWaitError:
4698 						if (!ConditionalXactLockTableWait(xwait))
4699 							ereport(ERROR,
4700 									(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4701 									 errmsg("could not obtain lock on row in relation \"%s\"",
4702 											RelationGetRelationName(relation))));
4703 						break;
4704 				}
4705 			}
4706 
4707 			/* if there are updates, follow the update chain */
4708 			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4709 			{
4710 				TM_Result	res;
4711 
4712 				res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4713 											  GetCurrentTransactionId(),
4714 											  mode);
4715 				if (res != TM_Ok)
4716 				{
4717 					result = res;
4718 					/* recovery code expects to have buffer lock held */
4719 					LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4720 					goto failed;
4721 				}
4722 			}
4723 
4724 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4725 
4726 			/*
4727 			 * xwait is done, but if xwait had just locked the tuple then some
4728 			 * other xact could update this tuple before we get to this point.
4729 			 * Check for xmax change, and start over if so.
4730 			 */
4731 			if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4732 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4733 									 xwait))
4734 				goto l3;
4735 
4736 			if (!(infomask & HEAP_XMAX_IS_MULTI))
4737 			{
4738 				/*
4739 				 * Otherwise check if it committed or aborted.  Note we cannot
4740 				 * be here if the tuple was only locked by somebody who didn't
4741 				 * conflict with us; that would have been handled above.  So
4742 				 * that transaction must necessarily be gone by now.  But
4743 				 * don't check for this in the multixact case, because some
4744 				 * locker transactions might still be running.
4745 				 */
4746 				UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4747 			}
4748 		}
4749 
4750 		/* By here, we're certain that we hold buffer exclusive lock again */
4751 
4752 		/*
4753 		 * We may lock if previous xmax aborted, or if it committed but only
4754 		 * locked the tuple without updating it; or if we didn't have to wait
4755 		 * at all for whatever reason.
4756 		 */
4757 		if (!require_sleep ||
4758 			(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4759 			HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4760 			HeapTupleHeaderIsOnlyLocked(tuple->t_data))
4761 			result = TM_Ok;
4762 		else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
4763 			result = TM_Updated;
4764 		else
4765 			result = TM_Deleted;
4766 	}
4767 
4768 failed:
4769 	if (result != TM_Ok)
4770 	{
4771 		Assert(result == TM_SelfModified || result == TM_Updated ||
4772 			   result == TM_Deleted || result == TM_WouldBlock);
4773 		Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4774 		Assert(result != TM_Updated ||
4775 			   !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4776 		tmfd->ctid = tuple->t_data->t_ctid;
4777 		tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4778 		if (result == TM_SelfModified)
4779 			tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4780 		else
4781 			tmfd->cmax = InvalidCommandId;
4782 		goto out_locked;
4783 	}
4784 
4785 	/*
4786 	 * If we didn't pin the visibility map page and the page has become all
4787 	 * visible while we were busy locking the buffer, or during some
4788 	 * subsequent window during which we had it unlocked, we'll have to unlock
4789 	 * and re-lock, to avoid holding the buffer lock across I/O.  That's a bit
4790 	 * unfortunate, especially since we'll now have to recheck whether the
4791 	 * tuple has been locked or updated under us, but hopefully it won't
4792 	 * happen very often.
4793 	 */
4794 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4795 	{
4796 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4797 		visibilitymap_pin(relation, block, &vmbuffer);
4798 		LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4799 		goto l3;
4800 	}
4801 
4802 	xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4803 	old_infomask = tuple->t_data->t_infomask;
4804 
4805 	/*
4806 	 * If this is the first possibly-multixact-able operation in the current
4807 	 * transaction, set my per-backend OldestMemberMXactId setting. We can be
4808 	 * certain that the transaction will never become a member of any older
4809 	 * MultiXactIds than that.  (We have to do this even if we end up just
4810 	 * using our own TransactionId below, since some other backend could
4811 	 * incorporate our XID into a MultiXact immediately afterwards.)
4812 	 */
4813 	MultiXactIdSetOldestMember();
4814 
4815 	/*
4816 	 * Compute the new xmax and infomask to store into the tuple.  Note we do
4817 	 * not modify the tuple just yet, because that would leave it in the wrong
4818 	 * state if multixact.c elogs.
4819 	 */
4820 	compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4821 							  GetCurrentTransactionId(), mode, false,
4822 							  &xid, &new_infomask, &new_infomask2);
4823 
4824 	START_CRIT_SECTION();
4825 
4826 	/*
4827 	 * Store transaction information of xact locking the tuple.
4828 	 *
4829 	 * Note: Cmax is meaningless in this context, so don't set it; this avoids
4830 	 * possibly generating a useless combo CID.  Moreover, if we're locking a
4831 	 * previously updated tuple, it's important to preserve the Cmax.
4832 	 *
4833 	 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4834 	 * we would break the HOT chain.
4835 	 */
4836 	tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4837 	tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4838 	tuple->t_data->t_infomask |= new_infomask;
4839 	tuple->t_data->t_infomask2 |= new_infomask2;
4840 	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4841 		HeapTupleHeaderClearHotUpdated(tuple->t_data);
4842 	HeapTupleHeaderSetXmax(tuple->t_data, xid);
4843 
4844 	/*
4845 	 * Make sure there is no forward chain link in t_ctid.  Note that in the
4846 	 * cases where the tuple has been updated, we must not overwrite t_ctid,
4847 	 * because it was set by the updater.  Moreover, if the tuple has been
4848 	 * updated, we need to follow the update chain to lock the new versions of
4849 	 * the tuple as well.
4850 	 */
4851 	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4852 		tuple->t_data->t_ctid = *tid;
4853 
4854 	/* Clear only the all-frozen bit on visibility map if needed */
4855 	if (PageIsAllVisible(page) &&
4856 		visibilitymap_clear(relation, block, vmbuffer,
4857 							VISIBILITYMAP_ALL_FROZEN))
4858 		cleared_all_frozen = true;
4859 
4860 
4861 	MarkBufferDirty(*buffer);
4862 
4863 	/*
4864 	 * XLOG stuff.  You might think that we don't need an XLOG record because
4865 	 * there is no state change worth restoring after a crash.  You would be
4866 	 * wrong however: we have just written either a TransactionId or a
4867 	 * MultiXactId that may never have been seen on disk before, and we need
4868 	 * to make sure that there are XLOG entries covering those ID numbers.
4869 	 * Else the same IDs might be re-used after a crash, which would be
4870 	 * disastrous if this page made it to disk before the crash.  Essentially
4871 	 * we have to enforce the WAL log-before-data rule even in this case.
4872 	 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4873 	 * entries for everything anyway.)
4874 	 */
4875 	if (RelationNeedsWAL(relation))
4876 	{
4877 		xl_heap_lock xlrec;
4878 		XLogRecPtr	recptr;
4879 
4880 		XLogBeginInsert();
4881 		XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4882 
4883 		xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4884 		xlrec.locking_xid = xid;
4885 		xlrec.infobits_set = compute_infobits(new_infomask,
4886 											  tuple->t_data->t_infomask2);
4887 		xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4888 		XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4889 
4890 		/* we don't decode row locks atm, so no need to log the origin */
4891 
4892 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4893 
4894 		PageSetLSN(page, recptr);
4895 	}
4896 
4897 	END_CRIT_SECTION();
4898 
4899 	result = TM_Ok;
4900 
4901 out_locked:
4902 	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4903 
4904 out_unlocked:
4905 	if (BufferIsValid(vmbuffer))
4906 		ReleaseBuffer(vmbuffer);
4907 
4908 	/*
4909 	 * Don't update the visibility map here. Locking a tuple doesn't change
4910 	 * visibility info.
4911 	 */
4912 
4913 	/*
4914 	 * Now that we have successfully marked the tuple as locked, we can
4915 	 * release the lmgr tuple lock, if we had it.
4916 	 */
4917 	if (have_tuple_lock)
4918 		UnlockTupleTuplock(relation, tid, mode);
4919 
4920 	return result;
4921 }
4922 
4923 /*
4924  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4925  * its normal, Xmax-based tuple lock.
4926  *
4927  * have_tuple_lock is an input and output parameter: on input, it indicates
4928  * whether the lock has previously been acquired (and this function does
4929  * nothing in that case).  If this function returns success, have_tuple_lock
4930  * has been flipped to true.
4931  *
4932  * Returns false if it was unable to obtain the lock; this can only happen if
4933  * wait_policy is Skip.
4934  */
4935 static bool
heap_acquire_tuplock(Relation relation,ItemPointer tid,LockTupleMode mode,LockWaitPolicy wait_policy,bool * have_tuple_lock)4936 heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
4937 					 LockWaitPolicy wait_policy, bool *have_tuple_lock)
4938 {
4939 	if (*have_tuple_lock)
4940 		return true;
4941 
4942 	switch (wait_policy)
4943 	{
4944 		case LockWaitBlock:
4945 			LockTupleTuplock(relation, tid, mode);
4946 			break;
4947 
4948 		case LockWaitSkip:
4949 			if (!ConditionalLockTupleTuplock(relation, tid, mode))
4950 				return false;
4951 			break;
4952 
4953 		case LockWaitError:
4954 			if (!ConditionalLockTupleTuplock(relation, tid, mode))
4955 				ereport(ERROR,
4956 						(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4957 						 errmsg("could not obtain lock on row in relation \"%s\"",
4958 								RelationGetRelationName(relation))));
4959 			break;
4960 	}
4961 	*have_tuple_lock = true;
4962 
4963 	return true;
4964 }
4965 
4966 /*
4967  * Given an original set of Xmax and infomask, and a transaction (identified by
4968  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4969  * corresponding infomasks to use on the tuple.
4970  *
4971  * Note that this might have side effects such as creating a new MultiXactId.
4972  *
4973  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4974  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4975  * but it was not running anymore. There is a race condition, which is that the
4976  * MultiXactId may have finished since then, but that uncommon case is handled
4977  * either here, or within MultiXactIdExpand.
4978  *
4979  * There is a similar race condition possible when the old xmax was a regular
4980  * TransactionId.  We test TransactionIdIsInProgress again just to narrow the
4981  * window, but it's still possible to end up creating an unnecessary
4982  * MultiXactId.  Fortunately this is harmless.
4983  */
4984 static void
compute_new_xmax_infomask(TransactionId xmax,uint16 old_infomask,uint16 old_infomask2,TransactionId add_to_xmax,LockTupleMode mode,bool is_update,TransactionId * result_xmax,uint16 * result_infomask,uint16 * result_infomask2)4985 compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
4986 						  uint16 old_infomask2, TransactionId add_to_xmax,
4987 						  LockTupleMode mode, bool is_update,
4988 						  TransactionId *result_xmax, uint16 *result_infomask,
4989 						  uint16 *result_infomask2)
4990 {
4991 	TransactionId new_xmax;
4992 	uint16		new_infomask,
4993 				new_infomask2;
4994 
4995 	Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
4996 
4997 l5:
4998 	new_infomask = 0;
4999 	new_infomask2 = 0;
5000 	if (old_infomask & HEAP_XMAX_INVALID)
5001 	{
5002 		/*
5003 		 * No previous locker; we just insert our own TransactionId.
5004 		 *
5005 		 * Note that it's critical that this case be the first one checked,
5006 		 * because there are several blocks below that come back to this one
5007 		 * to implement certain optimizations; old_infomask might contain
5008 		 * other dirty bits in those cases, but we don't really care.
5009 		 */
5010 		if (is_update)
5011 		{
5012 			new_xmax = add_to_xmax;
5013 			if (mode == LockTupleExclusive)
5014 				new_infomask2 |= HEAP_KEYS_UPDATED;
5015 		}
5016 		else
5017 		{
5018 			new_infomask |= HEAP_XMAX_LOCK_ONLY;
5019 			switch (mode)
5020 			{
5021 				case LockTupleKeyShare:
5022 					new_xmax = add_to_xmax;
5023 					new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5024 					break;
5025 				case LockTupleShare:
5026 					new_xmax = add_to_xmax;
5027 					new_infomask |= HEAP_XMAX_SHR_LOCK;
5028 					break;
5029 				case LockTupleNoKeyExclusive:
5030 					new_xmax = add_to_xmax;
5031 					new_infomask |= HEAP_XMAX_EXCL_LOCK;
5032 					break;
5033 				case LockTupleExclusive:
5034 					new_xmax = add_to_xmax;
5035 					new_infomask |= HEAP_XMAX_EXCL_LOCK;
5036 					new_infomask2 |= HEAP_KEYS_UPDATED;
5037 					break;
5038 				default:
5039 					new_xmax = InvalidTransactionId;	/* silence compiler */
5040 					elog(ERROR, "invalid lock mode");
5041 			}
5042 		}
5043 	}
5044 	else if (old_infomask & HEAP_XMAX_IS_MULTI)
5045 	{
5046 		MultiXactStatus new_status;
5047 
5048 		/*
5049 		 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5050 		 * cross-check.
5051 		 */
5052 		Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5053 
5054 		/*
5055 		 * A multixact together with LOCK_ONLY set but neither lock bit set
5056 		 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5057 		 * anymore.  This check is critical for databases upgraded by
5058 		 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5059 		 * that such multis are never passed.
5060 		 */
5061 		if (HEAP_LOCKED_UPGRADED(old_infomask))
5062 		{
5063 			old_infomask &= ~HEAP_XMAX_IS_MULTI;
5064 			old_infomask |= HEAP_XMAX_INVALID;
5065 			goto l5;
5066 		}
5067 
5068 		/*
5069 		 * If the XMAX is already a MultiXactId, then we need to expand it to
5070 		 * include add_to_xmax; but if all the members were lockers and are
5071 		 * all gone, we can do away with the IS_MULTI bit and just set
5072 		 * add_to_xmax as the only locker/updater.  If all lockers are gone
5073 		 * and we have an updater that aborted, we can also do without a
5074 		 * multi.
5075 		 *
5076 		 * The cost of doing GetMultiXactIdMembers would be paid by
5077 		 * MultiXactIdExpand if we weren't to do this, so this check is not
5078 		 * incurring extra work anyhow.
5079 		 */
5080 		if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5081 		{
5082 			if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5083 				!TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
5084 																old_infomask)))
5085 			{
5086 				/*
5087 				 * Reset these bits and restart; otherwise fall through to
5088 				 * create a new multi below.
5089 				 */
5090 				old_infomask &= ~HEAP_XMAX_IS_MULTI;
5091 				old_infomask |= HEAP_XMAX_INVALID;
5092 				goto l5;
5093 			}
5094 		}
5095 
5096 		new_status = get_mxact_status_for_lock(mode, is_update);
5097 
5098 		new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5099 									 new_status);
5100 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5101 	}
5102 	else if (old_infomask & HEAP_XMAX_COMMITTED)
5103 	{
5104 		/*
5105 		 * It's a committed update, so we need to preserve him as updater of
5106 		 * the tuple.
5107 		 */
5108 		MultiXactStatus status;
5109 		MultiXactStatus new_status;
5110 
5111 		if (old_infomask2 & HEAP_KEYS_UPDATED)
5112 			status = MultiXactStatusUpdate;
5113 		else
5114 			status = MultiXactStatusNoKeyUpdate;
5115 
5116 		new_status = get_mxact_status_for_lock(mode, is_update);
5117 
5118 		/*
5119 		 * since it's not running, it's obviously impossible for the old
5120 		 * updater to be identical to the current one, so we need not check
5121 		 * for that case as we do in the block above.
5122 		 */
5123 		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5124 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5125 	}
5126 	else if (TransactionIdIsInProgress(xmax))
5127 	{
5128 		/*
5129 		 * If the XMAX is a valid, in-progress TransactionId, then we need to
5130 		 * create a new MultiXactId that includes both the old locker or
5131 		 * updater and our own TransactionId.
5132 		 */
5133 		MultiXactStatus new_status;
5134 		MultiXactStatus old_status;
5135 		LockTupleMode old_mode;
5136 
5137 		if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5138 		{
5139 			if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5140 				old_status = MultiXactStatusForKeyShare;
5141 			else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5142 				old_status = MultiXactStatusForShare;
5143 			else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5144 			{
5145 				if (old_infomask2 & HEAP_KEYS_UPDATED)
5146 					old_status = MultiXactStatusForUpdate;
5147 				else
5148 					old_status = MultiXactStatusForNoKeyUpdate;
5149 			}
5150 			else
5151 			{
5152 				/*
5153 				 * LOCK_ONLY can be present alone only when a page has been
5154 				 * upgraded by pg_upgrade.  But in that case,
5155 				 * TransactionIdIsInProgress() should have returned false.  We
5156 				 * assume it's no longer locked in this case.
5157 				 */
5158 				elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5159 				old_infomask |= HEAP_XMAX_INVALID;
5160 				old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5161 				goto l5;
5162 			}
5163 		}
5164 		else
5165 		{
5166 			/* it's an update, but which kind? */
5167 			if (old_infomask2 & HEAP_KEYS_UPDATED)
5168 				old_status = MultiXactStatusUpdate;
5169 			else
5170 				old_status = MultiXactStatusNoKeyUpdate;
5171 		}
5172 
5173 		old_mode = TUPLOCK_from_mxstatus(old_status);
5174 
5175 		/*
5176 		 * If the lock to be acquired is for the same TransactionId as the
5177 		 * existing lock, there's an optimization possible: consider only the
5178 		 * strongest of both locks as the only one present, and restart.
5179 		 */
5180 		if (xmax == add_to_xmax)
5181 		{
5182 			/*
5183 			 * Note that it's not possible for the original tuple to be
5184 			 * updated: we wouldn't be here because the tuple would have been
5185 			 * invisible and we wouldn't try to update it.  As a subtlety,
5186 			 * this code can also run when traversing an update chain to lock
5187 			 * future versions of a tuple.  But we wouldn't be here either,
5188 			 * because the add_to_xmax would be different from the original
5189 			 * updater.
5190 			 */
5191 			Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5192 
5193 			/* acquire the strongest of both */
5194 			if (mode < old_mode)
5195 				mode = old_mode;
5196 			/* mustn't touch is_update */
5197 
5198 			old_infomask |= HEAP_XMAX_INVALID;
5199 			goto l5;
5200 		}
5201 
5202 		/* otherwise, just fall back to creating a new multixact */
5203 		new_status = get_mxact_status_for_lock(mode, is_update);
5204 		new_xmax = MultiXactIdCreate(xmax, old_status,
5205 									 add_to_xmax, new_status);
5206 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5207 	}
5208 	else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5209 			 TransactionIdDidCommit(xmax))
5210 	{
5211 		/*
5212 		 * It's a committed update, so we gotta preserve him as updater of the
5213 		 * tuple.
5214 		 */
5215 		MultiXactStatus status;
5216 		MultiXactStatus new_status;
5217 
5218 		if (old_infomask2 & HEAP_KEYS_UPDATED)
5219 			status = MultiXactStatusUpdate;
5220 		else
5221 			status = MultiXactStatusNoKeyUpdate;
5222 
5223 		new_status = get_mxact_status_for_lock(mode, is_update);
5224 
5225 		/*
5226 		 * since it's not running, it's obviously impossible for the old
5227 		 * updater to be identical to the current one, so we need not check
5228 		 * for that case as we do in the block above.
5229 		 */
5230 		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5231 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5232 	}
5233 	else
5234 	{
5235 		/*
5236 		 * Can get here iff the locking/updating transaction was running when
5237 		 * the infomask was extracted from the tuple, but finished before
5238 		 * TransactionIdIsInProgress got to run.  Deal with it as if there was
5239 		 * no locker at all in the first place.
5240 		 */
5241 		old_infomask |= HEAP_XMAX_INVALID;
5242 		goto l5;
5243 	}
5244 
5245 	*result_infomask = new_infomask;
5246 	*result_infomask2 = new_infomask2;
5247 	*result_xmax = new_xmax;
5248 }
5249 
5250 /*
5251  * Subroutine for heap_lock_updated_tuple_rec.
5252  *
5253  * Given a hypothetical multixact status held by the transaction identified
5254  * with the given xid, does the current transaction need to wait, fail, or can
5255  * it continue if it wanted to acquire a lock of the given mode?  "needwait"
5256  * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5257  * returned.  If the lock is already held by the current transaction, return
5258  * TM_SelfModified.  In case of a conflict with another transaction, a
5259  * different HeapTupleSatisfiesUpdate return code is returned.
5260  *
5261  * The held status is said to be hypothetical because it might correspond to a
5262  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5263  * way for simplicity of API.
5264  */
5265 static TM_Result
test_lockmode_for_conflict(MultiXactStatus status,TransactionId xid,LockTupleMode mode,HeapTuple tup,bool * needwait)5266 test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
5267 						   LockTupleMode mode, HeapTuple tup,
5268 						   bool *needwait)
5269 {
5270 	MultiXactStatus wantedstatus;
5271 
5272 	*needwait = false;
5273 	wantedstatus = get_mxact_status_for_lock(mode, false);
5274 
5275 	/*
5276 	 * Note: we *must* check TransactionIdIsInProgress before
5277 	 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5278 	 * for an explanation.
5279 	 */
5280 	if (TransactionIdIsCurrentTransactionId(xid))
5281 	{
5282 		/*
5283 		 * The tuple has already been locked by our own transaction.  This is
5284 		 * very rare but can happen if multiple transactions are trying to
5285 		 * lock an ancient version of the same tuple.
5286 		 */
5287 		return TM_SelfModified;
5288 	}
5289 	else if (TransactionIdIsInProgress(xid))
5290 	{
5291 		/*
5292 		 * If the locking transaction is running, what we do depends on
5293 		 * whether the lock modes conflict: if they do, then we must wait for
5294 		 * it to finish; otherwise we can fall through to lock this tuple
5295 		 * version without waiting.
5296 		 */
5297 		if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5298 								LOCKMODE_from_mxstatus(wantedstatus)))
5299 		{
5300 			*needwait = true;
5301 		}
5302 
5303 		/*
5304 		 * If we set needwait above, then this value doesn't matter;
5305 		 * otherwise, this value signals to caller that it's okay to proceed.
5306 		 */
5307 		return TM_Ok;
5308 	}
5309 	else if (TransactionIdDidAbort(xid))
5310 		return TM_Ok;
5311 	else if (TransactionIdDidCommit(xid))
5312 	{
5313 		/*
5314 		 * The other transaction committed.  If it was only a locker, then the
5315 		 * lock is completely gone now and we can return success; but if it
5316 		 * was an update, then what we do depends on whether the two lock
5317 		 * modes conflict.  If they conflict, then we must report error to
5318 		 * caller. But if they don't, we can fall through to allow the current
5319 		 * transaction to lock the tuple.
5320 		 *
5321 		 * Note: the reason we worry about ISUPDATE here is because as soon as
5322 		 * a transaction ends, all its locks are gone and meaningless, and
5323 		 * thus we can ignore them; whereas its updates persist.  In the
5324 		 * TransactionIdIsInProgress case, above, we don't need to check
5325 		 * because we know the lock is still "alive" and thus a conflict needs
5326 		 * always be checked.
5327 		 */
5328 		if (!ISUPDATE_from_mxstatus(status))
5329 			return TM_Ok;
5330 
5331 		if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5332 								LOCKMODE_from_mxstatus(wantedstatus)))
5333 		{
5334 			/* bummer */
5335 			if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5336 				return TM_Updated;
5337 			else
5338 				return TM_Deleted;
5339 		}
5340 
5341 		return TM_Ok;
5342 	}
5343 
5344 	/* Not in progress, not aborted, not committed -- must have crashed */
5345 	return TM_Ok;
5346 }
5347 
5348 
5349 /*
5350  * Recursive part of heap_lock_updated_tuple
5351  *
5352  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5353  * xid with the given mode; if this tuple is updated, recurse to lock the new
5354  * version as well.
5355  */
5356 static TM_Result
heap_lock_updated_tuple_rec(Relation rel,ItemPointer tid,TransactionId xid,LockTupleMode mode)5357 heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5358 							LockTupleMode mode)
5359 {
5360 	TM_Result	result;
5361 	ItemPointerData tupid;
5362 	HeapTupleData mytup;
5363 	Buffer		buf;
5364 	uint16		new_infomask,
5365 				new_infomask2,
5366 				old_infomask,
5367 				old_infomask2;
5368 	TransactionId xmax,
5369 				new_xmax;
5370 	TransactionId priorXmax = InvalidTransactionId;
5371 	bool		cleared_all_frozen = false;
5372 	bool		pinned_desired_page;
5373 	Buffer		vmbuffer = InvalidBuffer;
5374 	BlockNumber block;
5375 
5376 	ItemPointerCopy(tid, &tupid);
5377 
5378 	for (;;)
5379 	{
5380 		new_infomask = 0;
5381 		new_xmax = InvalidTransactionId;
5382 		block = ItemPointerGetBlockNumber(&tupid);
5383 		ItemPointerCopy(&tupid, &(mytup.t_self));
5384 
5385 		if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5386 		{
5387 			/*
5388 			 * if we fail to find the updated version of the tuple, it's
5389 			 * because it was vacuumed/pruned away after its creator
5390 			 * transaction aborted.  So behave as if we got to the end of the
5391 			 * chain, and there's no further tuple to lock: return success to
5392 			 * caller.
5393 			 */
5394 			result = TM_Ok;
5395 			goto out_unlocked;
5396 		}
5397 
5398 l4:
5399 		CHECK_FOR_INTERRUPTS();
5400 
5401 		/*
5402 		 * Before locking the buffer, pin the visibility map page if it
5403 		 * appears to be necessary.  Since we haven't got the lock yet,
5404 		 * someone else might be in the middle of changing this, so we'll need
5405 		 * to recheck after we have the lock.
5406 		 */
5407 		if (PageIsAllVisible(BufferGetPage(buf)))
5408 		{
5409 			visibilitymap_pin(rel, block, &vmbuffer);
5410 			pinned_desired_page = true;
5411 		}
5412 		else
5413 			pinned_desired_page = false;
5414 
5415 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5416 
5417 		/*
5418 		 * If we didn't pin the visibility map page and the page has become
5419 		 * all visible while we were busy locking the buffer, we'll have to
5420 		 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5421 		 * That's a bit unfortunate, but hopefully shouldn't happen often.
5422 		 *
5423 		 * Note: in some paths through this function, we will reach here
5424 		 * holding a pin on a vm page that may or may not be the one matching
5425 		 * this page.  If this page isn't all-visible, we won't use the vm
5426 		 * page, but we hold onto such a pin till the end of the function.
5427 		 */
5428 		if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5429 		{
5430 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5431 			visibilitymap_pin(rel, block, &vmbuffer);
5432 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5433 		}
5434 
5435 		/*
5436 		 * Check the tuple XMIN against prior XMAX, if any.  If we reached the
5437 		 * end of the chain, we're done, so return success.
5438 		 */
5439 		if (TransactionIdIsValid(priorXmax) &&
5440 			!TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5441 								 priorXmax))
5442 		{
5443 			result = TM_Ok;
5444 			goto out_locked;
5445 		}
5446 
5447 		/*
5448 		 * Also check Xmin: if this tuple was created by an aborted
5449 		 * (sub)transaction, then we already locked the last live one in the
5450 		 * chain, thus we're done, so return success.
5451 		 */
5452 		if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5453 		{
5454 			result = TM_Ok;
5455 			goto out_locked;
5456 		}
5457 
5458 		old_infomask = mytup.t_data->t_infomask;
5459 		old_infomask2 = mytup.t_data->t_infomask2;
5460 		xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5461 
5462 		/*
5463 		 * If this tuple version has been updated or locked by some concurrent
5464 		 * transaction(s), what we do depends on whether our lock mode
5465 		 * conflicts with what those other transactions hold, and also on the
5466 		 * status of them.
5467 		 */
5468 		if (!(old_infomask & HEAP_XMAX_INVALID))
5469 		{
5470 			TransactionId rawxmax;
5471 			bool		needwait;
5472 
5473 			rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5474 			if (old_infomask & HEAP_XMAX_IS_MULTI)
5475 			{
5476 				int			nmembers;
5477 				int			i;
5478 				MultiXactMember *members;
5479 
5480 				/*
5481 				 * We don't need a test for pg_upgrade'd tuples: this is only
5482 				 * applied to tuples after the first in an update chain.  Said
5483 				 * first tuple in the chain may well be locked-in-9.2-and-
5484 				 * pg_upgraded, but that one was already locked by our caller,
5485 				 * not us; and any subsequent ones cannot be because our
5486 				 * caller must necessarily have obtained a snapshot later than
5487 				 * the pg_upgrade itself.
5488 				 */
5489 				Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5490 
5491 				nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5492 												 HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5493 				for (i = 0; i < nmembers; i++)
5494 				{
5495 					result = test_lockmode_for_conflict(members[i].status,
5496 														members[i].xid,
5497 														mode,
5498 														&mytup,
5499 														&needwait);
5500 
5501 					/*
5502 					 * If the tuple was already locked by ourselves in a
5503 					 * previous iteration of this (say heap_lock_tuple was
5504 					 * forced to restart the locking loop because of a change
5505 					 * in xmax), then we hold the lock already on this tuple
5506 					 * version and we don't need to do anything; and this is
5507 					 * not an error condition either.  We just need to skip
5508 					 * this tuple and continue locking the next version in the
5509 					 * update chain.
5510 					 */
5511 					if (result == TM_SelfModified)
5512 					{
5513 						pfree(members);
5514 						goto next;
5515 					}
5516 
5517 					if (needwait)
5518 					{
5519 						LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5520 						XactLockTableWait(members[i].xid, rel,
5521 										  &mytup.t_self,
5522 										  XLTW_LockUpdated);
5523 						pfree(members);
5524 						goto l4;
5525 					}
5526 					if (result != TM_Ok)
5527 					{
5528 						pfree(members);
5529 						goto out_locked;
5530 					}
5531 				}
5532 				if (members)
5533 					pfree(members);
5534 			}
5535 			else
5536 			{
5537 				MultiXactStatus status;
5538 
5539 				/*
5540 				 * For a non-multi Xmax, we first need to compute the
5541 				 * corresponding MultiXactStatus by using the infomask bits.
5542 				 */
5543 				if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5544 				{
5545 					if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5546 						status = MultiXactStatusForKeyShare;
5547 					else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5548 						status = MultiXactStatusForShare;
5549 					else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5550 					{
5551 						if (old_infomask2 & HEAP_KEYS_UPDATED)
5552 							status = MultiXactStatusForUpdate;
5553 						else
5554 							status = MultiXactStatusForNoKeyUpdate;
5555 					}
5556 					else
5557 					{
5558 						/*
5559 						 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5560 						 * as share-locked in the old cluster) shouldn't be
5561 						 * seen in the middle of an update chain.
5562 						 */
5563 						elog(ERROR, "invalid lock status in tuple");
5564 					}
5565 				}
5566 				else
5567 				{
5568 					/* it's an update, but which kind? */
5569 					if (old_infomask2 & HEAP_KEYS_UPDATED)
5570 						status = MultiXactStatusUpdate;
5571 					else
5572 						status = MultiXactStatusNoKeyUpdate;
5573 				}
5574 
5575 				result = test_lockmode_for_conflict(status, rawxmax, mode,
5576 													&mytup, &needwait);
5577 
5578 				/*
5579 				 * If the tuple was already locked by ourselves in a previous
5580 				 * iteration of this (say heap_lock_tuple was forced to
5581 				 * restart the locking loop because of a change in xmax), then
5582 				 * we hold the lock already on this tuple version and we don't
5583 				 * need to do anything; and this is not an error condition
5584 				 * either.  We just need to skip this tuple and continue
5585 				 * locking the next version in the update chain.
5586 				 */
5587 				if (result == TM_SelfModified)
5588 					goto next;
5589 
5590 				if (needwait)
5591 				{
5592 					LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5593 					XactLockTableWait(rawxmax, rel, &mytup.t_self,
5594 									  XLTW_LockUpdated);
5595 					goto l4;
5596 				}
5597 				if (result != TM_Ok)
5598 				{
5599 					goto out_locked;
5600 				}
5601 			}
5602 		}
5603 
5604 		/* compute the new Xmax and infomask values for the tuple ... */
5605 		compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5606 								  xid, mode, false,
5607 								  &new_xmax, &new_infomask, &new_infomask2);
5608 
5609 		if (PageIsAllVisible(BufferGetPage(buf)) &&
5610 			visibilitymap_clear(rel, block, vmbuffer,
5611 								VISIBILITYMAP_ALL_FROZEN))
5612 			cleared_all_frozen = true;
5613 
5614 		START_CRIT_SECTION();
5615 
5616 		/* ... and set them */
5617 		HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5618 		mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5619 		mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5620 		mytup.t_data->t_infomask |= new_infomask;
5621 		mytup.t_data->t_infomask2 |= new_infomask2;
5622 
5623 		MarkBufferDirty(buf);
5624 
5625 		/* XLOG stuff */
5626 		if (RelationNeedsWAL(rel))
5627 		{
5628 			xl_heap_lock_updated xlrec;
5629 			XLogRecPtr	recptr;
5630 			Page		page = BufferGetPage(buf);
5631 
5632 			XLogBeginInsert();
5633 			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
5634 
5635 			xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5636 			xlrec.xmax = new_xmax;
5637 			xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5638 			xlrec.flags =
5639 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5640 
5641 			XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5642 
5643 			recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5644 
5645 			PageSetLSN(page, recptr);
5646 		}
5647 
5648 		END_CRIT_SECTION();
5649 
5650 next:
5651 		/* if we find the end of update chain, we're done. */
5652 		if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5653 			HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
5654 			ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5655 			HeapTupleHeaderIsOnlyLocked(mytup.t_data))
5656 		{
5657 			result = TM_Ok;
5658 			goto out_locked;
5659 		}
5660 
5661 		/* tail recursion */
5662 		priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5663 		ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5664 		UnlockReleaseBuffer(buf);
5665 	}
5666 
5667 	result = TM_Ok;
5668 
5669 out_locked:
5670 	UnlockReleaseBuffer(buf);
5671 
5672 out_unlocked:
5673 	if (vmbuffer != InvalidBuffer)
5674 		ReleaseBuffer(vmbuffer);
5675 
5676 	return result;
5677 }
5678 
5679 /*
5680  * heap_lock_updated_tuple
5681  *		Follow update chain when locking an updated tuple, acquiring locks (row
5682  *		marks) on the updated versions.
5683  *
5684  * The initial tuple is assumed to be already locked.
5685  *
5686  * This function doesn't check visibility, it just unconditionally marks the
5687  * tuple(s) as locked.  If any tuple in the updated chain is being deleted
5688  * concurrently (or updated with the key being modified), sleep until the
5689  * transaction doing it is finished.
5690  *
5691  * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5692  * when we have to wait for other transactions to release them, as opposed to
5693  * what heap_lock_tuple does.  The reason is that having more than one
5694  * transaction walking the chain is probably uncommon enough that risk of
5695  * starvation is not likely: one of the preconditions for being here is that
5696  * the snapshot in use predates the update that created this tuple (because we
5697  * started at an earlier version of the tuple), but at the same time such a
5698  * transaction cannot be using repeatable read or serializable isolation
5699  * levels, because that would lead to a serializability failure.
5700  */
5701 static TM_Result
heap_lock_updated_tuple(Relation rel,HeapTuple tuple,ItemPointer ctid,TransactionId xid,LockTupleMode mode)5702 heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
5703 						TransactionId xid, LockTupleMode mode)
5704 {
5705 	/*
5706 	 * If the tuple has not been updated, or has moved into another partition
5707 	 * (effectively a delete) stop here.
5708 	 */
5709 	if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
5710 		!ItemPointerEquals(&tuple->t_self, ctid))
5711 	{
5712 		/*
5713 		 * If this is the first possibly-multixact-able operation in the
5714 		 * current transaction, set my per-backend OldestMemberMXactId
5715 		 * setting. We can be certain that the transaction will never become a
5716 		 * member of any older MultiXactIds than that.  (We have to do this
5717 		 * even if we end up just using our own TransactionId below, since
5718 		 * some other backend could incorporate our XID into a MultiXact
5719 		 * immediately afterwards.)
5720 		 */
5721 		MultiXactIdSetOldestMember();
5722 
5723 		return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5724 	}
5725 
5726 	/* nothing to lock */
5727 	return TM_Ok;
5728 }
5729 
5730 /*
5731  *	heap_finish_speculative - mark speculative insertion as successful
5732  *
5733  * To successfully finish a speculative insertion we have to clear speculative
5734  * token from tuple.  To do so the t_ctid field, which will contain a
5735  * speculative token value, is modified in place to point to the tuple itself,
5736  * which is characteristic of a newly inserted ordinary tuple.
5737  *
5738  * NB: It is not ok to commit without either finishing or aborting a
5739  * speculative insertion.  We could treat speculative tuples of committed
5740  * transactions implicitly as completed, but then we would have to be prepared
5741  * to deal with speculative tokens on committed tuples.  That wouldn't be
5742  * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5743  * but clearing the token at completion isn't very expensive either.
5744  * An explicit confirmation WAL record also makes logical decoding simpler.
5745  */
5746 void
heap_finish_speculative(Relation relation,ItemPointer tid)5747 heap_finish_speculative(Relation relation, ItemPointer tid)
5748 {
5749 	Buffer		buffer;
5750 	Page		page;
5751 	OffsetNumber offnum;
5752 	ItemId		lp = NULL;
5753 	HeapTupleHeader htup;
5754 
5755 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5756 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5757 	page = (Page) BufferGetPage(buffer);
5758 
5759 	offnum = ItemPointerGetOffsetNumber(tid);
5760 	if (PageGetMaxOffsetNumber(page) >= offnum)
5761 		lp = PageGetItemId(page, offnum);
5762 
5763 	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5764 		elog(ERROR, "invalid lp");
5765 
5766 	htup = (HeapTupleHeader) PageGetItem(page, lp);
5767 
5768 	/* SpecTokenOffsetNumber should be distinguishable from any real offset */
5769 	StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
5770 					 "invalid speculative token constant");
5771 
5772 	/* NO EREPORT(ERROR) from here till changes are logged */
5773 	START_CRIT_SECTION();
5774 
5775 	Assert(HeapTupleHeaderIsSpeculative(htup));
5776 
5777 	MarkBufferDirty(buffer);
5778 
5779 	/*
5780 	 * Replace the speculative insertion token with a real t_ctid, pointing to
5781 	 * itself like it does on regular tuples.
5782 	 */
5783 	htup->t_ctid = *tid;
5784 
5785 	/* XLOG stuff */
5786 	if (RelationNeedsWAL(relation))
5787 	{
5788 		xl_heap_confirm xlrec;
5789 		XLogRecPtr	recptr;
5790 
5791 		xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5792 
5793 		XLogBeginInsert();
5794 
5795 		/* We want the same filtering on this as on a plain insert */
5796 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
5797 
5798 		XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5799 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5800 
5801 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5802 
5803 		PageSetLSN(page, recptr);
5804 	}
5805 
5806 	END_CRIT_SECTION();
5807 
5808 	UnlockReleaseBuffer(buffer);
5809 }
5810 
5811 /*
5812  *	heap_abort_speculative - kill a speculatively inserted tuple
5813  *
5814  * Marks a tuple that was speculatively inserted in the same command as dead,
5815  * by setting its xmin as invalid.  That makes it immediately appear as dead
5816  * to all transactions, including our own.  In particular, it makes
5817  * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5818  * inserting a duplicate key value won't unnecessarily wait for our whole
5819  * transaction to finish (it'll just wait for our speculative insertion to
5820  * finish).
5821  *
5822  * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5823  * that arise due to a mutual dependency that is not user visible.  By
5824  * definition, unprincipled deadlocks cannot be prevented by the user
5825  * reordering lock acquisition in client code, because the implementation level
5826  * lock acquisitions are not under the user's direct control.  If speculative
5827  * inserters did not take this precaution, then under high concurrency they
5828  * could deadlock with each other, which would not be acceptable.
5829  *
5830  * This is somewhat redundant with heap_delete, but we prefer to have a
5831  * dedicated routine with stripped down requirements.  Note that this is also
5832  * used to delete the TOAST tuples created during speculative insertion.
5833  *
5834  * This routine does not affect logical decoding as it only looks at
5835  * confirmation records.
5836  */
5837 void
heap_abort_speculative(Relation relation,ItemPointer tid)5838 heap_abort_speculative(Relation relation, ItemPointer tid)
5839 {
5840 	TransactionId xid = GetCurrentTransactionId();
5841 	ItemId		lp;
5842 	HeapTupleData tp;
5843 	Page		page;
5844 	BlockNumber block;
5845 	Buffer		buffer;
5846 	TransactionId prune_xid;
5847 
5848 	Assert(ItemPointerIsValid(tid));
5849 
5850 	block = ItemPointerGetBlockNumber(tid);
5851 	buffer = ReadBuffer(relation, block);
5852 	page = BufferGetPage(buffer);
5853 
5854 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5855 
5856 	/*
5857 	 * Page can't be all visible, we just inserted into it, and are still
5858 	 * running.
5859 	 */
5860 	Assert(!PageIsAllVisible(page));
5861 
5862 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5863 	Assert(ItemIdIsNormal(lp));
5864 
5865 	tp.t_tableOid = RelationGetRelid(relation);
5866 	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5867 	tp.t_len = ItemIdGetLength(lp);
5868 	tp.t_self = *tid;
5869 
5870 	/*
5871 	 * Sanity check that the tuple really is a speculatively inserted tuple,
5872 	 * inserted by us.
5873 	 */
5874 	if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5875 		elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5876 	if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
5877 		elog(ERROR, "attempted to kill a non-speculative tuple");
5878 	Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
5879 
5880 	/*
5881 	 * No need to check for serializable conflicts here.  There is never a
5882 	 * need for a combo CID, either.  No need to extract replica identity, or
5883 	 * do anything special with infomask bits.
5884 	 */
5885 
5886 	START_CRIT_SECTION();
5887 
5888 	/*
5889 	 * The tuple will become DEAD immediately.  Flag that this page is a
5890 	 * candidate for pruning by setting xmin to TransactionXmin. While not
5891 	 * immediately prunable, it is the oldest xid we can cheaply determine
5892 	 * that's safe against wraparound / being older than the table's
5893 	 * relfrozenxid.  To defend against the unlikely case of a new relation
5894 	 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
5895 	 * if so (vacuum can't subsequently move relfrozenxid to beyond
5896 	 * TransactionXmin, so there's no race here).
5897 	 */
5898 	Assert(TransactionIdIsValid(TransactionXmin));
5899 	if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
5900 		prune_xid = relation->rd_rel->relfrozenxid;
5901 	else
5902 		prune_xid = TransactionXmin;
5903 	PageSetPrunable(page, prune_xid);
5904 
5905 	/* store transaction information of xact deleting the tuple */
5906 	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
5907 	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5908 
5909 	/*
5910 	 * Set the tuple header xmin to InvalidTransactionId.  This makes the
5911 	 * tuple immediately invisible everyone.  (In particular, to any
5912 	 * transactions waiting on the speculative token, woken up later.)
5913 	 */
5914 	HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
5915 
5916 	/* Clear the speculative insertion token too */
5917 	tp.t_data->t_ctid = tp.t_self;
5918 
5919 	MarkBufferDirty(buffer);
5920 
5921 	/*
5922 	 * XLOG stuff
5923 	 *
5924 	 * The WAL records generated here match heap_delete().  The same recovery
5925 	 * routines are used.
5926 	 */
5927 	if (RelationNeedsWAL(relation))
5928 	{
5929 		xl_heap_delete xlrec;
5930 		XLogRecPtr	recptr;
5931 
5932 		xlrec.flags = XLH_DELETE_IS_SUPER;
5933 		xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
5934 											  tp.t_data->t_infomask2);
5935 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
5936 		xlrec.xmax = xid;
5937 
5938 		XLogBeginInsert();
5939 		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5940 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5941 
5942 		/* No replica identity & replication origin logged */
5943 
5944 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5945 
5946 		PageSetLSN(page, recptr);
5947 	}
5948 
5949 	END_CRIT_SECTION();
5950 
5951 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5952 
5953 	if (HeapTupleHasExternal(&tp))
5954 	{
5955 		Assert(!IsToastRelation(relation));
5956 		heap_toast_delete(relation, &tp, true);
5957 	}
5958 
5959 	/*
5960 	 * Never need to mark tuple for invalidation, since catalogs don't support
5961 	 * speculative insertion
5962 	 */
5963 
5964 	/* Now we can release the buffer */
5965 	ReleaseBuffer(buffer);
5966 
5967 	/* count deletion, as we counted the insertion too */
5968 	pgstat_count_heap_delete(relation);
5969 }
5970 
5971 /*
5972  * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5973  *
5974  * Overwriting violates both MVCC and transactional safety, so the uses
5975  * of this function in Postgres are extremely limited.  Nonetheless we
5976  * find some places to use it.
5977  *
5978  * The tuple cannot change size, and therefore it's reasonable to assume
5979  * that its null bitmap (if any) doesn't change either.  So we just
5980  * overwrite the data portion of the tuple without touching the null
5981  * bitmap or any of the header fields.
5982  *
5983  * tuple is an in-memory tuple structure containing the data to be written
5984  * over the target tuple.  Also, tuple->t_self identifies the target tuple.
5985  *
5986  * Note that the tuple updated here had better not come directly from the
5987  * syscache if the relation has a toast relation as this tuple could
5988  * include toast values that have been expanded, causing a failure here.
5989  */
5990 void
heap_inplace_update(Relation relation,HeapTuple tuple)5991 heap_inplace_update(Relation relation, HeapTuple tuple)
5992 {
5993 	Buffer		buffer;
5994 	Page		page;
5995 	OffsetNumber offnum;
5996 	ItemId		lp = NULL;
5997 	HeapTupleHeader htup;
5998 	uint32		oldlen;
5999 	uint32		newlen;
6000 
6001 	/*
6002 	 * For now, we don't allow parallel updates.  Unlike a regular update,
6003 	 * this should never create a combo CID, so it might be possible to relax
6004 	 * this restriction, but not without more thought and testing.  It's not
6005 	 * clear that it would be useful, anyway.
6006 	 */
6007 	if (IsInParallelMode())
6008 		ereport(ERROR,
6009 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
6010 				 errmsg("cannot update tuples during a parallel operation")));
6011 
6012 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
6013 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
6014 	page = (Page) BufferGetPage(buffer);
6015 
6016 	offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
6017 	if (PageGetMaxOffsetNumber(page) >= offnum)
6018 		lp = PageGetItemId(page, offnum);
6019 
6020 	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
6021 		elog(ERROR, "invalid lp");
6022 
6023 	htup = (HeapTupleHeader) PageGetItem(page, lp);
6024 
6025 	oldlen = ItemIdGetLength(lp) - htup->t_hoff;
6026 	newlen = tuple->t_len - tuple->t_data->t_hoff;
6027 	if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6028 		elog(ERROR, "wrong tuple length");
6029 
6030 	/* NO EREPORT(ERROR) from here till changes are logged */
6031 	START_CRIT_SECTION();
6032 
6033 	memcpy((char *) htup + htup->t_hoff,
6034 		   (char *) tuple->t_data + tuple->t_data->t_hoff,
6035 		   newlen);
6036 
6037 	MarkBufferDirty(buffer);
6038 
6039 	/* XLOG stuff */
6040 	if (RelationNeedsWAL(relation))
6041 	{
6042 		xl_heap_inplace xlrec;
6043 		XLogRecPtr	recptr;
6044 
6045 		xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6046 
6047 		XLogBeginInsert();
6048 		XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
6049 
6050 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6051 		XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
6052 
6053 		/* inplace updates aren't decoded atm, don't log the origin */
6054 
6055 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
6056 
6057 		PageSetLSN(page, recptr);
6058 	}
6059 
6060 	END_CRIT_SECTION();
6061 
6062 	UnlockReleaseBuffer(buffer);
6063 
6064 	/*
6065 	 * Send out shared cache inval if necessary.  Note that because we only
6066 	 * pass the new version of the tuple, this mustn't be used for any
6067 	 * operations that could change catcache lookup keys.  But we aren't
6068 	 * bothering with index updates either, so that's true a fortiori.
6069 	 */
6070 	if (!IsBootstrapProcessingMode())
6071 		CacheInvalidateHeapTuple(relation, tuple, NULL);
6072 }
6073 
6074 #define		FRM_NOOP				0x0001
6075 #define		FRM_INVALIDATE_XMAX		0x0002
6076 #define		FRM_RETURN_IS_XID		0x0004
6077 #define		FRM_RETURN_IS_MULTI		0x0008
6078 #define		FRM_MARK_COMMITTED		0x0010
6079 
6080 /*
6081  * FreezeMultiXactId
6082  *		Determine what to do during freezing when a tuple is marked by a
6083  *		MultiXactId.
6084  *
6085  * NB -- this might have the side-effect of creating a new MultiXactId!
6086  *
6087  * "flags" is an output value; it's used to tell caller what to do on return.
6088  * Possible flags are:
6089  * FRM_NOOP
6090  *		don't do anything -- keep existing Xmax
6091  * FRM_INVALIDATE_XMAX
6092  *		mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6093  * FRM_RETURN_IS_XID
6094  *		The Xid return value is a single update Xid to set as xmax.
6095  * FRM_MARK_COMMITTED
6096  *		Xmax can be marked as HEAP_XMAX_COMMITTED
6097  * FRM_RETURN_IS_MULTI
6098  *		The return value is a new MultiXactId to set as new Xmax.
6099  *		(caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6100  */
6101 static TransactionId
FreezeMultiXactId(MultiXactId multi,uint16 t_infomask,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,MultiXactId cutoff_multi,uint16 * flags)6102 FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6103 				  TransactionId relfrozenxid, TransactionId relminmxid,
6104 				  TransactionId cutoff_xid, MultiXactId cutoff_multi,
6105 				  uint16 *flags)
6106 {
6107 	TransactionId xid = InvalidTransactionId;
6108 	int			i;
6109 	MultiXactMember *members;
6110 	int			nmembers;
6111 	bool		need_replace;
6112 	int			nnewmembers;
6113 	MultiXactMember *newmembers;
6114 	bool		has_lockers;
6115 	TransactionId update_xid;
6116 	bool		update_committed;
6117 
6118 	*flags = 0;
6119 
6120 	/* We should only be called in Multis */
6121 	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6122 
6123 	if (!MultiXactIdIsValid(multi) ||
6124 		HEAP_LOCKED_UPGRADED(t_infomask))
6125 	{
6126 		/* Ensure infomask bits are appropriately set/reset */
6127 		*flags |= FRM_INVALIDATE_XMAX;
6128 		return InvalidTransactionId;
6129 	}
6130 	else if (MultiXactIdPrecedes(multi, relminmxid))
6131 		ereport(ERROR,
6132 				(errcode(ERRCODE_DATA_CORRUPTED),
6133 				 errmsg_internal("found multixact %u from before relminmxid %u",
6134 								 multi, relminmxid)));
6135 	else if (MultiXactIdPrecedes(multi, cutoff_multi))
6136 	{
6137 		/*
6138 		 * This old multi cannot possibly have members still running, but
6139 		 * verify just in case.  If it was a locker only, it can be removed
6140 		 * without any further consideration; but if it contained an update,
6141 		 * we might need to preserve it.
6142 		 */
6143 		if (MultiXactIdIsRunning(multi,
6144 								 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6145 			ereport(ERROR,
6146 					(errcode(ERRCODE_DATA_CORRUPTED),
6147 					 errmsg_internal("multixact %u from before cutoff %u found to be still running",
6148 									 multi, cutoff_multi)));
6149 
6150 		if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6151 		{
6152 			*flags |= FRM_INVALIDATE_XMAX;
6153 			xid = InvalidTransactionId; /* not strictly necessary */
6154 		}
6155 		else
6156 		{
6157 			/* replace multi by update xid */
6158 			xid = MultiXactIdGetUpdateXid(multi, t_infomask);
6159 
6160 			/* wasn't only a lock, xid needs to be valid */
6161 			Assert(TransactionIdIsValid(xid));
6162 
6163 			if (TransactionIdPrecedes(xid, relfrozenxid))
6164 				ereport(ERROR,
6165 						(errcode(ERRCODE_DATA_CORRUPTED),
6166 						 errmsg_internal("found update xid %u from before relfrozenxid %u",
6167 										 xid, relfrozenxid)));
6168 
6169 			/*
6170 			 * If the xid is older than the cutoff, it has to have aborted,
6171 			 * otherwise the tuple would have gotten pruned away.
6172 			 */
6173 			if (TransactionIdPrecedes(xid, cutoff_xid))
6174 			{
6175 				if (TransactionIdDidCommit(xid))
6176 					ereport(ERROR,
6177 							(errcode(ERRCODE_DATA_CORRUPTED),
6178 							 errmsg_internal("cannot freeze committed update xid %u", xid)));
6179 				*flags |= FRM_INVALIDATE_XMAX;
6180 				xid = InvalidTransactionId; /* not strictly necessary */
6181 			}
6182 			else
6183 			{
6184 				*flags |= FRM_RETURN_IS_XID;
6185 			}
6186 		}
6187 
6188 		return xid;
6189 	}
6190 
6191 	/*
6192 	 * This multixact might have or might not have members still running, but
6193 	 * we know it's valid and is newer than the cutoff point for multis.
6194 	 * However, some member(s) of it may be below the cutoff for Xids, so we
6195 	 * need to walk the whole members array to figure out what to do, if
6196 	 * anything.
6197 	 */
6198 
6199 	nmembers =
6200 		GetMultiXactIdMembers(multi, &members, false,
6201 							  HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6202 	if (nmembers <= 0)
6203 	{
6204 		/* Nothing worth keeping */
6205 		*flags |= FRM_INVALIDATE_XMAX;
6206 		return InvalidTransactionId;
6207 	}
6208 
6209 	/* is there anything older than the cutoff? */
6210 	need_replace = false;
6211 	for (i = 0; i < nmembers; i++)
6212 	{
6213 		if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6214 		{
6215 			need_replace = true;
6216 			break;
6217 		}
6218 	}
6219 
6220 	/*
6221 	 * In the simplest case, there is no member older than the cutoff; we can
6222 	 * keep the existing MultiXactId as is.
6223 	 */
6224 	if (!need_replace)
6225 	{
6226 		*flags |= FRM_NOOP;
6227 		pfree(members);
6228 		return InvalidTransactionId;
6229 	}
6230 
6231 	/*
6232 	 * If the multi needs to be updated, figure out which members do we need
6233 	 * to keep.
6234 	 */
6235 	nnewmembers = 0;
6236 	newmembers = palloc(sizeof(MultiXactMember) * nmembers);
6237 	has_lockers = false;
6238 	update_xid = InvalidTransactionId;
6239 	update_committed = false;
6240 
6241 	for (i = 0; i < nmembers; i++)
6242 	{
6243 		/*
6244 		 * Determine whether to keep this member or ignore it.
6245 		 */
6246 		if (ISUPDATE_from_mxstatus(members[i].status))
6247 		{
6248 			TransactionId xid = members[i].xid;
6249 
6250 			Assert(TransactionIdIsValid(xid));
6251 			if (TransactionIdPrecedes(xid, relfrozenxid))
6252 				ereport(ERROR,
6253 						(errcode(ERRCODE_DATA_CORRUPTED),
6254 						 errmsg_internal("found update xid %u from before relfrozenxid %u",
6255 										 xid, relfrozenxid)));
6256 
6257 			/*
6258 			 * It's an update; should we keep it?  If the transaction is known
6259 			 * aborted or crashed then it's okay to ignore it, otherwise not.
6260 			 * Note that an updater older than cutoff_xid cannot possibly be
6261 			 * committed, because HeapTupleSatisfiesVacuum would have returned
6262 			 * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
6263 			 *
6264 			 * As with all tuple visibility routines, it's critical to test
6265 			 * TransactionIdIsInProgress before TransactionIdDidCommit,
6266 			 * because of race conditions explained in detail in
6267 			 * heapam_visibility.c.
6268 			 */
6269 			if (TransactionIdIsCurrentTransactionId(xid) ||
6270 				TransactionIdIsInProgress(xid))
6271 			{
6272 				Assert(!TransactionIdIsValid(update_xid));
6273 				update_xid = xid;
6274 			}
6275 			else if (TransactionIdDidCommit(xid))
6276 			{
6277 				/*
6278 				 * The transaction committed, so we can tell caller to set
6279 				 * HEAP_XMAX_COMMITTED.  (We can only do this because we know
6280 				 * the transaction is not running.)
6281 				 */
6282 				Assert(!TransactionIdIsValid(update_xid));
6283 				update_committed = true;
6284 				update_xid = xid;
6285 			}
6286 			else
6287 			{
6288 				/*
6289 				 * Not in progress, not committed -- must be aborted or
6290 				 * crashed; we can ignore it.
6291 				 */
6292 			}
6293 
6294 			/*
6295 			 * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6296 			 * update Xid cannot possibly be older than the xid cutoff. The
6297 			 * presence of such a tuple would cause corruption, so be paranoid
6298 			 * and check.
6299 			 */
6300 			if (TransactionIdIsValid(update_xid) &&
6301 				TransactionIdPrecedes(update_xid, cutoff_xid))
6302 				ereport(ERROR,
6303 						(errcode(ERRCODE_DATA_CORRUPTED),
6304 						 errmsg_internal("found update xid %u from before xid cutoff %u",
6305 										 update_xid, cutoff_xid)));
6306 
6307 			/*
6308 			 * If we determined that it's an Xid corresponding to an update
6309 			 * that must be retained, additionally add it to the list of
6310 			 * members of the new Multi, in case we end up using that.  (We
6311 			 * might still decide to use only an update Xid and not a multi,
6312 			 * but it's easier to maintain the list as we walk the old members
6313 			 * list.)
6314 			 */
6315 			if (TransactionIdIsValid(update_xid))
6316 				newmembers[nnewmembers++] = members[i];
6317 		}
6318 		else
6319 		{
6320 			/* We only keep lockers if they are still running */
6321 			if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
6322 				TransactionIdIsInProgress(members[i].xid))
6323 			{
6324 				/* running locker cannot possibly be older than the cutoff */
6325 				Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6326 				newmembers[nnewmembers++] = members[i];
6327 				has_lockers = true;
6328 			}
6329 		}
6330 	}
6331 
6332 	pfree(members);
6333 
6334 	if (nnewmembers == 0)
6335 	{
6336 		/* nothing worth keeping!? Tell caller to remove the whole thing */
6337 		*flags |= FRM_INVALIDATE_XMAX;
6338 		xid = InvalidTransactionId;
6339 	}
6340 	else if (TransactionIdIsValid(update_xid) && !has_lockers)
6341 	{
6342 		/*
6343 		 * If there's a single member and it's an update, pass it back alone
6344 		 * without creating a new Multi.  (XXX we could do this when there's a
6345 		 * single remaining locker, too, but that would complicate the API too
6346 		 * much; moreover, the case with the single updater is more
6347 		 * interesting, because those are longer-lived.)
6348 		 */
6349 		Assert(nnewmembers == 1);
6350 		*flags |= FRM_RETURN_IS_XID;
6351 		if (update_committed)
6352 			*flags |= FRM_MARK_COMMITTED;
6353 		xid = update_xid;
6354 	}
6355 	else
6356 	{
6357 		/*
6358 		 * Create a new multixact with the surviving members of the previous
6359 		 * one, to set as new Xmax in the tuple.
6360 		 */
6361 		xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6362 		*flags |= FRM_RETURN_IS_MULTI;
6363 	}
6364 
6365 	pfree(newmembers);
6366 
6367 	return xid;
6368 }
6369 
6370 /*
6371  * heap_prepare_freeze_tuple
6372  *
6373  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6374  * are older than the specified cutoff XID and cutoff MultiXactId.  If so,
6375  * setup enough state (in the *frz output argument) to later execute and
6376  * WAL-log what we would need to do, and return true.  Return false if nothing
6377  * is to be changed.  In addition, set *totally_frozen_p to true if the tuple
6378  * will be totally frozen after these operations are performed and false if
6379  * more freezing will eventually be required.
6380  *
6381  * Caller is responsible for setting the offset field, if appropriate.
6382  *
6383  * It is assumed that the caller has checked the tuple with
6384  * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6385  * (else we should be removing the tuple, not freezing it).
6386  *
6387  * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
6388  * XID older than it could neither be running nor seen as running by any
6389  * open transaction.  This ensures that the replacement will not change
6390  * anyone's idea of the tuple state.
6391  * Similarly, cutoff_multi must be less than or equal to the smallest
6392  * MultiXactId used by any transaction currently open.
6393  *
6394  * If the tuple is in a shared buffer, caller must hold an exclusive lock on
6395  * that buffer.
6396  *
6397  * NB: It is not enough to set hint bits to indicate something is
6398  * committed/invalid -- they might not be set on a standby, or after crash
6399  * recovery.  We really need to remove old xids.
6400  */
6401 bool
heap_prepare_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi,xl_heap_freeze_tuple * frz,bool * totally_frozen_p)6402 heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6403 						  TransactionId relfrozenxid, TransactionId relminmxid,
6404 						  TransactionId cutoff_xid, TransactionId cutoff_multi,
6405 						  xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
6406 {
6407 	bool		changed = false;
6408 	bool		xmax_already_frozen = false;
6409 	bool		xmin_frozen;
6410 	bool		freeze_xmax;
6411 	TransactionId xid;
6412 
6413 	frz->frzflags = 0;
6414 	frz->t_infomask2 = tuple->t_infomask2;
6415 	frz->t_infomask = tuple->t_infomask;
6416 	frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6417 
6418 	/*
6419 	 * Process xmin.  xmin_frozen has two slightly different meanings: in the
6420 	 * !XidIsNormal case, it means "the xmin doesn't need any freezing" (it's
6421 	 * already a permanent value), while in the block below it is set true to
6422 	 * mean "xmin won't need freezing after what we do to it here" (false
6423 	 * otherwise).  In both cases we're allowed to set totally_frozen, as far
6424 	 * as xmin is concerned.
6425 	 */
6426 	xid = HeapTupleHeaderGetXmin(tuple);
6427 	if (!TransactionIdIsNormal(xid))
6428 		xmin_frozen = true;
6429 	else
6430 	{
6431 		if (TransactionIdPrecedes(xid, relfrozenxid))
6432 			ereport(ERROR,
6433 					(errcode(ERRCODE_DATA_CORRUPTED),
6434 					 errmsg_internal("found xmin %u from before relfrozenxid %u",
6435 									 xid, relfrozenxid)));
6436 
6437 		xmin_frozen = TransactionIdPrecedes(xid, cutoff_xid);
6438 		if (xmin_frozen)
6439 		{
6440 			if (!TransactionIdDidCommit(xid))
6441 				ereport(ERROR,
6442 						(errcode(ERRCODE_DATA_CORRUPTED),
6443 						 errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
6444 										 xid, cutoff_xid)));
6445 
6446 			frz->t_infomask |= HEAP_XMIN_FROZEN;
6447 			changed = true;
6448 		}
6449 	}
6450 
6451 	/*
6452 	 * Process xmax.  To thoroughly examine the current Xmax value we need to
6453 	 * resolve a MultiXactId to its member Xids, in case some of them are
6454 	 * below the given cutoff for Xids.  In that case, those values might need
6455 	 * freezing, too.  Also, if a multi needs freezing, we cannot simply take
6456 	 * it out --- if there's a live updater Xid, it needs to be kept.
6457 	 *
6458 	 * Make sure to keep heap_tuple_needs_freeze in sync with this.
6459 	 */
6460 	xid = HeapTupleHeaderGetRawXmax(tuple);
6461 
6462 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6463 	{
6464 		TransactionId newxmax;
6465 		uint16		flags;
6466 
6467 		newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6468 									relfrozenxid, relminmxid,
6469 									cutoff_xid, cutoff_multi, &flags);
6470 
6471 		freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
6472 
6473 		if (flags & FRM_RETURN_IS_XID)
6474 		{
6475 			/*
6476 			 * NB -- some of these transformations are only valid because we
6477 			 * know the return Xid is a tuple updater (i.e. not merely a
6478 			 * locker.) Also note that the only reason we don't explicitly
6479 			 * worry about HEAP_KEYS_UPDATED is because it lives in
6480 			 * t_infomask2 rather than t_infomask.
6481 			 */
6482 			frz->t_infomask &= ~HEAP_XMAX_BITS;
6483 			frz->xmax = newxmax;
6484 			if (flags & FRM_MARK_COMMITTED)
6485 				frz->t_infomask |= HEAP_XMAX_COMMITTED;
6486 			changed = true;
6487 		}
6488 		else if (flags & FRM_RETURN_IS_MULTI)
6489 		{
6490 			uint16		newbits;
6491 			uint16		newbits2;
6492 
6493 			/*
6494 			 * We can't use GetMultiXactIdHintBits directly on the new multi
6495 			 * here; that routine initializes the masks to all zeroes, which
6496 			 * would lose other bits we need.  Doing it this way ensures all
6497 			 * unrelated bits remain untouched.
6498 			 */
6499 			frz->t_infomask &= ~HEAP_XMAX_BITS;
6500 			frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6501 			GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6502 			frz->t_infomask |= newbits;
6503 			frz->t_infomask2 |= newbits2;
6504 
6505 			frz->xmax = newxmax;
6506 
6507 			changed = true;
6508 		}
6509 	}
6510 	else if (TransactionIdIsNormal(xid))
6511 	{
6512 		if (TransactionIdPrecedes(xid, relfrozenxid))
6513 			ereport(ERROR,
6514 					(errcode(ERRCODE_DATA_CORRUPTED),
6515 					 errmsg_internal("found xmax %u from before relfrozenxid %u",
6516 									 xid, relfrozenxid)));
6517 
6518 		if (TransactionIdPrecedes(xid, cutoff_xid))
6519 		{
6520 			/*
6521 			 * If we freeze xmax, make absolutely sure that it's not an XID
6522 			 * that is important.  (Note, a lock-only xmax can be removed
6523 			 * independent of committedness, since a committed lock holder has
6524 			 * released the lock).
6525 			 */
6526 			if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
6527 				TransactionIdDidCommit(xid))
6528 				ereport(ERROR,
6529 						(errcode(ERRCODE_DATA_CORRUPTED),
6530 						 errmsg_internal("cannot freeze committed xmax %u",
6531 										 xid)));
6532 			freeze_xmax = true;
6533 		}
6534 		else
6535 			freeze_xmax = false;
6536 	}
6537 	else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
6538 			 !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
6539 	{
6540 		freeze_xmax = false;
6541 		xmax_already_frozen = true;
6542 	}
6543 	else
6544 		ereport(ERROR,
6545 				(errcode(ERRCODE_DATA_CORRUPTED),
6546 				 errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
6547 								 xid, tuple->t_infomask)));
6548 
6549 	if (freeze_xmax)
6550 	{
6551 		Assert(!xmax_already_frozen);
6552 
6553 		frz->xmax = InvalidTransactionId;
6554 
6555 		/*
6556 		 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
6557 		 * LOCKED.  Normalize to INVALID just to be sure no one gets confused.
6558 		 * Also get rid of the HEAP_KEYS_UPDATED bit.
6559 		 */
6560 		frz->t_infomask &= ~HEAP_XMAX_BITS;
6561 		frz->t_infomask |= HEAP_XMAX_INVALID;
6562 		frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
6563 		frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6564 		changed = true;
6565 	}
6566 
6567 	/*
6568 	 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
6569 	 * we support having MOVED_OFF/MOVED_IN tuples in the database.
6570 	 */
6571 	if (tuple->t_infomask & HEAP_MOVED)
6572 	{
6573 		xid = HeapTupleHeaderGetXvac(tuple);
6574 
6575 		/*
6576 		 * For Xvac, we ignore the cutoff_xid and just always perform the
6577 		 * freeze operation.  The oldest release in which such a value can
6578 		 * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
6579 		 * was removed in PostgreSQL 9.0.  Note that if we were to respect
6580 		 * cutoff_xid here, we'd need to make surely to clear totally_frozen
6581 		 * when we skipped freezing on that basis.
6582 		 */
6583 		if (TransactionIdIsNormal(xid))
6584 		{
6585 			/*
6586 			 * If a MOVED_OFF tuple is not dead, the xvac transaction must
6587 			 * have failed; whereas a non-dead MOVED_IN tuple must mean the
6588 			 * xvac transaction succeeded.
6589 			 */
6590 			if (tuple->t_infomask & HEAP_MOVED_OFF)
6591 				frz->frzflags |= XLH_INVALID_XVAC;
6592 			else
6593 				frz->frzflags |= XLH_FREEZE_XVAC;
6594 
6595 			/*
6596 			 * Might as well fix the hint bits too; usually XMIN_COMMITTED
6597 			 * will already be set here, but there's a small chance not.
6598 			 */
6599 			Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
6600 			frz->t_infomask |= HEAP_XMIN_COMMITTED;
6601 			changed = true;
6602 		}
6603 	}
6604 
6605 	*totally_frozen_p = (xmin_frozen &&
6606 						 (freeze_xmax || xmax_already_frozen));
6607 	return changed;
6608 }
6609 
6610 /*
6611  * heap_execute_freeze_tuple
6612  *		Execute the prepared freezing of a tuple.
6613  *
6614  * Caller is responsible for ensuring that no other backend can access the
6615  * storage underlying this tuple, either by holding an exclusive lock on the
6616  * buffer containing it (which is what lazy VACUUM does), or by having it be
6617  * in private storage (which is what CLUSTER and friends do).
6618  *
6619  * Note: it might seem we could make the changes without exclusive lock, since
6620  * TransactionId read/write is assumed atomic anyway.  However there is a race
6621  * condition: someone who just fetched an old XID that we overwrite here could
6622  * conceivably not finish checking the XID against pg_xact before we finish
6623  * the VACUUM and perhaps truncate off the part of pg_xact he needs.  Getting
6624  * exclusive lock ensures no other backend is in process of checking the
6625  * tuple status.  Also, getting exclusive lock makes it safe to adjust the
6626  * infomask bits.
6627  *
6628  * NB: All code in here must be safe to execute during crash recovery!
6629  */
6630 void
heap_execute_freeze_tuple(HeapTupleHeader tuple,xl_heap_freeze_tuple * frz)6631 heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
6632 {
6633 	HeapTupleHeaderSetXmax(tuple, frz->xmax);
6634 
6635 	if (frz->frzflags & XLH_FREEZE_XVAC)
6636 		HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
6637 
6638 	if (frz->frzflags & XLH_INVALID_XVAC)
6639 		HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
6640 
6641 	tuple->t_infomask = frz->t_infomask;
6642 	tuple->t_infomask2 = frz->t_infomask2;
6643 }
6644 
6645 /*
6646  * heap_freeze_tuple
6647  *		Freeze tuple in place, without WAL logging.
6648  *
6649  * Useful for callers like CLUSTER that perform their own WAL logging.
6650  */
6651 bool
heap_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi)6652 heap_freeze_tuple(HeapTupleHeader tuple,
6653 				  TransactionId relfrozenxid, TransactionId relminmxid,
6654 				  TransactionId cutoff_xid, TransactionId cutoff_multi)
6655 {
6656 	xl_heap_freeze_tuple frz;
6657 	bool		do_freeze;
6658 	bool		tuple_totally_frozen;
6659 
6660 	do_freeze = heap_prepare_freeze_tuple(tuple,
6661 										  relfrozenxid, relminmxid,
6662 										  cutoff_xid, cutoff_multi,
6663 										  &frz, &tuple_totally_frozen);
6664 
6665 	/*
6666 	 * Note that because this is not a WAL-logged operation, we don't need to
6667 	 * fill in the offset in the freeze record.
6668 	 */
6669 
6670 	if (do_freeze)
6671 		heap_execute_freeze_tuple(tuple, &frz);
6672 	return do_freeze;
6673 }
6674 
6675 /*
6676  * For a given MultiXactId, return the hint bits that should be set in the
6677  * tuple's infomask.
6678  *
6679  * Normally this should be called for a multixact that was just created, and
6680  * so is on our local cache, so the GetMembers call is fast.
6681  */
6682 static void
GetMultiXactIdHintBits(MultiXactId multi,uint16 * new_infomask,uint16 * new_infomask2)6683 GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
6684 					   uint16 *new_infomask2)
6685 {
6686 	int			nmembers;
6687 	MultiXactMember *members;
6688 	int			i;
6689 	uint16		bits = HEAP_XMAX_IS_MULTI;
6690 	uint16		bits2 = 0;
6691 	bool		has_update = false;
6692 	LockTupleMode strongest = LockTupleKeyShare;
6693 
6694 	/*
6695 	 * We only use this in multis we just created, so they cannot be values
6696 	 * pre-pg_upgrade.
6697 	 */
6698 	nmembers = GetMultiXactIdMembers(multi, &members, false, false);
6699 
6700 	for (i = 0; i < nmembers; i++)
6701 	{
6702 		LockTupleMode mode;
6703 
6704 		/*
6705 		 * Remember the strongest lock mode held by any member of the
6706 		 * multixact.
6707 		 */
6708 		mode = TUPLOCK_from_mxstatus(members[i].status);
6709 		if (mode > strongest)
6710 			strongest = mode;
6711 
6712 		/* See what other bits we need */
6713 		switch (members[i].status)
6714 		{
6715 			case MultiXactStatusForKeyShare:
6716 			case MultiXactStatusForShare:
6717 			case MultiXactStatusForNoKeyUpdate:
6718 				break;
6719 
6720 			case MultiXactStatusForUpdate:
6721 				bits2 |= HEAP_KEYS_UPDATED;
6722 				break;
6723 
6724 			case MultiXactStatusNoKeyUpdate:
6725 				has_update = true;
6726 				break;
6727 
6728 			case MultiXactStatusUpdate:
6729 				bits2 |= HEAP_KEYS_UPDATED;
6730 				has_update = true;
6731 				break;
6732 		}
6733 	}
6734 
6735 	if (strongest == LockTupleExclusive ||
6736 		strongest == LockTupleNoKeyExclusive)
6737 		bits |= HEAP_XMAX_EXCL_LOCK;
6738 	else if (strongest == LockTupleShare)
6739 		bits |= HEAP_XMAX_SHR_LOCK;
6740 	else if (strongest == LockTupleKeyShare)
6741 		bits |= HEAP_XMAX_KEYSHR_LOCK;
6742 
6743 	if (!has_update)
6744 		bits |= HEAP_XMAX_LOCK_ONLY;
6745 
6746 	if (nmembers > 0)
6747 		pfree(members);
6748 
6749 	*new_infomask = bits;
6750 	*new_infomask2 = bits2;
6751 }
6752 
6753 /*
6754  * MultiXactIdGetUpdateXid
6755  *
6756  * Given a multixact Xmax and corresponding infomask, which does not have the
6757  * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
6758  * transaction.
6759  *
6760  * Caller is expected to check the status of the updating transaction, if
6761  * necessary.
6762  */
6763 static TransactionId
MultiXactIdGetUpdateXid(TransactionId xmax,uint16 t_infomask)6764 MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
6765 {
6766 	TransactionId update_xact = InvalidTransactionId;
6767 	MultiXactMember *members;
6768 	int			nmembers;
6769 
6770 	Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
6771 	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6772 
6773 	/*
6774 	 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
6775 	 * pre-pg_upgrade.
6776 	 */
6777 	nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
6778 
6779 	if (nmembers > 0)
6780 	{
6781 		int			i;
6782 
6783 		for (i = 0; i < nmembers; i++)
6784 		{
6785 			/* Ignore lockers */
6786 			if (!ISUPDATE_from_mxstatus(members[i].status))
6787 				continue;
6788 
6789 			/* there can be at most one updater */
6790 			Assert(update_xact == InvalidTransactionId);
6791 			update_xact = members[i].xid;
6792 #ifndef USE_ASSERT_CHECKING
6793 
6794 			/*
6795 			 * in an assert-enabled build, walk the whole array to ensure
6796 			 * there's no other updater.
6797 			 */
6798 			break;
6799 #endif
6800 		}
6801 
6802 		pfree(members);
6803 	}
6804 
6805 	return update_xact;
6806 }
6807 
6808 /*
6809  * HeapTupleGetUpdateXid
6810  *		As above, but use a HeapTupleHeader
6811  *
6812  * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
6813  * checking the hint bits.
6814  */
6815 TransactionId
HeapTupleGetUpdateXid(HeapTupleHeader tuple)6816 HeapTupleGetUpdateXid(HeapTupleHeader tuple)
6817 {
6818 	return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
6819 								   tuple->t_infomask);
6820 }
6821 
6822 /*
6823  * Does the given multixact conflict with the current transaction grabbing a
6824  * tuple lock of the given strength?
6825  *
6826  * The passed infomask pairs up with the given multixact in the tuple header.
6827  *
6828  * If current_is_member is not NULL, it is set to 'true' if the current
6829  * transaction is a member of the given multixact.
6830  */
6831 static bool
DoesMultiXactIdConflict(MultiXactId multi,uint16 infomask,LockTupleMode lockmode,bool * current_is_member)6832 DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
6833 						LockTupleMode lockmode, bool *current_is_member)
6834 {
6835 	int			nmembers;
6836 	MultiXactMember *members;
6837 	bool		result = false;
6838 	LOCKMODE	wanted = tupleLockExtraInfo[lockmode].hwlock;
6839 
6840 	if (HEAP_LOCKED_UPGRADED(infomask))
6841 		return false;
6842 
6843 	nmembers = GetMultiXactIdMembers(multi, &members, false,
6844 									 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6845 	if (nmembers >= 0)
6846 	{
6847 		int			i;
6848 
6849 		for (i = 0; i < nmembers; i++)
6850 		{
6851 			TransactionId memxid;
6852 			LOCKMODE	memlockmode;
6853 
6854 			if (result && (current_is_member == NULL || *current_is_member))
6855 				break;
6856 
6857 			memlockmode = LOCKMODE_from_mxstatus(members[i].status);
6858 
6859 			/* ignore members from current xact (but track their presence) */
6860 			memxid = members[i].xid;
6861 			if (TransactionIdIsCurrentTransactionId(memxid))
6862 			{
6863 				if (current_is_member != NULL)
6864 					*current_is_member = true;
6865 				continue;
6866 			}
6867 			else if (result)
6868 				continue;
6869 
6870 			/* ignore members that don't conflict with the lock we want */
6871 			if (!DoLockModesConflict(memlockmode, wanted))
6872 				continue;
6873 
6874 			if (ISUPDATE_from_mxstatus(members[i].status))
6875 			{
6876 				/* ignore aborted updaters */
6877 				if (TransactionIdDidAbort(memxid))
6878 					continue;
6879 			}
6880 			else
6881 			{
6882 				/* ignore lockers-only that are no longer in progress */
6883 				if (!TransactionIdIsInProgress(memxid))
6884 					continue;
6885 			}
6886 
6887 			/*
6888 			 * Whatever remains are either live lockers that conflict with our
6889 			 * wanted lock, and updaters that are not aborted.  Those conflict
6890 			 * with what we want.  Set up to return true, but keep going to
6891 			 * look for the current transaction among the multixact members,
6892 			 * if needed.
6893 			 */
6894 			result = true;
6895 		}
6896 		pfree(members);
6897 	}
6898 
6899 	return result;
6900 }
6901 
6902 /*
6903  * Do_MultiXactIdWait
6904  *		Actual implementation for the two functions below.
6905  *
6906  * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
6907  * needed to ensure we only sleep on conflicting members, and the infomask is
6908  * used to optimize multixact access in case it's a lock-only multi); 'nowait'
6909  * indicates whether to use conditional lock acquisition, to allow callers to
6910  * fail if lock is unavailable.  'rel', 'ctid' and 'oper' are used to set up
6911  * context information for error messages.  'remaining', if not NULL, receives
6912  * the number of members that are still running, including any (non-aborted)
6913  * subtransactions of our own transaction.
6914  *
6915  * We do this by sleeping on each member using XactLockTableWait.  Any
6916  * members that belong to the current backend are *not* waited for, however;
6917  * this would not merely be useless but would lead to Assert failure inside
6918  * XactLockTableWait.  By the time this returns, it is certain that all
6919  * transactions *of other backends* that were members of the MultiXactId
6920  * that conflict with the requested status are dead (and no new ones can have
6921  * been added, since it is not legal to add members to an existing
6922  * MultiXactId).
6923  *
6924  * But by the time we finish sleeping, someone else may have changed the Xmax
6925  * of the containing tuple, so the caller needs to iterate on us somehow.
6926  *
6927  * Note that in case we return false, the number of remaining members is
6928  * not to be trusted.
6929  */
6930 static bool
Do_MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,bool nowait,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)6931 Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6932 				   uint16 infomask, bool nowait,
6933 				   Relation rel, ItemPointer ctid, XLTW_Oper oper,
6934 				   int *remaining)
6935 {
6936 	bool		result = true;
6937 	MultiXactMember *members;
6938 	int			nmembers;
6939 	int			remain = 0;
6940 
6941 	/* for pre-pg_upgrade tuples, no need to sleep at all */
6942 	nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
6943 		GetMultiXactIdMembers(multi, &members, false,
6944 							  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6945 
6946 	if (nmembers >= 0)
6947 	{
6948 		int			i;
6949 
6950 		for (i = 0; i < nmembers; i++)
6951 		{
6952 			TransactionId memxid = members[i].xid;
6953 			MultiXactStatus memstatus = members[i].status;
6954 
6955 			if (TransactionIdIsCurrentTransactionId(memxid))
6956 			{
6957 				remain++;
6958 				continue;
6959 			}
6960 
6961 			if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
6962 									 LOCKMODE_from_mxstatus(status)))
6963 			{
6964 				if (remaining && TransactionIdIsInProgress(memxid))
6965 					remain++;
6966 				continue;
6967 			}
6968 
6969 			/*
6970 			 * This member conflicts with our multi, so we have to sleep (or
6971 			 * return failure, if asked to avoid waiting.)
6972 			 *
6973 			 * Note that we don't set up an error context callback ourselves,
6974 			 * but instead we pass the info down to XactLockTableWait.  This
6975 			 * might seem a bit wasteful because the context is set up and
6976 			 * tore down for each member of the multixact, but in reality it
6977 			 * should be barely noticeable, and it avoids duplicate code.
6978 			 */
6979 			if (nowait)
6980 			{
6981 				result = ConditionalXactLockTableWait(memxid);
6982 				if (!result)
6983 					break;
6984 			}
6985 			else
6986 				XactLockTableWait(memxid, rel, ctid, oper);
6987 		}
6988 
6989 		pfree(members);
6990 	}
6991 
6992 	if (remaining)
6993 		*remaining = remain;
6994 
6995 	return result;
6996 }
6997 
6998 /*
6999  * MultiXactIdWait
7000  *		Sleep on a MultiXactId.
7001  *
7002  * By the time we finish sleeping, someone else may have changed the Xmax
7003  * of the containing tuple, so the caller needs to iterate on us somehow.
7004  *
7005  * We return (in *remaining, if not NULL) the number of members that are still
7006  * running, including any (non-aborted) subtransactions of our own transaction.
7007  */
7008 static void
MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)7009 MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
7010 				Relation rel, ItemPointer ctid, XLTW_Oper oper,
7011 				int *remaining)
7012 {
7013 	(void) Do_MultiXactIdWait(multi, status, infomask, false,
7014 							  rel, ctid, oper, remaining);
7015 }
7016 
7017 /*
7018  * ConditionalMultiXactIdWait
7019  *		As above, but only lock if we can get the lock without blocking.
7020  *
7021  * By the time we finish sleeping, someone else may have changed the Xmax
7022  * of the containing tuple, so the caller needs to iterate on us somehow.
7023  *
7024  * If the multixact is now all gone, return true.  Returns false if some
7025  * transactions might still be running.
7026  *
7027  * We return (in *remaining, if not NULL) the number of members that are still
7028  * running, including any (non-aborted) subtransactions of our own transaction.
7029  */
7030 static bool
ConditionalMultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,int * remaining)7031 ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
7032 						   uint16 infomask, Relation rel, int *remaining)
7033 {
7034 	return Do_MultiXactIdWait(multi, status, infomask, true,
7035 							  rel, NULL, XLTW_None, remaining);
7036 }
7037 
7038 /*
7039  * heap_tuple_needs_eventual_freeze
7040  *
7041  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7042  * will eventually require freezing.  Similar to heap_tuple_needs_freeze,
7043  * but there's no cutoff, since we're trying to figure out whether freezing
7044  * will ever be needed, not whether it's needed now.
7045  */
7046 bool
heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)7047 heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
7048 {
7049 	TransactionId xid;
7050 
7051 	/*
7052 	 * If xmin is a normal transaction ID, this tuple is definitely not
7053 	 * frozen.
7054 	 */
7055 	xid = HeapTupleHeaderGetXmin(tuple);
7056 	if (TransactionIdIsNormal(xid))
7057 		return true;
7058 
7059 	/*
7060 	 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7061 	 */
7062 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7063 	{
7064 		MultiXactId multi;
7065 
7066 		multi = HeapTupleHeaderGetRawXmax(tuple);
7067 		if (MultiXactIdIsValid(multi))
7068 			return true;
7069 	}
7070 	else
7071 	{
7072 		xid = HeapTupleHeaderGetRawXmax(tuple);
7073 		if (TransactionIdIsNormal(xid))
7074 			return true;
7075 	}
7076 
7077 	if (tuple->t_infomask & HEAP_MOVED)
7078 	{
7079 		xid = HeapTupleHeaderGetXvac(tuple);
7080 		if (TransactionIdIsNormal(xid))
7081 			return true;
7082 	}
7083 
7084 	return false;
7085 }
7086 
7087 /*
7088  * heap_tuple_needs_freeze
7089  *
7090  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7091  * are older than the specified cutoff XID or MultiXactId.  If so, return true.
7092  *
7093  * It doesn't matter whether the tuple is alive or dead, we are checking
7094  * to see if a tuple needs to be removed or frozen to avoid wraparound.
7095  *
7096  * NB: Cannot rely on hint bits here, they might not be set after a crash or
7097  * on a standby.
7098  */
7099 bool
heap_tuple_needs_freeze(HeapTupleHeader tuple,TransactionId cutoff_xid,MultiXactId cutoff_multi,Buffer buf)7100 heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
7101 						MultiXactId cutoff_multi, Buffer buf)
7102 {
7103 	TransactionId xid;
7104 
7105 	xid = HeapTupleHeaderGetXmin(tuple);
7106 	if (TransactionIdIsNormal(xid) &&
7107 		TransactionIdPrecedes(xid, cutoff_xid))
7108 		return true;
7109 
7110 	/*
7111 	 * The considerations for multixacts are complicated; look at
7112 	 * heap_prepare_freeze_tuple for justifications.  This routine had better
7113 	 * be in sync with that one!
7114 	 */
7115 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7116 	{
7117 		MultiXactId multi;
7118 
7119 		multi = HeapTupleHeaderGetRawXmax(tuple);
7120 		if (!MultiXactIdIsValid(multi))
7121 		{
7122 			/* no xmax set, ignore */
7123 			;
7124 		}
7125 		else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7126 			return true;
7127 		else if (MultiXactIdPrecedes(multi, cutoff_multi))
7128 			return true;
7129 		else
7130 		{
7131 			MultiXactMember *members;
7132 			int			nmembers;
7133 			int			i;
7134 
7135 			/* need to check whether any member of the mxact is too old */
7136 
7137 			nmembers = GetMultiXactIdMembers(multi, &members, false,
7138 											 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
7139 
7140 			for (i = 0; i < nmembers; i++)
7141 			{
7142 				if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
7143 				{
7144 					pfree(members);
7145 					return true;
7146 				}
7147 			}
7148 			if (nmembers > 0)
7149 				pfree(members);
7150 		}
7151 	}
7152 	else
7153 	{
7154 		xid = HeapTupleHeaderGetRawXmax(tuple);
7155 		if (TransactionIdIsNormal(xid) &&
7156 			TransactionIdPrecedes(xid, cutoff_xid))
7157 			return true;
7158 	}
7159 
7160 	if (tuple->t_infomask & HEAP_MOVED)
7161 	{
7162 		xid = HeapTupleHeaderGetXvac(tuple);
7163 		if (TransactionIdIsNormal(xid) &&
7164 			TransactionIdPrecedes(xid, cutoff_xid))
7165 			return true;
7166 	}
7167 
7168 	return false;
7169 }
7170 
7171 /*
7172  * If 'tuple' contains any visible XID greater than latestRemovedXid,
7173  * ratchet forwards latestRemovedXid to the greatest one found.
7174  * This is used as the basis for generating Hot Standby conflicts, so
7175  * if a tuple was never visible then removing it should not conflict
7176  * with queries.
7177  */
7178 void
HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,TransactionId * latestRemovedXid)7179 HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
7180 									   TransactionId *latestRemovedXid)
7181 {
7182 	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
7183 	TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
7184 	TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
7185 
7186 	if (tuple->t_infomask & HEAP_MOVED)
7187 	{
7188 		if (TransactionIdPrecedes(*latestRemovedXid, xvac))
7189 			*latestRemovedXid = xvac;
7190 	}
7191 
7192 	/*
7193 	 * Ignore tuples inserted by an aborted transaction or if the tuple was
7194 	 * updated/deleted by the inserting transaction.
7195 	 *
7196 	 * Look for a committed hint bit, or if no xmin bit is set, check clog.
7197 	 */
7198 	if (HeapTupleHeaderXminCommitted(tuple) ||
7199 		(!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
7200 	{
7201 		if (xmax != xmin &&
7202 			TransactionIdFollows(xmax, *latestRemovedXid))
7203 			*latestRemovedXid = xmax;
7204 	}
7205 
7206 	/* *latestRemovedXid may still be invalid at end */
7207 }
7208 
7209 #ifdef USE_PREFETCH
7210 /*
7211  * Helper function for heap_index_delete_tuples.  Issues prefetch requests for
7212  * prefetch_count buffers.  The prefetch_state keeps track of all the buffers
7213  * we can prefetch, and which have already been prefetched; each call to this
7214  * function picks up where the previous call left off.
7215  *
7216  * Note: we expect the deltids array to be sorted in an order that groups TIDs
7217  * by heap block, with all TIDs for each block appearing together in exactly
7218  * one group.
7219  */
7220 static void
index_delete_prefetch_buffer(Relation rel,IndexDeletePrefetchState * prefetch_state,int prefetch_count)7221 index_delete_prefetch_buffer(Relation rel,
7222 							 IndexDeletePrefetchState *prefetch_state,
7223 							 int prefetch_count)
7224 {
7225 	BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
7226 	int			count = 0;
7227 	int			i;
7228 	int			ndeltids = prefetch_state->ndeltids;
7229 	TM_IndexDelete *deltids = prefetch_state->deltids;
7230 
7231 	for (i = prefetch_state->next_item;
7232 		 i < ndeltids && count < prefetch_count;
7233 		 i++)
7234 	{
7235 		ItemPointer htid = &deltids[i].tid;
7236 
7237 		if (cur_hblkno == InvalidBlockNumber ||
7238 			ItemPointerGetBlockNumber(htid) != cur_hblkno)
7239 		{
7240 			cur_hblkno = ItemPointerGetBlockNumber(htid);
7241 			PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
7242 			count++;
7243 		}
7244 	}
7245 
7246 	/*
7247 	 * Save the prefetch position so that next time we can continue from that
7248 	 * position.
7249 	 */
7250 	prefetch_state->next_item = i;
7251 	prefetch_state->cur_hblkno = cur_hblkno;
7252 }
7253 #endif
7254 
7255 /*
7256  * heapam implementation of tableam's index_delete_tuples interface.
7257  *
7258  * This helper function is called by index AMs during index tuple deletion.
7259  * See tableam header comments for an explanation of the interface implemented
7260  * here and a general theory of operation.  Note that each call here is either
7261  * a simple index deletion call, or a bottom-up index deletion call.
7262  *
7263  * It's possible for this to generate a fair amount of I/O, since we may be
7264  * deleting hundreds of tuples from a single index block.  To amortize that
7265  * cost to some degree, this uses prefetching and combines repeat accesses to
7266  * the same heap block.
7267  */
7268 TransactionId
heap_index_delete_tuples(Relation rel,TM_IndexDeleteOp * delstate)7269 heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
7270 {
7271 	/* Initial assumption is that earlier pruning took care of conflict */
7272 	TransactionId latestRemovedXid = InvalidTransactionId;
7273 	BlockNumber blkno = InvalidBlockNumber;
7274 	Buffer		buf = InvalidBuffer;
7275 	Page		page = NULL;
7276 	OffsetNumber maxoff = InvalidOffsetNumber;
7277 	TransactionId priorXmax;
7278 #ifdef USE_PREFETCH
7279 	IndexDeletePrefetchState prefetch_state;
7280 	int			prefetch_distance;
7281 #endif
7282 	SnapshotData SnapshotNonVacuumable;
7283 	int			finalndeltids = 0,
7284 				nblocksaccessed = 0;
7285 
7286 	/* State that's only used in bottom-up index deletion case */
7287 	int			nblocksfavorable = 0;
7288 	int			curtargetfreespace = delstate->bottomupfreespace,
7289 				lastfreespace = 0,
7290 				actualfreespace = 0;
7291 	bool		bottomup_final_block = false;
7292 
7293 	InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel));
7294 
7295 	/* Sort caller's deltids array by TID for further processing */
7296 	index_delete_sort(delstate);
7297 
7298 	/*
7299 	 * Bottom-up case: resort deltids array in an order attuned to where the
7300 	 * greatest number of promising TIDs are to be found, and determine how
7301 	 * many blocks from the start of sorted array should be considered
7302 	 * favorable.  This will also shrink the deltids array in order to
7303 	 * eliminate completely unfavorable blocks up front.
7304 	 */
7305 	if (delstate->bottomup)
7306 		nblocksfavorable = bottomup_sort_and_shrink(delstate);
7307 
7308 #ifdef USE_PREFETCH
7309 	/* Initialize prefetch state. */
7310 	prefetch_state.cur_hblkno = InvalidBlockNumber;
7311 	prefetch_state.next_item = 0;
7312 	prefetch_state.ndeltids = delstate->ndeltids;
7313 	prefetch_state.deltids = delstate->deltids;
7314 
7315 	/*
7316 	 * Determine the prefetch distance that we will attempt to maintain.
7317 	 *
7318 	 * Since the caller holds a buffer lock somewhere in rel, we'd better make
7319 	 * sure that isn't a catalog relation before we call code that does
7320 	 * syscache lookups, to avoid risk of deadlock.
7321 	 */
7322 	if (IsCatalogRelation(rel))
7323 		prefetch_distance = maintenance_io_concurrency;
7324 	else
7325 		prefetch_distance =
7326 			get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
7327 
7328 	/* Cap initial prefetch distance for bottom-up deletion caller */
7329 	if (delstate->bottomup)
7330 	{
7331 		Assert(nblocksfavorable >= 1);
7332 		Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS);
7333 		prefetch_distance = Min(prefetch_distance, nblocksfavorable);
7334 	}
7335 
7336 	/* Start prefetching. */
7337 	index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
7338 #endif
7339 
7340 	/* Iterate over deltids, determine which to delete, check their horizon */
7341 	Assert(delstate->ndeltids > 0);
7342 	for (int i = 0; i < delstate->ndeltids; i++)
7343 	{
7344 		TM_IndexDelete *ideltid = &delstate->deltids[i];
7345 		TM_IndexStatus *istatus = delstate->status + ideltid->id;
7346 		ItemPointer htid = &ideltid->tid;
7347 		OffsetNumber offnum;
7348 
7349 		/*
7350 		 * Read buffer, and perform required extra steps each time a new block
7351 		 * is encountered.  Avoid refetching if it's the same block as the one
7352 		 * from the last htid.
7353 		 */
7354 		if (blkno == InvalidBlockNumber ||
7355 			ItemPointerGetBlockNumber(htid) != blkno)
7356 		{
7357 			/*
7358 			 * Consider giving up early for bottom-up index deletion caller
7359 			 * first. (Only prefetch next-next block afterwards, when it
7360 			 * becomes clear that we're at least going to access the next
7361 			 * block in line.)
7362 			 *
7363 			 * Sometimes the first block frees so much space for bottom-up
7364 			 * caller that the deletion process can end without accessing any
7365 			 * more blocks.  It is usually necessary to access 2 or 3 blocks
7366 			 * per bottom-up deletion operation, though.
7367 			 */
7368 			if (delstate->bottomup)
7369 			{
7370 				/*
7371 				 * We often allow caller to delete a few additional items
7372 				 * whose entries we reached after the point that space target
7373 				 * from caller was satisfied.  The cost of accessing the page
7374 				 * was already paid at that point, so it made sense to finish
7375 				 * it off.  When that happened, we finalize everything here
7376 				 * (by finishing off the whole bottom-up deletion operation
7377 				 * without needlessly paying the cost of accessing any more
7378 				 * blocks).
7379 				 */
7380 				if (bottomup_final_block)
7381 					break;
7382 
7383 				/*
7384 				 * Give up when we didn't enable our caller to free any
7385 				 * additional space as a result of processing the page that we
7386 				 * just finished up with.  This rule is the main way in which
7387 				 * we keep the cost of bottom-up deletion under control.
7388 				 */
7389 				if (nblocksaccessed >= 1 && actualfreespace == lastfreespace)
7390 					break;
7391 				lastfreespace = actualfreespace;	/* for next time */
7392 
7393 				/*
7394 				 * Deletion operation (which is bottom-up) will definitely
7395 				 * access the next block in line.  Prepare for that now.
7396 				 *
7397 				 * Decay target free space so that we don't hang on for too
7398 				 * long with a marginal case. (Space target is only truly
7399 				 * helpful when it allows us to recognize that we don't need
7400 				 * to access more than 1 or 2 blocks to satisfy caller due to
7401 				 * agreeable workload characteristics.)
7402 				 *
7403 				 * We are a bit more patient when we encounter contiguous
7404 				 * blocks, though: these are treated as favorable blocks.  The
7405 				 * decay process is only applied when the next block in line
7406 				 * is not a favorable/contiguous block.  This is not an
7407 				 * exception to the general rule; we still insist on finding
7408 				 * at least one deletable item per block accessed.  See
7409 				 * bottomup_nblocksfavorable() for full details of the theory
7410 				 * behind favorable blocks and heap block locality in general.
7411 				 *
7412 				 * Note: The first block in line is always treated as a
7413 				 * favorable block, so the earliest possible point that the
7414 				 * decay can be applied is just before we access the second
7415 				 * block in line.  The Assert() verifies this for us.
7416 				 */
7417 				Assert(nblocksaccessed > 0 || nblocksfavorable > 0);
7418 				if (nblocksfavorable > 0)
7419 					nblocksfavorable--;
7420 				else
7421 					curtargetfreespace /= 2;
7422 			}
7423 
7424 			/* release old buffer */
7425 			if (BufferIsValid(buf))
7426 				UnlockReleaseBuffer(buf);
7427 
7428 			blkno = ItemPointerGetBlockNumber(htid);
7429 			buf = ReadBuffer(rel, blkno);
7430 			nblocksaccessed++;
7431 			Assert(!delstate->bottomup ||
7432 				   nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS);
7433 
7434 #ifdef USE_PREFETCH
7435 
7436 			/*
7437 			 * To maintain the prefetch distance, prefetch one more page for
7438 			 * each page we read.
7439 			 */
7440 			index_delete_prefetch_buffer(rel, &prefetch_state, 1);
7441 #endif
7442 
7443 			LockBuffer(buf, BUFFER_LOCK_SHARE);
7444 
7445 			page = BufferGetPage(buf);
7446 			maxoff = PageGetMaxOffsetNumber(page);
7447 		}
7448 
7449 		if (istatus->knowndeletable)
7450 			Assert(!delstate->bottomup && !istatus->promising);
7451 		else
7452 		{
7453 			ItemPointerData tmp = *htid;
7454 			HeapTupleData heapTuple;
7455 
7456 			/* Are any tuples from this HOT chain non-vacuumable? */
7457 			if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable,
7458 									   &heapTuple, NULL, true))
7459 				continue;		/* can't delete entry */
7460 
7461 			/* Caller will delete, since whole HOT chain is vacuumable */
7462 			istatus->knowndeletable = true;
7463 
7464 			/* Maintain index free space info for bottom-up deletion case */
7465 			if (delstate->bottomup)
7466 			{
7467 				Assert(istatus->freespace > 0);
7468 				actualfreespace += istatus->freespace;
7469 				if (actualfreespace >= curtargetfreespace)
7470 					bottomup_final_block = true;
7471 			}
7472 		}
7473 
7474 		/*
7475 		 * Maintain latestRemovedXid value for deletion operation as a whole
7476 		 * by advancing current value using heap tuple headers.  This is
7477 		 * loosely based on the logic for pruning a HOT chain.
7478 		 */
7479 		offnum = ItemPointerGetOffsetNumber(htid);
7480 		priorXmax = InvalidTransactionId;	/* cannot check first XMIN */
7481 		for (;;)
7482 		{
7483 			ItemId		lp;
7484 			HeapTupleHeader htup;
7485 
7486 			/* Some sanity checks */
7487 			if (offnum < FirstOffsetNumber || offnum > maxoff)
7488 				break;
7489 
7490 			lp = PageGetItemId(page, offnum);
7491 			if (ItemIdIsRedirected(lp))
7492 			{
7493 				offnum = ItemIdGetRedirect(lp);
7494 				continue;
7495 			}
7496 
7497 			/*
7498 			 * We'll often encounter LP_DEAD line pointers (especially with an
7499 			 * entry marked knowndeletable by our caller up front).  No heap
7500 			 * tuple headers get examined for an htid that leads us to an
7501 			 * LP_DEAD item.  This is okay because the earlier pruning
7502 			 * operation that made the line pointer LP_DEAD in the first place
7503 			 * must have considered the original tuple header as part of
7504 			 * generating its own latestRemovedXid value.
7505 			 *
7506 			 * Relying on XLOG_HEAP2_PRUNE records like this is the same
7507 			 * strategy that index vacuuming uses in all cases.  Index VACUUM
7508 			 * WAL records don't even have a latestRemovedXid field of their
7509 			 * own for this reason.
7510 			 */
7511 			if (!ItemIdIsNormal(lp))
7512 				break;
7513 
7514 			htup = (HeapTupleHeader) PageGetItem(page, lp);
7515 
7516 			/*
7517 			 * Check the tuple XMIN against prior XMAX, if any
7518 			 */
7519 			if (TransactionIdIsValid(priorXmax) &&
7520 				!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
7521 				break;
7522 
7523 			HeapTupleHeaderAdvanceLatestRemovedXid(htup, &latestRemovedXid);
7524 
7525 			/*
7526 			 * If the tuple is not HOT-updated, then we are at the end of this
7527 			 * HOT-chain.  No need to visit later tuples from the same update
7528 			 * chain (they get their own index entries) -- just move on to
7529 			 * next htid from index AM caller.
7530 			 */
7531 			if (!HeapTupleHeaderIsHotUpdated(htup))
7532 				break;
7533 
7534 			/* Advance to next HOT chain member */
7535 			Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
7536 			offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
7537 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
7538 		}
7539 
7540 		/* Enable further/final shrinking of deltids for caller */
7541 		finalndeltids = i + 1;
7542 	}
7543 
7544 	UnlockReleaseBuffer(buf);
7545 
7546 	/*
7547 	 * Shrink deltids array to exclude non-deletable entries at the end.  This
7548 	 * is not just a minor optimization.  Final deltids array size might be
7549 	 * zero for a bottom-up caller.  Index AM is explicitly allowed to rely on
7550 	 * ndeltids being zero in all cases with zero total deletable entries.
7551 	 */
7552 	Assert(finalndeltids > 0 || delstate->bottomup);
7553 	delstate->ndeltids = finalndeltids;
7554 
7555 	return latestRemovedXid;
7556 }
7557 
7558 /*
7559  * Specialized inlineable comparison function for index_delete_sort()
7560  */
7561 static inline int
index_delete_sort_cmp(TM_IndexDelete * deltid1,TM_IndexDelete * deltid2)7562 index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
7563 {
7564 	ItemPointer tid1 = &deltid1->tid;
7565 	ItemPointer tid2 = &deltid2->tid;
7566 
7567 	{
7568 		BlockNumber blk1 = ItemPointerGetBlockNumber(tid1);
7569 		BlockNumber blk2 = ItemPointerGetBlockNumber(tid2);
7570 
7571 		if (blk1 != blk2)
7572 			return (blk1 < blk2) ? -1 : 1;
7573 	}
7574 	{
7575 		OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1);
7576 		OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2);
7577 
7578 		if (pos1 != pos2)
7579 			return (pos1 < pos2) ? -1 : 1;
7580 	}
7581 
7582 	Assert(false);
7583 
7584 	return 0;
7585 }
7586 
7587 /*
7588  * Sort deltids array from delstate by TID.  This prepares it for further
7589  * processing by heap_index_delete_tuples().
7590  *
7591  * This operation becomes a noticeable consumer of CPU cycles with some
7592  * workloads, so we go to the trouble of specialization/micro optimization.
7593  * We use shellsort for this because it's easy to specialize, compiles to
7594  * relatively few instructions, and is adaptive to presorted inputs/subsets
7595  * (which are typical here).
7596  */
7597 static void
index_delete_sort(TM_IndexDeleteOp * delstate)7598 index_delete_sort(TM_IndexDeleteOp *delstate)
7599 {
7600 	TM_IndexDelete *deltids = delstate->deltids;
7601 	int			ndeltids = delstate->ndeltids;
7602 	int			low = 0;
7603 
7604 	/*
7605 	 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
7606 	 *
7607 	 * This implementation is fast with array sizes up to ~4500.  This covers
7608 	 * all supported BLCKSZ values.
7609 	 */
7610 	const int	gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
7611 
7612 	/* Think carefully before changing anything here -- keep swaps cheap */
7613 	StaticAssertStmt(sizeof(TM_IndexDelete) <= 8,
7614 					 "element size exceeds 8 bytes");
7615 
7616 	for (int g = 0; g < lengthof(gaps); g++)
7617 	{
7618 		for (int hi = gaps[g], i = low + hi; i < ndeltids; i++)
7619 		{
7620 			TM_IndexDelete d = deltids[i];
7621 			int			j = i;
7622 
7623 			while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
7624 			{
7625 				deltids[j] = deltids[j - hi];
7626 				j -= hi;
7627 			}
7628 			deltids[j] = d;
7629 		}
7630 	}
7631 }
7632 
7633 /*
7634  * Returns how many blocks should be considered favorable/contiguous for a
7635  * bottom-up index deletion pass.  This is a number of heap blocks that starts
7636  * from and includes the first block in line.
7637  *
7638  * There is always at least one favorable block during bottom-up index
7639  * deletion.  In the worst case (i.e. with totally random heap blocks) the
7640  * first block in line (the only favorable block) can be thought of as a
7641  * degenerate array of contiguous blocks that consists of a single block.
7642  * heap_index_delete_tuples() will expect this.
7643  *
7644  * Caller passes blockgroups, a description of the final order that deltids
7645  * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
7646  * processing.  Note that deltids need not actually be sorted just yet (caller
7647  * only passes deltids to us so that we can interpret blockgroups).
7648  *
7649  * You might guess that the existence of contiguous blocks cannot matter much,
7650  * since in general the main factor that determines which blocks we visit is
7651  * the number of promising TIDs, which is a fixed hint from the index AM.
7652  * We're not really targeting the general case, though -- the actual goal is
7653  * to adapt our behavior to a wide variety of naturally occurring conditions.
7654  * The effects of most of the heuristics we apply are only noticeable in the
7655  * aggregate, over time and across many _related_ bottom-up index deletion
7656  * passes.
7657  *
7658  * Deeming certain blocks favorable allows heapam to recognize and adapt to
7659  * workloads where heap blocks visited during bottom-up index deletion can be
7660  * accessed contiguously, in the sense that each newly visited block is the
7661  * neighbor of the block that bottom-up deletion just finished processing (or
7662  * close enough to it).  It will likely be cheaper to access more favorable
7663  * blocks sooner rather than later (e.g. in this pass, not across a series of
7664  * related bottom-up passes).  Either way it is probably only a matter of time
7665  * (or a matter of further correlated version churn) before all blocks that
7666  * appear together as a single large batch of favorable blocks get accessed by
7667  * _some_ bottom-up pass.  Large batches of favorable blocks tend to either
7668  * appear almost constantly or not even once (it all depends on per-index
7669  * workload characteristics).
7670  *
7671  * Note that the blockgroups sort order applies a power-of-two bucketing
7672  * scheme that creates opportunities for contiguous groups of blocks to get
7673  * batched together, at least with workloads that are naturally amenable to
7674  * being driven by heap block locality.  This doesn't just enhance the spatial
7675  * locality of bottom-up heap block processing in the obvious way.  It also
7676  * enables temporal locality of access, since sorting by heap block number
7677  * naturally tends to make the bottom-up processing order deterministic.
7678  *
7679  * Consider the following example to get a sense of how temporal locality
7680  * might matter: There is a heap relation with several indexes, each of which
7681  * is low to medium cardinality.  It is subject to constant non-HOT updates.
7682  * The updates are skewed (in one part of the primary key, perhaps).  None of
7683  * the indexes are logically modified by the UPDATE statements (if they were
7684  * then bottom-up index deletion would not be triggered in the first place).
7685  * Naturally, each new round of index tuples (for each heap tuple that gets a
7686  * heap_update() call) will have the same heap TID in each and every index.
7687  * Since these indexes are low cardinality and never get logically modified,
7688  * heapam processing during bottom-up deletion passes will access heap blocks
7689  * in approximately sequential order.  Temporal locality of access occurs due
7690  * to bottom-up deletion passes behaving very similarly across each of the
7691  * indexes at any given moment.  This keeps the number of buffer misses needed
7692  * to visit heap blocks to a minimum.
7693  */
7694 static int
bottomup_nblocksfavorable(IndexDeleteCounts * blockgroups,int nblockgroups,TM_IndexDelete * deltids)7695 bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups,
7696 						  TM_IndexDelete *deltids)
7697 {
7698 	int64		lastblock = -1;
7699 	int			nblocksfavorable = 0;
7700 
7701 	Assert(nblockgroups >= 1);
7702 	Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS);
7703 
7704 	/*
7705 	 * We tolerate heap blocks that will be accessed only slightly out of
7706 	 * physical order.  Small blips occur when a pair of almost-contiguous
7707 	 * blocks happen to fall into different buckets (perhaps due only to a
7708 	 * small difference in npromisingtids that the bucketing scheme didn't
7709 	 * quite manage to ignore).  We effectively ignore these blips by applying
7710 	 * a small tolerance.  The precise tolerance we use is a little arbitrary,
7711 	 * but it works well enough in practice.
7712 	 */
7713 	for (int b = 0; b < nblockgroups; b++)
7714 	{
7715 		IndexDeleteCounts *group = blockgroups + b;
7716 		TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
7717 		BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid);
7718 
7719 		if (lastblock != -1 &&
7720 			((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS ||
7721 			 (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS))
7722 			break;
7723 
7724 		nblocksfavorable++;
7725 		lastblock = block;
7726 	}
7727 
7728 	/* Always indicate that there is at least 1 favorable block */
7729 	Assert(nblocksfavorable >= 1);
7730 
7731 	return nblocksfavorable;
7732 }
7733 
7734 /*
7735  * qsort comparison function for bottomup_sort_and_shrink()
7736  */
7737 static int
bottomup_sort_and_shrink_cmp(const void * arg1,const void * arg2)7738 bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
7739 {
7740 	const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1;
7741 	const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2;
7742 
7743 	/*
7744 	 * Most significant field is npromisingtids (which we invert the order of
7745 	 * so as to sort in desc order).
7746 	 *
7747 	 * Caller should have already normalized npromisingtids fields into
7748 	 * power-of-two values (buckets).
7749 	 */
7750 	if (group1->npromisingtids > group2->npromisingtids)
7751 		return -1;
7752 	if (group1->npromisingtids < group2->npromisingtids)
7753 		return 1;
7754 
7755 	/*
7756 	 * Tiebreak: desc ntids sort order.
7757 	 *
7758 	 * We cannot expect power-of-two values for ntids fields.  We should
7759 	 * behave as if they were already rounded up for us instead.
7760 	 */
7761 	if (group1->ntids != group2->ntids)
7762 	{
7763 		uint32		ntids1 = pg_nextpower2_32((uint32) group1->ntids);
7764 		uint32		ntids2 = pg_nextpower2_32((uint32) group2->ntids);
7765 
7766 		if (ntids1 > ntids2)
7767 			return -1;
7768 		if (ntids1 < ntids2)
7769 			return 1;
7770 	}
7771 
7772 	/*
7773 	 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
7774 	 * block in deltids array) order.
7775 	 *
7776 	 * This is equivalent to sorting in ascending heap block number order
7777 	 * (among otherwise equal subsets of the array).  This approach allows us
7778 	 * to avoid accessing the out-of-line TID.  (We rely on the assumption
7779 	 * that the deltids array was sorted in ascending heap TID order when
7780 	 * these offsets to the first TID from each heap block group were formed.)
7781 	 */
7782 	if (group1->ifirsttid > group2->ifirsttid)
7783 		return 1;
7784 	if (group1->ifirsttid < group2->ifirsttid)
7785 		return -1;
7786 
7787 	pg_unreachable();
7788 
7789 	return 0;
7790 }
7791 
7792 /*
7793  * heap_index_delete_tuples() helper function for bottom-up deletion callers.
7794  *
7795  * Sorts deltids array in the order needed for useful processing by bottom-up
7796  * deletion.  The array should already be sorted in TID order when we're
7797  * called.  The sort process groups heap TIDs from deltids into heap block
7798  * groupings.  Earlier/more-promising groups/blocks are usually those that are
7799  * known to have the most "promising" TIDs.
7800  *
7801  * Sets new size of deltids array (ndeltids) in state.  deltids will only have
7802  * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
7803  * return.  This often means that deltids will be shrunk to a small fraction
7804  * of its original size (we eliminate many heap blocks from consideration for
7805  * caller up front).
7806  *
7807  * Returns the number of "favorable" blocks.  See bottomup_nblocksfavorable()
7808  * for a definition and full details.
7809  */
7810 static int
bottomup_sort_and_shrink(TM_IndexDeleteOp * delstate)7811 bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
7812 {
7813 	IndexDeleteCounts *blockgroups;
7814 	TM_IndexDelete *reordereddeltids;
7815 	BlockNumber curblock = InvalidBlockNumber;
7816 	int			nblockgroups = 0;
7817 	int			ncopied = 0;
7818 	int			nblocksfavorable = 0;
7819 
7820 	Assert(delstate->bottomup);
7821 	Assert(delstate->ndeltids > 0);
7822 
7823 	/* Calculate per-heap-block count of TIDs */
7824 	blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids);
7825 	for (int i = 0; i < delstate->ndeltids; i++)
7826 	{
7827 		TM_IndexDelete *ideltid = &delstate->deltids[i];
7828 		TM_IndexStatus *istatus = delstate->status + ideltid->id;
7829 		ItemPointer htid = &ideltid->tid;
7830 		bool		promising = istatus->promising;
7831 
7832 		if (curblock != ItemPointerGetBlockNumber(htid))
7833 		{
7834 			/* New block group */
7835 			nblockgroups++;
7836 
7837 			Assert(curblock < ItemPointerGetBlockNumber(htid) ||
7838 				   !BlockNumberIsValid(curblock));
7839 
7840 			curblock = ItemPointerGetBlockNumber(htid);
7841 			blockgroups[nblockgroups - 1].ifirsttid = i;
7842 			blockgroups[nblockgroups - 1].ntids = 1;
7843 			blockgroups[nblockgroups - 1].npromisingtids = 0;
7844 		}
7845 		else
7846 		{
7847 			blockgroups[nblockgroups - 1].ntids++;
7848 		}
7849 
7850 		if (promising)
7851 			blockgroups[nblockgroups - 1].npromisingtids++;
7852 	}
7853 
7854 	/*
7855 	 * We're about ready to sort block groups to determine the optimal order
7856 	 * for visiting heap blocks.  But before we do, round the number of
7857 	 * promising tuples for each block group up to the next power-of-two,
7858 	 * unless it is very low (less than 4), in which case we round up to 4.
7859 	 * npromisingtids is far too noisy to trust when choosing between a pair
7860 	 * of block groups that both have very low values.
7861 	 *
7862 	 * This scheme divides heap blocks/block groups into buckets.  Each bucket
7863 	 * contains blocks that have _approximately_ the same number of promising
7864 	 * TIDs as each other.  The goal is to ignore relatively small differences
7865 	 * in the total number of promising entries, so that the whole process can
7866 	 * give a little weight to heapam factors (like heap block locality)
7867 	 * instead.  This isn't a trade-off, really -- we have nothing to lose. It
7868 	 * would be foolish to interpret small differences in npromisingtids
7869 	 * values as anything more than noise.
7870 	 *
7871 	 * We tiebreak on nhtids when sorting block group subsets that have the
7872 	 * same npromisingtids, but this has the same issues as npromisingtids,
7873 	 * and so nhtids is subject to the same power-of-two bucketing scheme. The
7874 	 * only reason that we don't fix nhtids in the same way here too is that
7875 	 * we'll need accurate nhtids values after the sort.  We handle nhtids
7876 	 * bucketization dynamically instead (in the sort comparator).
7877 	 *
7878 	 * See bottomup_nblocksfavorable() for a full explanation of when and how
7879 	 * heap locality/favorable blocks can significantly influence when and how
7880 	 * heap blocks are accessed.
7881 	 */
7882 	for (int b = 0; b < nblockgroups; b++)
7883 	{
7884 		IndexDeleteCounts *group = blockgroups + b;
7885 
7886 		/* Better off falling back on nhtids with low npromisingtids */
7887 		if (group->npromisingtids <= 4)
7888 			group->npromisingtids = 4;
7889 		else
7890 			group->npromisingtids =
7891 				pg_nextpower2_32((uint32) group->npromisingtids);
7892 	}
7893 
7894 	/* Sort groups and rearrange caller's deltids array */
7895 	qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts),
7896 		  bottomup_sort_and_shrink_cmp);
7897 	reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
7898 
7899 	nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups);
7900 	/* Determine number of favorable blocks at the start of final deltids */
7901 	nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups,
7902 												 delstate->deltids);
7903 
7904 	for (int b = 0; b < nblockgroups; b++)
7905 	{
7906 		IndexDeleteCounts *group = blockgroups + b;
7907 		TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
7908 
7909 		memcpy(reordereddeltids + ncopied, firstdtid,
7910 			   sizeof(TM_IndexDelete) * group->ntids);
7911 		ncopied += group->ntids;
7912 	}
7913 
7914 	/* Copy final grouped and sorted TIDs back into start of caller's array */
7915 	memcpy(delstate->deltids, reordereddeltids,
7916 		   sizeof(TM_IndexDelete) * ncopied);
7917 	delstate->ndeltids = ncopied;
7918 
7919 	pfree(reordereddeltids);
7920 	pfree(blockgroups);
7921 
7922 	return nblocksfavorable;
7923 }
7924 
7925 /*
7926  * Perform XLogInsert for a heap-freeze operation.  Caller must have already
7927  * modified the buffer and marked it dirty.
7928  */
7929 XLogRecPtr
log_heap_freeze(Relation reln,Buffer buffer,TransactionId cutoff_xid,xl_heap_freeze_tuple * tuples,int ntuples)7930 log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7931 				xl_heap_freeze_tuple *tuples, int ntuples)
7932 {
7933 	xl_heap_freeze_page xlrec;
7934 	XLogRecPtr	recptr;
7935 
7936 	/* Caller should not call me on a non-WAL-logged relation */
7937 	Assert(RelationNeedsWAL(reln));
7938 	/* nor when there are no tuples to freeze */
7939 	Assert(ntuples > 0);
7940 
7941 	xlrec.cutoff_xid = cutoff_xid;
7942 	xlrec.ntuples = ntuples;
7943 
7944 	XLogBeginInsert();
7945 	XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7946 
7947 	/*
7948 	 * The freeze plan array is not actually in the buffer, but pretend that
7949 	 * it is.  When XLogInsert stores the whole buffer, the freeze plan need
7950 	 * not be stored too.
7951 	 */
7952 	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7953 	XLogRegisterBufData(0, (char *) tuples,
7954 						ntuples * sizeof(xl_heap_freeze_tuple));
7955 
7956 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7957 
7958 	return recptr;
7959 }
7960 
7961 /*
7962  * Perform XLogInsert for a heap-visible operation.  'block' is the block
7963  * being marked all-visible, and vm_buffer is the buffer containing the
7964  * corresponding visibility map block.  Both should have already been modified
7965  * and dirtied.
7966  *
7967  * If checksums are enabled, we also generate a full-page image of
7968  * heap_buffer, if necessary.
7969  */
7970 XLogRecPtr
log_heap_visible(RelFileNode rnode,Buffer heap_buffer,Buffer vm_buffer,TransactionId cutoff_xid,uint8 vmflags)7971 log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7972 				 TransactionId cutoff_xid, uint8 vmflags)
7973 {
7974 	xl_heap_visible xlrec;
7975 	XLogRecPtr	recptr;
7976 	uint8		flags;
7977 
7978 	Assert(BufferIsValid(heap_buffer));
7979 	Assert(BufferIsValid(vm_buffer));
7980 
7981 	xlrec.cutoff_xid = cutoff_xid;
7982 	xlrec.flags = vmflags;
7983 	XLogBeginInsert();
7984 	XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7985 
7986 	XLogRegisterBuffer(0, vm_buffer, 0);
7987 
7988 	flags = REGBUF_STANDARD;
7989 	if (!XLogHintBitIsNeeded())
7990 		flags |= REGBUF_NO_IMAGE;
7991 	XLogRegisterBuffer(1, heap_buffer, flags);
7992 
7993 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7994 
7995 	return recptr;
7996 }
7997 
7998 /*
7999  * Perform XLogInsert for a heap-update operation.  Caller must already
8000  * have modified the buffer(s) and marked them dirty.
8001  */
8002 static XLogRecPtr
log_heap_update(Relation reln,Buffer oldbuf,Buffer newbuf,HeapTuple oldtup,HeapTuple newtup,HeapTuple old_key_tuple,bool all_visible_cleared,bool new_all_visible_cleared)8003 log_heap_update(Relation reln, Buffer oldbuf,
8004 				Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
8005 				HeapTuple old_key_tuple,
8006 				bool all_visible_cleared, bool new_all_visible_cleared)
8007 {
8008 	xl_heap_update xlrec;
8009 	xl_heap_header xlhdr;
8010 	xl_heap_header xlhdr_idx;
8011 	uint8		info;
8012 	uint16		prefix_suffix[2];
8013 	uint16		prefixlen = 0,
8014 				suffixlen = 0;
8015 	XLogRecPtr	recptr;
8016 	Page		page = BufferGetPage(newbuf);
8017 	bool		need_tuple_data = RelationIsLogicallyLogged(reln);
8018 	bool		init;
8019 	int			bufflags;
8020 
8021 	/* Caller should not call me on a non-WAL-logged relation */
8022 	Assert(RelationNeedsWAL(reln));
8023 
8024 	XLogBeginInsert();
8025 
8026 	if (HeapTupleIsHeapOnly(newtup))
8027 		info = XLOG_HEAP_HOT_UPDATE;
8028 	else
8029 		info = XLOG_HEAP_UPDATE;
8030 
8031 	/*
8032 	 * If the old and new tuple are on the same page, we only need to log the
8033 	 * parts of the new tuple that were changed.  That saves on the amount of
8034 	 * WAL we need to write.  Currently, we just count any unchanged bytes in
8035 	 * the beginning and end of the tuple.  That's quick to check, and
8036 	 * perfectly covers the common case that only one field is updated.
8037 	 *
8038 	 * We could do this even if the old and new tuple are on different pages,
8039 	 * but only if we don't make a full-page image of the old page, which is
8040 	 * difficult to know in advance.  Also, if the old tuple is corrupt for
8041 	 * some reason, it would allow the corruption to propagate the new page,
8042 	 * so it seems best to avoid.  Under the general assumption that most
8043 	 * updates tend to create the new tuple version on the same page, there
8044 	 * isn't much to be gained by doing this across pages anyway.
8045 	 *
8046 	 * Skip this if we're taking a full-page image of the new page, as we
8047 	 * don't include the new tuple in the WAL record in that case.  Also
8048 	 * disable if wal_level='logical', as logical decoding needs to be able to
8049 	 * read the new tuple in whole from the WAL record alone.
8050 	 */
8051 	if (oldbuf == newbuf && !need_tuple_data &&
8052 		!XLogCheckBufferNeedsBackup(newbuf))
8053 	{
8054 		char	   *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8055 		char	   *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8056 		int			oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8057 		int			newlen = newtup->t_len - newtup->t_data->t_hoff;
8058 
8059 		/* Check for common prefix between old and new tuple */
8060 		for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8061 		{
8062 			if (newp[prefixlen] != oldp[prefixlen])
8063 				break;
8064 		}
8065 
8066 		/*
8067 		 * Storing the length of the prefix takes 2 bytes, so we need to save
8068 		 * at least 3 bytes or there's no point.
8069 		 */
8070 		if (prefixlen < 3)
8071 			prefixlen = 0;
8072 
8073 		/* Same for suffix */
8074 		for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
8075 		{
8076 			if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8077 				break;
8078 		}
8079 		if (suffixlen < 3)
8080 			suffixlen = 0;
8081 	}
8082 
8083 	/* Prepare main WAL data chain */
8084 	xlrec.flags = 0;
8085 	if (all_visible_cleared)
8086 		xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
8087 	if (new_all_visible_cleared)
8088 		xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
8089 	if (prefixlen > 0)
8090 		xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
8091 	if (suffixlen > 0)
8092 		xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
8093 	if (need_tuple_data)
8094 	{
8095 		xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
8096 		if (old_key_tuple)
8097 		{
8098 			if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
8099 				xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
8100 			else
8101 				xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
8102 		}
8103 	}
8104 
8105 	/* If new tuple is the single and first tuple on page... */
8106 	if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
8107 		PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
8108 	{
8109 		info |= XLOG_HEAP_INIT_PAGE;
8110 		init = true;
8111 	}
8112 	else
8113 		init = false;
8114 
8115 	/* Prepare WAL data for the old page */
8116 	xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
8117 	xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
8118 	xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
8119 											  oldtup->t_data->t_infomask2);
8120 
8121 	/* Prepare WAL data for the new page */
8122 	xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
8123 	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
8124 
8125 	bufflags = REGBUF_STANDARD;
8126 	if (init)
8127 		bufflags |= REGBUF_WILL_INIT;
8128 	if (need_tuple_data)
8129 		bufflags |= REGBUF_KEEP_DATA;
8130 
8131 	XLogRegisterBuffer(0, newbuf, bufflags);
8132 	if (oldbuf != newbuf)
8133 		XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
8134 
8135 	XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
8136 
8137 	/*
8138 	 * Prepare WAL data for the new tuple.
8139 	 */
8140 	if (prefixlen > 0 || suffixlen > 0)
8141 	{
8142 		if (prefixlen > 0 && suffixlen > 0)
8143 		{
8144 			prefix_suffix[0] = prefixlen;
8145 			prefix_suffix[1] = suffixlen;
8146 			XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
8147 		}
8148 		else if (prefixlen > 0)
8149 		{
8150 			XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
8151 		}
8152 		else
8153 		{
8154 			XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
8155 		}
8156 	}
8157 
8158 	xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
8159 	xlhdr.t_infomask = newtup->t_data->t_infomask;
8160 	xlhdr.t_hoff = newtup->t_data->t_hoff;
8161 	Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
8162 
8163 	/*
8164 	 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
8165 	 *
8166 	 * The 'data' doesn't include the common prefix or suffix.
8167 	 */
8168 	XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
8169 	if (prefixlen == 0)
8170 	{
8171 		XLogRegisterBufData(0,
8172 							((char *) newtup->t_data) + SizeofHeapTupleHeader,
8173 							newtup->t_len - SizeofHeapTupleHeader - suffixlen);
8174 	}
8175 	else
8176 	{
8177 		/*
8178 		 * Have to write the null bitmap and data after the common prefix as
8179 		 * two separate rdata entries.
8180 		 */
8181 		/* bitmap [+ padding] [+ oid] */
8182 		if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
8183 		{
8184 			XLogRegisterBufData(0,
8185 								((char *) newtup->t_data) + SizeofHeapTupleHeader,
8186 								newtup->t_data->t_hoff - SizeofHeapTupleHeader);
8187 		}
8188 
8189 		/* data after common prefix */
8190 		XLogRegisterBufData(0,
8191 							((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
8192 							newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
8193 	}
8194 
8195 	/* We need to log a tuple identity */
8196 	if (need_tuple_data && old_key_tuple)
8197 	{
8198 		/* don't really need this, but its more comfy to decode */
8199 		xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
8200 		xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
8201 		xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
8202 
8203 		XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
8204 
8205 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
8206 		XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
8207 						 old_key_tuple->t_len - SizeofHeapTupleHeader);
8208 	}
8209 
8210 	/* filtering by origin on a row level is much more efficient */
8211 	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
8212 
8213 	recptr = XLogInsert(RM_HEAP_ID, info);
8214 
8215 	return recptr;
8216 }
8217 
8218 /*
8219  * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
8220  *
8221  * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
8222  * tuples.
8223  */
8224 static XLogRecPtr
log_heap_new_cid(Relation relation,HeapTuple tup)8225 log_heap_new_cid(Relation relation, HeapTuple tup)
8226 {
8227 	xl_heap_new_cid xlrec;
8228 
8229 	XLogRecPtr	recptr;
8230 	HeapTupleHeader hdr = tup->t_data;
8231 
8232 	Assert(ItemPointerIsValid(&tup->t_self));
8233 	Assert(tup->t_tableOid != InvalidOid);
8234 
8235 	xlrec.top_xid = GetTopTransactionId();
8236 	xlrec.target_node = relation->rd_node;
8237 	xlrec.target_tid = tup->t_self;
8238 
8239 	/*
8240 	 * If the tuple got inserted & deleted in the same TX we definitely have a
8241 	 * combo CID, set cmin and cmax.
8242 	 */
8243 	if (hdr->t_infomask & HEAP_COMBOCID)
8244 	{
8245 		Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
8246 		Assert(!HeapTupleHeaderXminInvalid(hdr));
8247 		xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
8248 		xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
8249 		xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
8250 	}
8251 	/* No combo CID, so only cmin or cmax can be set by this TX */
8252 	else
8253 	{
8254 		/*
8255 		 * Tuple inserted.
8256 		 *
8257 		 * We need to check for LOCK ONLY because multixacts might be
8258 		 * transferred to the new tuple in case of FOR KEY SHARE updates in
8259 		 * which case there will be an xmax, although the tuple just got
8260 		 * inserted.
8261 		 */
8262 		if (hdr->t_infomask & HEAP_XMAX_INVALID ||
8263 			HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
8264 		{
8265 			xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
8266 			xlrec.cmax = InvalidCommandId;
8267 		}
8268 		/* Tuple from a different tx updated or deleted. */
8269 		else
8270 		{
8271 			xlrec.cmin = InvalidCommandId;
8272 			xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
8273 
8274 		}
8275 		xlrec.combocid = InvalidCommandId;
8276 	}
8277 
8278 	/*
8279 	 * Note that we don't need to register the buffer here, because this
8280 	 * operation does not modify the page. The insert/update/delete that
8281 	 * called us certainly did, but that's WAL-logged separately.
8282 	 */
8283 	XLogBeginInsert();
8284 	XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
8285 
8286 	/* will be looked at irrespective of origin */
8287 
8288 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
8289 
8290 	return recptr;
8291 }
8292 
8293 /*
8294  * Build a heap tuple representing the configured REPLICA IDENTITY to represent
8295  * the old tuple in a UPDATE or DELETE.
8296  *
8297  * Returns NULL if there's no need to log an identity or if there's no suitable
8298  * key defined.
8299  *
8300  * key_changed should be false if caller knows that no replica identity
8301  * columns changed value.  It's always true in the DELETE case.
8302  *
8303  * *copy is set to true if the returned tuple is a modified copy rather than
8304  * the same tuple that was passed in.
8305  */
8306 static HeapTuple
ExtractReplicaIdentity(Relation relation,HeapTuple tp,bool key_changed,bool * copy)8307 ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed,
8308 					   bool *copy)
8309 {
8310 	TupleDesc	desc = RelationGetDescr(relation);
8311 	char		replident = relation->rd_rel->relreplident;
8312 	Bitmapset  *idattrs;
8313 	HeapTuple	key_tuple;
8314 	bool		nulls[MaxHeapAttributeNumber];
8315 	Datum		values[MaxHeapAttributeNumber];
8316 
8317 	*copy = false;
8318 
8319 	if (!RelationIsLogicallyLogged(relation))
8320 		return NULL;
8321 
8322 	if (replident == REPLICA_IDENTITY_NOTHING)
8323 		return NULL;
8324 
8325 	if (replident == REPLICA_IDENTITY_FULL)
8326 	{
8327 		/*
8328 		 * When logging the entire old tuple, it very well could contain
8329 		 * toasted columns. If so, force them to be inlined.
8330 		 */
8331 		if (HeapTupleHasExternal(tp))
8332 		{
8333 			*copy = true;
8334 			tp = toast_flatten_tuple(tp, desc);
8335 		}
8336 		return tp;
8337 	}
8338 
8339 	/* if the key hasn't changed and we're only logging the key, we're done */
8340 	if (!key_changed)
8341 		return NULL;
8342 
8343 	/* find out the replica identity columns */
8344 	idattrs = RelationGetIndexAttrBitmap(relation,
8345 										 INDEX_ATTR_BITMAP_IDENTITY_KEY);
8346 
8347 	/*
8348 	 * If there's no defined replica identity columns, treat as !key_changed.
8349 	 * (This case should not be reachable from heap_update, since that should
8350 	 * calculate key_changed accurately.  But heap_delete just passes constant
8351 	 * true for key_changed, so we can hit this case in deletes.)
8352 	 */
8353 	if (bms_is_empty(idattrs))
8354 		return NULL;
8355 
8356 	/*
8357 	 * Construct a new tuple containing only the replica identity columns,
8358 	 * with nulls elsewhere.  While we're at it, assert that the replica
8359 	 * identity columns aren't null.
8360 	 */
8361 	heap_deform_tuple(tp, desc, values, nulls);
8362 
8363 	for (int i = 0; i < desc->natts; i++)
8364 	{
8365 		if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
8366 						  idattrs))
8367 			Assert(!nulls[i]);
8368 		else
8369 			nulls[i] = true;
8370 	}
8371 
8372 	key_tuple = heap_form_tuple(desc, values, nulls);
8373 	*copy = true;
8374 
8375 	bms_free(idattrs);
8376 
8377 	/*
8378 	 * If the tuple, which by here only contains indexed columns, still has
8379 	 * toasted columns, force them to be inlined. This is somewhat unlikely
8380 	 * since there's limits on the size of indexed columns, so we don't
8381 	 * duplicate toast_flatten_tuple()s functionality in the above loop over
8382 	 * the indexed columns, even if it would be more efficient.
8383 	 */
8384 	if (HeapTupleHasExternal(key_tuple))
8385 	{
8386 		HeapTuple	oldtup = key_tuple;
8387 
8388 		key_tuple = toast_flatten_tuple(oldtup, desc);
8389 		heap_freetuple(oldtup);
8390 	}
8391 
8392 	return key_tuple;
8393 }
8394 
8395 /*
8396  * Handles XLOG_HEAP2_PRUNE record type.
8397  *
8398  * Acquires a super-exclusive lock.
8399  */
8400 static void
heap_xlog_prune(XLogReaderState * record)8401 heap_xlog_prune(XLogReaderState *record)
8402 {
8403 	XLogRecPtr	lsn = record->EndRecPtr;
8404 	xl_heap_prune *xlrec = (xl_heap_prune *) XLogRecGetData(record);
8405 	Buffer		buffer;
8406 	RelFileNode rnode;
8407 	BlockNumber blkno;
8408 	XLogRedoAction action;
8409 
8410 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8411 
8412 	/*
8413 	 * We're about to remove tuples. In Hot Standby mode, ensure that there's
8414 	 * no queries running for which the removed tuples are still visible.
8415 	 */
8416 	if (InHotStandby)
8417 		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
8418 
8419 	/*
8420 	 * If we have a full-page image, restore it (using a cleanup lock) and
8421 	 * we're done.
8422 	 */
8423 	action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
8424 										   &buffer);
8425 	if (action == BLK_NEEDS_REDO)
8426 	{
8427 		Page		page = (Page) BufferGetPage(buffer);
8428 		OffsetNumber *end;
8429 		OffsetNumber *redirected;
8430 		OffsetNumber *nowdead;
8431 		OffsetNumber *nowunused;
8432 		int			nredirected;
8433 		int			ndead;
8434 		int			nunused;
8435 		Size		datalen;
8436 
8437 		redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
8438 
8439 		nredirected = xlrec->nredirected;
8440 		ndead = xlrec->ndead;
8441 		end = (OffsetNumber *) ((char *) redirected + datalen);
8442 		nowdead = redirected + (nredirected * 2);
8443 		nowunused = nowdead + ndead;
8444 		nunused = (end - nowunused);
8445 		Assert(nunused >= 0);
8446 
8447 		/* Update all line pointers per the record, and repair fragmentation */
8448 		heap_page_prune_execute(buffer,
8449 								redirected, nredirected,
8450 								nowdead, ndead,
8451 								nowunused, nunused);
8452 
8453 		/*
8454 		 * Note: we don't worry about updating the page's prunability hints.
8455 		 * At worst this will cause an extra prune cycle to occur soon.
8456 		 */
8457 
8458 		PageSetLSN(page, lsn);
8459 		MarkBufferDirty(buffer);
8460 	}
8461 
8462 	if (BufferIsValid(buffer))
8463 	{
8464 		Size		freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
8465 
8466 		UnlockReleaseBuffer(buffer);
8467 
8468 		/*
8469 		 * After pruning records from a page, it's useful to update the FSM
8470 		 * about it, as it may cause the page become target for insertions
8471 		 * later even if vacuum decides not to visit it (which is possible if
8472 		 * gets marked all-visible.)
8473 		 *
8474 		 * Do this regardless of a full-page image being applied, since the
8475 		 * FSM data is not in the page anyway.
8476 		 */
8477 		XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8478 	}
8479 }
8480 
8481 /*
8482  * Handles XLOG_HEAP2_VACUUM record type.
8483  *
8484  * Acquires an exclusive lock only.
8485  */
8486 static void
heap_xlog_vacuum(XLogReaderState * record)8487 heap_xlog_vacuum(XLogReaderState *record)
8488 {
8489 	XLogRecPtr	lsn = record->EndRecPtr;
8490 	xl_heap_vacuum *xlrec = (xl_heap_vacuum *) XLogRecGetData(record);
8491 	Buffer		buffer;
8492 	BlockNumber blkno;
8493 	XLogRedoAction action;
8494 
8495 	/*
8496 	 * If we have a full-page image, restore it	(without using a cleanup lock)
8497 	 * and we're done.
8498 	 */
8499 	action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false,
8500 										   &buffer);
8501 	if (action == BLK_NEEDS_REDO)
8502 	{
8503 		Page		page = (Page) BufferGetPage(buffer);
8504 		OffsetNumber *nowunused;
8505 		Size		datalen;
8506 		OffsetNumber *offnum;
8507 
8508 		nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
8509 
8510 		/* Shouldn't be a record unless there's something to do */
8511 		Assert(xlrec->nunused > 0);
8512 
8513 		/* Update all now-unused line pointers */
8514 		offnum = nowunused;
8515 		for (int i = 0; i < xlrec->nunused; i++)
8516 		{
8517 			OffsetNumber off = *offnum++;
8518 			ItemId		lp = PageGetItemId(page, off);
8519 
8520 			Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
8521 			ItemIdSetUnused(lp);
8522 		}
8523 
8524 		/* Attempt to truncate line pointer array now */
8525 		PageTruncateLinePointerArray(page);
8526 
8527 		PageSetLSN(page, lsn);
8528 		MarkBufferDirty(buffer);
8529 	}
8530 
8531 	if (BufferIsValid(buffer))
8532 	{
8533 		Size		freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
8534 		RelFileNode rnode;
8535 
8536 		XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8537 
8538 		UnlockReleaseBuffer(buffer);
8539 
8540 		/*
8541 		 * After vacuuming LP_DEAD items from a page, it's useful to update
8542 		 * the FSM about it, as it may cause the page become target for
8543 		 * insertions later even if vacuum decides not to visit it (which is
8544 		 * possible if gets marked all-visible.)
8545 		 *
8546 		 * Do this regardless of a full-page image being applied, since the
8547 		 * FSM data is not in the page anyway.
8548 		 */
8549 		XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8550 	}
8551 }
8552 
8553 /*
8554  * Replay XLOG_HEAP2_VISIBLE record.
8555  *
8556  * The critical integrity requirement here is that we must never end up with
8557  * a situation where the visibility map bit is set, and the page-level
8558  * PD_ALL_VISIBLE bit is clear.  If that were to occur, then a subsequent
8559  * page modification would fail to clear the visibility map bit.
8560  */
8561 static void
heap_xlog_visible(XLogReaderState * record)8562 heap_xlog_visible(XLogReaderState *record)
8563 {
8564 	XLogRecPtr	lsn = record->EndRecPtr;
8565 	xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
8566 	Buffer		vmbuffer = InvalidBuffer;
8567 	Buffer		buffer;
8568 	Page		page;
8569 	RelFileNode rnode;
8570 	BlockNumber blkno;
8571 	XLogRedoAction action;
8572 
8573 	XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno);
8574 
8575 	/*
8576 	 * If there are any Hot Standby transactions running that have an xmin
8577 	 * horizon old enough that this page isn't all-visible for them, they
8578 	 * might incorrectly decide that an index-only scan can skip a heap fetch.
8579 	 *
8580 	 * NB: It might be better to throw some kind of "soft" conflict here that
8581 	 * forces any index-only scan that is in flight to perform heap fetches,
8582 	 * rather than killing the transaction outright.
8583 	 */
8584 	if (InHotStandby)
8585 		ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
8586 
8587 	/*
8588 	 * Read the heap page, if it still exists. If the heap file has dropped or
8589 	 * truncated later in recovery, we don't need to update the page, but we'd
8590 	 * better still update the visibility map.
8591 	 */
8592 	action = XLogReadBufferForRedo(record, 1, &buffer);
8593 	if (action == BLK_NEEDS_REDO)
8594 	{
8595 		/*
8596 		 * We don't bump the LSN of the heap page when setting the visibility
8597 		 * map bit (unless checksums or wal_hint_bits is enabled, in which
8598 		 * case we must), because that would generate an unworkable volume of
8599 		 * full-page writes.  This exposes us to torn page hazards, but since
8600 		 * we're not inspecting the existing page contents in any way, we
8601 		 * don't care.
8602 		 *
8603 		 * However, all operations that clear the visibility map bit *do* bump
8604 		 * the LSN, and those operations will only be replayed if the XLOG LSN
8605 		 * follows the page LSN.  Thus, if the page LSN has advanced past our
8606 		 * XLOG record's LSN, we mustn't mark the page all-visible, because
8607 		 * the subsequent update won't be replayed to clear the flag.
8608 		 */
8609 		page = BufferGetPage(buffer);
8610 
8611 		PageSetAllVisible(page);
8612 
8613 		MarkBufferDirty(buffer);
8614 	}
8615 	else if (action == BLK_RESTORED)
8616 	{
8617 		/*
8618 		 * If heap block was backed up, we already restored it and there's
8619 		 * nothing more to do. (This can only happen with checksums or
8620 		 * wal_log_hints enabled.)
8621 		 */
8622 	}
8623 
8624 	if (BufferIsValid(buffer))
8625 	{
8626 		Size		space = PageGetFreeSpace(BufferGetPage(buffer));
8627 
8628 		UnlockReleaseBuffer(buffer);
8629 
8630 		/*
8631 		 * Since FSM is not WAL-logged and only updated heuristically, it
8632 		 * easily becomes stale in standbys.  If the standby is later promoted
8633 		 * and runs VACUUM, it will skip updating individual free space
8634 		 * figures for pages that became all-visible (or all-frozen, depending
8635 		 * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
8636 		 * propagates too optimistic free space values to upper FSM layers;
8637 		 * later inserters try to use such pages only to find out that they
8638 		 * are unusable.  This can cause long stalls when there are many such
8639 		 * pages.
8640 		 *
8641 		 * Forestall those problems by updating FSM's idea about a page that
8642 		 * is becoming all-visible or all-frozen.
8643 		 *
8644 		 * Do this regardless of a full-page image being applied, since the
8645 		 * FSM data is not in the page anyway.
8646 		 */
8647 		if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
8648 			XLogRecordPageWithFreeSpace(rnode, blkno, space);
8649 	}
8650 
8651 	/*
8652 	 * Even if we skipped the heap page update due to the LSN interlock, it's
8653 	 * still safe to update the visibility map.  Any WAL record that clears
8654 	 * the visibility map bit does so before checking the page LSN, so any
8655 	 * bits that need to be cleared will still be cleared.
8656 	 */
8657 	if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
8658 									  &vmbuffer) == BLK_NEEDS_REDO)
8659 	{
8660 		Page		vmpage = BufferGetPage(vmbuffer);
8661 		Relation	reln;
8662 
8663 		/* initialize the page if it was read as zeros */
8664 		if (PageIsNew(vmpage))
8665 			PageInit(vmpage, BLCKSZ, 0);
8666 
8667 		/*
8668 		 * XLogReadBufferForRedoExtended locked the buffer. But
8669 		 * visibilitymap_set will handle locking itself.
8670 		 */
8671 		LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
8672 
8673 		reln = CreateFakeRelcacheEntry(rnode);
8674 		visibilitymap_pin(reln, blkno, &vmbuffer);
8675 
8676 		/*
8677 		 * Don't set the bit if replay has already passed this point.
8678 		 *
8679 		 * It might be safe to do this unconditionally; if replay has passed
8680 		 * this point, we'll replay at least as far this time as we did
8681 		 * before, and if this bit needs to be cleared, the record responsible
8682 		 * for doing so should be again replayed, and clear it.  For right
8683 		 * now, out of an abundance of conservatism, we use the same test here
8684 		 * we did for the heap page.  If this results in a dropped bit, no
8685 		 * real harm is done; and the next VACUUM will fix it.
8686 		 */
8687 		if (lsn > PageGetLSN(vmpage))
8688 			visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
8689 							  xlrec->cutoff_xid, xlrec->flags);
8690 
8691 		ReleaseBuffer(vmbuffer);
8692 		FreeFakeRelcacheEntry(reln);
8693 	}
8694 	else if (BufferIsValid(vmbuffer))
8695 		UnlockReleaseBuffer(vmbuffer);
8696 }
8697 
8698 /*
8699  * Replay XLOG_HEAP2_FREEZE_PAGE records
8700  */
8701 static void
heap_xlog_freeze_page(XLogReaderState * record)8702 heap_xlog_freeze_page(XLogReaderState *record)
8703 {
8704 	XLogRecPtr	lsn = record->EndRecPtr;
8705 	xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
8706 	TransactionId cutoff_xid = xlrec->cutoff_xid;
8707 	Buffer		buffer;
8708 	int			ntup;
8709 
8710 	/*
8711 	 * In Hot Standby mode, ensure that there's no queries running which still
8712 	 * consider the frozen xids as running.
8713 	 */
8714 	if (InHotStandby)
8715 	{
8716 		RelFileNode rnode;
8717 		TransactionId latestRemovedXid = cutoff_xid;
8718 
8719 		TransactionIdRetreat(latestRemovedXid);
8720 
8721 		XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
8722 		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
8723 	}
8724 
8725 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8726 	{
8727 		Page		page = BufferGetPage(buffer);
8728 		xl_heap_freeze_tuple *tuples;
8729 
8730 		tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL);
8731 
8732 		/* now execute freeze plan for each frozen tuple */
8733 		for (ntup = 0; ntup < xlrec->ntuples; ntup++)
8734 		{
8735 			xl_heap_freeze_tuple *xlrec_tp;
8736 			ItemId		lp;
8737 			HeapTupleHeader tuple;
8738 
8739 			xlrec_tp = &tuples[ntup];
8740 			lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
8741 			tuple = (HeapTupleHeader) PageGetItem(page, lp);
8742 
8743 			heap_execute_freeze_tuple(tuple, xlrec_tp);
8744 		}
8745 
8746 		PageSetLSN(page, lsn);
8747 		MarkBufferDirty(buffer);
8748 	}
8749 	if (BufferIsValid(buffer))
8750 		UnlockReleaseBuffer(buffer);
8751 }
8752 
8753 /*
8754  * Given an "infobits" field from an XLog record, set the correct bits in the
8755  * given infomask and infomask2 for the tuple touched by the record.
8756  *
8757  * (This is the reverse of compute_infobits).
8758  */
8759 static void
fix_infomask_from_infobits(uint8 infobits,uint16 * infomask,uint16 * infomask2)8760 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
8761 {
8762 	*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
8763 				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
8764 	*infomask2 &= ~HEAP_KEYS_UPDATED;
8765 
8766 	if (infobits & XLHL_XMAX_IS_MULTI)
8767 		*infomask |= HEAP_XMAX_IS_MULTI;
8768 	if (infobits & XLHL_XMAX_LOCK_ONLY)
8769 		*infomask |= HEAP_XMAX_LOCK_ONLY;
8770 	if (infobits & XLHL_XMAX_EXCL_LOCK)
8771 		*infomask |= HEAP_XMAX_EXCL_LOCK;
8772 	/* note HEAP_XMAX_SHR_LOCK isn't considered here */
8773 	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8774 		*infomask |= HEAP_XMAX_KEYSHR_LOCK;
8775 
8776 	if (infobits & XLHL_KEYS_UPDATED)
8777 		*infomask2 |= HEAP_KEYS_UPDATED;
8778 }
8779 
8780 static void
heap_xlog_delete(XLogReaderState * record)8781 heap_xlog_delete(XLogReaderState *record)
8782 {
8783 	XLogRecPtr	lsn = record->EndRecPtr;
8784 	xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
8785 	Buffer		buffer;
8786 	Page		page;
8787 	ItemId		lp = NULL;
8788 	HeapTupleHeader htup;
8789 	BlockNumber blkno;
8790 	RelFileNode target_node;
8791 	ItemPointerData target_tid;
8792 
8793 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8794 	ItemPointerSetBlockNumber(&target_tid, blkno);
8795 	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8796 
8797 	/*
8798 	 * The visibility map may need to be fixed even if the heap page is
8799 	 * already up-to-date.
8800 	 */
8801 	if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8802 	{
8803 		Relation	reln = CreateFakeRelcacheEntry(target_node);
8804 		Buffer		vmbuffer = InvalidBuffer;
8805 
8806 		visibilitymap_pin(reln, blkno, &vmbuffer);
8807 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8808 		ReleaseBuffer(vmbuffer);
8809 		FreeFakeRelcacheEntry(reln);
8810 	}
8811 
8812 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8813 	{
8814 		page = BufferGetPage(buffer);
8815 
8816 		if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8817 			lp = PageGetItemId(page, xlrec->offnum);
8818 
8819 		if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
8820 			elog(PANIC, "invalid lp");
8821 
8822 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8823 
8824 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8825 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8826 		HeapTupleHeaderClearHotUpdated(htup);
8827 		fix_infomask_from_infobits(xlrec->infobits_set,
8828 								   &htup->t_infomask, &htup->t_infomask2);
8829 		if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8830 			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8831 		else
8832 			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8833 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8834 
8835 		/* Mark the page as a candidate for pruning */
8836 		PageSetPrunable(page, XLogRecGetXid(record));
8837 
8838 		if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8839 			PageClearAllVisible(page);
8840 
8841 		/* Make sure t_ctid is set correctly */
8842 		if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
8843 			HeapTupleHeaderSetMovedPartitions(htup);
8844 		else
8845 			htup->t_ctid = target_tid;
8846 		PageSetLSN(page, lsn);
8847 		MarkBufferDirty(buffer);
8848 	}
8849 	if (BufferIsValid(buffer))
8850 		UnlockReleaseBuffer(buffer);
8851 }
8852 
8853 static void
heap_xlog_insert(XLogReaderState * record)8854 heap_xlog_insert(XLogReaderState *record)
8855 {
8856 	XLogRecPtr	lsn = record->EndRecPtr;
8857 	xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
8858 	Buffer		buffer;
8859 	Page		page;
8860 	union
8861 	{
8862 		HeapTupleHeaderData hdr;
8863 		char		data[MaxHeapTupleSize];
8864 	}			tbuf;
8865 	HeapTupleHeader htup;
8866 	xl_heap_header xlhdr;
8867 	uint32		newlen;
8868 	Size		freespace = 0;
8869 	RelFileNode target_node;
8870 	BlockNumber blkno;
8871 	ItemPointerData target_tid;
8872 	XLogRedoAction action;
8873 
8874 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8875 	ItemPointerSetBlockNumber(&target_tid, blkno);
8876 	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8877 
8878 	/*
8879 	 * The visibility map may need to be fixed even if the heap page is
8880 	 * already up-to-date.
8881 	 */
8882 	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8883 	{
8884 		Relation	reln = CreateFakeRelcacheEntry(target_node);
8885 		Buffer		vmbuffer = InvalidBuffer;
8886 
8887 		visibilitymap_pin(reln, blkno, &vmbuffer);
8888 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8889 		ReleaseBuffer(vmbuffer);
8890 		FreeFakeRelcacheEntry(reln);
8891 	}
8892 
8893 	/*
8894 	 * If we inserted the first and only tuple on the page, re-initialize the
8895 	 * page from scratch.
8896 	 */
8897 	if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8898 	{
8899 		buffer = XLogInitBufferForRedo(record, 0);
8900 		page = BufferGetPage(buffer);
8901 		PageInit(page, BufferGetPageSize(buffer), 0);
8902 		action = BLK_NEEDS_REDO;
8903 	}
8904 	else
8905 		action = XLogReadBufferForRedo(record, 0, &buffer);
8906 	if (action == BLK_NEEDS_REDO)
8907 	{
8908 		Size		datalen;
8909 		char	   *data;
8910 
8911 		page = BufferGetPage(buffer);
8912 
8913 		if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
8914 			elog(PANIC, "invalid max offset number");
8915 
8916 		data = XLogRecGetBlockData(record, 0, &datalen);
8917 
8918 		newlen = datalen - SizeOfHeapHeader;
8919 		Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8920 		memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8921 		data += SizeOfHeapHeader;
8922 
8923 		htup = &tbuf.hdr;
8924 		MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8925 		/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8926 		memcpy((char *) htup + SizeofHeapTupleHeader,
8927 			   data,
8928 			   newlen);
8929 		newlen += SizeofHeapTupleHeader;
8930 		htup->t_infomask2 = xlhdr.t_infomask2;
8931 		htup->t_infomask = xlhdr.t_infomask;
8932 		htup->t_hoff = xlhdr.t_hoff;
8933 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8934 		HeapTupleHeaderSetCmin(htup, FirstCommandId);
8935 		htup->t_ctid = target_tid;
8936 
8937 		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8938 						true, true) == InvalidOffsetNumber)
8939 			elog(PANIC, "failed to add tuple");
8940 
8941 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8942 
8943 		PageSetLSN(page, lsn);
8944 
8945 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8946 			PageClearAllVisible(page);
8947 
8948 		/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
8949 		if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
8950 			PageSetAllVisible(page);
8951 
8952 		MarkBufferDirty(buffer);
8953 	}
8954 	if (BufferIsValid(buffer))
8955 		UnlockReleaseBuffer(buffer);
8956 
8957 	/*
8958 	 * If the page is running low on free space, update the FSM as well.
8959 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8960 	 * better than that without knowing the fill-factor for the table.
8961 	 *
8962 	 * XXX: Don't do this if the page was restored from full page image. We
8963 	 * don't bother to update the FSM in that case, it doesn't need to be
8964 	 * totally accurate anyway.
8965 	 */
8966 	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8967 		XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8968 }
8969 
8970 /*
8971  * Handles MULTI_INSERT record type.
8972  */
8973 static void
heap_xlog_multi_insert(XLogReaderState * record)8974 heap_xlog_multi_insert(XLogReaderState *record)
8975 {
8976 	XLogRecPtr	lsn = record->EndRecPtr;
8977 	xl_heap_multi_insert *xlrec;
8978 	RelFileNode rnode;
8979 	BlockNumber blkno;
8980 	Buffer		buffer;
8981 	Page		page;
8982 	union
8983 	{
8984 		HeapTupleHeaderData hdr;
8985 		char		data[MaxHeapTupleSize];
8986 	}			tbuf;
8987 	HeapTupleHeader htup;
8988 	uint32		newlen;
8989 	Size		freespace = 0;
8990 	int			i;
8991 	bool		isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
8992 	XLogRedoAction action;
8993 
8994 	/*
8995 	 * Insertion doesn't overwrite MVCC data, so no conflict processing is
8996 	 * required.
8997 	 */
8998 	xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8999 
9000 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
9001 
9002 	/* check that the mutually exclusive flags are not both set */
9003 	Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) &&
9004 			 (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)));
9005 
9006 	/*
9007 	 * The visibility map may need to be fixed even if the heap page is
9008 	 * already up-to-date.
9009 	 */
9010 	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
9011 	{
9012 		Relation	reln = CreateFakeRelcacheEntry(rnode);
9013 		Buffer		vmbuffer = InvalidBuffer;
9014 
9015 		visibilitymap_pin(reln, blkno, &vmbuffer);
9016 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
9017 		ReleaseBuffer(vmbuffer);
9018 		FreeFakeRelcacheEntry(reln);
9019 	}
9020 
9021 	if (isinit)
9022 	{
9023 		buffer = XLogInitBufferForRedo(record, 0);
9024 		page = BufferGetPage(buffer);
9025 		PageInit(page, BufferGetPageSize(buffer), 0);
9026 		action = BLK_NEEDS_REDO;
9027 	}
9028 	else
9029 		action = XLogReadBufferForRedo(record, 0, &buffer);
9030 	if (action == BLK_NEEDS_REDO)
9031 	{
9032 		char	   *tupdata;
9033 		char	   *endptr;
9034 		Size		len;
9035 
9036 		/* Tuples are stored as block data */
9037 		tupdata = XLogRecGetBlockData(record, 0, &len);
9038 		endptr = tupdata + len;
9039 
9040 		page = (Page) BufferGetPage(buffer);
9041 
9042 		for (i = 0; i < xlrec->ntuples; i++)
9043 		{
9044 			OffsetNumber offnum;
9045 			xl_multi_insert_tuple *xlhdr;
9046 
9047 			/*
9048 			 * If we're reinitializing the page, the tuples are stored in
9049 			 * order from FirstOffsetNumber. Otherwise there's an array of
9050 			 * offsets in the WAL record, and the tuples come after that.
9051 			 */
9052 			if (isinit)
9053 				offnum = FirstOffsetNumber + i;
9054 			else
9055 				offnum = xlrec->offsets[i];
9056 			if (PageGetMaxOffsetNumber(page) + 1 < offnum)
9057 				elog(PANIC, "invalid max offset number");
9058 
9059 			xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
9060 			tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
9061 
9062 			newlen = xlhdr->datalen;
9063 			Assert(newlen <= MaxHeapTupleSize);
9064 			htup = &tbuf.hdr;
9065 			MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9066 			/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
9067 			memcpy((char *) htup + SizeofHeapTupleHeader,
9068 				   (char *) tupdata,
9069 				   newlen);
9070 			tupdata += newlen;
9071 
9072 			newlen += SizeofHeapTupleHeader;
9073 			htup->t_infomask2 = xlhdr->t_infomask2;
9074 			htup->t_infomask = xlhdr->t_infomask;
9075 			htup->t_hoff = xlhdr->t_hoff;
9076 			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9077 			HeapTupleHeaderSetCmin(htup, FirstCommandId);
9078 			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
9079 			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
9080 
9081 			offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
9082 			if (offnum == InvalidOffsetNumber)
9083 				elog(PANIC, "failed to add tuple");
9084 		}
9085 		if (tupdata != endptr)
9086 			elog(PANIC, "total tuple length mismatch");
9087 
9088 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9089 
9090 		PageSetLSN(page, lsn);
9091 
9092 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
9093 			PageClearAllVisible(page);
9094 
9095 		/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
9096 		if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
9097 			PageSetAllVisible(page);
9098 
9099 		MarkBufferDirty(buffer);
9100 	}
9101 	if (BufferIsValid(buffer))
9102 		UnlockReleaseBuffer(buffer);
9103 
9104 	/*
9105 	 * If the page is running low on free space, update the FSM as well.
9106 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9107 	 * better than that without knowing the fill-factor for the table.
9108 	 *
9109 	 * XXX: Don't do this if the page was restored from full page image. We
9110 	 * don't bother to update the FSM in that case, it doesn't need to be
9111 	 * totally accurate anyway.
9112 	 */
9113 	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
9114 		XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
9115 }
9116 
9117 /*
9118  * Handles UPDATE and HOT_UPDATE
9119  */
9120 static void
heap_xlog_update(XLogReaderState * record,bool hot_update)9121 heap_xlog_update(XLogReaderState *record, bool hot_update)
9122 {
9123 	XLogRecPtr	lsn = record->EndRecPtr;
9124 	xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
9125 	RelFileNode rnode;
9126 	BlockNumber oldblk;
9127 	BlockNumber newblk;
9128 	ItemPointerData newtid;
9129 	Buffer		obuffer,
9130 				nbuffer;
9131 	Page		page;
9132 	OffsetNumber offnum;
9133 	ItemId		lp = NULL;
9134 	HeapTupleData oldtup;
9135 	HeapTupleHeader htup;
9136 	uint16		prefixlen = 0,
9137 				suffixlen = 0;
9138 	char	   *newp;
9139 	union
9140 	{
9141 		HeapTupleHeaderData hdr;
9142 		char		data[MaxHeapTupleSize];
9143 	}			tbuf;
9144 	xl_heap_header xlhdr;
9145 	uint32		newlen;
9146 	Size		freespace = 0;
9147 	XLogRedoAction oldaction;
9148 	XLogRedoAction newaction;
9149 
9150 	/* initialize to keep the compiler quiet */
9151 	oldtup.t_data = NULL;
9152 	oldtup.t_len = 0;
9153 
9154 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk);
9155 	if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk))
9156 	{
9157 		/* HOT updates are never done across pages */
9158 		Assert(!hot_update);
9159 	}
9160 	else
9161 		oldblk = newblk;
9162 
9163 	ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
9164 
9165 	/*
9166 	 * The visibility map may need to be fixed even if the heap page is
9167 	 * already up-to-date.
9168 	 */
9169 	if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
9170 	{
9171 		Relation	reln = CreateFakeRelcacheEntry(rnode);
9172 		Buffer		vmbuffer = InvalidBuffer;
9173 
9174 		visibilitymap_pin(reln, oldblk, &vmbuffer);
9175 		visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
9176 		ReleaseBuffer(vmbuffer);
9177 		FreeFakeRelcacheEntry(reln);
9178 	}
9179 
9180 	/*
9181 	 * In normal operation, it is important to lock the two pages in
9182 	 * page-number order, to avoid possible deadlocks against other update
9183 	 * operations going the other way.  However, during WAL replay there can
9184 	 * be no other update happening, so we don't need to worry about that. But
9185 	 * we *do* need to worry that we don't expose an inconsistent state to Hot
9186 	 * Standby queries --- so the original page can't be unlocked before we've
9187 	 * added the new tuple to the new page.
9188 	 */
9189 
9190 	/* Deal with old tuple version */
9191 	oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
9192 									  &obuffer);
9193 	if (oldaction == BLK_NEEDS_REDO)
9194 	{
9195 		page = BufferGetPage(obuffer);
9196 		offnum = xlrec->old_offnum;
9197 		if (PageGetMaxOffsetNumber(page) >= offnum)
9198 			lp = PageGetItemId(page, offnum);
9199 
9200 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9201 			elog(PANIC, "invalid lp");
9202 
9203 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9204 
9205 		oldtup.t_data = htup;
9206 		oldtup.t_len = ItemIdGetLength(lp);
9207 
9208 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9209 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9210 		if (hot_update)
9211 			HeapTupleHeaderSetHotUpdated(htup);
9212 		else
9213 			HeapTupleHeaderClearHotUpdated(htup);
9214 		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
9215 								   &htup->t_infomask2);
9216 		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
9217 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9218 		/* Set forward chain link in t_ctid */
9219 		htup->t_ctid = newtid;
9220 
9221 		/* Mark the page as a candidate for pruning */
9222 		PageSetPrunable(page, XLogRecGetXid(record));
9223 
9224 		if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
9225 			PageClearAllVisible(page);
9226 
9227 		PageSetLSN(page, lsn);
9228 		MarkBufferDirty(obuffer);
9229 	}
9230 
9231 	/*
9232 	 * Read the page the new tuple goes into, if different from old.
9233 	 */
9234 	if (oldblk == newblk)
9235 	{
9236 		nbuffer = obuffer;
9237 		newaction = oldaction;
9238 	}
9239 	else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
9240 	{
9241 		nbuffer = XLogInitBufferForRedo(record, 0);
9242 		page = (Page) BufferGetPage(nbuffer);
9243 		PageInit(page, BufferGetPageSize(nbuffer), 0);
9244 		newaction = BLK_NEEDS_REDO;
9245 	}
9246 	else
9247 		newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
9248 
9249 	/*
9250 	 * The visibility map may need to be fixed even if the heap page is
9251 	 * already up-to-date.
9252 	 */
9253 	if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
9254 	{
9255 		Relation	reln = CreateFakeRelcacheEntry(rnode);
9256 		Buffer		vmbuffer = InvalidBuffer;
9257 
9258 		visibilitymap_pin(reln, newblk, &vmbuffer);
9259 		visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
9260 		ReleaseBuffer(vmbuffer);
9261 		FreeFakeRelcacheEntry(reln);
9262 	}
9263 
9264 	/* Deal with new tuple */
9265 	if (newaction == BLK_NEEDS_REDO)
9266 	{
9267 		char	   *recdata;
9268 		char	   *recdata_end;
9269 		Size		datalen;
9270 		Size		tuplen;
9271 
9272 		recdata = XLogRecGetBlockData(record, 0, &datalen);
9273 		recdata_end = recdata + datalen;
9274 
9275 		page = BufferGetPage(nbuffer);
9276 
9277 		offnum = xlrec->new_offnum;
9278 		if (PageGetMaxOffsetNumber(page) + 1 < offnum)
9279 			elog(PANIC, "invalid max offset number");
9280 
9281 		if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
9282 		{
9283 			Assert(newblk == oldblk);
9284 			memcpy(&prefixlen, recdata, sizeof(uint16));
9285 			recdata += sizeof(uint16);
9286 		}
9287 		if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
9288 		{
9289 			Assert(newblk == oldblk);
9290 			memcpy(&suffixlen, recdata, sizeof(uint16));
9291 			recdata += sizeof(uint16);
9292 		}
9293 
9294 		memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
9295 		recdata += SizeOfHeapHeader;
9296 
9297 		tuplen = recdata_end - recdata;
9298 		Assert(tuplen <= MaxHeapTupleSize);
9299 
9300 		htup = &tbuf.hdr;
9301 		MemSet((char *) htup, 0, SizeofHeapTupleHeader);
9302 
9303 		/*
9304 		 * Reconstruct the new tuple using the prefix and/or suffix from the
9305 		 * old tuple, and the data stored in the WAL record.
9306 		 */
9307 		newp = (char *) htup + SizeofHeapTupleHeader;
9308 		if (prefixlen > 0)
9309 		{
9310 			int			len;
9311 
9312 			/* copy bitmap [+ padding] [+ oid] from WAL record */
9313 			len = xlhdr.t_hoff - SizeofHeapTupleHeader;
9314 			memcpy(newp, recdata, len);
9315 			recdata += len;
9316 			newp += len;
9317 
9318 			/* copy prefix from old tuple */
9319 			memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
9320 			newp += prefixlen;
9321 
9322 			/* copy new tuple data from WAL record */
9323 			len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
9324 			memcpy(newp, recdata, len);
9325 			recdata += len;
9326 			newp += len;
9327 		}
9328 		else
9329 		{
9330 			/*
9331 			 * copy bitmap [+ padding] [+ oid] + data from record, all in one
9332 			 * go
9333 			 */
9334 			memcpy(newp, recdata, tuplen);
9335 			recdata += tuplen;
9336 			newp += tuplen;
9337 		}
9338 		Assert(recdata == recdata_end);
9339 
9340 		/* copy suffix from old tuple */
9341 		if (suffixlen > 0)
9342 			memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
9343 
9344 		newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
9345 		htup->t_infomask2 = xlhdr.t_infomask2;
9346 		htup->t_infomask = xlhdr.t_infomask;
9347 		htup->t_hoff = xlhdr.t_hoff;
9348 
9349 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
9350 		HeapTupleHeaderSetCmin(htup, FirstCommandId);
9351 		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
9352 		/* Make sure there is no forward chain link in t_ctid */
9353 		htup->t_ctid = newtid;
9354 
9355 		offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
9356 		if (offnum == InvalidOffsetNumber)
9357 			elog(PANIC, "failed to add tuple");
9358 
9359 		if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
9360 			PageClearAllVisible(page);
9361 
9362 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
9363 
9364 		PageSetLSN(page, lsn);
9365 		MarkBufferDirty(nbuffer);
9366 	}
9367 
9368 	if (BufferIsValid(nbuffer) && nbuffer != obuffer)
9369 		UnlockReleaseBuffer(nbuffer);
9370 	if (BufferIsValid(obuffer))
9371 		UnlockReleaseBuffer(obuffer);
9372 
9373 	/*
9374 	 * If the new page is running low on free space, update the FSM as well.
9375 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
9376 	 * better than that without knowing the fill-factor for the table.
9377 	 *
9378 	 * However, don't update the FSM on HOT updates, because after crash
9379 	 * recovery, either the old or the new tuple will certainly be dead and
9380 	 * prunable. After pruning, the page will have roughly as much free space
9381 	 * as it did before the update, assuming the new tuple is about the same
9382 	 * size as the old one.
9383 	 *
9384 	 * XXX: Don't do this if the page was restored from full page image. We
9385 	 * don't bother to update the FSM in that case, it doesn't need to be
9386 	 * totally accurate anyway.
9387 	 */
9388 	if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
9389 		XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
9390 }
9391 
9392 static void
heap_xlog_confirm(XLogReaderState * record)9393 heap_xlog_confirm(XLogReaderState *record)
9394 {
9395 	XLogRecPtr	lsn = record->EndRecPtr;
9396 	xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
9397 	Buffer		buffer;
9398 	Page		page;
9399 	OffsetNumber offnum;
9400 	ItemId		lp = NULL;
9401 	HeapTupleHeader htup;
9402 
9403 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9404 	{
9405 		page = BufferGetPage(buffer);
9406 
9407 		offnum = xlrec->offnum;
9408 		if (PageGetMaxOffsetNumber(page) >= offnum)
9409 			lp = PageGetItemId(page, offnum);
9410 
9411 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9412 			elog(PANIC, "invalid lp");
9413 
9414 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9415 
9416 		/*
9417 		 * Confirm tuple as actually inserted
9418 		 */
9419 		ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
9420 
9421 		PageSetLSN(page, lsn);
9422 		MarkBufferDirty(buffer);
9423 	}
9424 	if (BufferIsValid(buffer))
9425 		UnlockReleaseBuffer(buffer);
9426 }
9427 
9428 static void
heap_xlog_lock(XLogReaderState * record)9429 heap_xlog_lock(XLogReaderState *record)
9430 {
9431 	XLogRecPtr	lsn = record->EndRecPtr;
9432 	xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
9433 	Buffer		buffer;
9434 	Page		page;
9435 	OffsetNumber offnum;
9436 	ItemId		lp = NULL;
9437 	HeapTupleHeader htup;
9438 
9439 	/*
9440 	 * The visibility map may need to be fixed even if the heap page is
9441 	 * already up-to-date.
9442 	 */
9443 	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9444 	{
9445 		RelFileNode rnode;
9446 		Buffer		vmbuffer = InvalidBuffer;
9447 		BlockNumber block;
9448 		Relation	reln;
9449 
9450 		XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
9451 		reln = CreateFakeRelcacheEntry(rnode);
9452 
9453 		visibilitymap_pin(reln, block, &vmbuffer);
9454 		visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9455 
9456 		ReleaseBuffer(vmbuffer);
9457 		FreeFakeRelcacheEntry(reln);
9458 	}
9459 
9460 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9461 	{
9462 		page = (Page) BufferGetPage(buffer);
9463 
9464 		offnum = xlrec->offnum;
9465 		if (PageGetMaxOffsetNumber(page) >= offnum)
9466 			lp = PageGetItemId(page, offnum);
9467 
9468 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9469 			elog(PANIC, "invalid lp");
9470 
9471 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9472 
9473 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9474 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9475 		fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
9476 								   &htup->t_infomask2);
9477 
9478 		/*
9479 		 * Clear relevant update flags, but only if the modified infomask says
9480 		 * there's no update.
9481 		 */
9482 		if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
9483 		{
9484 			HeapTupleHeaderClearHotUpdated(htup);
9485 			/* Make sure there is no forward chain link in t_ctid */
9486 			ItemPointerSet(&htup->t_ctid,
9487 						   BufferGetBlockNumber(buffer),
9488 						   offnum);
9489 		}
9490 		HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
9491 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
9492 		PageSetLSN(page, lsn);
9493 		MarkBufferDirty(buffer);
9494 	}
9495 	if (BufferIsValid(buffer))
9496 		UnlockReleaseBuffer(buffer);
9497 }
9498 
9499 static void
heap_xlog_lock_updated(XLogReaderState * record)9500 heap_xlog_lock_updated(XLogReaderState *record)
9501 {
9502 	XLogRecPtr	lsn = record->EndRecPtr;
9503 	xl_heap_lock_updated *xlrec;
9504 	Buffer		buffer;
9505 	Page		page;
9506 	OffsetNumber offnum;
9507 	ItemId		lp = NULL;
9508 	HeapTupleHeader htup;
9509 
9510 	xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
9511 
9512 	/*
9513 	 * The visibility map may need to be fixed even if the heap page is
9514 	 * already up-to-date.
9515 	 */
9516 	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
9517 	{
9518 		RelFileNode rnode;
9519 		Buffer		vmbuffer = InvalidBuffer;
9520 		BlockNumber block;
9521 		Relation	reln;
9522 
9523 		XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
9524 		reln = CreateFakeRelcacheEntry(rnode);
9525 
9526 		visibilitymap_pin(reln, block, &vmbuffer);
9527 		visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
9528 
9529 		ReleaseBuffer(vmbuffer);
9530 		FreeFakeRelcacheEntry(reln);
9531 	}
9532 
9533 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9534 	{
9535 		page = BufferGetPage(buffer);
9536 
9537 		offnum = xlrec->offnum;
9538 		if (PageGetMaxOffsetNumber(page) >= offnum)
9539 			lp = PageGetItemId(page, offnum);
9540 
9541 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9542 			elog(PANIC, "invalid lp");
9543 
9544 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9545 
9546 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
9547 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
9548 		fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
9549 								   &htup->t_infomask2);
9550 		HeapTupleHeaderSetXmax(htup, xlrec->xmax);
9551 
9552 		PageSetLSN(page, lsn);
9553 		MarkBufferDirty(buffer);
9554 	}
9555 	if (BufferIsValid(buffer))
9556 		UnlockReleaseBuffer(buffer);
9557 }
9558 
9559 static void
heap_xlog_inplace(XLogReaderState * record)9560 heap_xlog_inplace(XLogReaderState *record)
9561 {
9562 	XLogRecPtr	lsn = record->EndRecPtr;
9563 	xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
9564 	Buffer		buffer;
9565 	Page		page;
9566 	OffsetNumber offnum;
9567 	ItemId		lp = NULL;
9568 	HeapTupleHeader htup;
9569 	uint32		oldlen;
9570 	Size		newlen;
9571 
9572 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
9573 	{
9574 		char	   *newtup = XLogRecGetBlockData(record, 0, &newlen);
9575 
9576 		page = BufferGetPage(buffer);
9577 
9578 		offnum = xlrec->offnum;
9579 		if (PageGetMaxOffsetNumber(page) >= offnum)
9580 			lp = PageGetItemId(page, offnum);
9581 
9582 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
9583 			elog(PANIC, "invalid lp");
9584 
9585 		htup = (HeapTupleHeader) PageGetItem(page, lp);
9586 
9587 		oldlen = ItemIdGetLength(lp) - htup->t_hoff;
9588 		if (oldlen != newlen)
9589 			elog(PANIC, "wrong tuple length");
9590 
9591 		memcpy((char *) htup + htup->t_hoff, newtup, newlen);
9592 
9593 		PageSetLSN(page, lsn);
9594 		MarkBufferDirty(buffer);
9595 	}
9596 	if (BufferIsValid(buffer))
9597 		UnlockReleaseBuffer(buffer);
9598 }
9599 
9600 void
heap_redo(XLogReaderState * record)9601 heap_redo(XLogReaderState *record)
9602 {
9603 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9604 
9605 	/*
9606 	 * These operations don't overwrite MVCC data so no conflict processing is
9607 	 * required. The ones in heap2 rmgr do.
9608 	 */
9609 
9610 	switch (info & XLOG_HEAP_OPMASK)
9611 	{
9612 		case XLOG_HEAP_INSERT:
9613 			heap_xlog_insert(record);
9614 			break;
9615 		case XLOG_HEAP_DELETE:
9616 			heap_xlog_delete(record);
9617 			break;
9618 		case XLOG_HEAP_UPDATE:
9619 			heap_xlog_update(record, false);
9620 			break;
9621 		case XLOG_HEAP_TRUNCATE:
9622 
9623 			/*
9624 			 * TRUNCATE is a no-op because the actions are already logged as
9625 			 * SMGR WAL records.  TRUNCATE WAL record only exists for logical
9626 			 * decoding.
9627 			 */
9628 			break;
9629 		case XLOG_HEAP_HOT_UPDATE:
9630 			heap_xlog_update(record, true);
9631 			break;
9632 		case XLOG_HEAP_CONFIRM:
9633 			heap_xlog_confirm(record);
9634 			break;
9635 		case XLOG_HEAP_LOCK:
9636 			heap_xlog_lock(record);
9637 			break;
9638 		case XLOG_HEAP_INPLACE:
9639 			heap_xlog_inplace(record);
9640 			break;
9641 		default:
9642 			elog(PANIC, "heap_redo: unknown op code %u", info);
9643 	}
9644 }
9645 
9646 void
heap2_redo(XLogReaderState * record)9647 heap2_redo(XLogReaderState *record)
9648 {
9649 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9650 
9651 	switch (info & XLOG_HEAP_OPMASK)
9652 	{
9653 		case XLOG_HEAP2_PRUNE:
9654 			heap_xlog_prune(record);
9655 			break;
9656 		case XLOG_HEAP2_VACUUM:
9657 			heap_xlog_vacuum(record);
9658 			break;
9659 		case XLOG_HEAP2_FREEZE_PAGE:
9660 			heap_xlog_freeze_page(record);
9661 			break;
9662 		case XLOG_HEAP2_VISIBLE:
9663 			heap_xlog_visible(record);
9664 			break;
9665 		case XLOG_HEAP2_MULTI_INSERT:
9666 			heap_xlog_multi_insert(record);
9667 			break;
9668 		case XLOG_HEAP2_LOCK_UPDATED:
9669 			heap_xlog_lock_updated(record);
9670 			break;
9671 		case XLOG_HEAP2_NEW_CID:
9672 
9673 			/*
9674 			 * Nothing to do on a real replay, only used during logical
9675 			 * decoding.
9676 			 */
9677 			break;
9678 		case XLOG_HEAP2_REWRITE:
9679 			heap_xlog_logical_rewrite(record);
9680 			break;
9681 		default:
9682 			elog(PANIC, "heap2_redo: unknown op code %u", info);
9683 	}
9684 }
9685 
9686 /*
9687  * Mask a heap page before performing consistency checks on it.
9688  */
9689 void
heap_mask(char * pagedata,BlockNumber blkno)9690 heap_mask(char *pagedata, BlockNumber blkno)
9691 {
9692 	Page		page = (Page) pagedata;
9693 	OffsetNumber off;
9694 
9695 	mask_page_lsn_and_checksum(page);
9696 
9697 	mask_page_hint_bits(page);
9698 	mask_unused_space(page);
9699 
9700 	for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
9701 	{
9702 		ItemId		iid = PageGetItemId(page, off);
9703 		char	   *page_item;
9704 
9705 		page_item = (char *) (page + ItemIdGetOffset(iid));
9706 
9707 		if (ItemIdIsNormal(iid))
9708 		{
9709 			HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
9710 
9711 			/*
9712 			 * If xmin of a tuple is not yet frozen, we should ignore
9713 			 * differences in hint bits, since they can be set without
9714 			 * emitting WAL.
9715 			 */
9716 			if (!HeapTupleHeaderXminFrozen(page_htup))
9717 				page_htup->t_infomask &= ~HEAP_XACT_MASK;
9718 			else
9719 			{
9720 				/* Still we need to mask xmax hint bits. */
9721 				page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
9722 				page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
9723 			}
9724 
9725 			/*
9726 			 * During replay, we set Command Id to FirstCommandId. Hence, mask
9727 			 * it. See heap_xlog_insert() for details.
9728 			 */
9729 			page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
9730 
9731 			/*
9732 			 * For a speculative tuple, heap_insert() does not set ctid in the
9733 			 * caller-passed heap tuple itself, leaving the ctid field to
9734 			 * contain a speculative token value - a per-backend monotonically
9735 			 * increasing identifier. Besides, it does not WAL-log ctid under
9736 			 * any circumstances.
9737 			 *
9738 			 * During redo, heap_xlog_insert() sets t_ctid to current block
9739 			 * number and self offset number. It doesn't care about any
9740 			 * speculative insertions on the primary. Hence, we set t_ctid to
9741 			 * current block number and self offset number to ignore any
9742 			 * inconsistency.
9743 			 */
9744 			if (HeapTupleHeaderIsSpeculative(page_htup))
9745 				ItemPointerSet(&page_htup->t_ctid, blkno, off);
9746 
9747 			/*
9748 			 * NB: Not ignoring ctid changes due to the tuple having moved
9749 			 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
9750 			 * important information that needs to be in-sync between primary
9751 			 * and standby, and thus is WAL logged.
9752 			 */
9753 		}
9754 
9755 		/*
9756 		 * Ignore any padding bytes after the tuple, when the length of the
9757 		 * item is not MAXALIGNed.
9758 		 */
9759 		if (ItemIdHasStorage(iid))
9760 		{
9761 			int			len = ItemIdGetLength(iid);
9762 			int			padlen = MAXALIGN(len) - len;
9763 
9764 			if (padlen > 0)
9765 				memset(page_item + len, MASK_MARKER, padlen);
9766 		}
9767 	}
9768 }
9769 
9770 /*
9771  * HeapCheckForSerializableConflictOut
9772  *		We are reading a tuple.  If it's not visible, there may be a
9773  *		rw-conflict out with the inserter.  Otherwise, if it is visible to us
9774  *		but has been deleted, there may be a rw-conflict out with the deleter.
9775  *
9776  * We will determine the top level xid of the writing transaction with which
9777  * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9778  * for overlap with our own transaction.
9779  *
9780  * This function should be called just about anywhere in heapam.c where a
9781  * tuple has been read. The caller must hold at least a shared lock on the
9782  * buffer, because this function might set hint bits on the tuple. There is
9783  * currently no known reason to call this function from an index AM.
9784  */
9785 void
HeapCheckForSerializableConflictOut(bool visible,Relation relation,HeapTuple tuple,Buffer buffer,Snapshot snapshot)9786 HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9787 									HeapTuple tuple, Buffer buffer,
9788 									Snapshot snapshot)
9789 {
9790 	TransactionId xid;
9791 	HTSV_Result htsvResult;
9792 
9793 	if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9794 		return;
9795 
9796 	/*
9797 	 * Check to see whether the tuple has been written to by a concurrent
9798 	 * transaction, either to create it not visible to us, or to delete it
9799 	 * while it is visible to us.  The "visible" bool indicates whether the
9800 	 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9801 	 * is going on with it.
9802 	 *
9803 	 * In the event of a concurrently inserted tuple that also happens to have
9804 	 * been concurrently updated (by a separate transaction), the xmin of the
9805 	 * tuple will be used -- not the updater's xid.
9806 	 */
9807 	htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
9808 	switch (htsvResult)
9809 	{
9810 		case HEAPTUPLE_LIVE:
9811 			if (visible)
9812 				return;
9813 			xid = HeapTupleHeaderGetXmin(tuple->t_data);
9814 			break;
9815 		case HEAPTUPLE_RECENTLY_DEAD:
9816 		case HEAPTUPLE_DELETE_IN_PROGRESS:
9817 			if (visible)
9818 				xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9819 			else
9820 				xid = HeapTupleHeaderGetXmin(tuple->t_data);
9821 
9822 			if (TransactionIdPrecedes(xid, TransactionXmin))
9823 			{
9824 				/* This is like the HEAPTUPLE_DEAD case */
9825 				Assert(!visible);
9826 				return;
9827 			}
9828 			break;
9829 		case HEAPTUPLE_INSERT_IN_PROGRESS:
9830 			xid = HeapTupleHeaderGetXmin(tuple->t_data);
9831 			break;
9832 		case HEAPTUPLE_DEAD:
9833 			Assert(!visible);
9834 			return;
9835 		default:
9836 
9837 			/*
9838 			 * The only way to get to this default clause is if a new value is
9839 			 * added to the enum type without adding it to this switch
9840 			 * statement.  That's a bug, so elog.
9841 			 */
9842 			elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9843 
9844 			/*
9845 			 * In spite of having all enum values covered and calling elog on
9846 			 * this default, some compilers think this is a code path which
9847 			 * allows xid to be used below without initialization. Silence
9848 			 * that warning.
9849 			 */
9850 			xid = InvalidTransactionId;
9851 	}
9852 
9853 	Assert(TransactionIdIsValid(xid));
9854 	Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
9855 
9856 	/*
9857 	 * Find top level xid.  Bail out if xid is too early to be a conflict, or
9858 	 * if it's our own xid.
9859 	 */
9860 	if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
9861 		return;
9862 	xid = SubTransGetTopmostTransaction(xid);
9863 	if (TransactionIdPrecedes(xid, TransactionXmin))
9864 		return;
9865 
9866 	CheckForSerializableConflictOut(relation, xid, snapshot);
9867 }
9868