1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  *	  heap access method code
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  *		heap_beginscan	- begin relation scan
16  *		heap_rescan		- restart a relation scan
17  *		heap_endscan	- end relation scan
18  *		heap_getnext	- retrieve next tuple in scan
19  *		heap_fetch		- retrieve tuple with given tid
20  *		heap_insert		- insert tuple into a relation
21  *		heap_multi_insert - insert multiple tuples into a relation
22  *		heap_delete		- delete a tuple from a relation
23  *		heap_update		- replace a tuple in a relation with another tuple
24  *
25  * NOTES
26  *	  This file contains the heap_ routines which implement
27  *	  the POSTGRES heap access method used for all POSTGRES
28  *	  relations.
29  *
30  *-------------------------------------------------------------------------
31  */
32 #include "postgres.h"
33 
34 #include "access/bufmask.h"
35 #include "access/genam.h"
36 #include "access/heapam.h"
37 #include "access/heapam_xlog.h"
38 #include "access/heaptoast.h"
39 #include "access/hio.h"
40 #include "access/multixact.h"
41 #include "access/parallel.h"
42 #include "access/relscan.h"
43 #include "access/subtrans.h"
44 #include "access/sysattr.h"
45 #include "access/tableam.h"
46 #include "access/transam.h"
47 #include "access/valid.h"
48 #include "access/visibilitymap.h"
49 #include "access/xact.h"
50 #include "access/xlog.h"
51 #include "access/xloginsert.h"
52 #include "access/xlogutils.h"
53 #include "catalog/catalog.h"
54 #include "miscadmin.h"
55 #include "pgstat.h"
56 #include "port/atomics.h"
57 #include "storage/bufmgr.h"
58 #include "storage/freespace.h"
59 #include "storage/lmgr.h"
60 #include "storage/predicate.h"
61 #include "storage/procarray.h"
62 #include "storage/smgr.h"
63 #include "storage/spin.h"
64 #include "storage/standby.h"
65 #include "utils/datum.h"
66 #include "utils/inval.h"
67 #include "utils/lsyscache.h"
68 #include "utils/relcache.h"
69 #include "utils/snapmgr.h"
70 #include "utils/spccache.h"
71 
72 
73 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
74 									 TransactionId xid, CommandId cid, int options);
75 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
76 								  Buffer newbuf, HeapTuple oldtup,
77 								  HeapTuple newtup, HeapTuple old_key_tuple,
78 								  bool all_visible_cleared, bool new_all_visible_cleared);
79 static Bitmapset *HeapDetermineModifiedColumns(Relation relation,
80 											   Bitmapset *interesting_cols,
81 											   HeapTuple oldtup, HeapTuple newtup);
82 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
83 								 LockTupleMode mode, LockWaitPolicy wait_policy,
84 								 bool *have_tuple_lock);
85 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
86 									  uint16 old_infomask2, TransactionId add_to_xmax,
87 									  LockTupleMode mode, bool is_update,
88 									  TransactionId *result_xmax, uint16 *result_infomask,
89 									  uint16 *result_infomask2);
90 static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
91 										 ItemPointer ctid, TransactionId xid,
92 										 LockTupleMode mode);
93 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
94 								   uint16 *new_infomask2);
95 static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
96 											 uint16 t_infomask);
97 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
98 									LockTupleMode lockmode, bool *current_is_member);
99 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
100 							Relation rel, ItemPointer ctid, XLTW_Oper oper,
101 							int *remaining);
102 static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
103 									   uint16 infomask, Relation rel, int *remaining);
104 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
105 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
106 										bool *copy);
107 
108 
109 /*
110  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
111  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
112  * update them).  This table (and the macros below) helps us determine the
113  * heavyweight lock mode and MultiXactStatus values to use for any particular
114  * tuple lock strength.
115  *
116  * Don't look at lockstatus/updstatus directly!  Use get_mxact_status_for_lock
117  * instead.
118  */
119 static const struct
120 {
121 	LOCKMODE	hwlock;
122 	int			lockstatus;
123 	int			updstatus;
124 }
125 
126 			tupleLockExtraInfo[MaxLockTupleMode + 1] =
127 {
128 	{							/* LockTupleKeyShare */
129 		AccessShareLock,
130 		MultiXactStatusForKeyShare,
131 		-1						/* KeyShare does not allow updating tuples */
132 	},
133 	{							/* LockTupleShare */
134 		RowShareLock,
135 		MultiXactStatusForShare,
136 		-1						/* Share does not allow updating tuples */
137 	},
138 	{							/* LockTupleNoKeyExclusive */
139 		ExclusiveLock,
140 		MultiXactStatusForNoKeyUpdate,
141 		MultiXactStatusNoKeyUpdate
142 	},
143 	{							/* LockTupleExclusive */
144 		AccessExclusiveLock,
145 		MultiXactStatusForUpdate,
146 		MultiXactStatusUpdate
147 	}
148 };
149 
150 /* Get the LOCKMODE for a given MultiXactStatus */
151 #define LOCKMODE_from_mxstatus(status) \
152 			(tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
153 
154 /*
155  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
156  * This is more readable than having every caller translate it to lock.h's
157  * LOCKMODE.
158  */
159 #define LockTupleTuplock(rel, tup, mode) \
160 	LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
161 #define UnlockTupleTuplock(rel, tup, mode) \
162 	UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
163 #define ConditionalLockTupleTuplock(rel, tup, mode) \
164 	ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165 
166 #ifdef USE_PREFETCH
167 /*
168  * heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use
169  * this structure to coordinate prefetching activity.
170  */
171 typedef struct
172 {
173 	BlockNumber cur_hblkno;
174 	int			next_item;
175 	int			nitems;
176 	ItemPointerData *tids;
177 } XidHorizonPrefetchState;
178 #endif
179 
180 /*
181  * This table maps tuple lock strength values for each particular
182  * MultiXactStatus value.
183  */
184 static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
185 {
186 	LockTupleKeyShare,			/* ForKeyShare */
187 	LockTupleShare,				/* ForShare */
188 	LockTupleNoKeyExclusive,	/* ForNoKeyUpdate */
189 	LockTupleExclusive,			/* ForUpdate */
190 	LockTupleNoKeyExclusive,	/* NoKeyUpdate */
191 	LockTupleExclusive			/* Update */
192 };
193 
194 /* Get the LockTupleMode for a given MultiXactStatus */
195 #define TUPLOCK_from_mxstatus(status) \
196 			(MultiXactStatusLock[(status)])
197 
198 /* ----------------------------------------------------------------
199  *						 heap support routines
200  * ----------------------------------------------------------------
201  */
202 
203 /* ----------------
204  *		initscan - scan code common to heap_beginscan and heap_rescan
205  * ----------------
206  */
207 static void
initscan(HeapScanDesc scan,ScanKey key,bool keep_startblock)208 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
209 {
210 	ParallelBlockTableScanDesc bpscan = NULL;
211 	bool		allow_strat;
212 	bool		allow_sync;
213 
214 	/*
215 	 * Determine the number of blocks we have to scan.
216 	 *
217 	 * It is sufficient to do this once at scan start, since any tuples added
218 	 * while the scan is in progress will be invisible to my snapshot anyway.
219 	 * (That is not true when using a non-MVCC snapshot.  However, we couldn't
220 	 * guarantee to return tuples added after scan start anyway, since they
221 	 * might go into pages we already scanned.  To guarantee consistent
222 	 * results for a non-MVCC snapshot, the caller must hold some higher-level
223 	 * lock that ensures the interesting tuple(s) won't change.)
224 	 */
225 	if (scan->rs_base.rs_parallel != NULL)
226 	{
227 		bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
228 		scan->rs_nblocks = bpscan->phs_nblocks;
229 	}
230 	else
231 		scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
232 
233 	/*
234 	 * If the table is large relative to NBuffers, use a bulk-read access
235 	 * strategy and enable synchronized scanning (see syncscan.c).  Although
236 	 * the thresholds for these features could be different, we make them the
237 	 * same so that there are only two behaviors to tune rather than four.
238 	 * (However, some callers need to be able to disable one or both of these
239 	 * behaviors, independently of the size of the table; also there is a GUC
240 	 * variable that can disable synchronized scanning.)
241 	 *
242 	 * Note that table_block_parallelscan_initialize has a very similar test;
243 	 * if you change this, consider changing that one, too.
244 	 */
245 	if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
246 		scan->rs_nblocks > NBuffers / 4)
247 	{
248 		allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
249 		allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
250 	}
251 	else
252 		allow_strat = allow_sync = false;
253 
254 	if (allow_strat)
255 	{
256 		/* During a rescan, keep the previous strategy object. */
257 		if (scan->rs_strategy == NULL)
258 			scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
259 	}
260 	else
261 	{
262 		if (scan->rs_strategy != NULL)
263 			FreeAccessStrategy(scan->rs_strategy);
264 		scan->rs_strategy = NULL;
265 	}
266 
267 	if (scan->rs_base.rs_parallel != NULL)
268 	{
269 		/* For parallel scan, believe whatever ParallelTableScanDesc says. */
270 		if (scan->rs_base.rs_parallel->phs_syncscan)
271 			scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
272 		else
273 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
274 	}
275 	else if (keep_startblock)
276 	{
277 		/*
278 		 * When rescanning, we want to keep the previous startblock setting,
279 		 * so that rewinding a cursor doesn't generate surprising results.
280 		 * Reset the active syncscan setting, though.
281 		 */
282 		if (allow_sync && synchronize_seqscans)
283 			scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
284 		else
285 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
286 	}
287 	else if (allow_sync && synchronize_seqscans)
288 	{
289 		scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
290 		scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
291 	}
292 	else
293 	{
294 		scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
295 		scan->rs_startblock = 0;
296 	}
297 
298 	scan->rs_numblocks = InvalidBlockNumber;
299 	scan->rs_inited = false;
300 	scan->rs_ctup.t_data = NULL;
301 	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
302 	scan->rs_cbuf = InvalidBuffer;
303 	scan->rs_cblock = InvalidBlockNumber;
304 
305 	/* page-at-a-time fields are always invalid when not rs_inited */
306 
307 	/*
308 	 * copy the scan key, if appropriate
309 	 */
310 	if (key != NULL)
311 		memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
312 
313 	/*
314 	 * Currently, we only have a stats counter for sequential heap scans (but
315 	 * e.g for bitmap scans the underlying bitmap index scans will be counted,
316 	 * and for sample scans we update stats for tuple fetches).
317 	 */
318 	if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
319 		pgstat_count_heap_scan(scan->rs_base.rs_rd);
320 }
321 
322 /*
323  * heap_setscanlimits - restrict range of a heapscan
324  *
325  * startBlk is the page to start at
326  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
327  */
328 void
heap_setscanlimits(TableScanDesc sscan,BlockNumber startBlk,BlockNumber numBlks)329 heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
330 {
331 	HeapScanDesc scan = (HeapScanDesc) sscan;
332 
333 	Assert(!scan->rs_inited);	/* else too late to change */
334 	/* else rs_startblock is significant */
335 	Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
336 
337 	/* Check startBlk is valid (but allow case of zero blocks...) */
338 	Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
339 
340 	scan->rs_startblock = startBlk;
341 	scan->rs_numblocks = numBlks;
342 }
343 
344 /*
345  * heapgetpage - subroutine for heapgettup()
346  *
347  * This routine reads and pins the specified page of the relation.
348  * In page-at-a-time mode it performs additional work, namely determining
349  * which tuples on the page are visible.
350  */
351 void
heapgetpage(TableScanDesc sscan,BlockNumber page)352 heapgetpage(TableScanDesc sscan, BlockNumber page)
353 {
354 	HeapScanDesc scan = (HeapScanDesc) sscan;
355 	Buffer		buffer;
356 	Snapshot	snapshot;
357 	Page		dp;
358 	int			lines;
359 	int			ntup;
360 	OffsetNumber lineoff;
361 	ItemId		lpp;
362 	bool		all_visible;
363 
364 	Assert(page < scan->rs_nblocks);
365 
366 	/* release previous scan buffer, if any */
367 	if (BufferIsValid(scan->rs_cbuf))
368 	{
369 		ReleaseBuffer(scan->rs_cbuf);
370 		scan->rs_cbuf = InvalidBuffer;
371 	}
372 
373 	/*
374 	 * Be sure to check for interrupts at least once per page.  Checks at
375 	 * higher code levels won't be able to stop a seqscan that encounters many
376 	 * pages' worth of consecutive dead tuples.
377 	 */
378 	CHECK_FOR_INTERRUPTS();
379 
380 	/* read page using selected strategy */
381 	scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
382 									   RBM_NORMAL, scan->rs_strategy);
383 	scan->rs_cblock = page;
384 
385 	if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
386 		return;
387 
388 	buffer = scan->rs_cbuf;
389 	snapshot = scan->rs_base.rs_snapshot;
390 
391 	/*
392 	 * Prune and repair fragmentation for the whole page, if possible.
393 	 */
394 	heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
395 
396 	/*
397 	 * We must hold share lock on the buffer content while examining tuple
398 	 * visibility.  Afterwards, however, the tuples we have found to be
399 	 * visible are guaranteed good as long as we hold the buffer pin.
400 	 */
401 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
402 
403 	dp = BufferGetPage(buffer);
404 	TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
405 	lines = PageGetMaxOffsetNumber(dp);
406 	ntup = 0;
407 
408 	/*
409 	 * If the all-visible flag indicates that all tuples on the page are
410 	 * visible to everyone, we can skip the per-tuple visibility tests.
411 	 *
412 	 * Note: In hot standby, a tuple that's already visible to all
413 	 * transactions in the master might still be invisible to a read-only
414 	 * transaction in the standby. We partly handle this problem by tracking
415 	 * the minimum xmin of visible tuples as the cut-off XID while marking a
416 	 * page all-visible on master and WAL log that along with the visibility
417 	 * map SET operation. In hot standby, we wait for (or abort) all
418 	 * transactions that can potentially may not see one or more tuples on the
419 	 * page. That's how index-only scans work fine in hot standby. A crucial
420 	 * difference between index-only scans and heap scans is that the
421 	 * index-only scan completely relies on the visibility map where as heap
422 	 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
423 	 * the page-level flag can be trusted in the same way, because it might
424 	 * get propagated somehow without being explicitly WAL-logged, e.g. via a
425 	 * full page write. Until we can prove that beyond doubt, let's check each
426 	 * tuple for visibility the hard way.
427 	 */
428 	all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
429 
430 	for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
431 		 lineoff <= lines;
432 		 lineoff++, lpp++)
433 	{
434 		if (ItemIdIsNormal(lpp))
435 		{
436 			HeapTupleData loctup;
437 			bool		valid;
438 
439 			loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
440 			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
441 			loctup.t_len = ItemIdGetLength(lpp);
442 			ItemPointerSet(&(loctup.t_self), page, lineoff);
443 
444 			if (all_visible)
445 				valid = true;
446 			else
447 				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
448 
449 			HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
450 												&loctup, buffer, snapshot);
451 
452 			if (valid)
453 				scan->rs_vistuples[ntup++] = lineoff;
454 		}
455 	}
456 
457 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
458 
459 	Assert(ntup <= MaxHeapTuplesPerPage);
460 	scan->rs_ntuples = ntup;
461 }
462 
463 /* ----------------
464  *		heapgettup - fetch next heap tuple
465  *
466  *		Initialize the scan if not already done; then advance to the next
467  *		tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
468  *		or set scan->rs_ctup.t_data = NULL if no more tuples.
469  *
470  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
471  * by scan->rs_ctup".
472  *
473  * Note: the reason nkeys/key are passed separately, even though they are
474  * kept in the scan descriptor, is that the caller may not want us to check
475  * the scankeys.
476  *
477  * Note: when we fall off the end of the scan in either direction, we
478  * reset rs_inited.  This means that a further request with the same
479  * scan direction will restart the scan, which is a bit odd, but a
480  * request with the opposite scan direction will start a fresh scan
481  * in the proper direction.  The latter is required behavior for cursors,
482  * while the former case is generally undefined behavior in Postgres
483  * so we don't care too much.
484  * ----------------
485  */
486 static void
heapgettup(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)487 heapgettup(HeapScanDesc scan,
488 		   ScanDirection dir,
489 		   int nkeys,
490 		   ScanKey key)
491 {
492 	HeapTuple	tuple = &(scan->rs_ctup);
493 	Snapshot	snapshot = scan->rs_base.rs_snapshot;
494 	bool		backward = ScanDirectionIsBackward(dir);
495 	BlockNumber page;
496 	bool		finished;
497 	Page		dp;
498 	int			lines;
499 	OffsetNumber lineoff;
500 	int			linesleft;
501 	ItemId		lpp;
502 
503 	/*
504 	 * calculate next starting lineoff, given scan direction
505 	 */
506 	if (ScanDirectionIsForward(dir))
507 	{
508 		if (!scan->rs_inited)
509 		{
510 			/*
511 			 * return null immediately if relation is empty
512 			 */
513 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
514 			{
515 				Assert(!BufferIsValid(scan->rs_cbuf));
516 				tuple->t_data = NULL;
517 				return;
518 			}
519 			if (scan->rs_base.rs_parallel != NULL)
520 			{
521 				ParallelBlockTableScanDesc pbscan =
522 				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
523 
524 				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
525 														 pbscan);
526 
527 				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
528 														 pbscan);
529 
530 				/* Other processes might have already finished the scan. */
531 				if (page == InvalidBlockNumber)
532 				{
533 					Assert(!BufferIsValid(scan->rs_cbuf));
534 					tuple->t_data = NULL;
535 					return;
536 				}
537 			}
538 			else
539 				page = scan->rs_startblock; /* first page */
540 			heapgetpage((TableScanDesc) scan, page);
541 			lineoff = FirstOffsetNumber;	/* first offnum */
542 			scan->rs_inited = true;
543 		}
544 		else
545 		{
546 			/* continue from previously returned page/tuple */
547 			page = scan->rs_cblock; /* current page */
548 			lineoff =			/* next offnum */
549 				OffsetNumberNext(ItemPointerGetOffsetNumber(&(tuple->t_self)));
550 		}
551 
552 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
553 
554 		dp = BufferGetPage(scan->rs_cbuf);
555 		TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
556 		lines = PageGetMaxOffsetNumber(dp);
557 		/* page and lineoff now reference the physically next tid */
558 
559 		linesleft = lines - lineoff + 1;
560 	}
561 	else if (backward)
562 	{
563 		/* backward parallel scan not supported */
564 		Assert(scan->rs_base.rs_parallel == NULL);
565 
566 		if (!scan->rs_inited)
567 		{
568 			/*
569 			 * return null immediately if relation is empty
570 			 */
571 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
572 			{
573 				Assert(!BufferIsValid(scan->rs_cbuf));
574 				tuple->t_data = NULL;
575 				return;
576 			}
577 
578 			/*
579 			 * Disable reporting to syncscan logic in a backwards scan; it's
580 			 * not very likely anyone else is doing the same thing at the same
581 			 * time, and much more likely that we'll just bollix things for
582 			 * forward scanners.
583 			 */
584 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
585 
586 			/*
587 			 * Start from last page of the scan.  Ensure we take into account
588 			 * rs_numblocks if it's been adjusted by heap_setscanlimits().
589 			 */
590 			if (scan->rs_numblocks != InvalidBlockNumber)
591 				page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
592 			else if (scan->rs_startblock > 0)
593 				page = scan->rs_startblock - 1;
594 			else
595 				page = scan->rs_nblocks - 1;
596 			heapgetpage((TableScanDesc) scan, page);
597 		}
598 		else
599 		{
600 			/* continue from previously returned page/tuple */
601 			page = scan->rs_cblock; /* current page */
602 		}
603 
604 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
605 
606 		dp = BufferGetPage(scan->rs_cbuf);
607 		TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
608 		lines = PageGetMaxOffsetNumber(dp);
609 
610 		if (!scan->rs_inited)
611 		{
612 			lineoff = lines;	/* final offnum */
613 			scan->rs_inited = true;
614 		}
615 		else
616 		{
617 			lineoff =			/* previous offnum */
618 				OffsetNumberPrev(ItemPointerGetOffsetNumber(&(tuple->t_self)));
619 		}
620 		/* page and lineoff now reference the physically previous tid */
621 
622 		linesleft = lineoff;
623 	}
624 	else
625 	{
626 		/*
627 		 * ``no movement'' scan direction: refetch prior tuple
628 		 */
629 		if (!scan->rs_inited)
630 		{
631 			Assert(!BufferIsValid(scan->rs_cbuf));
632 			tuple->t_data = NULL;
633 			return;
634 		}
635 
636 		page = ItemPointerGetBlockNumber(&(tuple->t_self));
637 		if (page != scan->rs_cblock)
638 			heapgetpage((TableScanDesc) scan, page);
639 
640 		/* Since the tuple was previously fetched, needn't lock page here */
641 		dp = BufferGetPage(scan->rs_cbuf);
642 		TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
643 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
644 		lpp = PageGetItemId(dp, lineoff);
645 		Assert(ItemIdIsNormal(lpp));
646 
647 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
648 		tuple->t_len = ItemIdGetLength(lpp);
649 
650 		return;
651 	}
652 
653 	/*
654 	 * advance the scan until we find a qualifying tuple or run out of stuff
655 	 * to scan
656 	 */
657 	lpp = PageGetItemId(dp, lineoff);
658 	for (;;)
659 	{
660 		while (linesleft > 0)
661 		{
662 			if (ItemIdIsNormal(lpp))
663 			{
664 				bool		valid;
665 
666 				tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
667 				tuple->t_len = ItemIdGetLength(lpp);
668 				ItemPointerSet(&(tuple->t_self), page, lineoff);
669 
670 				/*
671 				 * if current tuple qualifies, return it.
672 				 */
673 				valid = HeapTupleSatisfiesVisibility(tuple,
674 													 snapshot,
675 													 scan->rs_cbuf);
676 
677 				HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
678 													tuple, scan->rs_cbuf,
679 													snapshot);
680 
681 				if (valid && key != NULL)
682 					HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
683 								nkeys, key, valid);
684 
685 				if (valid)
686 				{
687 					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
688 					return;
689 				}
690 			}
691 
692 			/*
693 			 * otherwise move to the next item on the page
694 			 */
695 			--linesleft;
696 			if (backward)
697 			{
698 				--lpp;			/* move back in this page's ItemId array */
699 				--lineoff;
700 			}
701 			else
702 			{
703 				++lpp;			/* move forward in this page's ItemId array */
704 				++lineoff;
705 			}
706 		}
707 
708 		/*
709 		 * if we get here, it means we've exhausted the items on this page and
710 		 * it's time to move to the next.
711 		 */
712 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
713 
714 		/*
715 		 * advance to next/prior page and detect end of scan
716 		 */
717 		if (backward)
718 		{
719 			finished = (page == scan->rs_startblock) ||
720 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
721 			if (page == 0)
722 				page = scan->rs_nblocks;
723 			page--;
724 		}
725 		else if (scan->rs_base.rs_parallel != NULL)
726 		{
727 			ParallelBlockTableScanDesc pbscan =
728 			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
729 
730 			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
731 													 pbscan);
732 			finished = (page == InvalidBlockNumber);
733 		}
734 		else
735 		{
736 			page++;
737 			if (page >= scan->rs_nblocks)
738 				page = 0;
739 			finished = (page == scan->rs_startblock) ||
740 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
741 
742 			/*
743 			 * Report our new scan position for synchronization purposes. We
744 			 * don't do that when moving backwards, however. That would just
745 			 * mess up any other forward-moving scanners.
746 			 *
747 			 * Note: we do this before checking for end of scan so that the
748 			 * final state of the position hint is back at the start of the
749 			 * rel.  That's not strictly necessary, but otherwise when you run
750 			 * the same query multiple times the starting position would shift
751 			 * a little bit backwards on every invocation, which is confusing.
752 			 * We don't guarantee any specific ordering in general, though.
753 			 */
754 			if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
755 				ss_report_location(scan->rs_base.rs_rd, page);
756 		}
757 
758 		/*
759 		 * return NULL if we've exhausted all the pages
760 		 */
761 		if (finished)
762 		{
763 			if (BufferIsValid(scan->rs_cbuf))
764 				ReleaseBuffer(scan->rs_cbuf);
765 			scan->rs_cbuf = InvalidBuffer;
766 			scan->rs_cblock = InvalidBlockNumber;
767 			tuple->t_data = NULL;
768 			scan->rs_inited = false;
769 			return;
770 		}
771 
772 		heapgetpage((TableScanDesc) scan, page);
773 
774 		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
775 
776 		dp = BufferGetPage(scan->rs_cbuf);
777 		TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
778 		lines = PageGetMaxOffsetNumber((Page) dp);
779 		linesleft = lines;
780 		if (backward)
781 		{
782 			lineoff = lines;
783 			lpp = PageGetItemId(dp, lines);
784 		}
785 		else
786 		{
787 			lineoff = FirstOffsetNumber;
788 			lpp = PageGetItemId(dp, FirstOffsetNumber);
789 		}
790 	}
791 }
792 
793 /* ----------------
794  *		heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
795  *
796  *		Same API as heapgettup, but used in page-at-a-time mode
797  *
798  * The internal logic is much the same as heapgettup's too, but there are some
799  * differences: we do not take the buffer content lock (that only needs to
800  * happen inside heapgetpage), and we iterate through just the tuples listed
801  * in rs_vistuples[] rather than all tuples on the page.  Notice that
802  * lineindex is 0-based, where the corresponding loop variable lineoff in
803  * heapgettup is 1-based.
804  * ----------------
805  */
806 static void
heapgettup_pagemode(HeapScanDesc scan,ScanDirection dir,int nkeys,ScanKey key)807 heapgettup_pagemode(HeapScanDesc scan,
808 					ScanDirection dir,
809 					int nkeys,
810 					ScanKey key)
811 {
812 	HeapTuple	tuple = &(scan->rs_ctup);
813 	bool		backward = ScanDirectionIsBackward(dir);
814 	BlockNumber page;
815 	bool		finished;
816 	Page		dp;
817 	int			lines;
818 	int			lineindex;
819 	OffsetNumber lineoff;
820 	int			linesleft;
821 	ItemId		lpp;
822 
823 	/*
824 	 * calculate next starting lineindex, given scan direction
825 	 */
826 	if (ScanDirectionIsForward(dir))
827 	{
828 		if (!scan->rs_inited)
829 		{
830 			/*
831 			 * return null immediately if relation is empty
832 			 */
833 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
834 			{
835 				Assert(!BufferIsValid(scan->rs_cbuf));
836 				tuple->t_data = NULL;
837 				return;
838 			}
839 			if (scan->rs_base.rs_parallel != NULL)
840 			{
841 				ParallelBlockTableScanDesc pbscan =
842 				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
843 
844 				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
845 														 pbscan);
846 
847 				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
848 														 pbscan);
849 
850 				/* Other processes might have already finished the scan. */
851 				if (page == InvalidBlockNumber)
852 				{
853 					Assert(!BufferIsValid(scan->rs_cbuf));
854 					tuple->t_data = NULL;
855 					return;
856 				}
857 			}
858 			else
859 				page = scan->rs_startblock; /* first page */
860 			heapgetpage((TableScanDesc) scan, page);
861 			lineindex = 0;
862 			scan->rs_inited = true;
863 		}
864 		else
865 		{
866 			/* continue from previously returned page/tuple */
867 			page = scan->rs_cblock; /* current page */
868 			lineindex = scan->rs_cindex + 1;
869 		}
870 
871 		dp = BufferGetPage(scan->rs_cbuf);
872 		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
873 		lines = scan->rs_ntuples;
874 		/* page and lineindex now reference the next visible tid */
875 
876 		linesleft = lines - lineindex;
877 	}
878 	else if (backward)
879 	{
880 		/* backward parallel scan not supported */
881 		Assert(scan->rs_base.rs_parallel == NULL);
882 
883 		if (!scan->rs_inited)
884 		{
885 			/*
886 			 * return null immediately if relation is empty
887 			 */
888 			if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
889 			{
890 				Assert(!BufferIsValid(scan->rs_cbuf));
891 				tuple->t_data = NULL;
892 				return;
893 			}
894 
895 			/*
896 			 * Disable reporting to syncscan logic in a backwards scan; it's
897 			 * not very likely anyone else is doing the same thing at the same
898 			 * time, and much more likely that we'll just bollix things for
899 			 * forward scanners.
900 			 */
901 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
902 
903 			/*
904 			 * Start from last page of the scan.  Ensure we take into account
905 			 * rs_numblocks if it's been adjusted by heap_setscanlimits().
906 			 */
907 			if (scan->rs_numblocks != InvalidBlockNumber)
908 				page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
909 			else if (scan->rs_startblock > 0)
910 				page = scan->rs_startblock - 1;
911 			else
912 				page = scan->rs_nblocks - 1;
913 			heapgetpage((TableScanDesc) scan, page);
914 		}
915 		else
916 		{
917 			/* continue from previously returned page/tuple */
918 			page = scan->rs_cblock; /* current page */
919 		}
920 
921 		dp = BufferGetPage(scan->rs_cbuf);
922 		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
923 		lines = scan->rs_ntuples;
924 
925 		if (!scan->rs_inited)
926 		{
927 			lineindex = lines - 1;
928 			scan->rs_inited = true;
929 		}
930 		else
931 		{
932 			lineindex = scan->rs_cindex - 1;
933 		}
934 		/* page and lineindex now reference the previous visible tid */
935 
936 		linesleft = lineindex + 1;
937 	}
938 	else
939 	{
940 		/*
941 		 * ``no movement'' scan direction: refetch prior tuple
942 		 */
943 		if (!scan->rs_inited)
944 		{
945 			Assert(!BufferIsValid(scan->rs_cbuf));
946 			tuple->t_data = NULL;
947 			return;
948 		}
949 
950 		page = ItemPointerGetBlockNumber(&(tuple->t_self));
951 		if (page != scan->rs_cblock)
952 			heapgetpage((TableScanDesc) scan, page);
953 
954 		/* Since the tuple was previously fetched, needn't lock page here */
955 		dp = BufferGetPage(scan->rs_cbuf);
956 		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
957 		lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
958 		lpp = PageGetItemId(dp, lineoff);
959 		Assert(ItemIdIsNormal(lpp));
960 
961 		tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
962 		tuple->t_len = ItemIdGetLength(lpp);
963 
964 		/* check that rs_cindex is in sync */
965 		Assert(scan->rs_cindex < scan->rs_ntuples);
966 		Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
967 
968 		return;
969 	}
970 
971 	/*
972 	 * advance the scan until we find a qualifying tuple or run out of stuff
973 	 * to scan
974 	 */
975 	for (;;)
976 	{
977 		while (linesleft > 0)
978 		{
979 			lineoff = scan->rs_vistuples[lineindex];
980 			lpp = PageGetItemId(dp, lineoff);
981 			Assert(ItemIdIsNormal(lpp));
982 
983 			tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
984 			tuple->t_len = ItemIdGetLength(lpp);
985 			ItemPointerSet(&(tuple->t_self), page, lineoff);
986 
987 			/*
988 			 * if current tuple qualifies, return it.
989 			 */
990 			if (key != NULL)
991 			{
992 				bool		valid;
993 
994 				HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
995 							nkeys, key, valid);
996 				if (valid)
997 				{
998 					scan->rs_cindex = lineindex;
999 					return;
1000 				}
1001 			}
1002 			else
1003 			{
1004 				scan->rs_cindex = lineindex;
1005 				return;
1006 			}
1007 
1008 			/*
1009 			 * otherwise move to the next item on the page
1010 			 */
1011 			--linesleft;
1012 			if (backward)
1013 				--lineindex;
1014 			else
1015 				++lineindex;
1016 		}
1017 
1018 		/*
1019 		 * if we get here, it means we've exhausted the items on this page and
1020 		 * it's time to move to the next.
1021 		 */
1022 		if (backward)
1023 		{
1024 			finished = (page == scan->rs_startblock) ||
1025 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1026 			if (page == 0)
1027 				page = scan->rs_nblocks;
1028 			page--;
1029 		}
1030 		else if (scan->rs_base.rs_parallel != NULL)
1031 		{
1032 			ParallelBlockTableScanDesc pbscan =
1033 			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
1034 
1035 			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
1036 													 pbscan);
1037 			finished = (page == InvalidBlockNumber);
1038 		}
1039 		else
1040 		{
1041 			page++;
1042 			if (page >= scan->rs_nblocks)
1043 				page = 0;
1044 			finished = (page == scan->rs_startblock) ||
1045 				(scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1046 
1047 			/*
1048 			 * Report our new scan position for synchronization purposes. We
1049 			 * don't do that when moving backwards, however. That would just
1050 			 * mess up any other forward-moving scanners.
1051 			 *
1052 			 * Note: we do this before checking for end of scan so that the
1053 			 * final state of the position hint is back at the start of the
1054 			 * rel.  That's not strictly necessary, but otherwise when you run
1055 			 * the same query multiple times the starting position would shift
1056 			 * a little bit backwards on every invocation, which is confusing.
1057 			 * We don't guarantee any specific ordering in general, though.
1058 			 */
1059 			if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1060 				ss_report_location(scan->rs_base.rs_rd, page);
1061 		}
1062 
1063 		/*
1064 		 * return NULL if we've exhausted all the pages
1065 		 */
1066 		if (finished)
1067 		{
1068 			if (BufferIsValid(scan->rs_cbuf))
1069 				ReleaseBuffer(scan->rs_cbuf);
1070 			scan->rs_cbuf = InvalidBuffer;
1071 			scan->rs_cblock = InvalidBlockNumber;
1072 			tuple->t_data = NULL;
1073 			scan->rs_inited = false;
1074 			return;
1075 		}
1076 
1077 		heapgetpage((TableScanDesc) scan, page);
1078 
1079 		dp = BufferGetPage(scan->rs_cbuf);
1080 		TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1081 		lines = scan->rs_ntuples;
1082 		linesleft = lines;
1083 		if (backward)
1084 			lineindex = lines - 1;
1085 		else
1086 			lineindex = 0;
1087 	}
1088 }
1089 
1090 
1091 #if defined(DISABLE_COMPLEX_MACRO)
1092 /*
1093  * This is formatted so oddly so that the correspondence to the macro
1094  * definition in access/htup_details.h is maintained.
1095  */
1096 Datum
fastgetattr(HeapTuple tup,int attnum,TupleDesc tupleDesc,bool * isnull)1097 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1098 			bool *isnull)
1099 {
1100 	return (
1101 			(attnum) > 0 ?
1102 			(
1103 			 (*(isnull) = false),
1104 			 HeapTupleNoNulls(tup) ?
1105 			 (
1106 			  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1107 			  (
1108 			   fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1109 						(char *) (tup)->t_data + (tup)->t_data->t_hoff +
1110 						TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1111 			   )
1112 			  :
1113 			  nocachegetattr((tup), (attnum), (tupleDesc))
1114 			  )
1115 			 :
1116 			 (
1117 			  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1118 			  (
1119 			   (*(isnull) = true),
1120 			   (Datum) NULL
1121 			   )
1122 			  :
1123 			  (
1124 			   nocachegetattr((tup), (attnum), (tupleDesc))
1125 			   )
1126 			  )
1127 			 )
1128 			:
1129 			(
1130 			 (Datum) NULL
1131 			 )
1132 		);
1133 }
1134 #endif							/* defined(DISABLE_COMPLEX_MACRO) */
1135 
1136 
1137 /* ----------------------------------------------------------------
1138  *					 heap access method interface
1139  * ----------------------------------------------------------------
1140  */
1141 
1142 
1143 TableScanDesc
heap_beginscan(Relation relation,Snapshot snapshot,int nkeys,ScanKey key,ParallelTableScanDesc parallel_scan,uint32 flags)1144 heap_beginscan(Relation relation, Snapshot snapshot,
1145 			   int nkeys, ScanKey key,
1146 			   ParallelTableScanDesc parallel_scan,
1147 			   uint32 flags)
1148 {
1149 	HeapScanDesc scan;
1150 
1151 	/*
1152 	 * increment relation ref count while scanning relation
1153 	 *
1154 	 * This is just to make really sure the relcache entry won't go away while
1155 	 * the scan has a pointer to it.  Caller should be holding the rel open
1156 	 * anyway, so this is redundant in all normal scenarios...
1157 	 */
1158 	RelationIncrementReferenceCount(relation);
1159 
1160 	/*
1161 	 * allocate and initialize scan descriptor
1162 	 */
1163 	scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1164 
1165 	scan->rs_base.rs_rd = relation;
1166 	scan->rs_base.rs_snapshot = snapshot;
1167 	scan->rs_base.rs_nkeys = nkeys;
1168 	scan->rs_base.rs_flags = flags;
1169 	scan->rs_base.rs_parallel = parallel_scan;
1170 	scan->rs_strategy = NULL;	/* set in initscan */
1171 
1172 	/*
1173 	 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1174 	 */
1175 	if (!(snapshot && IsMVCCSnapshot(snapshot)))
1176 		scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1177 
1178 	/*
1179 	 * For seqscan and sample scans in a serializable transaction, acquire a
1180 	 * predicate lock on the entire relation. This is required not only to
1181 	 * lock all the matching tuples, but also to conflict with new insertions
1182 	 * into the table. In an indexscan, we take page locks on the index pages
1183 	 * covering the range specified in the scan qual, but in a heap scan there
1184 	 * is nothing more fine-grained to lock. A bitmap scan is a different
1185 	 * story, there we have already scanned the index and locked the index
1186 	 * pages covering the predicate. But in that case we still have to lock
1187 	 * any matching heap tuples. For sample scan we could optimize the locking
1188 	 * to be at least page-level granularity, but we'd need to add per-tuple
1189 	 * locking for that.
1190 	 */
1191 	if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
1192 	{
1193 		/*
1194 		 * Ensure a missing snapshot is noticed reliably, even if the
1195 		 * isolation mode means predicate locking isn't performed (and
1196 		 * therefore the snapshot isn't used here).
1197 		 */
1198 		Assert(snapshot);
1199 		PredicateLockRelation(relation, snapshot);
1200 	}
1201 
1202 	/* we only need to set this up once */
1203 	scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1204 
1205 	/*
1206 	 * we do this here instead of in initscan() because heap_rescan also calls
1207 	 * initscan() and we don't want to allocate memory again
1208 	 */
1209 	if (nkeys > 0)
1210 		scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1211 	else
1212 		scan->rs_base.rs_key = NULL;
1213 
1214 	initscan(scan, key, false);
1215 
1216 	return (TableScanDesc) scan;
1217 }
1218 
1219 void
heap_rescan(TableScanDesc sscan,ScanKey key,bool set_params,bool allow_strat,bool allow_sync,bool allow_pagemode)1220 heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1221 			bool allow_strat, bool allow_sync, bool allow_pagemode)
1222 {
1223 	HeapScanDesc scan = (HeapScanDesc) sscan;
1224 
1225 	if (set_params)
1226 	{
1227 		if (allow_strat)
1228 			scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1229 		else
1230 			scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1231 
1232 		if (allow_sync)
1233 			scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1234 		else
1235 			scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1236 
1237 		if (allow_pagemode && scan->rs_base.rs_snapshot &&
1238 			IsMVCCSnapshot(scan->rs_base.rs_snapshot))
1239 			scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
1240 		else
1241 			scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
1242 	}
1243 
1244 	/*
1245 	 * unpin scan buffers
1246 	 */
1247 	if (BufferIsValid(scan->rs_cbuf))
1248 		ReleaseBuffer(scan->rs_cbuf);
1249 
1250 	/*
1251 	 * reinitialize scan descriptor
1252 	 */
1253 	initscan(scan, key, true);
1254 }
1255 
1256 void
heap_endscan(TableScanDesc sscan)1257 heap_endscan(TableScanDesc sscan)
1258 {
1259 	HeapScanDesc scan = (HeapScanDesc) sscan;
1260 
1261 	/* Note: no locking manipulations needed */
1262 
1263 	/*
1264 	 * unpin scan buffers
1265 	 */
1266 	if (BufferIsValid(scan->rs_cbuf))
1267 		ReleaseBuffer(scan->rs_cbuf);
1268 
1269 	/*
1270 	 * decrement relation reference count and free scan descriptor storage
1271 	 */
1272 	RelationDecrementReferenceCount(scan->rs_base.rs_rd);
1273 
1274 	if (scan->rs_base.rs_key)
1275 		pfree(scan->rs_base.rs_key);
1276 
1277 	if (scan->rs_strategy != NULL)
1278 		FreeAccessStrategy(scan->rs_strategy);
1279 
1280 	if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1281 		UnregisterSnapshot(scan->rs_base.rs_snapshot);
1282 
1283 	pfree(scan);
1284 }
1285 
1286 HeapTuple
heap_getnext(TableScanDesc sscan,ScanDirection direction)1287 heap_getnext(TableScanDesc sscan, ScanDirection direction)
1288 {
1289 	HeapScanDesc scan = (HeapScanDesc) sscan;
1290 
1291 	/*
1292 	 * This is still widely used directly, without going through table AM, so
1293 	 * add a safety check.  It's possible we should, at a later point,
1294 	 * downgrade this to an assert. The reason for checking the AM routine,
1295 	 * rather than the AM oid, is that this allows to write regression tests
1296 	 * that create another AM reusing the heap handler.
1297 	 */
1298 	if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1299 		ereport(ERROR,
1300 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1301 				 errmsg_internal("only heap AM is supported")));
1302 
1303 	/* Note: no locking manipulations needed */
1304 
1305 	if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1306 		heapgettup_pagemode(scan, direction,
1307 							scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1308 	else
1309 		heapgettup(scan, direction,
1310 				   scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1311 
1312 	if (scan->rs_ctup.t_data == NULL)
1313 		return NULL;
1314 
1315 	/*
1316 	 * if we get here it means we have a new current scan tuple, so point to
1317 	 * the proper return buffer and return the tuple.
1318 	 */
1319 
1320 	pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1321 
1322 	return &scan->rs_ctup;
1323 }
1324 
1325 bool
heap_getnextslot(TableScanDesc sscan,ScanDirection direction,TupleTableSlot * slot)1326 heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
1327 {
1328 	HeapScanDesc scan = (HeapScanDesc) sscan;
1329 
1330 	/* Note: no locking manipulations needed */
1331 
1332 	if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1333 		heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1334 	else
1335 		heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1336 
1337 	if (scan->rs_ctup.t_data == NULL)
1338 	{
1339 		ExecClearTuple(slot);
1340 		return false;
1341 	}
1342 
1343 	/*
1344 	 * if we get here it means we have a new current scan tuple, so point to
1345 	 * the proper return buffer and return the tuple.
1346 	 */
1347 
1348 	pgstat_count_heap_getnext(scan->rs_base.rs_rd);
1349 
1350 	ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1351 							 scan->rs_cbuf);
1352 	return true;
1353 }
1354 
1355 /*
1356  *	heap_fetch		- retrieve tuple with given tid
1357  *
1358  * On entry, tuple->t_self is the TID to fetch.  We pin the buffer holding
1359  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1360  * against the specified snapshot.
1361  *
1362  * If successful (tuple found and passes snapshot time qual), then *userbuf
1363  * is set to the buffer holding the tuple and true is returned.  The caller
1364  * must unpin the buffer when done with the tuple.
1365  *
1366  * If the tuple is not found (ie, item number references a deleted slot),
1367  * then tuple->t_data is set to NULL and false is returned.
1368  *
1369  * If the tuple is found but fails the time qual check, then false is returned
1370  * but tuple->t_data is left pointing to the tuple.
1371  *
1372  * heap_fetch does not follow HOT chains: only the exact TID requested will
1373  * be fetched.
1374  *
1375  * It is somewhat inconsistent that we ereport() on invalid block number but
1376  * return false on invalid item number.  There are a couple of reasons though.
1377  * One is that the caller can relatively easily check the block number for
1378  * validity, but cannot check the item number without reading the page
1379  * himself.  Another is that when we are following a t_ctid link, we can be
1380  * reasonably confident that the page number is valid (since VACUUM shouldn't
1381  * truncate off the destination page without having killed the referencing
1382  * tuple first), but the item number might well not be good.
1383  */
1384 bool
heap_fetch(Relation relation,Snapshot snapshot,HeapTuple tuple,Buffer * userbuf)1385 heap_fetch(Relation relation,
1386 		   Snapshot snapshot,
1387 		   HeapTuple tuple,
1388 		   Buffer *userbuf)
1389 {
1390 	ItemPointer tid = &(tuple->t_self);
1391 	ItemId		lp;
1392 	Buffer		buffer;
1393 	Page		page;
1394 	OffsetNumber offnum;
1395 	bool		valid;
1396 
1397 	/*
1398 	 * Fetch and pin the appropriate page of the relation.
1399 	 */
1400 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1401 
1402 	/*
1403 	 * Need share lock on buffer to examine tuple commit status.
1404 	 */
1405 	LockBuffer(buffer, BUFFER_LOCK_SHARE);
1406 	page = BufferGetPage(buffer);
1407 	TestForOldSnapshot(snapshot, relation, page);
1408 
1409 	/*
1410 	 * We'd better check for out-of-range offnum in case of VACUUM since the
1411 	 * TID was obtained.
1412 	 */
1413 	offnum = ItemPointerGetOffsetNumber(tid);
1414 	if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1415 	{
1416 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1417 		ReleaseBuffer(buffer);
1418 		*userbuf = InvalidBuffer;
1419 		tuple->t_data = NULL;
1420 		return false;
1421 	}
1422 
1423 	/*
1424 	 * get the item line pointer corresponding to the requested tid
1425 	 */
1426 	lp = PageGetItemId(page, offnum);
1427 
1428 	/*
1429 	 * Must check for deleted tuple.
1430 	 */
1431 	if (!ItemIdIsNormal(lp))
1432 	{
1433 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1434 		ReleaseBuffer(buffer);
1435 		*userbuf = InvalidBuffer;
1436 		tuple->t_data = NULL;
1437 		return false;
1438 	}
1439 
1440 	/*
1441 	 * fill in *tuple fields
1442 	 */
1443 	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1444 	tuple->t_len = ItemIdGetLength(lp);
1445 	tuple->t_tableOid = RelationGetRelid(relation);
1446 
1447 	/*
1448 	 * check tuple visibility, then release lock
1449 	 */
1450 	valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1451 
1452 	if (valid)
1453 		PredicateLockTID(relation, &(tuple->t_self), snapshot,
1454 						 HeapTupleHeaderGetXmin(tuple->t_data));
1455 
1456 	HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1457 
1458 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1459 
1460 	if (valid)
1461 	{
1462 		/*
1463 		 * All checks passed, so return the tuple as valid. Caller is now
1464 		 * responsible for releasing the buffer.
1465 		 */
1466 		*userbuf = buffer;
1467 
1468 		return true;
1469 	}
1470 
1471 	/* Tuple failed time qual */
1472 	ReleaseBuffer(buffer);
1473 	*userbuf = InvalidBuffer;
1474 
1475 	return false;
1476 }
1477 
1478 /*
1479  *	heap_hot_search_buffer	- search HOT chain for tuple satisfying snapshot
1480  *
1481  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1482  * of a HOT chain), and buffer is the buffer holding this tuple.  We search
1483  * for the first chain member satisfying the given snapshot.  If one is
1484  * found, we update *tid to reference that tuple's offset number, and
1485  * return true.  If no match, return false without modifying *tid.
1486  *
1487  * heapTuple is a caller-supplied buffer.  When a match is found, we return
1488  * the tuple here, in addition to updating *tid.  If no match is found, the
1489  * contents of this buffer on return are undefined.
1490  *
1491  * If all_dead is not NULL, we check non-visible tuples to see if they are
1492  * globally dead; *all_dead is set true if all members of the HOT chain
1493  * are vacuumable, false if not.
1494  *
1495  * Unlike heap_fetch, the caller must already have pin and (at least) share
1496  * lock on the buffer; it is still pinned/locked at exit.  Also unlike
1497  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1498  */
1499 bool
heap_hot_search_buffer(ItemPointer tid,Relation relation,Buffer buffer,Snapshot snapshot,HeapTuple heapTuple,bool * all_dead,bool first_call)1500 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
1501 					   Snapshot snapshot, HeapTuple heapTuple,
1502 					   bool *all_dead, bool first_call)
1503 {
1504 	Page		dp = (Page) BufferGetPage(buffer);
1505 	TransactionId prev_xmax = InvalidTransactionId;
1506 	BlockNumber blkno;
1507 	OffsetNumber offnum;
1508 	bool		at_chain_start;
1509 	bool		valid;
1510 	bool		skip;
1511 
1512 	/* If this is not the first call, previous call returned a (live!) tuple */
1513 	if (all_dead)
1514 		*all_dead = first_call;
1515 
1516 	blkno = ItemPointerGetBlockNumber(tid);
1517 	offnum = ItemPointerGetOffsetNumber(tid);
1518 	at_chain_start = first_call;
1519 	skip = !first_call;
1520 
1521 	Assert(TransactionIdIsValid(RecentGlobalXmin));
1522 	Assert(BufferGetBlockNumber(buffer) == blkno);
1523 
1524 	/* Scan through possible multiple members of HOT-chain */
1525 	for (;;)
1526 	{
1527 		ItemId		lp;
1528 
1529 		/* check for bogus TID */
1530 		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1531 			break;
1532 
1533 		lp = PageGetItemId(dp, offnum);
1534 
1535 		/* check for unused, dead, or redirected items */
1536 		if (!ItemIdIsNormal(lp))
1537 		{
1538 			/* We should only see a redirect at start of chain */
1539 			if (ItemIdIsRedirected(lp) && at_chain_start)
1540 			{
1541 				/* Follow the redirect */
1542 				offnum = ItemIdGetRedirect(lp);
1543 				at_chain_start = false;
1544 				continue;
1545 			}
1546 			/* else must be end of chain */
1547 			break;
1548 		}
1549 
1550 		/*
1551 		 * Update heapTuple to point to the element of the HOT chain we're
1552 		 * currently investigating. Having t_self set correctly is important
1553 		 * because the SSI checks and the *Satisfies routine for historical
1554 		 * MVCC snapshots need the correct tid to decide about the visibility.
1555 		 */
1556 		heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1557 		heapTuple->t_len = ItemIdGetLength(lp);
1558 		heapTuple->t_tableOid = RelationGetRelid(relation);
1559 		ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1560 
1561 		/*
1562 		 * Shouldn't see a HEAP_ONLY tuple at chain start.
1563 		 */
1564 		if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1565 			break;
1566 
1567 		/*
1568 		 * The xmin should match the previous xmax value, else chain is
1569 		 * broken.
1570 		 */
1571 		if (TransactionIdIsValid(prev_xmax) &&
1572 			!TransactionIdEquals(prev_xmax,
1573 								 HeapTupleHeaderGetXmin(heapTuple->t_data)))
1574 			break;
1575 
1576 		/*
1577 		 * When first_call is true (and thus, skip is initially false) we'll
1578 		 * return the first tuple we find.  But on later passes, heapTuple
1579 		 * will initially be pointing to the tuple we returned last time.
1580 		 * Returning it again would be incorrect (and would loop forever), so
1581 		 * we skip it and return the next match we find.
1582 		 */
1583 		if (!skip)
1584 		{
1585 			/* If it's visible per the snapshot, we must return it */
1586 			valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1587 			HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
1588 												buffer, snapshot);
1589 
1590 			if (valid)
1591 			{
1592 				ItemPointerSetOffsetNumber(tid, offnum);
1593 				PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1594 								 HeapTupleHeaderGetXmin(heapTuple->t_data));
1595 				if (all_dead)
1596 					*all_dead = false;
1597 				return true;
1598 			}
1599 		}
1600 		skip = false;
1601 
1602 		/*
1603 		 * If we can't see it, maybe no one else can either.  At caller
1604 		 * request, check whether all chain members are dead to all
1605 		 * transactions.
1606 		 *
1607 		 * Note: if you change the criterion here for what is "dead", fix the
1608 		 * planner's get_actual_variable_range() function to match.
1609 		 */
1610 		if (all_dead && *all_dead &&
1611 			!HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin))
1612 			*all_dead = false;
1613 
1614 		/*
1615 		 * Check to see if HOT chain continues past this tuple; if so fetch
1616 		 * the next offnum and loop around.
1617 		 */
1618 		if (HeapTupleIsHotUpdated(heapTuple))
1619 		{
1620 			Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1621 				   blkno);
1622 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1623 			at_chain_start = false;
1624 			prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1625 		}
1626 		else
1627 			break;				/* end of chain */
1628 	}
1629 
1630 	return false;
1631 }
1632 
1633 /*
1634  *	heap_get_latest_tid -  get the latest tid of a specified tuple
1635  *
1636  * Actually, this gets the latest version that is visible according to the
1637  * scan's snapshot.  Create a scan using SnapshotDirty to get the very latest,
1638  * possibly uncommitted version.
1639  *
1640  * *tid is both an input and an output parameter: it is updated to
1641  * show the latest version of the row.  Note that it will not be changed
1642  * if no version of the row passes the snapshot test.
1643  */
1644 void
heap_get_latest_tid(TableScanDesc sscan,ItemPointer tid)1645 heap_get_latest_tid(TableScanDesc sscan,
1646 					ItemPointer tid)
1647 {
1648 	Relation	relation = sscan->rs_rd;
1649 	Snapshot	snapshot = sscan->rs_snapshot;
1650 	ItemPointerData ctid;
1651 	TransactionId priorXmax;
1652 
1653 	/*
1654 	 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1655 	 * Assume that t_ctid links are valid however - there shouldn't be invalid
1656 	 * ones in the table.
1657 	 */
1658 	Assert(ItemPointerIsValid(tid));
1659 
1660 	/*
1661 	 * Loop to chase down t_ctid links.  At top of loop, ctid is the tuple we
1662 	 * need to examine, and *tid is the TID we will return if ctid turns out
1663 	 * to be bogus.
1664 	 *
1665 	 * Note that we will loop until we reach the end of the t_ctid chain.
1666 	 * Depending on the snapshot passed, there might be at most one visible
1667 	 * version of the row, but we don't try to optimize for that.
1668 	 */
1669 	ctid = *tid;
1670 	priorXmax = InvalidTransactionId;	/* cannot check first XMIN */
1671 	for (;;)
1672 	{
1673 		Buffer		buffer;
1674 		Page		page;
1675 		OffsetNumber offnum;
1676 		ItemId		lp;
1677 		HeapTupleData tp;
1678 		bool		valid;
1679 
1680 		/*
1681 		 * Read, pin, and lock the page.
1682 		 */
1683 		buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1684 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
1685 		page = BufferGetPage(buffer);
1686 		TestForOldSnapshot(snapshot, relation, page);
1687 
1688 		/*
1689 		 * Check for bogus item number.  This is not treated as an error
1690 		 * condition because it can happen while following a t_ctid link. We
1691 		 * just assume that the prior tid is OK and return it unchanged.
1692 		 */
1693 		offnum = ItemPointerGetOffsetNumber(&ctid);
1694 		if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1695 		{
1696 			UnlockReleaseBuffer(buffer);
1697 			break;
1698 		}
1699 		lp = PageGetItemId(page, offnum);
1700 		if (!ItemIdIsNormal(lp))
1701 		{
1702 			UnlockReleaseBuffer(buffer);
1703 			break;
1704 		}
1705 
1706 		/* OK to access the tuple */
1707 		tp.t_self = ctid;
1708 		tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1709 		tp.t_len = ItemIdGetLength(lp);
1710 		tp.t_tableOid = RelationGetRelid(relation);
1711 
1712 		/*
1713 		 * After following a t_ctid link, we might arrive at an unrelated
1714 		 * tuple.  Check for XMIN match.
1715 		 */
1716 		if (TransactionIdIsValid(priorXmax) &&
1717 			!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
1718 		{
1719 			UnlockReleaseBuffer(buffer);
1720 			break;
1721 		}
1722 
1723 		/*
1724 		 * Check tuple visibility; if visible, set it as the new result
1725 		 * candidate.
1726 		 */
1727 		valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1728 		HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1729 		if (valid)
1730 			*tid = ctid;
1731 
1732 		/*
1733 		 * If there's a valid t_ctid link, follow it, else we're done.
1734 		 */
1735 		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1736 			HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
1737 			HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
1738 			ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
1739 		{
1740 			UnlockReleaseBuffer(buffer);
1741 			break;
1742 		}
1743 
1744 		ctid = tp.t_data->t_ctid;
1745 		priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1746 		UnlockReleaseBuffer(buffer);
1747 	}							/* end of loop */
1748 }
1749 
1750 
1751 /*
1752  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1753  *
1754  * This is called after we have waited for the XMAX transaction to terminate.
1755  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1756  * be set on exit.  If the transaction committed, we set the XMAX_COMMITTED
1757  * hint bit if possible --- but beware that that may not yet be possible,
1758  * if the transaction committed asynchronously.
1759  *
1760  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1761  * even if it commits.
1762  *
1763  * Hence callers should look only at XMAX_INVALID.
1764  *
1765  * Note this is not allowed for tuples whose xmax is a multixact.
1766  */
1767 static void
UpdateXmaxHintBits(HeapTupleHeader tuple,Buffer buffer,TransactionId xid)1768 UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
1769 {
1770 	Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
1771 	Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1772 
1773 	if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1774 	{
1775 		if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1776 			TransactionIdDidCommit(xid))
1777 			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
1778 								 xid);
1779 		else
1780 			HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1781 								 InvalidTransactionId);
1782 	}
1783 }
1784 
1785 
1786 /*
1787  * GetBulkInsertState - prepare status object for a bulk insert
1788  */
1789 BulkInsertState
GetBulkInsertState(void)1790 GetBulkInsertState(void)
1791 {
1792 	BulkInsertState bistate;
1793 
1794 	bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1795 	bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
1796 	bistate->current_buf = InvalidBuffer;
1797 	return bistate;
1798 }
1799 
1800 /*
1801  * FreeBulkInsertState - clean up after finishing a bulk insert
1802  */
1803 void
FreeBulkInsertState(BulkInsertState bistate)1804 FreeBulkInsertState(BulkInsertState bistate)
1805 {
1806 	if (bistate->current_buf != InvalidBuffer)
1807 		ReleaseBuffer(bistate->current_buf);
1808 	FreeAccessStrategy(bistate->strategy);
1809 	pfree(bistate);
1810 }
1811 
1812 /*
1813  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1814  */
1815 void
ReleaseBulkInsertStatePin(BulkInsertState bistate)1816 ReleaseBulkInsertStatePin(BulkInsertState bistate)
1817 {
1818 	if (bistate->current_buf != InvalidBuffer)
1819 		ReleaseBuffer(bistate->current_buf);
1820 	bistate->current_buf = InvalidBuffer;
1821 }
1822 
1823 
1824 /*
1825  *	heap_insert		- insert tuple into a heap
1826  *
1827  * The new tuple is stamped with current transaction ID and the specified
1828  * command ID.
1829  *
1830  * See table_tuple_insert for comments about most of the input flags, except
1831  * that this routine directly takes a tuple rather than a slot.
1832  *
1833  * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1834  * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1835  * implement table_tuple_insert_speculative().
1836  *
1837  * On return the header fields of *tup are updated to match the stored tuple;
1838  * in particular tup->t_self receives the actual TID where the tuple was
1839  * stored.  But note that any toasting of fields within the tuple data is NOT
1840  * reflected into *tup.
1841  */
1842 void
heap_insert(Relation relation,HeapTuple tup,CommandId cid,int options,BulkInsertState bistate)1843 heap_insert(Relation relation, HeapTuple tup, CommandId cid,
1844 			int options, BulkInsertState bistate)
1845 {
1846 	TransactionId xid = GetCurrentTransactionId();
1847 	HeapTuple	heaptup;
1848 	Buffer		buffer;
1849 	Buffer		vmbuffer = InvalidBuffer;
1850 	bool		all_visible_cleared = false;
1851 
1852 	/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
1853 	Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
1854 		   RelationGetNumberOfAttributes(relation));
1855 
1856 	/*
1857 	 * Fill in tuple header fields and toast the tuple if necessary.
1858 	 *
1859 	 * Note: below this point, heaptup is the data we actually intend to store
1860 	 * into the relation; tup is the caller's original untoasted data.
1861 	 */
1862 	heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
1863 
1864 	/*
1865 	 * Find buffer to insert this tuple into.  If the page is all visible,
1866 	 * this will also pin the requisite visibility map page.
1867 	 */
1868 	buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
1869 									   InvalidBuffer, options, bistate,
1870 									   &vmbuffer, NULL);
1871 
1872 	/*
1873 	 * We're about to do the actual insert -- but check for conflict first, to
1874 	 * avoid possibly having to roll back work we've just done.
1875 	 *
1876 	 * This is safe without a recheck as long as there is no possibility of
1877 	 * another process scanning the page between this check and the insert
1878 	 * being visible to the scan (i.e., an exclusive buffer content lock is
1879 	 * continuously held from this point until the tuple insert is visible).
1880 	 *
1881 	 * For a heap insert, we only need to check for table-level SSI locks. Our
1882 	 * new tuple can't possibly conflict with existing tuple locks, and heap
1883 	 * page locks are only consolidated versions of tuple locks; they do not
1884 	 * lock "gaps" as index page locks do.  So we don't need to specify a
1885 	 * buffer when making the call, which makes for a faster check.
1886 	 */
1887 	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
1888 
1889 	/* NO EREPORT(ERROR) from here till changes are logged */
1890 	START_CRIT_SECTION();
1891 
1892 	RelationPutHeapTuple(relation, buffer, heaptup,
1893 						 (options & HEAP_INSERT_SPECULATIVE) != 0);
1894 
1895 	if (PageIsAllVisible(BufferGetPage(buffer)))
1896 	{
1897 		all_visible_cleared = true;
1898 		PageClearAllVisible(BufferGetPage(buffer));
1899 		visibilitymap_clear(relation,
1900 							ItemPointerGetBlockNumber(&(heaptup->t_self)),
1901 							vmbuffer, VISIBILITYMAP_VALID_BITS);
1902 	}
1903 
1904 	/*
1905 	 * XXX Should we set PageSetPrunable on this page ?
1906 	 *
1907 	 * The inserting transaction may eventually abort thus making this tuple
1908 	 * DEAD and hence available for pruning. Though we don't want to optimize
1909 	 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
1910 	 * aborted tuple will never be pruned until next vacuum is triggered.
1911 	 *
1912 	 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
1913 	 */
1914 
1915 	MarkBufferDirty(buffer);
1916 
1917 	/* XLOG stuff */
1918 	if (RelationNeedsWAL(relation))
1919 	{
1920 		xl_heap_insert xlrec;
1921 		xl_heap_header xlhdr;
1922 		XLogRecPtr	recptr;
1923 		Page		page = BufferGetPage(buffer);
1924 		uint8		info = XLOG_HEAP_INSERT;
1925 		int			bufflags = 0;
1926 
1927 		/*
1928 		 * If this is a catalog, we need to transmit combocids to properly
1929 		 * decode, so log that as well.
1930 		 */
1931 		if (RelationIsAccessibleInLogicalDecoding(relation))
1932 			log_heap_new_cid(relation, heaptup);
1933 
1934 		/*
1935 		 * If this is the single and first tuple on page, we can reinit the
1936 		 * page instead of restoring the whole thing.  Set flag, and hide
1937 		 * buffer references from XLogInsert.
1938 		 */
1939 		if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
1940 			PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
1941 		{
1942 			info |= XLOG_HEAP_INIT_PAGE;
1943 			bufflags |= REGBUF_WILL_INIT;
1944 		}
1945 
1946 		xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
1947 		xlrec.flags = 0;
1948 		if (all_visible_cleared)
1949 			xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
1950 		if (options & HEAP_INSERT_SPECULATIVE)
1951 			xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
1952 		Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
1953 
1954 		/*
1955 		 * For logical decoding, we need the tuple even if we're doing a full
1956 		 * page write, so make sure it's included even if we take a full-page
1957 		 * image. (XXX We could alternatively store a pointer into the FPW).
1958 		 */
1959 		if (RelationIsLogicallyLogged(relation) &&
1960 			!(options & HEAP_INSERT_NO_LOGICAL))
1961 		{
1962 			xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
1963 			bufflags |= REGBUF_KEEP_DATA;
1964 		}
1965 
1966 		XLogBeginInsert();
1967 		XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
1968 
1969 		xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
1970 		xlhdr.t_infomask = heaptup->t_data->t_infomask;
1971 		xlhdr.t_hoff = heaptup->t_data->t_hoff;
1972 
1973 		/*
1974 		 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
1975 		 * write the whole page to the xlog, we don't need to store
1976 		 * xl_heap_header in the xlog.
1977 		 */
1978 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
1979 		XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
1980 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
1981 		XLogRegisterBufData(0,
1982 							(char *) heaptup->t_data + SizeofHeapTupleHeader,
1983 							heaptup->t_len - SizeofHeapTupleHeader);
1984 
1985 		/* filtering by origin on a row level is much more efficient */
1986 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
1987 
1988 		recptr = XLogInsert(RM_HEAP_ID, info);
1989 
1990 		PageSetLSN(page, recptr);
1991 	}
1992 
1993 	END_CRIT_SECTION();
1994 
1995 	UnlockReleaseBuffer(buffer);
1996 	if (vmbuffer != InvalidBuffer)
1997 		ReleaseBuffer(vmbuffer);
1998 
1999 	/*
2000 	 * If tuple is cachable, mark it for invalidation from the caches in case
2001 	 * we abort.  Note it is OK to do this after releasing the buffer, because
2002 	 * the heaptup data structure is all in local memory, not in the shared
2003 	 * buffer.
2004 	 */
2005 	CacheInvalidateHeapTuple(relation, heaptup, NULL);
2006 
2007 	/* Note: speculative insertions are counted too, even if aborted later */
2008 	pgstat_count_heap_insert(relation, 1);
2009 
2010 	/*
2011 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
2012 	 * back to the caller's image, too.
2013 	 */
2014 	if (heaptup != tup)
2015 	{
2016 		tup->t_self = heaptup->t_self;
2017 		heap_freetuple(heaptup);
2018 	}
2019 }
2020 
2021 /*
2022  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2023  * tuple header fields and toasts the tuple if necessary.  Returns a toasted
2024  * version of the tuple if it was toasted, or the original tuple if not. Note
2025  * that in any case, the header fields are also set in the original tuple.
2026  */
2027 static HeapTuple
heap_prepare_insert(Relation relation,HeapTuple tup,TransactionId xid,CommandId cid,int options)2028 heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
2029 					CommandId cid, int options)
2030 {
2031 	/*
2032 	 * To allow parallel inserts, we need to ensure that they are safe to be
2033 	 * performed in workers. We have the infrastructure to allow parallel
2034 	 * inserts in general except for the cases where inserts generate a new
2035 	 * CommandId (eg. inserts into a table having a foreign key column).
2036 	 */
2037 	if (IsParallelWorker())
2038 		ereport(ERROR,
2039 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2040 				 errmsg("cannot insert tuples in a parallel worker")));
2041 
2042 	tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2043 	tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2044 	tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2045 	HeapTupleHeaderSetXmin(tup->t_data, xid);
2046 	if (options & HEAP_INSERT_FROZEN)
2047 		HeapTupleHeaderSetXminFrozen(tup->t_data);
2048 
2049 	HeapTupleHeaderSetCmin(tup->t_data, cid);
2050 	HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2051 	tup->t_tableOid = RelationGetRelid(relation);
2052 
2053 	/*
2054 	 * If the new tuple is too big for storage or contains already toasted
2055 	 * out-of-line attributes from some other relation, invoke the toaster.
2056 	 */
2057 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
2058 		relation->rd_rel->relkind != RELKIND_MATVIEW)
2059 	{
2060 		/* toast table entries should never be recursively toasted */
2061 		Assert(!HeapTupleHasExternal(tup));
2062 		return tup;
2063 	}
2064 	else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2065 		return heap_toast_insert_or_update(relation, tup, NULL, options);
2066 	else
2067 		return tup;
2068 }
2069 
2070 /*
2071  *	heap_multi_insert	- insert multiple tuple into a heap
2072  *
2073  * This is like heap_insert(), but inserts multiple tuples in one operation.
2074  * That's faster than calling heap_insert() in a loop, because when multiple
2075  * tuples can be inserted on a single page, we can write just a single WAL
2076  * record covering all of them, and only need to lock/unlock the page once.
2077  *
2078  * Note: this leaks memory into the current memory context. You can create a
2079  * temporary context before calling this, if that's a problem.
2080  */
2081 void
heap_multi_insert(Relation relation,TupleTableSlot ** slots,int ntuples,CommandId cid,int options,BulkInsertState bistate)2082 heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2083 				  CommandId cid, int options, BulkInsertState bistate)
2084 {
2085 	TransactionId xid = GetCurrentTransactionId();
2086 	HeapTuple  *heaptuples;
2087 	int			i;
2088 	int			ndone;
2089 	PGAlignedBlock scratch;
2090 	Page		page;
2091 	bool		needwal;
2092 	Size		saveFreeSpace;
2093 	bool		need_tuple_data = RelationIsLogicallyLogged(relation);
2094 	bool		need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2095 
2096 	/* currently not needed (thus unsupported) for heap_multi_insert() */
2097 	AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2098 
2099 	needwal = RelationNeedsWAL(relation);
2100 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2101 												   HEAP_DEFAULT_FILLFACTOR);
2102 
2103 	/* Toast and set header data in all the slots */
2104 	heaptuples = palloc(ntuples * sizeof(HeapTuple));
2105 	for (i = 0; i < ntuples; i++)
2106 	{
2107 		HeapTuple	tuple;
2108 
2109 		tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2110 		slots[i]->tts_tableOid = RelationGetRelid(relation);
2111 		tuple->t_tableOid = slots[i]->tts_tableOid;
2112 		heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2113 											options);
2114 	}
2115 
2116 	/*
2117 	 * We're about to do the actual inserts -- but check for conflict first,
2118 	 * to minimize the possibility of having to roll back work we've just
2119 	 * done.
2120 	 *
2121 	 * A check here does not definitively prevent a serialization anomaly;
2122 	 * that check MUST be done at least past the point of acquiring an
2123 	 * exclusive buffer content lock on every buffer that will be affected,
2124 	 * and MAY be done after all inserts are reflected in the buffers and
2125 	 * those locks are released; otherwise there is a race condition.  Since
2126 	 * multiple buffers can be locked and unlocked in the loop below, and it
2127 	 * would not be feasible to identify and lock all of those buffers before
2128 	 * the loop, we must do a final check at the end.
2129 	 *
2130 	 * The check here could be omitted with no loss of correctness; it is
2131 	 * present strictly as an optimization.
2132 	 *
2133 	 * For heap inserts, we only need to check for table-level SSI locks. Our
2134 	 * new tuples can't possibly conflict with existing tuple locks, and heap
2135 	 * page locks are only consolidated versions of tuple locks; they do not
2136 	 * lock "gaps" as index page locks do.  So we don't need to specify a
2137 	 * buffer when making the call, which makes for a faster check.
2138 	 */
2139 	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2140 
2141 	ndone = 0;
2142 	while (ndone < ntuples)
2143 	{
2144 		Buffer		buffer;
2145 		Buffer		vmbuffer = InvalidBuffer;
2146 		bool		all_visible_cleared = false;
2147 		int			nthispage;
2148 
2149 		CHECK_FOR_INTERRUPTS();
2150 
2151 		/*
2152 		 * Find buffer where at least the next tuple will fit.  If the page is
2153 		 * all-visible, this will also pin the requisite visibility map page.
2154 		 */
2155 		buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2156 										   InvalidBuffer, options, bistate,
2157 										   &vmbuffer, NULL);
2158 		page = BufferGetPage(buffer);
2159 
2160 		/* NO EREPORT(ERROR) from here till changes are logged */
2161 		START_CRIT_SECTION();
2162 
2163 		/*
2164 		 * RelationGetBufferForTuple has ensured that the first tuple fits.
2165 		 * Put that on the page, and then as many other tuples as fit.
2166 		 */
2167 		RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2168 
2169 		/*
2170 		 * Note that heap_multi_insert is not used for catalog tuples yet, but
2171 		 * this will cover the gap once that is the case.
2172 		 */
2173 		if (needwal && need_cids)
2174 			log_heap_new_cid(relation, heaptuples[ndone]);
2175 
2176 		for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2177 		{
2178 			HeapTuple	heaptup = heaptuples[ndone + nthispage];
2179 
2180 			if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2181 				break;
2182 
2183 			RelationPutHeapTuple(relation, buffer, heaptup, false);
2184 
2185 			/*
2186 			 * We don't use heap_multi_insert for catalog tuples yet, but
2187 			 * better be prepared...
2188 			 */
2189 			if (needwal && need_cids)
2190 				log_heap_new_cid(relation, heaptup);
2191 		}
2192 
2193 		if (PageIsAllVisible(page))
2194 		{
2195 			all_visible_cleared = true;
2196 			PageClearAllVisible(page);
2197 			visibilitymap_clear(relation,
2198 								BufferGetBlockNumber(buffer),
2199 								vmbuffer, VISIBILITYMAP_VALID_BITS);
2200 		}
2201 
2202 		/*
2203 		 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2204 		 */
2205 
2206 		MarkBufferDirty(buffer);
2207 
2208 		/* XLOG stuff */
2209 		if (needwal)
2210 		{
2211 			XLogRecPtr	recptr;
2212 			xl_heap_multi_insert *xlrec;
2213 			uint8		info = XLOG_HEAP2_MULTI_INSERT;
2214 			char	   *tupledata;
2215 			int			totaldatalen;
2216 			char	   *scratchptr = scratch.data;
2217 			bool		init;
2218 			int			bufflags = 0;
2219 
2220 			/*
2221 			 * If the page was previously empty, we can reinit the page
2222 			 * instead of restoring the whole thing.
2223 			 */
2224 			init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2225 					PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2226 
2227 			/* allocate xl_heap_multi_insert struct from the scratch area */
2228 			xlrec = (xl_heap_multi_insert *) scratchptr;
2229 			scratchptr += SizeOfHeapMultiInsert;
2230 
2231 			/*
2232 			 * Allocate offsets array. Unless we're reinitializing the page,
2233 			 * in that case the tuples are stored in order starting at
2234 			 * FirstOffsetNumber and we don't need to store the offsets
2235 			 * explicitly.
2236 			 */
2237 			if (!init)
2238 				scratchptr += nthispage * sizeof(OffsetNumber);
2239 
2240 			/* the rest of the scratch space is used for tuple data */
2241 			tupledata = scratchptr;
2242 
2243 			xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2244 			xlrec->ntuples = nthispage;
2245 
2246 			/*
2247 			 * Write out an xl_multi_insert_tuple and the tuple data itself
2248 			 * for each tuple.
2249 			 */
2250 			for (i = 0; i < nthispage; i++)
2251 			{
2252 				HeapTuple	heaptup = heaptuples[ndone + i];
2253 				xl_multi_insert_tuple *tuphdr;
2254 				int			datalen;
2255 
2256 				if (!init)
2257 					xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2258 				/* xl_multi_insert_tuple needs two-byte alignment. */
2259 				tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2260 				scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2261 
2262 				tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2263 				tuphdr->t_infomask = heaptup->t_data->t_infomask;
2264 				tuphdr->t_hoff = heaptup->t_data->t_hoff;
2265 
2266 				/* write bitmap [+ padding] [+ oid] + data */
2267 				datalen = heaptup->t_len - SizeofHeapTupleHeader;
2268 				memcpy(scratchptr,
2269 					   (char *) heaptup->t_data + SizeofHeapTupleHeader,
2270 					   datalen);
2271 				tuphdr->datalen = datalen;
2272 				scratchptr += datalen;
2273 			}
2274 			totaldatalen = scratchptr - tupledata;
2275 			Assert((scratchptr - scratch.data) < BLCKSZ);
2276 
2277 			if (need_tuple_data)
2278 				xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
2279 
2280 			/*
2281 			 * Signal that this is the last xl_heap_multi_insert record
2282 			 * emitted by this call to heap_multi_insert(). Needed for logical
2283 			 * decoding so it knows when to cleanup temporary data.
2284 			 */
2285 			if (ndone + nthispage == ntuples)
2286 				xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2287 
2288 			if (init)
2289 			{
2290 				info |= XLOG_HEAP_INIT_PAGE;
2291 				bufflags |= REGBUF_WILL_INIT;
2292 			}
2293 
2294 			/*
2295 			 * If we're doing logical decoding, include the new tuple data
2296 			 * even if we take a full-page image of the page.
2297 			 */
2298 			if (need_tuple_data)
2299 				bufflags |= REGBUF_KEEP_DATA;
2300 
2301 			XLogBeginInsert();
2302 			XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2303 			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2304 
2305 			XLogRegisterBufData(0, tupledata, totaldatalen);
2306 
2307 			/* filtering by origin on a row level is much more efficient */
2308 			XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2309 
2310 			recptr = XLogInsert(RM_HEAP2_ID, info);
2311 
2312 			PageSetLSN(page, recptr);
2313 		}
2314 
2315 		END_CRIT_SECTION();
2316 
2317 		UnlockReleaseBuffer(buffer);
2318 		if (vmbuffer != InvalidBuffer)
2319 			ReleaseBuffer(vmbuffer);
2320 
2321 		ndone += nthispage;
2322 	}
2323 
2324 	/*
2325 	 * We're done with the actual inserts.  Check for conflicts again, to
2326 	 * ensure that all rw-conflicts in to these inserts are detected.  Without
2327 	 * this final check, a sequential scan of the heap may have locked the
2328 	 * table after the "before" check, missing one opportunity to detect the
2329 	 * conflict, and then scanned the table before the new tuples were there,
2330 	 * missing the other chance to detect the conflict.
2331 	 *
2332 	 * For heap inserts, we only need to check for table-level SSI locks. Our
2333 	 * new tuples can't possibly conflict with existing tuple locks, and heap
2334 	 * page locks are only consolidated versions of tuple locks; they do not
2335 	 * lock "gaps" as index page locks do.  So we don't need to specify a
2336 	 * buffer when making the call.
2337 	 */
2338 	CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
2339 
2340 	/*
2341 	 * If tuples are cachable, mark them for invalidation from the caches in
2342 	 * case we abort.  Note it is OK to do this after releasing the buffer,
2343 	 * because the heaptuples data structure is all in local memory, not in
2344 	 * the shared buffer.
2345 	 */
2346 	if (IsCatalogRelation(relation))
2347 	{
2348 		for (i = 0; i < ntuples; i++)
2349 			CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2350 	}
2351 
2352 	/* copy t_self fields back to the caller's slots */
2353 	for (i = 0; i < ntuples; i++)
2354 		slots[i]->tts_tid = heaptuples[i]->t_self;
2355 
2356 	pgstat_count_heap_insert(relation, ntuples);
2357 }
2358 
2359 /*
2360  *	simple_heap_insert - insert a tuple
2361  *
2362  * Currently, this routine differs from heap_insert only in supplying
2363  * a default command ID and not allowing access to the speedup options.
2364  *
2365  * This should be used rather than using heap_insert directly in most places
2366  * where we are modifying system catalogs.
2367  */
2368 void
simple_heap_insert(Relation relation,HeapTuple tup)2369 simple_heap_insert(Relation relation, HeapTuple tup)
2370 {
2371 	heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2372 }
2373 
2374 /*
2375  * Given infomask/infomask2, compute the bits that must be saved in the
2376  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2377  * xl_heap_lock_updated WAL records.
2378  *
2379  * See fix_infomask_from_infobits.
2380  */
2381 static uint8
compute_infobits(uint16 infomask,uint16 infomask2)2382 compute_infobits(uint16 infomask, uint16 infomask2)
2383 {
2384 	return
2385 		((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2386 		((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2387 		((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2388 	/* note we ignore HEAP_XMAX_SHR_LOCK here */
2389 		((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2390 		((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2391 		 XLHL_KEYS_UPDATED : 0);
2392 }
2393 
2394 /*
2395  * Given two versions of the same t_infomask for a tuple, compare them and
2396  * return whether the relevant status for a tuple Xmax has changed.  This is
2397  * used after a buffer lock has been released and reacquired: we want to ensure
2398  * that the tuple state continues to be the same it was when we previously
2399  * examined it.
2400  *
2401  * Note the Xmax field itself must be compared separately.
2402  */
2403 static inline bool
xmax_infomask_changed(uint16 new_infomask,uint16 old_infomask)2404 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2405 {
2406 	const uint16 interesting =
2407 	HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
2408 
2409 	if ((new_infomask & interesting) != (old_infomask & interesting))
2410 		return true;
2411 
2412 	return false;
2413 }
2414 
2415 /*
2416  *	heap_delete - delete a tuple
2417  *
2418  * See table_tuple_delete() for an explanation of the parameters, except that
2419  * this routine directly takes a tuple rather than a slot.
2420  *
2421  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2422  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2423  * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2424  * generated by another transaction).
2425  */
2426 TM_Result
heap_delete(Relation relation,ItemPointer tid,CommandId cid,Snapshot crosscheck,bool wait,TM_FailureData * tmfd,bool changingPart)2427 heap_delete(Relation relation, ItemPointer tid,
2428 			CommandId cid, Snapshot crosscheck, bool wait,
2429 			TM_FailureData *tmfd, bool changingPart)
2430 {
2431 	TM_Result	result;
2432 	TransactionId xid = GetCurrentTransactionId();
2433 	ItemId		lp;
2434 	HeapTupleData tp;
2435 	Page		page;
2436 	BlockNumber block;
2437 	Buffer		buffer;
2438 	Buffer		vmbuffer = InvalidBuffer;
2439 	TransactionId new_xmax;
2440 	uint16		new_infomask,
2441 				new_infomask2;
2442 	bool		have_tuple_lock = false;
2443 	bool		iscombo;
2444 	bool		all_visible_cleared = false;
2445 	HeapTuple	old_key_tuple = NULL;	/* replica identity of the tuple */
2446 	bool		old_key_copied = false;
2447 
2448 	Assert(ItemPointerIsValid(tid));
2449 
2450 	/*
2451 	 * Forbid this during a parallel operation, lest it allocate a combocid.
2452 	 * Other workers might need that combocid for visibility checks, and we
2453 	 * have no provision for broadcasting it to them.
2454 	 */
2455 	if (IsInParallelMode())
2456 		ereport(ERROR,
2457 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2458 				 errmsg("cannot delete tuples during a parallel operation")));
2459 
2460 	block = ItemPointerGetBlockNumber(tid);
2461 	buffer = ReadBuffer(relation, block);
2462 	page = BufferGetPage(buffer);
2463 
2464 	/*
2465 	 * Before locking the buffer, pin the visibility map page if it appears to
2466 	 * be necessary.  Since we haven't got the lock yet, someone else might be
2467 	 * in the middle of changing this, so we'll need to recheck after we have
2468 	 * the lock.
2469 	 */
2470 	if (PageIsAllVisible(page))
2471 		visibilitymap_pin(relation, block, &vmbuffer);
2472 
2473 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2474 
2475 	/*
2476 	 * If we didn't pin the visibility map page and the page has become all
2477 	 * visible while we were busy locking the buffer, we'll have to unlock and
2478 	 * re-lock, to avoid holding the buffer lock across an I/O.  That's a bit
2479 	 * unfortunate, but hopefully shouldn't happen often.
2480 	 */
2481 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2482 	{
2483 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2484 		visibilitymap_pin(relation, block, &vmbuffer);
2485 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2486 	}
2487 
2488 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2489 	Assert(ItemIdIsNormal(lp));
2490 
2491 	tp.t_tableOid = RelationGetRelid(relation);
2492 	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2493 	tp.t_len = ItemIdGetLength(lp);
2494 	tp.t_self = *tid;
2495 
2496 l1:
2497 	result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2498 
2499 	if (result == TM_Invisible)
2500 	{
2501 		UnlockReleaseBuffer(buffer);
2502 		ereport(ERROR,
2503 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2504 				 errmsg("attempted to delete invisible tuple")));
2505 	}
2506 	else if (result == TM_BeingModified && wait)
2507 	{
2508 		TransactionId xwait;
2509 		uint16		infomask;
2510 
2511 		/* must copy state data before unlocking buffer */
2512 		xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2513 		infomask = tp.t_data->t_infomask;
2514 
2515 		/*
2516 		 * Sleep until concurrent transaction ends -- except when there's a
2517 		 * single locker and it's our own transaction.  Note we don't care
2518 		 * which lock mode the locker has, because we need the strongest one.
2519 		 *
2520 		 * Before sleeping, we need to acquire tuple lock to establish our
2521 		 * priority for the tuple (see heap_lock_tuple).  LockTuple will
2522 		 * release us when we are next-in-line for the tuple.
2523 		 *
2524 		 * If we are forced to "start over" below, we keep the tuple lock;
2525 		 * this arranges that we stay at the head of the line while rechecking
2526 		 * tuple state.
2527 		 */
2528 		if (infomask & HEAP_XMAX_IS_MULTI)
2529 		{
2530 			bool		current_is_member = false;
2531 
2532 			if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2533 										LockTupleExclusive, &current_is_member))
2534 			{
2535 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2536 
2537 				/*
2538 				 * Acquire the lock, if necessary (but skip it when we're
2539 				 * requesting a lock and already have one; avoids deadlock).
2540 				 */
2541 				if (!current_is_member)
2542 					heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2543 										 LockWaitBlock, &have_tuple_lock);
2544 
2545 				/* wait for multixact */
2546 				MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
2547 								relation, &(tp.t_self), XLTW_Delete,
2548 								NULL);
2549 				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2550 
2551 				/*
2552 				 * If xwait had just locked the tuple then some other xact
2553 				 * could update this tuple before we get to this point.  Check
2554 				 * for xmax change, and start over if so.
2555 				 */
2556 				if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2557 					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2558 										 xwait))
2559 					goto l1;
2560 			}
2561 
2562 			/*
2563 			 * You might think the multixact is necessarily done here, but not
2564 			 * so: it could have surviving members, namely our own xact or
2565 			 * other subxacts of this backend.  It is legal for us to delete
2566 			 * the tuple in either case, however (the latter case is
2567 			 * essentially a situation of upgrading our former shared lock to
2568 			 * exclusive).  We don't bother changing the on-disk hint bits
2569 			 * since we are about to overwrite the xmax altogether.
2570 			 */
2571 		}
2572 		else if (!TransactionIdIsCurrentTransactionId(xwait))
2573 		{
2574 			/*
2575 			 * Wait for regular transaction to end; but first, acquire tuple
2576 			 * lock.
2577 			 */
2578 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2579 			heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
2580 								 LockWaitBlock, &have_tuple_lock);
2581 			XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2582 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2583 
2584 			/*
2585 			 * xwait is done, but if xwait had just locked the tuple then some
2586 			 * other xact could update this tuple before we get to this point.
2587 			 * Check for xmax change, and start over if so.
2588 			 */
2589 			if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2590 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
2591 									 xwait))
2592 				goto l1;
2593 
2594 			/* Otherwise check if it committed or aborted */
2595 			UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2596 		}
2597 
2598 		/*
2599 		 * We may overwrite if previous xmax aborted, or if it committed but
2600 		 * only locked the tuple without updating it.
2601 		 */
2602 		if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2603 			HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
2604 			HeapTupleHeaderIsOnlyLocked(tp.t_data))
2605 			result = TM_Ok;
2606 		else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid) ||
2607 				 HeapTupleHeaderIndicatesMovedPartitions(tp.t_data))
2608 			result = TM_Updated;
2609 		else
2610 			result = TM_Deleted;
2611 	}
2612 
2613 	if (crosscheck != InvalidSnapshot && result == TM_Ok)
2614 	{
2615 		/* Perform additional check for transaction-snapshot mode RI updates */
2616 		if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2617 			result = TM_Updated;
2618 	}
2619 
2620 	if (result != TM_Ok)
2621 	{
2622 		Assert(result == TM_SelfModified ||
2623 			   result == TM_Updated ||
2624 			   result == TM_Deleted ||
2625 			   result == TM_BeingModified);
2626 		Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
2627 		Assert(result != TM_Updated ||
2628 			   !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2629 		tmfd->ctid = tp.t_data->t_ctid;
2630 		tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2631 		if (result == TM_SelfModified)
2632 			tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2633 		else
2634 			tmfd->cmax = InvalidCommandId;
2635 		UnlockReleaseBuffer(buffer);
2636 		if (have_tuple_lock)
2637 			UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2638 		if (vmbuffer != InvalidBuffer)
2639 			ReleaseBuffer(vmbuffer);
2640 		return result;
2641 	}
2642 
2643 	/*
2644 	 * We're about to do the actual delete -- check for conflict first, to
2645 	 * avoid possibly having to roll back work we've just done.
2646 	 *
2647 	 * This is safe without a recheck as long as there is no possibility of
2648 	 * another process scanning the page between this check and the delete
2649 	 * being visible to the scan (i.e., an exclusive buffer content lock is
2650 	 * continuously held from this point until the tuple delete is visible).
2651 	 */
2652 	CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
2653 
2654 	/* replace cid with a combo cid if necessary */
2655 	HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2656 
2657 	/*
2658 	 * Compute replica identity tuple before entering the critical section so
2659 	 * we don't PANIC upon a memory allocation failure.
2660 	 */
2661 	old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2662 
2663 	/*
2664 	 * If this is the first possibly-multixact-able operation in the current
2665 	 * transaction, set my per-backend OldestMemberMXactId setting. We can be
2666 	 * certain that the transaction will never become a member of any older
2667 	 * MultiXactIds than that.  (We have to do this even if we end up just
2668 	 * using our own TransactionId below, since some other backend could
2669 	 * incorporate our XID into a MultiXact immediately afterwards.)
2670 	 */
2671 	MultiXactIdSetOldestMember();
2672 
2673 	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
2674 							  tp.t_data->t_infomask, tp.t_data->t_infomask2,
2675 							  xid, LockTupleExclusive, true,
2676 							  &new_xmax, &new_infomask, &new_infomask2);
2677 
2678 	START_CRIT_SECTION();
2679 
2680 	/*
2681 	 * If this transaction commits, the tuple will become DEAD sooner or
2682 	 * later.  Set flag that this page is a candidate for pruning once our xid
2683 	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
2684 	 * the subsequent page pruning will be a no-op and the hint will be
2685 	 * cleared.
2686 	 */
2687 	PageSetPrunable(page, xid);
2688 
2689 	if (PageIsAllVisible(page))
2690 	{
2691 		all_visible_cleared = true;
2692 		PageClearAllVisible(page);
2693 		visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2694 							vmbuffer, VISIBILITYMAP_VALID_BITS);
2695 	}
2696 
2697 	/* store transaction information of xact deleting the tuple */
2698 	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
2699 	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
2700 	tp.t_data->t_infomask |= new_infomask;
2701 	tp.t_data->t_infomask2 |= new_infomask2;
2702 	HeapTupleHeaderClearHotUpdated(tp.t_data);
2703 	HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2704 	HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2705 	/* Make sure there is no forward chain link in t_ctid */
2706 	tp.t_data->t_ctid = tp.t_self;
2707 
2708 	/* Signal that this is actually a move into another partition */
2709 	if (changingPart)
2710 		HeapTupleHeaderSetMovedPartitions(tp.t_data);
2711 
2712 	MarkBufferDirty(buffer);
2713 
2714 	/*
2715 	 * XLOG stuff
2716 	 *
2717 	 * NB: heap_abort_speculative() uses the same xlog record and replay
2718 	 * routines.
2719 	 */
2720 	if (RelationNeedsWAL(relation))
2721 	{
2722 		xl_heap_delete xlrec;
2723 		xl_heap_header xlhdr;
2724 		XLogRecPtr	recptr;
2725 
2726 		/* For logical decode we need combocids to properly decode the catalog */
2727 		if (RelationIsAccessibleInLogicalDecoding(relation))
2728 			log_heap_new_cid(relation, &tp);
2729 
2730 		xlrec.flags = 0;
2731 		if (all_visible_cleared)
2732 			xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
2733 		if (changingPart)
2734 			xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
2735 		xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
2736 											  tp.t_data->t_infomask2);
2737 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
2738 		xlrec.xmax = new_xmax;
2739 
2740 		if (old_key_tuple != NULL)
2741 		{
2742 			if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
2743 				xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
2744 			else
2745 				xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
2746 		}
2747 
2748 		XLogBeginInsert();
2749 		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
2750 
2751 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
2752 
2753 		/*
2754 		 * Log replica identity of the deleted tuple if there is one
2755 		 */
2756 		if (old_key_tuple != NULL)
2757 		{
2758 			xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
2759 			xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
2760 			xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
2761 
2762 			XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
2763 			XLogRegisterData((char *) old_key_tuple->t_data
2764 							 + SizeofHeapTupleHeader,
2765 							 old_key_tuple->t_len
2766 							 - SizeofHeapTupleHeader);
2767 		}
2768 
2769 		/* filtering by origin on a row level is much more efficient */
2770 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
2771 
2772 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
2773 
2774 		PageSetLSN(page, recptr);
2775 	}
2776 
2777 	END_CRIT_SECTION();
2778 
2779 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2780 
2781 	if (vmbuffer != InvalidBuffer)
2782 		ReleaseBuffer(vmbuffer);
2783 
2784 	/*
2785 	 * If the tuple has toasted out-of-line attributes, we need to delete
2786 	 * those items too.  We have to do this before releasing the buffer
2787 	 * because we need to look at the contents of the tuple, but it's OK to
2788 	 * release the content lock on the buffer first.
2789 	 */
2790 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
2791 		relation->rd_rel->relkind != RELKIND_MATVIEW)
2792 	{
2793 		/* toast table entries should never be recursively toasted */
2794 		Assert(!HeapTupleHasExternal(&tp));
2795 	}
2796 	else if (HeapTupleHasExternal(&tp))
2797 		heap_toast_delete(relation, &tp, false);
2798 
2799 	/*
2800 	 * Mark tuple for invalidation from system caches at next command
2801 	 * boundary. We have to do this before releasing the buffer because we
2802 	 * need to look at the contents of the tuple.
2803 	 */
2804 	CacheInvalidateHeapTuple(relation, &tp, NULL);
2805 
2806 	/* Now we can release the buffer */
2807 	ReleaseBuffer(buffer);
2808 
2809 	/*
2810 	 * Release the lmgr tuple lock, if we had it.
2811 	 */
2812 	if (have_tuple_lock)
2813 		UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2814 
2815 	pgstat_count_heap_delete(relation);
2816 
2817 	if (old_key_tuple != NULL && old_key_copied)
2818 		heap_freetuple(old_key_tuple);
2819 
2820 	return TM_Ok;
2821 }
2822 
2823 /*
2824  *	simple_heap_delete - delete a tuple
2825  *
2826  * This routine may be used to delete a tuple when concurrent updates of
2827  * the target tuple are not expected (for example, because we have a lock
2828  * on the relation associated with the tuple).  Any failure is reported
2829  * via ereport().
2830  */
2831 void
simple_heap_delete(Relation relation,ItemPointer tid)2832 simple_heap_delete(Relation relation, ItemPointer tid)
2833 {
2834 	TM_Result	result;
2835 	TM_FailureData tmfd;
2836 
2837 	result = heap_delete(relation, tid,
2838 						 GetCurrentCommandId(true), InvalidSnapshot,
2839 						 true /* wait for commit */ ,
2840 						 &tmfd, false /* changingPart */ );
2841 	switch (result)
2842 	{
2843 		case TM_SelfModified:
2844 			/* Tuple was already updated in current command? */
2845 			elog(ERROR, "tuple already updated by self");
2846 			break;
2847 
2848 		case TM_Ok:
2849 			/* done successfully */
2850 			break;
2851 
2852 		case TM_Updated:
2853 			elog(ERROR, "tuple concurrently updated");
2854 			break;
2855 
2856 		case TM_Deleted:
2857 			elog(ERROR, "tuple concurrently deleted");
2858 			break;
2859 
2860 		default:
2861 			elog(ERROR, "unrecognized heap_delete status: %u", result);
2862 			break;
2863 	}
2864 }
2865 
2866 /*
2867  *	heap_update - replace a tuple
2868  *
2869  * See table_tuple_update() for an explanation of the parameters, except that
2870  * this routine directly takes a tuple rather than a slot.
2871  *
2872  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2873  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2874  * only for TM_SelfModified, since we cannot obtain cmax from a combocid
2875  * generated by another transaction).
2876  */
2877 TM_Result
heap_update(Relation relation,ItemPointer otid,HeapTuple newtup,CommandId cid,Snapshot crosscheck,bool wait,TM_FailureData * tmfd,LockTupleMode * lockmode)2878 heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
2879 			CommandId cid, Snapshot crosscheck, bool wait,
2880 			TM_FailureData *tmfd, LockTupleMode *lockmode)
2881 {
2882 	TM_Result	result;
2883 	TransactionId xid = GetCurrentTransactionId();
2884 	Bitmapset  *hot_attrs;
2885 	Bitmapset  *key_attrs;
2886 	Bitmapset  *id_attrs;
2887 	Bitmapset  *interesting_attrs;
2888 	Bitmapset  *modified_attrs;
2889 	ItemId		lp;
2890 	HeapTupleData oldtup;
2891 	HeapTuple	heaptup;
2892 	HeapTuple	old_key_tuple = NULL;
2893 	bool		old_key_copied = false;
2894 	Page		page;
2895 	BlockNumber block;
2896 	MultiXactStatus mxact_status;
2897 	Buffer		buffer,
2898 				newbuf,
2899 				vmbuffer = InvalidBuffer,
2900 				vmbuffer_new = InvalidBuffer;
2901 	bool		need_toast;
2902 	Size		newtupsize,
2903 				pagefree;
2904 	bool		have_tuple_lock = false;
2905 	bool		iscombo;
2906 	bool		use_hot_update = false;
2907 	bool		hot_attrs_checked = false;
2908 	bool		key_intact;
2909 	bool		all_visible_cleared = false;
2910 	bool		all_visible_cleared_new = false;
2911 	bool		checked_lockers;
2912 	bool		locker_remains;
2913 	TransactionId xmax_new_tuple,
2914 				xmax_old_tuple;
2915 	uint16		infomask_old_tuple,
2916 				infomask2_old_tuple,
2917 				infomask_new_tuple,
2918 				infomask2_new_tuple;
2919 
2920 	Assert(ItemPointerIsValid(otid));
2921 
2922 	/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2923 	Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
2924 		   RelationGetNumberOfAttributes(relation));
2925 
2926 	/*
2927 	 * Forbid this during a parallel operation, lest it allocate a combocid.
2928 	 * Other workers might need that combocid for visibility checks, and we
2929 	 * have no provision for broadcasting it to them.
2930 	 */
2931 	if (IsInParallelMode())
2932 		ereport(ERROR,
2933 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2934 				 errmsg("cannot update tuples during a parallel operation")));
2935 
2936 	/*
2937 	 * Fetch the list of attributes to be checked for various operations.
2938 	 *
2939 	 * For HOT considerations, this is wasted effort if we fail to update or
2940 	 * have to put the new tuple on a different page.  But we must compute the
2941 	 * list before obtaining buffer lock --- in the worst case, if we are
2942 	 * doing an update on one of the relevant system catalogs, we could
2943 	 * deadlock if we try to fetch the list later.  In any case, the relcache
2944 	 * caches the data so this is usually pretty cheap.
2945 	 *
2946 	 * We also need columns used by the replica identity and columns that are
2947 	 * considered the "key" of rows in the table.
2948 	 *
2949 	 * Note that we get copies of each bitmap, so we need not worry about
2950 	 * relcache flush happening midway through.
2951 	 */
2952 	hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
2953 	key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
2954 	id_attrs = RelationGetIndexAttrBitmap(relation,
2955 										  INDEX_ATTR_BITMAP_IDENTITY_KEY);
2956 
2957 
2958 	block = ItemPointerGetBlockNumber(otid);
2959 	buffer = ReadBuffer(relation, block);
2960 	page = BufferGetPage(buffer);
2961 
2962 	interesting_attrs = NULL;
2963 
2964 	/*
2965 	 * If the page is already full, there is hardly any chance of doing a HOT
2966 	 * update on this page. It might be wasteful effort to look for index
2967 	 * column updates only to later reject HOT updates for lack of space in
2968 	 * the same page. So we be conservative and only fetch hot_attrs if the
2969 	 * page is not already full. Since we are already holding a pin on the
2970 	 * buffer, there is no chance that the buffer can get cleaned up
2971 	 * concurrently and even if that was possible, in the worst case we lose a
2972 	 * chance to do a HOT update.
2973 	 */
2974 	if (!PageIsFull(page))
2975 	{
2976 		interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
2977 		hot_attrs_checked = true;
2978 	}
2979 	interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
2980 	interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
2981 
2982 	/*
2983 	 * Before locking the buffer, pin the visibility map page if it appears to
2984 	 * be necessary.  Since we haven't got the lock yet, someone else might be
2985 	 * in the middle of changing this, so we'll need to recheck after we have
2986 	 * the lock.
2987 	 */
2988 	if (PageIsAllVisible(page))
2989 		visibilitymap_pin(relation, block, &vmbuffer);
2990 
2991 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2992 
2993 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
2994 	Assert(ItemIdIsNormal(lp));
2995 
2996 	/*
2997 	 * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
2998 	 * properly.
2999 	 */
3000 	oldtup.t_tableOid = RelationGetRelid(relation);
3001 	oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3002 	oldtup.t_len = ItemIdGetLength(lp);
3003 	oldtup.t_self = *otid;
3004 
3005 	/* the new tuple is ready, except for this: */
3006 	newtup->t_tableOid = RelationGetRelid(relation);
3007 
3008 	/* Determine columns modified by the update. */
3009 	modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3010 												  &oldtup, newtup);
3011 
3012 	/*
3013 	 * If we're not updating any "key" column, we can grab a weaker lock type.
3014 	 * This allows for more concurrency when we are running simultaneously
3015 	 * with foreign key checks.
3016 	 *
3017 	 * Note that if a column gets detoasted while executing the update, but
3018 	 * the value ends up being the same, this test will fail and we will use
3019 	 * the stronger lock.  This is acceptable; the important case to optimize
3020 	 * is updates that don't manipulate key columns, not those that
3021 	 * serendipitously arrive at the same key values.
3022 	 */
3023 	if (!bms_overlap(modified_attrs, key_attrs))
3024 	{
3025 		*lockmode = LockTupleNoKeyExclusive;
3026 		mxact_status = MultiXactStatusNoKeyUpdate;
3027 		key_intact = true;
3028 
3029 		/*
3030 		 * If this is the first possibly-multixact-able operation in the
3031 		 * current transaction, set my per-backend OldestMemberMXactId
3032 		 * setting. We can be certain that the transaction will never become a
3033 		 * member of any older MultiXactIds than that.  (We have to do this
3034 		 * even if we end up just using our own TransactionId below, since
3035 		 * some other backend could incorporate our XID into a MultiXact
3036 		 * immediately afterwards.)
3037 		 */
3038 		MultiXactIdSetOldestMember();
3039 	}
3040 	else
3041 	{
3042 		*lockmode = LockTupleExclusive;
3043 		mxact_status = MultiXactStatusUpdate;
3044 		key_intact = false;
3045 	}
3046 
3047 	/*
3048 	 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3049 	 * otid may very well point at newtup->t_self, which we will overwrite
3050 	 * with the new tuple's location, so there's great risk of confusion if we
3051 	 * use otid anymore.
3052 	 */
3053 
3054 l2:
3055 	checked_lockers = false;
3056 	locker_remains = false;
3057 	result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3058 
3059 	/* see below about the "no wait" case */
3060 	Assert(result != TM_BeingModified || wait);
3061 
3062 	if (result == TM_Invisible)
3063 	{
3064 		UnlockReleaseBuffer(buffer);
3065 		ereport(ERROR,
3066 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3067 				 errmsg("attempted to update invisible tuple")));
3068 	}
3069 	else if (result == TM_BeingModified && wait)
3070 	{
3071 		TransactionId xwait;
3072 		uint16		infomask;
3073 		bool		can_continue = false;
3074 
3075 		/*
3076 		 * XXX note that we don't consider the "no wait" case here.  This
3077 		 * isn't a problem currently because no caller uses that case, but it
3078 		 * should be fixed if such a caller is introduced.  It wasn't a
3079 		 * problem previously because this code would always wait, but now
3080 		 * that some tuple locks do not conflict with one of the lock modes we
3081 		 * use, it is possible that this case is interesting to handle
3082 		 * specially.
3083 		 *
3084 		 * This may cause failures with third-party code that calls
3085 		 * heap_update directly.
3086 		 */
3087 
3088 		/* must copy state data before unlocking buffer */
3089 		xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3090 		infomask = oldtup.t_data->t_infomask;
3091 
3092 		/*
3093 		 * Now we have to do something about the existing locker.  If it's a
3094 		 * multi, sleep on it; we might be awakened before it is completely
3095 		 * gone (or even not sleep at all in some cases); we need to preserve
3096 		 * it as locker, unless it is gone completely.
3097 		 *
3098 		 * If it's not a multi, we need to check for sleeping conditions
3099 		 * before actually going to sleep.  If the update doesn't conflict
3100 		 * with the locks, we just continue without sleeping (but making sure
3101 		 * it is preserved).
3102 		 *
3103 		 * Before sleeping, we need to acquire tuple lock to establish our
3104 		 * priority for the tuple (see heap_lock_tuple).  LockTuple will
3105 		 * release us when we are next-in-line for the tuple.  Note we must
3106 		 * not acquire the tuple lock until we're sure we're going to sleep;
3107 		 * otherwise we're open for race conditions with other transactions
3108 		 * holding the tuple lock which sleep on us.
3109 		 *
3110 		 * If we are forced to "start over" below, we keep the tuple lock;
3111 		 * this arranges that we stay at the head of the line while rechecking
3112 		 * tuple state.
3113 		 */
3114 		if (infomask & HEAP_XMAX_IS_MULTI)
3115 		{
3116 			TransactionId update_xact;
3117 			int			remain;
3118 			bool		current_is_member = false;
3119 
3120 			if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3121 										*lockmode, &current_is_member))
3122 			{
3123 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3124 
3125 				/*
3126 				 * Acquire the lock, if necessary (but skip it when we're
3127 				 * requesting a lock and already have one; avoids deadlock).
3128 				 */
3129 				if (!current_is_member)
3130 					heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3131 										 LockWaitBlock, &have_tuple_lock);
3132 
3133 				/* wait for multixact */
3134 				MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3135 								relation, &oldtup.t_self, XLTW_Update,
3136 								&remain);
3137 				checked_lockers = true;
3138 				locker_remains = remain != 0;
3139 				LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3140 
3141 				/*
3142 				 * If xwait had just locked the tuple then some other xact
3143 				 * could update this tuple before we get to this point.  Check
3144 				 * for xmax change, and start over if so.
3145 				 */
3146 				if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3147 										  infomask) ||
3148 					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3149 										 xwait))
3150 					goto l2;
3151 			}
3152 
3153 			/*
3154 			 * Note that the multixact may not be done by now.  It could have
3155 			 * surviving members; our own xact or other subxacts of this
3156 			 * backend, and also any other concurrent transaction that locked
3157 			 * the tuple with LockTupleKeyShare if we only got
3158 			 * LockTupleNoKeyExclusive.  If this is the case, we have to be
3159 			 * careful to mark the updated tuple with the surviving members in
3160 			 * Xmax.
3161 			 *
3162 			 * Note that there could have been another update in the
3163 			 * MultiXact. In that case, we need to check whether it committed
3164 			 * or aborted. If it aborted we are safe to update it again;
3165 			 * otherwise there is an update conflict, and we have to return
3166 			 * TableTuple{Deleted, Updated} below.
3167 			 *
3168 			 * In the LockTupleExclusive case, we still need to preserve the
3169 			 * surviving members: those would include the tuple locks we had
3170 			 * before this one, which are important to keep in case this
3171 			 * subxact aborts.
3172 			 */
3173 			if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3174 				update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3175 			else
3176 				update_xact = InvalidTransactionId;
3177 
3178 			/*
3179 			 * There was no UPDATE in the MultiXact; or it aborted. No
3180 			 * TransactionIdIsInProgress() call needed here, since we called
3181 			 * MultiXactIdWait() above.
3182 			 */
3183 			if (!TransactionIdIsValid(update_xact) ||
3184 				TransactionIdDidAbort(update_xact))
3185 				can_continue = true;
3186 		}
3187 		else if (TransactionIdIsCurrentTransactionId(xwait))
3188 		{
3189 			/*
3190 			 * The only locker is ourselves; we can avoid grabbing the tuple
3191 			 * lock here, but must preserve our locking information.
3192 			 */
3193 			checked_lockers = true;
3194 			locker_remains = true;
3195 			can_continue = true;
3196 		}
3197 		else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3198 		{
3199 			/*
3200 			 * If it's just a key-share locker, and we're not changing the key
3201 			 * columns, we don't need to wait for it to end; but we need to
3202 			 * preserve it as locker.
3203 			 */
3204 			checked_lockers = true;
3205 			locker_remains = true;
3206 			can_continue = true;
3207 		}
3208 		else
3209 		{
3210 			/*
3211 			 * Wait for regular transaction to end; but first, acquire tuple
3212 			 * lock.
3213 			 */
3214 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3215 			heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3216 								 LockWaitBlock, &have_tuple_lock);
3217 			XactLockTableWait(xwait, relation, &oldtup.t_self,
3218 							  XLTW_Update);
3219 			checked_lockers = true;
3220 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3221 
3222 			/*
3223 			 * xwait is done, but if xwait had just locked the tuple then some
3224 			 * other xact could update this tuple before we get to this point.
3225 			 * Check for xmax change, and start over if so.
3226 			 */
3227 			if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3228 				!TransactionIdEquals(xwait,
3229 									 HeapTupleHeaderGetRawXmax(oldtup.t_data)))
3230 				goto l2;
3231 
3232 			/* Otherwise check if it committed or aborted */
3233 			UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3234 			if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3235 				can_continue = true;
3236 		}
3237 
3238 		if (can_continue)
3239 			result = TM_Ok;
3240 		else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid) ||
3241 				 HeapTupleHeaderIndicatesMovedPartitions(oldtup.t_data))
3242 			result = TM_Updated;
3243 		else
3244 			result = TM_Deleted;
3245 	}
3246 
3247 	if (crosscheck != InvalidSnapshot && result == TM_Ok)
3248 	{
3249 		/* Perform additional check for transaction-snapshot mode RI updates */
3250 		if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3251 		{
3252 			result = TM_Updated;
3253 			Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3254 		}
3255 	}
3256 
3257 	if (result != TM_Ok)
3258 	{
3259 		Assert(result == TM_SelfModified ||
3260 			   result == TM_Updated ||
3261 			   result == TM_Deleted ||
3262 			   result == TM_BeingModified);
3263 		Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3264 		Assert(result != TM_Updated ||
3265 			   !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3266 		tmfd->ctid = oldtup.t_data->t_ctid;
3267 		tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3268 		if (result == TM_SelfModified)
3269 			tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3270 		else
3271 			tmfd->cmax = InvalidCommandId;
3272 		UnlockReleaseBuffer(buffer);
3273 		if (have_tuple_lock)
3274 			UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3275 		if (vmbuffer != InvalidBuffer)
3276 			ReleaseBuffer(vmbuffer);
3277 		bms_free(hot_attrs);
3278 		bms_free(key_attrs);
3279 		bms_free(id_attrs);
3280 		bms_free(modified_attrs);
3281 		bms_free(interesting_attrs);
3282 		return result;
3283 	}
3284 
3285 	/*
3286 	 * If we didn't pin the visibility map page and the page has become all
3287 	 * visible while we were busy locking the buffer, or during some
3288 	 * subsequent window during which we had it unlocked, we'll have to unlock
3289 	 * and re-lock, to avoid holding the buffer lock across an I/O.  That's a
3290 	 * bit unfortunate, especially since we'll now have to recheck whether the
3291 	 * tuple has been locked or updated under us, but hopefully it won't
3292 	 * happen very often.
3293 	 */
3294 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3295 	{
3296 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3297 		visibilitymap_pin(relation, block, &vmbuffer);
3298 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3299 		goto l2;
3300 	}
3301 
3302 	/* Fill in transaction status data */
3303 
3304 	/*
3305 	 * If the tuple we're updating is locked, we need to preserve the locking
3306 	 * info in the old tuple's Xmax.  Prepare a new Xmax value for this.
3307 	 */
3308 	compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3309 							  oldtup.t_data->t_infomask,
3310 							  oldtup.t_data->t_infomask2,
3311 							  xid, *lockmode, true,
3312 							  &xmax_old_tuple, &infomask_old_tuple,
3313 							  &infomask2_old_tuple);
3314 
3315 	/*
3316 	 * And also prepare an Xmax value for the new copy of the tuple.  If there
3317 	 * was no xmax previously, or there was one but all lockers are now gone,
3318 	 * then use InvalidXid; otherwise, get the xmax from the old tuple.  (In
3319 	 * rare cases that might also be InvalidXid and yet not have the
3320 	 * HEAP_XMAX_INVALID bit set; that's fine.)
3321 	 */
3322 	if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3323 		HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3324 		(checked_lockers && !locker_remains))
3325 		xmax_new_tuple = InvalidTransactionId;
3326 	else
3327 		xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3328 
3329 	if (!TransactionIdIsValid(xmax_new_tuple))
3330 	{
3331 		infomask_new_tuple = HEAP_XMAX_INVALID;
3332 		infomask2_new_tuple = 0;
3333 	}
3334 	else
3335 	{
3336 		/*
3337 		 * If we found a valid Xmax for the new tuple, then the infomask bits
3338 		 * to use on the new tuple depend on what was there on the old one.
3339 		 * Note that since we're doing an update, the only possibility is that
3340 		 * the lockers had FOR KEY SHARE lock.
3341 		 */
3342 		if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3343 		{
3344 			GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3345 								   &infomask2_new_tuple);
3346 		}
3347 		else
3348 		{
3349 			infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3350 			infomask2_new_tuple = 0;
3351 		}
3352 	}
3353 
3354 	/*
3355 	 * Prepare the new tuple with the appropriate initial values of Xmin and
3356 	 * Xmax, as well as initial infomask bits as computed above.
3357 	 */
3358 	newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3359 	newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3360 	HeapTupleHeaderSetXmin(newtup->t_data, xid);
3361 	HeapTupleHeaderSetCmin(newtup->t_data, cid);
3362 	newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3363 	newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3364 	HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3365 
3366 	/*
3367 	 * Replace cid with a combo cid if necessary.  Note that we already put
3368 	 * the plain cid into the new tuple.
3369 	 */
3370 	HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3371 
3372 	/*
3373 	 * If the toaster needs to be activated, OR if the new tuple will not fit
3374 	 * on the same page as the old, then we need to release the content lock
3375 	 * (but not the pin!) on the old tuple's buffer while we are off doing
3376 	 * TOAST and/or table-file-extension work.  We must mark the old tuple to
3377 	 * show that it's locked, else other processes may try to update it
3378 	 * themselves.
3379 	 *
3380 	 * We need to invoke the toaster if there are already any out-of-line
3381 	 * toasted values present, or if the new tuple is over-threshold.
3382 	 */
3383 	if (relation->rd_rel->relkind != RELKIND_RELATION &&
3384 		relation->rd_rel->relkind != RELKIND_MATVIEW)
3385 	{
3386 		/* toast table entries should never be recursively toasted */
3387 		Assert(!HeapTupleHasExternal(&oldtup));
3388 		Assert(!HeapTupleHasExternal(newtup));
3389 		need_toast = false;
3390 	}
3391 	else
3392 		need_toast = (HeapTupleHasExternal(&oldtup) ||
3393 					  HeapTupleHasExternal(newtup) ||
3394 					  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3395 
3396 	pagefree = PageGetHeapFreeSpace(page);
3397 
3398 	newtupsize = MAXALIGN(newtup->t_len);
3399 
3400 	if (need_toast || newtupsize > pagefree)
3401 	{
3402 		TransactionId xmax_lock_old_tuple;
3403 		uint16		infomask_lock_old_tuple,
3404 					infomask2_lock_old_tuple;
3405 		bool		cleared_all_frozen = false;
3406 
3407 		/*
3408 		 * To prevent concurrent sessions from updating the tuple, we have to
3409 		 * temporarily mark it locked, while we release the page-level lock.
3410 		 *
3411 		 * To satisfy the rule that any xid potentially appearing in a buffer
3412 		 * written out to disk, we unfortunately have to WAL log this
3413 		 * temporary modification.  We can reuse xl_heap_lock for this
3414 		 * purpose.  If we crash/error before following through with the
3415 		 * actual update, xmax will be of an aborted transaction, allowing
3416 		 * other sessions to proceed.
3417 		 */
3418 
3419 		/*
3420 		 * Compute xmax / infomask appropriate for locking the tuple. This has
3421 		 * to be done separately from the combo that's going to be used for
3422 		 * updating, because the potentially created multixact would otherwise
3423 		 * be wrong.
3424 		 */
3425 		compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
3426 								  oldtup.t_data->t_infomask,
3427 								  oldtup.t_data->t_infomask2,
3428 								  xid, *lockmode, false,
3429 								  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3430 								  &infomask2_lock_old_tuple);
3431 
3432 		Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3433 
3434 		START_CRIT_SECTION();
3435 
3436 		/* Clear obsolete visibility flags ... */
3437 		oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3438 		oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3439 		HeapTupleClearHotUpdated(&oldtup);
3440 		/* ... and store info about transaction updating this tuple */
3441 		Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3442 		HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3443 		oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3444 		oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3445 		HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3446 
3447 		/* temporarily make it look not-updated, but locked */
3448 		oldtup.t_data->t_ctid = oldtup.t_self;
3449 
3450 		/*
3451 		 * Clear all-frozen bit on visibility map if needed. We could
3452 		 * immediately reset ALL_VISIBLE, but given that the WAL logging
3453 		 * overhead would be unchanged, that doesn't seem necessarily
3454 		 * worthwhile.
3455 		 */
3456 		if (PageIsAllVisible(page) &&
3457 			visibilitymap_clear(relation, block, vmbuffer,
3458 								VISIBILITYMAP_ALL_FROZEN))
3459 			cleared_all_frozen = true;
3460 
3461 		MarkBufferDirty(buffer);
3462 
3463 		if (RelationNeedsWAL(relation))
3464 		{
3465 			xl_heap_lock xlrec;
3466 			XLogRecPtr	recptr;
3467 
3468 			XLogBeginInsert();
3469 			XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3470 
3471 			xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3472 			xlrec.locking_xid = xmax_lock_old_tuple;
3473 			xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3474 												  oldtup.t_data->t_infomask2);
3475 			xlrec.flags =
3476 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3477 			XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3478 			recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3479 			PageSetLSN(page, recptr);
3480 		}
3481 
3482 		END_CRIT_SECTION();
3483 
3484 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3485 
3486 		/*
3487 		 * Let the toaster do its thing, if needed.
3488 		 *
3489 		 * Note: below this point, heaptup is the data we actually intend to
3490 		 * store into the relation; newtup is the caller's original untoasted
3491 		 * data.
3492 		 */
3493 		if (need_toast)
3494 		{
3495 			/* Note we always use WAL and FSM during updates */
3496 			heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3497 			newtupsize = MAXALIGN(heaptup->t_len);
3498 		}
3499 		else
3500 			heaptup = newtup;
3501 
3502 		/*
3503 		 * Now, do we need a new page for the tuple, or not?  This is a bit
3504 		 * tricky since someone else could have added tuples to the page while
3505 		 * we weren't looking.  We have to recheck the available space after
3506 		 * reacquiring the buffer lock.  But don't bother to do that if the
3507 		 * former amount of free space is still not enough; it's unlikely
3508 		 * there's more free now than before.
3509 		 *
3510 		 * What's more, if we need to get a new page, we will need to acquire
3511 		 * buffer locks on both old and new pages.  To avoid deadlock against
3512 		 * some other backend trying to get the same two locks in the other
3513 		 * order, we must be consistent about the order we get the locks in.
3514 		 * We use the rule "lock the lower-numbered page of the relation
3515 		 * first".  To implement this, we must do RelationGetBufferForTuple
3516 		 * while not holding the lock on the old page, and we must rely on it
3517 		 * to get the locks on both pages in the correct order.
3518 		 *
3519 		 * Another consideration is that we need visibility map page pin(s) if
3520 		 * we will have to clear the all-visible flag on either page.  If we
3521 		 * call RelationGetBufferForTuple, we rely on it to acquire any such
3522 		 * pins; but if we don't, we have to handle that here.  Hence we need
3523 		 * a loop.
3524 		 */
3525 		for (;;)
3526 		{
3527 			if (newtupsize > pagefree)
3528 			{
3529 				/* It doesn't fit, must use RelationGetBufferForTuple. */
3530 				newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3531 												   buffer, 0, NULL,
3532 												   &vmbuffer_new, &vmbuffer);
3533 				/* We're all done. */
3534 				break;
3535 			}
3536 			/* Acquire VM page pin if needed and we don't have it. */
3537 			if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3538 				visibilitymap_pin(relation, block, &vmbuffer);
3539 			/* Re-acquire the lock on the old tuple's page. */
3540 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3541 			/* Re-check using the up-to-date free space */
3542 			pagefree = PageGetHeapFreeSpace(page);
3543 			if (newtupsize > pagefree ||
3544 				(vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
3545 			{
3546 				/*
3547 				 * Rats, it doesn't fit anymore, or somebody just now set the
3548 				 * all-visible flag.  We must now unlock and loop to avoid
3549 				 * deadlock.  Fortunately, this path should seldom be taken.
3550 				 */
3551 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3552 			}
3553 			else
3554 			{
3555 				/* We're all done. */
3556 				newbuf = buffer;
3557 				break;
3558 			}
3559 		}
3560 	}
3561 	else
3562 	{
3563 		/* No TOAST work needed, and it'll fit on same page */
3564 		newbuf = buffer;
3565 		heaptup = newtup;
3566 	}
3567 
3568 	/*
3569 	 * We're about to do the actual update -- check for conflict first, to
3570 	 * avoid possibly having to roll back work we've just done.
3571 	 *
3572 	 * This is safe without a recheck as long as there is no possibility of
3573 	 * another process scanning the pages between this check and the update
3574 	 * being visible to the scan (i.e., exclusive buffer content lock(s) are
3575 	 * continuously held from this point until the tuple update is visible).
3576 	 *
3577 	 * For the new tuple the only check needed is at the relation level, but
3578 	 * since both tuples are in the same relation and the check for oldtup
3579 	 * will include checking the relation level, there is no benefit to a
3580 	 * separate check for the new tuple.
3581 	 */
3582 	CheckForSerializableConflictIn(relation, &oldtup.t_self,
3583 								   BufferGetBlockNumber(buffer));
3584 
3585 	/*
3586 	 * At this point newbuf and buffer are both pinned and locked, and newbuf
3587 	 * has enough space for the new tuple.  If they are the same buffer, only
3588 	 * one pin is held.
3589 	 */
3590 
3591 	if (newbuf == buffer)
3592 	{
3593 		/*
3594 		 * Since the new tuple is going into the same page, we might be able
3595 		 * to do a HOT update.  Check if any of the index columns have been
3596 		 * changed. If the page was already full, we may have skipped checking
3597 		 * for index columns, and also can't do a HOT update.
3598 		 */
3599 		if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3600 			use_hot_update = true;
3601 	}
3602 	else
3603 	{
3604 		/* Set a hint that the old page could use prune/defrag */
3605 		PageSetFull(page);
3606 	}
3607 
3608 	/*
3609 	 * Compute replica identity tuple before entering the critical section so
3610 	 * we don't PANIC upon a memory allocation failure.
3611 	 * ExtractReplicaIdentity() will return NULL if nothing needs to be
3612 	 * logged.
3613 	 */
3614 	old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3615 										   bms_overlap(modified_attrs, id_attrs),
3616 										   &old_key_copied);
3617 
3618 	/* NO EREPORT(ERROR) from here till changes are logged */
3619 	START_CRIT_SECTION();
3620 
3621 	/*
3622 	 * If this transaction commits, the old tuple will become DEAD sooner or
3623 	 * later.  Set flag that this page is a candidate for pruning once our xid
3624 	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
3625 	 * the subsequent page pruning will be a no-op and the hint will be
3626 	 * cleared.
3627 	 *
3628 	 * XXX Should we set hint on newbuf as well?  If the transaction aborts,
3629 	 * there would be a prunable tuple in the newbuf; but for now we choose
3630 	 * not to optimize for aborts.  Note that heap_xlog_update must be kept in
3631 	 * sync if this decision changes.
3632 	 */
3633 	PageSetPrunable(page, xid);
3634 
3635 	if (use_hot_update)
3636 	{
3637 		/* Mark the old tuple as HOT-updated */
3638 		HeapTupleSetHotUpdated(&oldtup);
3639 		/* And mark the new tuple as heap-only */
3640 		HeapTupleSetHeapOnly(heaptup);
3641 		/* Mark the caller's copy too, in case different from heaptup */
3642 		HeapTupleSetHeapOnly(newtup);
3643 	}
3644 	else
3645 	{
3646 		/* Make sure tuples are correctly marked as not-HOT */
3647 		HeapTupleClearHotUpdated(&oldtup);
3648 		HeapTupleClearHeapOnly(heaptup);
3649 		HeapTupleClearHeapOnly(newtup);
3650 	}
3651 
3652 	RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3653 
3654 
3655 	/* Clear obsolete visibility flags, possibly set by ourselves above... */
3656 	oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3657 	oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3658 	/* ... and store info about transaction updating this tuple */
3659 	Assert(TransactionIdIsValid(xmax_old_tuple));
3660 	HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3661 	oldtup.t_data->t_infomask |= infomask_old_tuple;
3662 	oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3663 	HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3664 
3665 	/* record address of new tuple in t_ctid of old one */
3666 	oldtup.t_data->t_ctid = heaptup->t_self;
3667 
3668 	/* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3669 	if (PageIsAllVisible(BufferGetPage(buffer)))
3670 	{
3671 		all_visible_cleared = true;
3672 		PageClearAllVisible(BufferGetPage(buffer));
3673 		visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3674 							vmbuffer, VISIBILITYMAP_VALID_BITS);
3675 	}
3676 	if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3677 	{
3678 		all_visible_cleared_new = true;
3679 		PageClearAllVisible(BufferGetPage(newbuf));
3680 		visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3681 							vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3682 	}
3683 
3684 	if (newbuf != buffer)
3685 		MarkBufferDirty(newbuf);
3686 	MarkBufferDirty(buffer);
3687 
3688 	/* XLOG stuff */
3689 	if (RelationNeedsWAL(relation))
3690 	{
3691 		XLogRecPtr	recptr;
3692 
3693 		/*
3694 		 * For logical decoding we need combocids to properly decode the
3695 		 * catalog.
3696 		 */
3697 		if (RelationIsAccessibleInLogicalDecoding(relation))
3698 		{
3699 			log_heap_new_cid(relation, &oldtup);
3700 			log_heap_new_cid(relation, heaptup);
3701 		}
3702 
3703 		recptr = log_heap_update(relation, buffer,
3704 								 newbuf, &oldtup, heaptup,
3705 								 old_key_tuple,
3706 								 all_visible_cleared,
3707 								 all_visible_cleared_new);
3708 		if (newbuf != buffer)
3709 		{
3710 			PageSetLSN(BufferGetPage(newbuf), recptr);
3711 		}
3712 		PageSetLSN(BufferGetPage(buffer), recptr);
3713 	}
3714 
3715 	END_CRIT_SECTION();
3716 
3717 	if (newbuf != buffer)
3718 		LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3719 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3720 
3721 	/*
3722 	 * Mark old tuple for invalidation from system caches at next command
3723 	 * boundary, and mark the new tuple for invalidation in case we abort. We
3724 	 * have to do this before releasing the buffer because oldtup is in the
3725 	 * buffer.  (heaptup is all in local memory, but it's necessary to process
3726 	 * both tuple versions in one call to inval.c so we can avoid redundant
3727 	 * sinval messages.)
3728 	 */
3729 	CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
3730 
3731 	/* Now we can release the buffer(s) */
3732 	if (newbuf != buffer)
3733 		ReleaseBuffer(newbuf);
3734 	ReleaseBuffer(buffer);
3735 	if (BufferIsValid(vmbuffer_new))
3736 		ReleaseBuffer(vmbuffer_new);
3737 	if (BufferIsValid(vmbuffer))
3738 		ReleaseBuffer(vmbuffer);
3739 
3740 	/*
3741 	 * Release the lmgr tuple lock, if we had it.
3742 	 */
3743 	if (have_tuple_lock)
3744 		UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3745 
3746 	pgstat_count_heap_update(relation, use_hot_update);
3747 
3748 	/*
3749 	 * If heaptup is a private copy, release it.  Don't forget to copy t_self
3750 	 * back to the caller's image, too.
3751 	 */
3752 	if (heaptup != newtup)
3753 	{
3754 		newtup->t_self = heaptup->t_self;
3755 		heap_freetuple(heaptup);
3756 	}
3757 
3758 	if (old_key_tuple != NULL && old_key_copied)
3759 		heap_freetuple(old_key_tuple);
3760 
3761 	bms_free(hot_attrs);
3762 	bms_free(key_attrs);
3763 	bms_free(id_attrs);
3764 	bms_free(modified_attrs);
3765 	bms_free(interesting_attrs);
3766 
3767 	return TM_Ok;
3768 }
3769 
3770 /*
3771  * Check if the specified attribute's value is same in both given tuples.
3772  * Subroutine for HeapDetermineModifiedColumns.
3773  */
3774 static bool
heap_tuple_attr_equals(TupleDesc tupdesc,int attrnum,HeapTuple tup1,HeapTuple tup2)3775 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
3776 					   HeapTuple tup1, HeapTuple tup2)
3777 {
3778 	Datum		value1,
3779 				value2;
3780 	bool		isnull1,
3781 				isnull2;
3782 	Form_pg_attribute att;
3783 
3784 	/*
3785 	 * If it's a whole-tuple reference, say "not equal".  It's not really
3786 	 * worth supporting this case, since it could only succeed after a no-op
3787 	 * update, which is hardly a case worth optimizing for.
3788 	 */
3789 	if (attrnum == 0)
3790 		return false;
3791 
3792 	/*
3793 	 * Likewise, automatically say "not equal" for any system attribute other
3794 	 * than tableOID; we cannot expect these to be consistent in a HOT chain,
3795 	 * or even to be set correctly yet in the new tuple.
3796 	 */
3797 	if (attrnum < 0)
3798 	{
3799 		if (attrnum != TableOidAttributeNumber)
3800 			return false;
3801 	}
3802 
3803 	/*
3804 	 * Extract the corresponding values.  XXX this is pretty inefficient if
3805 	 * there are many indexed columns.  Should HeapDetermineModifiedColumns do
3806 	 * a single heap_deform_tuple call on each tuple, instead?	But that
3807 	 * doesn't work for system columns ...
3808 	 */
3809 	value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
3810 	value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
3811 
3812 	/*
3813 	 * If one value is NULL and other is not, then they are certainly not
3814 	 * equal
3815 	 */
3816 	if (isnull1 != isnull2)
3817 		return false;
3818 
3819 	/*
3820 	 * If both are NULL, they can be considered equal.
3821 	 */
3822 	if (isnull1)
3823 		return true;
3824 
3825 	/*
3826 	 * We do simple binary comparison of the two datums.  This may be overly
3827 	 * strict because there can be multiple binary representations for the
3828 	 * same logical value.  But we should be OK as long as there are no false
3829 	 * positives.  Using a type-specific equality operator is messy because
3830 	 * there could be multiple notions of equality in different operator
3831 	 * classes; furthermore, we cannot safely invoke user-defined functions
3832 	 * while holding exclusive buffer lock.
3833 	 */
3834 	if (attrnum <= 0)
3835 	{
3836 		/* The only allowed system columns are OIDs, so do this */
3837 		return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
3838 	}
3839 	else
3840 	{
3841 		Assert(attrnum <= tupdesc->natts);
3842 		att = TupleDescAttr(tupdesc, attrnum - 1);
3843 		return datumIsEqual(value1, value2, att->attbyval, att->attlen);
3844 	}
3845 }
3846 
3847 /*
3848  * Check which columns are being updated.
3849  *
3850  * Given an updated tuple, determine (and return into the output bitmapset),
3851  * from those listed as interesting, the set of columns that changed.
3852  *
3853  * The input bitmapset is destructively modified; that is OK since this is
3854  * invoked at most once in heap_update.
3855  */
3856 static Bitmapset *
HeapDetermineModifiedColumns(Relation relation,Bitmapset * interesting_cols,HeapTuple oldtup,HeapTuple newtup)3857 HeapDetermineModifiedColumns(Relation relation, Bitmapset *interesting_cols,
3858 							 HeapTuple oldtup, HeapTuple newtup)
3859 {
3860 	int			attnum;
3861 	Bitmapset  *modified = NULL;
3862 
3863 	while ((attnum = bms_first_member(interesting_cols)) >= 0)
3864 	{
3865 		attnum += FirstLowInvalidHeapAttributeNumber;
3866 
3867 		if (!heap_tuple_attr_equals(RelationGetDescr(relation),
3868 									attnum, oldtup, newtup))
3869 			modified = bms_add_member(modified,
3870 									  attnum - FirstLowInvalidHeapAttributeNumber);
3871 	}
3872 
3873 	return modified;
3874 }
3875 
3876 /*
3877  *	simple_heap_update - replace a tuple
3878  *
3879  * This routine may be used to update a tuple when concurrent updates of
3880  * the target tuple are not expected (for example, because we have a lock
3881  * on the relation associated with the tuple).  Any failure is reported
3882  * via ereport().
3883  */
3884 void
simple_heap_update(Relation relation,ItemPointer otid,HeapTuple tup)3885 simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
3886 {
3887 	TM_Result	result;
3888 	TM_FailureData tmfd;
3889 	LockTupleMode lockmode;
3890 
3891 	result = heap_update(relation, otid, tup,
3892 						 GetCurrentCommandId(true), InvalidSnapshot,
3893 						 true /* wait for commit */ ,
3894 						 &tmfd, &lockmode);
3895 	switch (result)
3896 	{
3897 		case TM_SelfModified:
3898 			/* Tuple was already updated in current command? */
3899 			elog(ERROR, "tuple already updated by self");
3900 			break;
3901 
3902 		case TM_Ok:
3903 			/* done successfully */
3904 			break;
3905 
3906 		case TM_Updated:
3907 			elog(ERROR, "tuple concurrently updated");
3908 			break;
3909 
3910 		case TM_Deleted:
3911 			elog(ERROR, "tuple concurrently deleted");
3912 			break;
3913 
3914 		default:
3915 			elog(ERROR, "unrecognized heap_update status: %u", result);
3916 			break;
3917 	}
3918 }
3919 
3920 
3921 /*
3922  * Return the MultiXactStatus corresponding to the given tuple lock mode.
3923  */
3924 static MultiXactStatus
get_mxact_status_for_lock(LockTupleMode mode,bool is_update)3925 get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
3926 {
3927 	int			retval;
3928 
3929 	if (is_update)
3930 		retval = tupleLockExtraInfo[mode].updstatus;
3931 	else
3932 		retval = tupleLockExtraInfo[mode].lockstatus;
3933 
3934 	if (retval == -1)
3935 		elog(ERROR, "invalid lock tuple mode %d/%s", mode,
3936 			 is_update ? "true" : "false");
3937 
3938 	return (MultiXactStatus) retval;
3939 }
3940 
3941 /*
3942  *	heap_lock_tuple - lock a tuple in shared or exclusive mode
3943  *
3944  * Note that this acquires a buffer pin, which the caller must release.
3945  *
3946  * Input parameters:
3947  *	relation: relation containing tuple (caller must hold suitable lock)
3948  *	tid: TID of tuple to lock
3949  *	cid: current command ID (used for visibility test, and stored into
3950  *		tuple's cmax if lock is successful)
3951  *	mode: indicates if shared or exclusive tuple lock is desired
3952  *	wait_policy: what to do if tuple lock is not available
3953  *	follow_updates: if true, follow the update chain to also lock descendant
3954  *		tuples.
3955  *
3956  * Output parameters:
3957  *	*tuple: all fields filled in
3958  *	*buffer: set to buffer holding tuple (pinned but not locked at exit)
3959  *	*tmfd: filled in failure cases (see below)
3960  *
3961  * Function results are the same as the ones for table_tuple_lock().
3962  *
3963  * In the failure cases other than TM_Invisible, the routine fills
3964  * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
3965  * if necessary), and t_cmax (the last only for TM_SelfModified,
3966  * since we cannot obtain cmax from a combocid generated by another
3967  * transaction).
3968  * See comments for struct TM_FailureData for additional info.
3969  *
3970  * See README.tuplock for a thorough explanation of this mechanism.
3971  */
3972 TM_Result
heap_lock_tuple(Relation relation,HeapTuple tuple,CommandId cid,LockTupleMode mode,LockWaitPolicy wait_policy,bool follow_updates,Buffer * buffer,TM_FailureData * tmfd)3973 heap_lock_tuple(Relation relation, HeapTuple tuple,
3974 				CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
3975 				bool follow_updates,
3976 				Buffer *buffer, TM_FailureData *tmfd)
3977 {
3978 	TM_Result	result;
3979 	ItemPointer tid = &(tuple->t_self);
3980 	ItemId		lp;
3981 	Page		page;
3982 	Buffer		vmbuffer = InvalidBuffer;
3983 	BlockNumber block;
3984 	TransactionId xid,
3985 				xmax;
3986 	uint16		old_infomask,
3987 				new_infomask,
3988 				new_infomask2;
3989 	bool		first_time = true;
3990 	bool		skip_tuple_lock = false;
3991 	bool		have_tuple_lock = false;
3992 	bool		cleared_all_frozen = false;
3993 
3994 	*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
3995 	block = ItemPointerGetBlockNumber(tid);
3996 
3997 	/*
3998 	 * Before locking the buffer, pin the visibility map page if it appears to
3999 	 * be necessary.  Since we haven't got the lock yet, someone else might be
4000 	 * in the middle of changing this, so we'll need to recheck after we have
4001 	 * the lock.
4002 	 */
4003 	if (PageIsAllVisible(BufferGetPage(*buffer)))
4004 		visibilitymap_pin(relation, block, &vmbuffer);
4005 
4006 	LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4007 
4008 	page = BufferGetPage(*buffer);
4009 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4010 	Assert(ItemIdIsNormal(lp));
4011 
4012 	tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4013 	tuple->t_len = ItemIdGetLength(lp);
4014 	tuple->t_tableOid = RelationGetRelid(relation);
4015 
4016 l3:
4017 	result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4018 
4019 	if (result == TM_Invisible)
4020 	{
4021 		/*
4022 		 * This is possible, but only when locking a tuple for ON CONFLICT
4023 		 * UPDATE.  We return this value here rather than throwing an error in
4024 		 * order to give that case the opportunity to throw a more specific
4025 		 * error.
4026 		 */
4027 		result = TM_Invisible;
4028 		goto out_locked;
4029 	}
4030 	else if (result == TM_BeingModified ||
4031 			 result == TM_Updated ||
4032 			 result == TM_Deleted)
4033 	{
4034 		TransactionId xwait;
4035 		uint16		infomask;
4036 		uint16		infomask2;
4037 		bool		require_sleep;
4038 		ItemPointerData t_ctid;
4039 
4040 		/* must copy state data before unlocking buffer */
4041 		xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4042 		infomask = tuple->t_data->t_infomask;
4043 		infomask2 = tuple->t_data->t_infomask2;
4044 		ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4045 
4046 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4047 
4048 		/*
4049 		 * If any subtransaction of the current top transaction already holds
4050 		 * a lock as strong as or stronger than what we're requesting, we
4051 		 * effectively hold the desired lock already.  We *must* succeed
4052 		 * without trying to take the tuple lock, else we will deadlock
4053 		 * against anyone wanting to acquire a stronger lock.
4054 		 *
4055 		 * Note we only do this the first time we loop on the HTSU result;
4056 		 * there is no point in testing in subsequent passes, because
4057 		 * evidently our own transaction cannot have acquired a new lock after
4058 		 * the first time we checked.
4059 		 */
4060 		if (first_time)
4061 		{
4062 			first_time = false;
4063 
4064 			if (infomask & HEAP_XMAX_IS_MULTI)
4065 			{
4066 				int			i;
4067 				int			nmembers;
4068 				MultiXactMember *members;
4069 
4070 				/*
4071 				 * We don't need to allow old multixacts here; if that had
4072 				 * been the case, HeapTupleSatisfiesUpdate would have returned
4073 				 * MayBeUpdated and we wouldn't be here.
4074 				 */
4075 				nmembers =
4076 					GetMultiXactIdMembers(xwait, &members, false,
4077 										  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4078 
4079 				for (i = 0; i < nmembers; i++)
4080 				{
4081 					/* only consider members of our own transaction */
4082 					if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4083 						continue;
4084 
4085 					if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4086 					{
4087 						pfree(members);
4088 						result = TM_Ok;
4089 						goto out_unlocked;
4090 					}
4091 					else
4092 					{
4093 						/*
4094 						 * Disable acquisition of the heavyweight tuple lock.
4095 						 * Otherwise, when promoting a weaker lock, we might
4096 						 * deadlock with another locker that has acquired the
4097 						 * heavyweight tuple lock and is waiting for our
4098 						 * transaction to finish.
4099 						 *
4100 						 * Note that in this case we still need to wait for
4101 						 * the multixact if required, to avoid acquiring
4102 						 * conflicting locks.
4103 						 */
4104 						skip_tuple_lock = true;
4105 					}
4106 				}
4107 
4108 				if (members)
4109 					pfree(members);
4110 			}
4111 			else if (TransactionIdIsCurrentTransactionId(xwait))
4112 			{
4113 				switch (mode)
4114 				{
4115 					case LockTupleKeyShare:
4116 						Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4117 							   HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4118 							   HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4119 						result = TM_Ok;
4120 						goto out_unlocked;
4121 					case LockTupleShare:
4122 						if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4123 							HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4124 						{
4125 							result = TM_Ok;
4126 							goto out_unlocked;
4127 						}
4128 						break;
4129 					case LockTupleNoKeyExclusive:
4130 						if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4131 						{
4132 							result = TM_Ok;
4133 							goto out_unlocked;
4134 						}
4135 						break;
4136 					case LockTupleExclusive:
4137 						if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4138 							infomask2 & HEAP_KEYS_UPDATED)
4139 						{
4140 							result = TM_Ok;
4141 							goto out_unlocked;
4142 						}
4143 						break;
4144 				}
4145 			}
4146 		}
4147 
4148 		/*
4149 		 * Initially assume that we will have to wait for the locking
4150 		 * transaction(s) to finish.  We check various cases below in which
4151 		 * this can be turned off.
4152 		 */
4153 		require_sleep = true;
4154 		if (mode == LockTupleKeyShare)
4155 		{
4156 			/*
4157 			 * If we're requesting KeyShare, and there's no update present, we
4158 			 * don't need to wait.  Even if there is an update, we can still
4159 			 * continue if the key hasn't been modified.
4160 			 *
4161 			 * However, if there are updates, we need to walk the update chain
4162 			 * to mark future versions of the row as locked, too.  That way,
4163 			 * if somebody deletes that future version, we're protected
4164 			 * against the key going away.  This locking of future versions
4165 			 * could block momentarily, if a concurrent transaction is
4166 			 * deleting a key; or it could return a value to the effect that
4167 			 * the transaction deleting the key has already committed.  So we
4168 			 * do this before re-locking the buffer; otherwise this would be
4169 			 * prone to deadlocks.
4170 			 *
4171 			 * Note that the TID we're locking was grabbed before we unlocked
4172 			 * the buffer.  For it to change while we're not looking, the
4173 			 * other properties we're testing for below after re-locking the
4174 			 * buffer would also change, in which case we would restart this
4175 			 * loop above.
4176 			 */
4177 			if (!(infomask2 & HEAP_KEYS_UPDATED))
4178 			{
4179 				bool		updated;
4180 
4181 				updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4182 
4183 				/*
4184 				 * If there are updates, follow the update chain; bail out if
4185 				 * that cannot be done.
4186 				 */
4187 				if (follow_updates && updated)
4188 				{
4189 					TM_Result	res;
4190 
4191 					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4192 												  GetCurrentTransactionId(),
4193 												  mode);
4194 					if (res != TM_Ok)
4195 					{
4196 						result = res;
4197 						/* recovery code expects to have buffer lock held */
4198 						LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4199 						goto failed;
4200 					}
4201 				}
4202 
4203 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4204 
4205 				/*
4206 				 * Make sure it's still an appropriate lock, else start over.
4207 				 * Also, if it wasn't updated before we released the lock, but
4208 				 * is updated now, we start over too; the reason is that we
4209 				 * now need to follow the update chain to lock the new
4210 				 * versions.
4211 				 */
4212 				if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4213 					((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4214 					 !updated))
4215 					goto l3;
4216 
4217 				/* Things look okay, so we can skip sleeping */
4218 				require_sleep = false;
4219 
4220 				/*
4221 				 * Note we allow Xmax to change here; other updaters/lockers
4222 				 * could have modified it before we grabbed the buffer lock.
4223 				 * However, this is not a problem, because with the recheck we
4224 				 * just did we ensure that they still don't conflict with the
4225 				 * lock we want.
4226 				 */
4227 			}
4228 		}
4229 		else if (mode == LockTupleShare)
4230 		{
4231 			/*
4232 			 * If we're requesting Share, we can similarly avoid sleeping if
4233 			 * there's no update and no exclusive lock present.
4234 			 */
4235 			if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4236 				!HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4237 			{
4238 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4239 
4240 				/*
4241 				 * Make sure it's still an appropriate lock, else start over.
4242 				 * See above about allowing xmax to change.
4243 				 */
4244 				if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4245 					HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
4246 					goto l3;
4247 				require_sleep = false;
4248 			}
4249 		}
4250 		else if (mode == LockTupleNoKeyExclusive)
4251 		{
4252 			/*
4253 			 * If we're requesting NoKeyExclusive, we might also be able to
4254 			 * avoid sleeping; just ensure that there no conflicting lock
4255 			 * already acquired.
4256 			 */
4257 			if (infomask & HEAP_XMAX_IS_MULTI)
4258 			{
4259 				if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4260 											 mode, NULL))
4261 				{
4262 					/*
4263 					 * No conflict, but if the xmax changed under us in the
4264 					 * meantime, start over.
4265 					 */
4266 					LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4267 					if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4268 						!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4269 											 xwait))
4270 						goto l3;
4271 
4272 					/* otherwise, we're good */
4273 					require_sleep = false;
4274 				}
4275 			}
4276 			else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4277 			{
4278 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4279 
4280 				/* if the xmax changed in the meantime, start over */
4281 				if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4282 					!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4283 										 xwait))
4284 					goto l3;
4285 				/* otherwise, we're good */
4286 				require_sleep = false;
4287 			}
4288 		}
4289 
4290 		/*
4291 		 * As a check independent from those above, we can also avoid sleeping
4292 		 * if the current transaction is the sole locker of the tuple.  Note
4293 		 * that the strength of the lock already held is irrelevant; this is
4294 		 * not about recording the lock in Xmax (which will be done regardless
4295 		 * of this optimization, below).  Also, note that the cases where we
4296 		 * hold a lock stronger than we are requesting are already handled
4297 		 * above by not doing anything.
4298 		 *
4299 		 * Note we only deal with the non-multixact case here; MultiXactIdWait
4300 		 * is well equipped to deal with this situation on its own.
4301 		 */
4302 		if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4303 			TransactionIdIsCurrentTransactionId(xwait))
4304 		{
4305 			/* ... but if the xmax changed in the meantime, start over */
4306 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4307 			if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4308 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4309 									 xwait))
4310 				goto l3;
4311 			Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
4312 			require_sleep = false;
4313 		}
4314 
4315 		/*
4316 		 * Time to sleep on the other transaction/multixact, if necessary.
4317 		 *
4318 		 * If the other transaction is an update/delete that's already
4319 		 * committed, then sleeping cannot possibly do any good: if we're
4320 		 * required to sleep, get out to raise an error instead.
4321 		 *
4322 		 * By here, we either have already acquired the buffer exclusive lock,
4323 		 * or we must wait for the locking transaction or multixact; so below
4324 		 * we ensure that we grab buffer lock after the sleep.
4325 		 */
4326 		if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4327 		{
4328 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4329 			goto failed;
4330 		}
4331 		else if (require_sleep)
4332 		{
4333 			/*
4334 			 * Acquire tuple lock to establish our priority for the tuple, or
4335 			 * die trying.  LockTuple will release us when we are next-in-line
4336 			 * for the tuple.  We must do this even if we are share-locking,
4337 			 * but not if we already have a weaker lock on the tuple.
4338 			 *
4339 			 * If we are forced to "start over" below, we keep the tuple lock;
4340 			 * this arranges that we stay at the head of the line while
4341 			 * rechecking tuple state.
4342 			 */
4343 			if (!skip_tuple_lock &&
4344 				!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4345 									  &have_tuple_lock))
4346 			{
4347 				/*
4348 				 * This can only happen if wait_policy is Skip and the lock
4349 				 * couldn't be obtained.
4350 				 */
4351 				result = TM_WouldBlock;
4352 				/* recovery code expects to have buffer lock held */
4353 				LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4354 				goto failed;
4355 			}
4356 
4357 			if (infomask & HEAP_XMAX_IS_MULTI)
4358 			{
4359 				MultiXactStatus status = get_mxact_status_for_lock(mode, false);
4360 
4361 				/* We only ever lock tuples, never update them */
4362 				if (status >= MultiXactStatusNoKeyUpdate)
4363 					elog(ERROR, "invalid lock mode in heap_lock_tuple");
4364 
4365 				/* wait for multixact to end, or die trying  */
4366 				switch (wait_policy)
4367 				{
4368 					case LockWaitBlock:
4369 						MultiXactIdWait((MultiXactId) xwait, status, infomask,
4370 										relation, &tuple->t_self, XLTW_Lock, NULL);
4371 						break;
4372 					case LockWaitSkip:
4373 						if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4374 														status, infomask, relation,
4375 														NULL))
4376 						{
4377 							result = TM_WouldBlock;
4378 							/* recovery code expects to have buffer lock held */
4379 							LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4380 							goto failed;
4381 						}
4382 						break;
4383 					case LockWaitError:
4384 						if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
4385 														status, infomask, relation,
4386 														NULL))
4387 							ereport(ERROR,
4388 									(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4389 									 errmsg("could not obtain lock on row in relation \"%s\"",
4390 											RelationGetRelationName(relation))));
4391 
4392 						break;
4393 				}
4394 
4395 				/*
4396 				 * Of course, the multixact might not be done here: if we're
4397 				 * requesting a light lock mode, other transactions with light
4398 				 * locks could still be alive, as well as locks owned by our
4399 				 * own xact or other subxacts of this backend.  We need to
4400 				 * preserve the surviving MultiXact members.  Note that it
4401 				 * isn't absolutely necessary in the latter case, but doing so
4402 				 * is simpler.
4403 				 */
4404 			}
4405 			else
4406 			{
4407 				/* wait for regular transaction to end, or die trying */
4408 				switch (wait_policy)
4409 				{
4410 					case LockWaitBlock:
4411 						XactLockTableWait(xwait, relation, &tuple->t_self,
4412 										  XLTW_Lock);
4413 						break;
4414 					case LockWaitSkip:
4415 						if (!ConditionalXactLockTableWait(xwait))
4416 						{
4417 							result = TM_WouldBlock;
4418 							/* recovery code expects to have buffer lock held */
4419 							LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4420 							goto failed;
4421 						}
4422 						break;
4423 					case LockWaitError:
4424 						if (!ConditionalXactLockTableWait(xwait))
4425 							ereport(ERROR,
4426 									(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4427 									 errmsg("could not obtain lock on row in relation \"%s\"",
4428 											RelationGetRelationName(relation))));
4429 						break;
4430 				}
4431 			}
4432 
4433 			/* if there are updates, follow the update chain */
4434 			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4435 			{
4436 				TM_Result	res;
4437 
4438 				res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4439 											  GetCurrentTransactionId(),
4440 											  mode);
4441 				if (res != TM_Ok)
4442 				{
4443 					result = res;
4444 					/* recovery code expects to have buffer lock held */
4445 					LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4446 					goto failed;
4447 				}
4448 			}
4449 
4450 			LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4451 
4452 			/*
4453 			 * xwait is done, but if xwait had just locked the tuple then some
4454 			 * other xact could update this tuple before we get to this point.
4455 			 * Check for xmax change, and start over if so.
4456 			 */
4457 			if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4458 				!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
4459 									 xwait))
4460 				goto l3;
4461 
4462 			if (!(infomask & HEAP_XMAX_IS_MULTI))
4463 			{
4464 				/*
4465 				 * Otherwise check if it committed or aborted.  Note we cannot
4466 				 * be here if the tuple was only locked by somebody who didn't
4467 				 * conflict with us; that would have been handled above.  So
4468 				 * that transaction must necessarily be gone by now.  But
4469 				 * don't check for this in the multixact case, because some
4470 				 * locker transactions might still be running.
4471 				 */
4472 				UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4473 			}
4474 		}
4475 
4476 		/* By here, we're certain that we hold buffer exclusive lock again */
4477 
4478 		/*
4479 		 * We may lock if previous xmax aborted, or if it committed but only
4480 		 * locked the tuple without updating it; or if we didn't have to wait
4481 		 * at all for whatever reason.
4482 		 */
4483 		if (!require_sleep ||
4484 			(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4485 			HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4486 			HeapTupleHeaderIsOnlyLocked(tuple->t_data))
4487 			result = TM_Ok;
4488 		else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid) ||
4489 				 HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data))
4490 			result = TM_Updated;
4491 		else
4492 			result = TM_Deleted;
4493 	}
4494 
4495 failed:
4496 	if (result != TM_Ok)
4497 	{
4498 		Assert(result == TM_SelfModified || result == TM_Updated ||
4499 			   result == TM_Deleted || result == TM_WouldBlock);
4500 		Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4501 		Assert(result != TM_Updated ||
4502 			   !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4503 		tmfd->ctid = tuple->t_data->t_ctid;
4504 		tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4505 		if (result == TM_SelfModified)
4506 			tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4507 		else
4508 			tmfd->cmax = InvalidCommandId;
4509 		goto out_locked;
4510 	}
4511 
4512 	/*
4513 	 * If we didn't pin the visibility map page and the page has become all
4514 	 * visible while we were busy locking the buffer, or during some
4515 	 * subsequent window during which we had it unlocked, we'll have to unlock
4516 	 * and re-lock, to avoid holding the buffer lock across I/O.  That's a bit
4517 	 * unfortunate, especially since we'll now have to recheck whether the
4518 	 * tuple has been locked or updated under us, but hopefully it won't
4519 	 * happen very often.
4520 	 */
4521 	if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4522 	{
4523 		LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4524 		visibilitymap_pin(relation, block, &vmbuffer);
4525 		LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
4526 		goto l3;
4527 	}
4528 
4529 	xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4530 	old_infomask = tuple->t_data->t_infomask;
4531 
4532 	/*
4533 	 * If this is the first possibly-multixact-able operation in the current
4534 	 * transaction, set my per-backend OldestMemberMXactId setting. We can be
4535 	 * certain that the transaction will never become a member of any older
4536 	 * MultiXactIds than that.  (We have to do this even if we end up just
4537 	 * using our own TransactionId below, since some other backend could
4538 	 * incorporate our XID into a MultiXact immediately afterwards.)
4539 	 */
4540 	MultiXactIdSetOldestMember();
4541 
4542 	/*
4543 	 * Compute the new xmax and infomask to store into the tuple.  Note we do
4544 	 * not modify the tuple just yet, because that would leave it in the wrong
4545 	 * state if multixact.c elogs.
4546 	 */
4547 	compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4548 							  GetCurrentTransactionId(), mode, false,
4549 							  &xid, &new_infomask, &new_infomask2);
4550 
4551 	START_CRIT_SECTION();
4552 
4553 	/*
4554 	 * Store transaction information of xact locking the tuple.
4555 	 *
4556 	 * Note: Cmax is meaningless in this context, so don't set it; this avoids
4557 	 * possibly generating a useless combo CID.  Moreover, if we're locking a
4558 	 * previously updated tuple, it's important to preserve the Cmax.
4559 	 *
4560 	 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4561 	 * we would break the HOT chain.
4562 	 */
4563 	tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4564 	tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4565 	tuple->t_data->t_infomask |= new_infomask;
4566 	tuple->t_data->t_infomask2 |= new_infomask2;
4567 	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4568 		HeapTupleHeaderClearHotUpdated(tuple->t_data);
4569 	HeapTupleHeaderSetXmax(tuple->t_data, xid);
4570 
4571 	/*
4572 	 * Make sure there is no forward chain link in t_ctid.  Note that in the
4573 	 * cases where the tuple has been updated, we must not overwrite t_ctid,
4574 	 * because it was set by the updater.  Moreover, if the tuple has been
4575 	 * updated, we need to follow the update chain to lock the new versions of
4576 	 * the tuple as well.
4577 	 */
4578 	if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4579 		tuple->t_data->t_ctid = *tid;
4580 
4581 	/* Clear only the all-frozen bit on visibility map if needed */
4582 	if (PageIsAllVisible(page) &&
4583 		visibilitymap_clear(relation, block, vmbuffer,
4584 							VISIBILITYMAP_ALL_FROZEN))
4585 		cleared_all_frozen = true;
4586 
4587 
4588 	MarkBufferDirty(*buffer);
4589 
4590 	/*
4591 	 * XLOG stuff.  You might think that we don't need an XLOG record because
4592 	 * there is no state change worth restoring after a crash.  You would be
4593 	 * wrong however: we have just written either a TransactionId or a
4594 	 * MultiXactId that may never have been seen on disk before, and we need
4595 	 * to make sure that there are XLOG entries covering those ID numbers.
4596 	 * Else the same IDs might be re-used after a crash, which would be
4597 	 * disastrous if this page made it to disk before the crash.  Essentially
4598 	 * we have to enforce the WAL log-before-data rule even in this case.
4599 	 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4600 	 * entries for everything anyway.)
4601 	 */
4602 	if (RelationNeedsWAL(relation))
4603 	{
4604 		xl_heap_lock xlrec;
4605 		XLogRecPtr	recptr;
4606 
4607 		XLogBeginInsert();
4608 		XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4609 
4610 		xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4611 		xlrec.locking_xid = xid;
4612 		xlrec.infobits_set = compute_infobits(new_infomask,
4613 											  tuple->t_data->t_infomask2);
4614 		xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4615 		XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4616 
4617 		/* we don't decode row locks atm, so no need to log the origin */
4618 
4619 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4620 
4621 		PageSetLSN(page, recptr);
4622 	}
4623 
4624 	END_CRIT_SECTION();
4625 
4626 	result = TM_Ok;
4627 
4628 out_locked:
4629 	LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4630 
4631 out_unlocked:
4632 	if (BufferIsValid(vmbuffer))
4633 		ReleaseBuffer(vmbuffer);
4634 
4635 	/*
4636 	 * Don't update the visibility map here. Locking a tuple doesn't change
4637 	 * visibility info.
4638 	 */
4639 
4640 	/*
4641 	 * Now that we have successfully marked the tuple as locked, we can
4642 	 * release the lmgr tuple lock, if we had it.
4643 	 */
4644 	if (have_tuple_lock)
4645 		UnlockTupleTuplock(relation, tid, mode);
4646 
4647 	return result;
4648 }
4649 
4650 /*
4651  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4652  * its normal, Xmax-based tuple lock.
4653  *
4654  * have_tuple_lock is an input and output parameter: on input, it indicates
4655  * whether the lock has previously been acquired (and this function does
4656  * nothing in that case).  If this function returns success, have_tuple_lock
4657  * has been flipped to true.
4658  *
4659  * Returns false if it was unable to obtain the lock; this can only happen if
4660  * wait_policy is Skip.
4661  */
4662 static bool
heap_acquire_tuplock(Relation relation,ItemPointer tid,LockTupleMode mode,LockWaitPolicy wait_policy,bool * have_tuple_lock)4663 heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
4664 					 LockWaitPolicy wait_policy, bool *have_tuple_lock)
4665 {
4666 	if (*have_tuple_lock)
4667 		return true;
4668 
4669 	switch (wait_policy)
4670 	{
4671 		case LockWaitBlock:
4672 			LockTupleTuplock(relation, tid, mode);
4673 			break;
4674 
4675 		case LockWaitSkip:
4676 			if (!ConditionalLockTupleTuplock(relation, tid, mode))
4677 				return false;
4678 			break;
4679 
4680 		case LockWaitError:
4681 			if (!ConditionalLockTupleTuplock(relation, tid, mode))
4682 				ereport(ERROR,
4683 						(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4684 						 errmsg("could not obtain lock on row in relation \"%s\"",
4685 								RelationGetRelationName(relation))));
4686 			break;
4687 	}
4688 	*have_tuple_lock = true;
4689 
4690 	return true;
4691 }
4692 
4693 /*
4694  * Given an original set of Xmax and infomask, and a transaction (identified by
4695  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4696  * corresponding infomasks to use on the tuple.
4697  *
4698  * Note that this might have side effects such as creating a new MultiXactId.
4699  *
4700  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4701  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4702  * but it was not running anymore. There is a race condition, which is that the
4703  * MultiXactId may have finished since then, but that uncommon case is handled
4704  * either here, or within MultiXactIdExpand.
4705  *
4706  * There is a similar race condition possible when the old xmax was a regular
4707  * TransactionId.  We test TransactionIdIsInProgress again just to narrow the
4708  * window, but it's still possible to end up creating an unnecessary
4709  * MultiXactId.  Fortunately this is harmless.
4710  */
4711 static void
compute_new_xmax_infomask(TransactionId xmax,uint16 old_infomask,uint16 old_infomask2,TransactionId add_to_xmax,LockTupleMode mode,bool is_update,TransactionId * result_xmax,uint16 * result_infomask,uint16 * result_infomask2)4712 compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
4713 						  uint16 old_infomask2, TransactionId add_to_xmax,
4714 						  LockTupleMode mode, bool is_update,
4715 						  TransactionId *result_xmax, uint16 *result_infomask,
4716 						  uint16 *result_infomask2)
4717 {
4718 	TransactionId new_xmax;
4719 	uint16		new_infomask,
4720 				new_infomask2;
4721 
4722 	Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
4723 
4724 l5:
4725 	new_infomask = 0;
4726 	new_infomask2 = 0;
4727 	if (old_infomask & HEAP_XMAX_INVALID)
4728 	{
4729 		/*
4730 		 * No previous locker; we just insert our own TransactionId.
4731 		 *
4732 		 * Note that it's critical that this case be the first one checked,
4733 		 * because there are several blocks below that come back to this one
4734 		 * to implement certain optimizations; old_infomask might contain
4735 		 * other dirty bits in those cases, but we don't really care.
4736 		 */
4737 		if (is_update)
4738 		{
4739 			new_xmax = add_to_xmax;
4740 			if (mode == LockTupleExclusive)
4741 				new_infomask2 |= HEAP_KEYS_UPDATED;
4742 		}
4743 		else
4744 		{
4745 			new_infomask |= HEAP_XMAX_LOCK_ONLY;
4746 			switch (mode)
4747 			{
4748 				case LockTupleKeyShare:
4749 					new_xmax = add_to_xmax;
4750 					new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
4751 					break;
4752 				case LockTupleShare:
4753 					new_xmax = add_to_xmax;
4754 					new_infomask |= HEAP_XMAX_SHR_LOCK;
4755 					break;
4756 				case LockTupleNoKeyExclusive:
4757 					new_xmax = add_to_xmax;
4758 					new_infomask |= HEAP_XMAX_EXCL_LOCK;
4759 					break;
4760 				case LockTupleExclusive:
4761 					new_xmax = add_to_xmax;
4762 					new_infomask |= HEAP_XMAX_EXCL_LOCK;
4763 					new_infomask2 |= HEAP_KEYS_UPDATED;
4764 					break;
4765 				default:
4766 					new_xmax = InvalidTransactionId;	/* silence compiler */
4767 					elog(ERROR, "invalid lock mode");
4768 			}
4769 		}
4770 	}
4771 	else if (old_infomask & HEAP_XMAX_IS_MULTI)
4772 	{
4773 		MultiXactStatus new_status;
4774 
4775 		/*
4776 		 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
4777 		 * cross-check.
4778 		 */
4779 		Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
4780 
4781 		/*
4782 		 * A multixact together with LOCK_ONLY set but neither lock bit set
4783 		 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
4784 		 * anymore.  This check is critical for databases upgraded by
4785 		 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
4786 		 * that such multis are never passed.
4787 		 */
4788 		if (HEAP_LOCKED_UPGRADED(old_infomask))
4789 		{
4790 			old_infomask &= ~HEAP_XMAX_IS_MULTI;
4791 			old_infomask |= HEAP_XMAX_INVALID;
4792 			goto l5;
4793 		}
4794 
4795 		/*
4796 		 * If the XMAX is already a MultiXactId, then we need to expand it to
4797 		 * include add_to_xmax; but if all the members were lockers and are
4798 		 * all gone, we can do away with the IS_MULTI bit and just set
4799 		 * add_to_xmax as the only locker/updater.  If all lockers are gone
4800 		 * and we have an updater that aborted, we can also do without a
4801 		 * multi.
4802 		 *
4803 		 * The cost of doing GetMultiXactIdMembers would be paid by
4804 		 * MultiXactIdExpand if we weren't to do this, so this check is not
4805 		 * incurring extra work anyhow.
4806 		 */
4807 		if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
4808 		{
4809 			if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
4810 				!TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
4811 																old_infomask)))
4812 			{
4813 				/*
4814 				 * Reset these bits and restart; otherwise fall through to
4815 				 * create a new multi below.
4816 				 */
4817 				old_infomask &= ~HEAP_XMAX_IS_MULTI;
4818 				old_infomask |= HEAP_XMAX_INVALID;
4819 				goto l5;
4820 			}
4821 		}
4822 
4823 		new_status = get_mxact_status_for_lock(mode, is_update);
4824 
4825 		new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
4826 									 new_status);
4827 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4828 	}
4829 	else if (old_infomask & HEAP_XMAX_COMMITTED)
4830 	{
4831 		/*
4832 		 * It's a committed update, so we need to preserve him as updater of
4833 		 * the tuple.
4834 		 */
4835 		MultiXactStatus status;
4836 		MultiXactStatus new_status;
4837 
4838 		if (old_infomask2 & HEAP_KEYS_UPDATED)
4839 			status = MultiXactStatusUpdate;
4840 		else
4841 			status = MultiXactStatusNoKeyUpdate;
4842 
4843 		new_status = get_mxact_status_for_lock(mode, is_update);
4844 
4845 		/*
4846 		 * since it's not running, it's obviously impossible for the old
4847 		 * updater to be identical to the current one, so we need not check
4848 		 * for that case as we do in the block above.
4849 		 */
4850 		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4851 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4852 	}
4853 	else if (TransactionIdIsInProgress(xmax))
4854 	{
4855 		/*
4856 		 * If the XMAX is a valid, in-progress TransactionId, then we need to
4857 		 * create a new MultiXactId that includes both the old locker or
4858 		 * updater and our own TransactionId.
4859 		 */
4860 		MultiXactStatus new_status;
4861 		MultiXactStatus old_status;
4862 		LockTupleMode old_mode;
4863 
4864 		if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
4865 		{
4866 			if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
4867 				old_status = MultiXactStatusForKeyShare;
4868 			else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
4869 				old_status = MultiXactStatusForShare;
4870 			else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
4871 			{
4872 				if (old_infomask2 & HEAP_KEYS_UPDATED)
4873 					old_status = MultiXactStatusForUpdate;
4874 				else
4875 					old_status = MultiXactStatusForNoKeyUpdate;
4876 			}
4877 			else
4878 			{
4879 				/*
4880 				 * LOCK_ONLY can be present alone only when a page has been
4881 				 * upgraded by pg_upgrade.  But in that case,
4882 				 * TransactionIdIsInProgress() should have returned false.  We
4883 				 * assume it's no longer locked in this case.
4884 				 */
4885 				elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
4886 				old_infomask |= HEAP_XMAX_INVALID;
4887 				old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
4888 				goto l5;
4889 			}
4890 		}
4891 		else
4892 		{
4893 			/* it's an update, but which kind? */
4894 			if (old_infomask2 & HEAP_KEYS_UPDATED)
4895 				old_status = MultiXactStatusUpdate;
4896 			else
4897 				old_status = MultiXactStatusNoKeyUpdate;
4898 		}
4899 
4900 		old_mode = TUPLOCK_from_mxstatus(old_status);
4901 
4902 		/*
4903 		 * If the lock to be acquired is for the same TransactionId as the
4904 		 * existing lock, there's an optimization possible: consider only the
4905 		 * strongest of both locks as the only one present, and restart.
4906 		 */
4907 		if (xmax == add_to_xmax)
4908 		{
4909 			/*
4910 			 * Note that it's not possible for the original tuple to be
4911 			 * updated: we wouldn't be here because the tuple would have been
4912 			 * invisible and we wouldn't try to update it.  As a subtlety,
4913 			 * this code can also run when traversing an update chain to lock
4914 			 * future versions of a tuple.  But we wouldn't be here either,
4915 			 * because the add_to_xmax would be different from the original
4916 			 * updater.
4917 			 */
4918 			Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
4919 
4920 			/* acquire the strongest of both */
4921 			if (mode < old_mode)
4922 				mode = old_mode;
4923 			/* mustn't touch is_update */
4924 
4925 			old_infomask |= HEAP_XMAX_INVALID;
4926 			goto l5;
4927 		}
4928 
4929 		/* otherwise, just fall back to creating a new multixact */
4930 		new_status = get_mxact_status_for_lock(mode, is_update);
4931 		new_xmax = MultiXactIdCreate(xmax, old_status,
4932 									 add_to_xmax, new_status);
4933 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4934 	}
4935 	else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
4936 			 TransactionIdDidCommit(xmax))
4937 	{
4938 		/*
4939 		 * It's a committed update, so we gotta preserve him as updater of the
4940 		 * tuple.
4941 		 */
4942 		MultiXactStatus status;
4943 		MultiXactStatus new_status;
4944 
4945 		if (old_infomask2 & HEAP_KEYS_UPDATED)
4946 			status = MultiXactStatusUpdate;
4947 		else
4948 			status = MultiXactStatusNoKeyUpdate;
4949 
4950 		new_status = get_mxact_status_for_lock(mode, is_update);
4951 
4952 		/*
4953 		 * since it's not running, it's obviously impossible for the old
4954 		 * updater to be identical to the current one, so we need not check
4955 		 * for that case as we do in the block above.
4956 		 */
4957 		new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
4958 		GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
4959 	}
4960 	else
4961 	{
4962 		/*
4963 		 * Can get here iff the locking/updating transaction was running when
4964 		 * the infomask was extracted from the tuple, but finished before
4965 		 * TransactionIdIsInProgress got to run.  Deal with it as if there was
4966 		 * no locker at all in the first place.
4967 		 */
4968 		old_infomask |= HEAP_XMAX_INVALID;
4969 		goto l5;
4970 	}
4971 
4972 	*result_infomask = new_infomask;
4973 	*result_infomask2 = new_infomask2;
4974 	*result_xmax = new_xmax;
4975 }
4976 
4977 /*
4978  * Subroutine for heap_lock_updated_tuple_rec.
4979  *
4980  * Given a hypothetical multixact status held by the transaction identified
4981  * with the given xid, does the current transaction need to wait, fail, or can
4982  * it continue if it wanted to acquire a lock of the given mode?  "needwait"
4983  * is set to true if waiting is necessary; if it can continue, then TM_Ok is
4984  * returned.  If the lock is already held by the current transaction, return
4985  * TM_SelfModified.  In case of a conflict with another transaction, a
4986  * different HeapTupleSatisfiesUpdate return code is returned.
4987  *
4988  * The held status is said to be hypothetical because it might correspond to a
4989  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
4990  * way for simplicity of API.
4991  */
4992 static TM_Result
test_lockmode_for_conflict(MultiXactStatus status,TransactionId xid,LockTupleMode mode,HeapTuple tup,bool * needwait)4993 test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
4994 						   LockTupleMode mode, HeapTuple tup,
4995 						   bool *needwait)
4996 {
4997 	MultiXactStatus wantedstatus;
4998 
4999 	*needwait = false;
5000 	wantedstatus = get_mxact_status_for_lock(mode, false);
5001 
5002 	/*
5003 	 * Note: we *must* check TransactionIdIsInProgress before
5004 	 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5005 	 * for an explanation.
5006 	 */
5007 	if (TransactionIdIsCurrentTransactionId(xid))
5008 	{
5009 		/*
5010 		 * The tuple has already been locked by our own transaction.  This is
5011 		 * very rare but can happen if multiple transactions are trying to
5012 		 * lock an ancient version of the same tuple.
5013 		 */
5014 		return TM_SelfModified;
5015 	}
5016 	else if (TransactionIdIsInProgress(xid))
5017 	{
5018 		/*
5019 		 * If the locking transaction is running, what we do depends on
5020 		 * whether the lock modes conflict: if they do, then we must wait for
5021 		 * it to finish; otherwise we can fall through to lock this tuple
5022 		 * version without waiting.
5023 		 */
5024 		if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5025 								LOCKMODE_from_mxstatus(wantedstatus)))
5026 		{
5027 			*needwait = true;
5028 		}
5029 
5030 		/*
5031 		 * If we set needwait above, then this value doesn't matter;
5032 		 * otherwise, this value signals to caller that it's okay to proceed.
5033 		 */
5034 		return TM_Ok;
5035 	}
5036 	else if (TransactionIdDidAbort(xid))
5037 		return TM_Ok;
5038 	else if (TransactionIdDidCommit(xid))
5039 	{
5040 		/*
5041 		 * The other transaction committed.  If it was only a locker, then the
5042 		 * lock is completely gone now and we can return success; but if it
5043 		 * was an update, then what we do depends on whether the two lock
5044 		 * modes conflict.  If they conflict, then we must report error to
5045 		 * caller. But if they don't, we can fall through to allow the current
5046 		 * transaction to lock the tuple.
5047 		 *
5048 		 * Note: the reason we worry about ISUPDATE here is because as soon as
5049 		 * a transaction ends, all its locks are gone and meaningless, and
5050 		 * thus we can ignore them; whereas its updates persist.  In the
5051 		 * TransactionIdIsInProgress case, above, we don't need to check
5052 		 * because we know the lock is still "alive" and thus a conflict needs
5053 		 * always be checked.
5054 		 */
5055 		if (!ISUPDATE_from_mxstatus(status))
5056 			return TM_Ok;
5057 
5058 		if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
5059 								LOCKMODE_from_mxstatus(wantedstatus)))
5060 		{
5061 			/* bummer */
5062 			if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid) ||
5063 				HeapTupleHeaderIndicatesMovedPartitions(tup->t_data))
5064 				return TM_Updated;
5065 			else
5066 				return TM_Deleted;
5067 		}
5068 
5069 		return TM_Ok;
5070 	}
5071 
5072 	/* Not in progress, not aborted, not committed -- must have crashed */
5073 	return TM_Ok;
5074 }
5075 
5076 
5077 /*
5078  * Recursive part of heap_lock_updated_tuple
5079  *
5080  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5081  * xid with the given mode; if this tuple is updated, recurse to lock the new
5082  * version as well.
5083  */
5084 static TM_Result
heap_lock_updated_tuple_rec(Relation rel,ItemPointer tid,TransactionId xid,LockTupleMode mode)5085 heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
5086 							LockTupleMode mode)
5087 {
5088 	TM_Result	result;
5089 	ItemPointerData tupid;
5090 	HeapTupleData mytup;
5091 	Buffer		buf;
5092 	uint16		new_infomask,
5093 				new_infomask2,
5094 				old_infomask,
5095 				old_infomask2;
5096 	TransactionId xmax,
5097 				new_xmax;
5098 	TransactionId priorXmax = InvalidTransactionId;
5099 	bool		cleared_all_frozen = false;
5100 	bool		pinned_desired_page;
5101 	Buffer		vmbuffer = InvalidBuffer;
5102 	BlockNumber block;
5103 
5104 	ItemPointerCopy(tid, &tupid);
5105 
5106 	for (;;)
5107 	{
5108 		new_infomask = 0;
5109 		new_xmax = InvalidTransactionId;
5110 		block = ItemPointerGetBlockNumber(&tupid);
5111 		ItemPointerCopy(&tupid, &(mytup.t_self));
5112 
5113 		if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5114 		{
5115 			/*
5116 			 * if we fail to find the updated version of the tuple, it's
5117 			 * because it was vacuumed/pruned away after its creator
5118 			 * transaction aborted.  So behave as if we got to the end of the
5119 			 * chain, and there's no further tuple to lock: return success to
5120 			 * caller.
5121 			 */
5122 			result = TM_Ok;
5123 			goto out_unlocked;
5124 		}
5125 
5126 l4:
5127 		CHECK_FOR_INTERRUPTS();
5128 
5129 		/*
5130 		 * Before locking the buffer, pin the visibility map page if it
5131 		 * appears to be necessary.  Since we haven't got the lock yet,
5132 		 * someone else might be in the middle of changing this, so we'll need
5133 		 * to recheck after we have the lock.
5134 		 */
5135 		if (PageIsAllVisible(BufferGetPage(buf)))
5136 		{
5137 			visibilitymap_pin(rel, block, &vmbuffer);
5138 			pinned_desired_page = true;
5139 		}
5140 		else
5141 			pinned_desired_page = false;
5142 
5143 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5144 
5145 		/*
5146 		 * If we didn't pin the visibility map page and the page has become
5147 		 * all visible while we were busy locking the buffer, we'll have to
5148 		 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5149 		 * That's a bit unfortunate, but hopefully shouldn't happen often.
5150 		 *
5151 		 * Note: in some paths through this function, we will reach here
5152 		 * holding a pin on a vm page that may or may not be the one matching
5153 		 * this page.  If this page isn't all-visible, we won't use the vm
5154 		 * page, but we hold onto such a pin till the end of the function.
5155 		 */
5156 		if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5157 		{
5158 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5159 			visibilitymap_pin(rel, block, &vmbuffer);
5160 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
5161 		}
5162 
5163 		/*
5164 		 * Check the tuple XMIN against prior XMAX, if any.  If we reached the
5165 		 * end of the chain, we're done, so return success.
5166 		 */
5167 		if (TransactionIdIsValid(priorXmax) &&
5168 			!TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
5169 								 priorXmax))
5170 		{
5171 			result = TM_Ok;
5172 			goto out_locked;
5173 		}
5174 
5175 		/*
5176 		 * Also check Xmin: if this tuple was created by an aborted
5177 		 * (sub)transaction, then we already locked the last live one in the
5178 		 * chain, thus we're done, so return success.
5179 		 */
5180 		if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
5181 		{
5182 			result = TM_Ok;
5183 			goto out_locked;
5184 		}
5185 
5186 		old_infomask = mytup.t_data->t_infomask;
5187 		old_infomask2 = mytup.t_data->t_infomask2;
5188 		xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5189 
5190 		/*
5191 		 * If this tuple version has been updated or locked by some concurrent
5192 		 * transaction(s), what we do depends on whether our lock mode
5193 		 * conflicts with what those other transactions hold, and also on the
5194 		 * status of them.
5195 		 */
5196 		if (!(old_infomask & HEAP_XMAX_INVALID))
5197 		{
5198 			TransactionId rawxmax;
5199 			bool		needwait;
5200 
5201 			rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5202 			if (old_infomask & HEAP_XMAX_IS_MULTI)
5203 			{
5204 				int			nmembers;
5205 				int			i;
5206 				MultiXactMember *members;
5207 
5208 				/*
5209 				 * We don't need a test for pg_upgrade'd tuples: this is only
5210 				 * applied to tuples after the first in an update chain.  Said
5211 				 * first tuple in the chain may well be locked-in-9.2-and-
5212 				 * pg_upgraded, but that one was already locked by our caller,
5213 				 * not us; and any subsequent ones cannot be because our
5214 				 * caller must necessarily have obtained a snapshot later than
5215 				 * the pg_upgrade itself.
5216 				 */
5217 				Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5218 
5219 				nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5220 												 HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5221 				for (i = 0; i < nmembers; i++)
5222 				{
5223 					result = test_lockmode_for_conflict(members[i].status,
5224 														members[i].xid,
5225 														mode,
5226 														&mytup,
5227 														&needwait);
5228 
5229 					/*
5230 					 * If the tuple was already locked by ourselves in a
5231 					 * previous iteration of this (say heap_lock_tuple was
5232 					 * forced to restart the locking loop because of a change
5233 					 * in xmax), then we hold the lock already on this tuple
5234 					 * version and we don't need to do anything; and this is
5235 					 * not an error condition either.  We just need to skip
5236 					 * this tuple and continue locking the next version in the
5237 					 * update chain.
5238 					 */
5239 					if (result == TM_SelfModified)
5240 					{
5241 						pfree(members);
5242 						goto next;
5243 					}
5244 
5245 					if (needwait)
5246 					{
5247 						LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5248 						XactLockTableWait(members[i].xid, rel,
5249 										  &mytup.t_self,
5250 										  XLTW_LockUpdated);
5251 						pfree(members);
5252 						goto l4;
5253 					}
5254 					if (result != TM_Ok)
5255 					{
5256 						pfree(members);
5257 						goto out_locked;
5258 					}
5259 				}
5260 				if (members)
5261 					pfree(members);
5262 			}
5263 			else
5264 			{
5265 				MultiXactStatus status;
5266 
5267 				/*
5268 				 * For a non-multi Xmax, we first need to compute the
5269 				 * corresponding MultiXactStatus by using the infomask bits.
5270 				 */
5271 				if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5272 				{
5273 					if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5274 						status = MultiXactStatusForKeyShare;
5275 					else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5276 						status = MultiXactStatusForShare;
5277 					else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5278 					{
5279 						if (old_infomask2 & HEAP_KEYS_UPDATED)
5280 							status = MultiXactStatusForUpdate;
5281 						else
5282 							status = MultiXactStatusForNoKeyUpdate;
5283 					}
5284 					else
5285 					{
5286 						/*
5287 						 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5288 						 * as share-locked in the old cluster) shouldn't be
5289 						 * seen in the middle of an update chain.
5290 						 */
5291 						elog(ERROR, "invalid lock status in tuple");
5292 					}
5293 				}
5294 				else
5295 				{
5296 					/* it's an update, but which kind? */
5297 					if (old_infomask2 & HEAP_KEYS_UPDATED)
5298 						status = MultiXactStatusUpdate;
5299 					else
5300 						status = MultiXactStatusNoKeyUpdate;
5301 				}
5302 
5303 				result = test_lockmode_for_conflict(status, rawxmax, mode,
5304 													&mytup, &needwait);
5305 
5306 				/*
5307 				 * If the tuple was already locked by ourselves in a previous
5308 				 * iteration of this (say heap_lock_tuple was forced to
5309 				 * restart the locking loop because of a change in xmax), then
5310 				 * we hold the lock already on this tuple version and we don't
5311 				 * need to do anything; and this is not an error condition
5312 				 * either.  We just need to skip this tuple and continue
5313 				 * locking the next version in the update chain.
5314 				 */
5315 				if (result == TM_SelfModified)
5316 					goto next;
5317 
5318 				if (needwait)
5319 				{
5320 					LockBuffer(buf, BUFFER_LOCK_UNLOCK);
5321 					XactLockTableWait(rawxmax, rel, &mytup.t_self,
5322 									  XLTW_LockUpdated);
5323 					goto l4;
5324 				}
5325 				if (result != TM_Ok)
5326 				{
5327 					goto out_locked;
5328 				}
5329 			}
5330 		}
5331 
5332 		/* compute the new Xmax and infomask values for the tuple ... */
5333 		compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5334 								  xid, mode, false,
5335 								  &new_xmax, &new_infomask, &new_infomask2);
5336 
5337 		if (PageIsAllVisible(BufferGetPage(buf)) &&
5338 			visibilitymap_clear(rel, block, vmbuffer,
5339 								VISIBILITYMAP_ALL_FROZEN))
5340 			cleared_all_frozen = true;
5341 
5342 		START_CRIT_SECTION();
5343 
5344 		/* ... and set them */
5345 		HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5346 		mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5347 		mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5348 		mytup.t_data->t_infomask |= new_infomask;
5349 		mytup.t_data->t_infomask2 |= new_infomask2;
5350 
5351 		MarkBufferDirty(buf);
5352 
5353 		/* XLOG stuff */
5354 		if (RelationNeedsWAL(rel))
5355 		{
5356 			xl_heap_lock_updated xlrec;
5357 			XLogRecPtr	recptr;
5358 			Page		page = BufferGetPage(buf);
5359 
5360 			XLogBeginInsert();
5361 			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
5362 
5363 			xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5364 			xlrec.xmax = new_xmax;
5365 			xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5366 			xlrec.flags =
5367 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5368 
5369 			XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5370 
5371 			recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5372 
5373 			PageSetLSN(page, recptr);
5374 		}
5375 
5376 		END_CRIT_SECTION();
5377 
5378 next:
5379 		/* if we find the end of update chain, we're done. */
5380 		if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5381 			HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
5382 			ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5383 			HeapTupleHeaderIsOnlyLocked(mytup.t_data))
5384 		{
5385 			result = TM_Ok;
5386 			goto out_locked;
5387 		}
5388 
5389 		/* tail recursion */
5390 		priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5391 		ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5392 		UnlockReleaseBuffer(buf);
5393 	}
5394 
5395 	result = TM_Ok;
5396 
5397 out_locked:
5398 	UnlockReleaseBuffer(buf);
5399 
5400 out_unlocked:
5401 	if (vmbuffer != InvalidBuffer)
5402 		ReleaseBuffer(vmbuffer);
5403 
5404 	return result;
5405 }
5406 
5407 /*
5408  * heap_lock_updated_tuple
5409  *		Follow update chain when locking an updated tuple, acquiring locks (row
5410  *		marks) on the updated versions.
5411  *
5412  * The initial tuple is assumed to be already locked.
5413  *
5414  * This function doesn't check visibility, it just unconditionally marks the
5415  * tuple(s) as locked.  If any tuple in the updated chain is being deleted
5416  * concurrently (or updated with the key being modified), sleep until the
5417  * transaction doing it is finished.
5418  *
5419  * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5420  * when we have to wait for other transactions to release them, as opposed to
5421  * what heap_lock_tuple does.  The reason is that having more than one
5422  * transaction walking the chain is probably uncommon enough that risk of
5423  * starvation is not likely: one of the preconditions for being here is that
5424  * the snapshot in use predates the update that created this tuple (because we
5425  * started at an earlier version of the tuple), but at the same time such a
5426  * transaction cannot be using repeatable read or serializable isolation
5427  * levels, because that would lead to a serializability failure.
5428  */
5429 static TM_Result
heap_lock_updated_tuple(Relation rel,HeapTuple tuple,ItemPointer ctid,TransactionId xid,LockTupleMode mode)5430 heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
5431 						TransactionId xid, LockTupleMode mode)
5432 {
5433 	/*
5434 	 * If the tuple has not been updated, or has moved into another partition
5435 	 * (effectively a delete) stop here.
5436 	 */
5437 	if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
5438 		!ItemPointerEquals(&tuple->t_self, ctid))
5439 	{
5440 		/*
5441 		 * If this is the first possibly-multixact-able operation in the
5442 		 * current transaction, set my per-backend OldestMemberMXactId
5443 		 * setting. We can be certain that the transaction will never become a
5444 		 * member of any older MultiXactIds than that.  (We have to do this
5445 		 * even if we end up just using our own TransactionId below, since
5446 		 * some other backend could incorporate our XID into a MultiXact
5447 		 * immediately afterwards.)
5448 		 */
5449 		MultiXactIdSetOldestMember();
5450 
5451 		return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5452 	}
5453 
5454 	/* nothing to lock */
5455 	return TM_Ok;
5456 }
5457 
5458 /*
5459  *	heap_finish_speculative - mark speculative insertion as successful
5460  *
5461  * To successfully finish a speculative insertion we have to clear speculative
5462  * token from tuple.  To do so the t_ctid field, which will contain a
5463  * speculative token value, is modified in place to point to the tuple itself,
5464  * which is characteristic of a newly inserted ordinary tuple.
5465  *
5466  * NB: It is not ok to commit without either finishing or aborting a
5467  * speculative insertion.  We could treat speculative tuples of committed
5468  * transactions implicitly as completed, but then we would have to be prepared
5469  * to deal with speculative tokens on committed tuples.  That wouldn't be
5470  * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5471  * but clearing the token at completion isn't very expensive either.
5472  * An explicit confirmation WAL record also makes logical decoding simpler.
5473  */
5474 void
heap_finish_speculative(Relation relation,ItemPointer tid)5475 heap_finish_speculative(Relation relation, ItemPointer tid)
5476 {
5477 	Buffer		buffer;
5478 	Page		page;
5479 	OffsetNumber offnum;
5480 	ItemId		lp = NULL;
5481 	HeapTupleHeader htup;
5482 
5483 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5484 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5485 	page = (Page) BufferGetPage(buffer);
5486 
5487 	offnum = ItemPointerGetOffsetNumber(tid);
5488 	if (PageGetMaxOffsetNumber(page) >= offnum)
5489 		lp = PageGetItemId(page, offnum);
5490 
5491 	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5492 		elog(ERROR, "invalid lp");
5493 
5494 	htup = (HeapTupleHeader) PageGetItem(page, lp);
5495 
5496 	/* SpecTokenOffsetNumber should be distinguishable from any real offset */
5497 	StaticAssertStmt(MaxOffsetNumber < SpecTokenOffsetNumber,
5498 					 "invalid speculative token constant");
5499 
5500 	/* NO EREPORT(ERROR) from here till changes are logged */
5501 	START_CRIT_SECTION();
5502 
5503 	Assert(HeapTupleHeaderIsSpeculative(htup));
5504 
5505 	MarkBufferDirty(buffer);
5506 
5507 	/*
5508 	 * Replace the speculative insertion token with a real t_ctid, pointing to
5509 	 * itself like it does on regular tuples.
5510 	 */
5511 	htup->t_ctid = *tid;
5512 
5513 	/* XLOG stuff */
5514 	if (RelationNeedsWAL(relation))
5515 	{
5516 		xl_heap_confirm xlrec;
5517 		XLogRecPtr	recptr;
5518 
5519 		xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5520 
5521 		XLogBeginInsert();
5522 
5523 		/* We want the same filtering on this as on a plain insert */
5524 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
5525 
5526 		XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5527 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5528 
5529 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5530 
5531 		PageSetLSN(page, recptr);
5532 	}
5533 
5534 	END_CRIT_SECTION();
5535 
5536 	UnlockReleaseBuffer(buffer);
5537 }
5538 
5539 /*
5540  *	heap_abort_speculative - kill a speculatively inserted tuple
5541  *
5542  * Marks a tuple that was speculatively inserted in the same command as dead,
5543  * by setting its xmin as invalid.  That makes it immediately appear as dead
5544  * to all transactions, including our own.  In particular, it makes
5545  * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
5546  * inserting a duplicate key value won't unnecessarily wait for our whole
5547  * transaction to finish (it'll just wait for our speculative insertion to
5548  * finish).
5549  *
5550  * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
5551  * that arise due to a mutual dependency that is not user visible.  By
5552  * definition, unprincipled deadlocks cannot be prevented by the user
5553  * reordering lock acquisition in client code, because the implementation level
5554  * lock acquisitions are not under the user's direct control.  If speculative
5555  * inserters did not take this precaution, then under high concurrency they
5556  * could deadlock with each other, which would not be acceptable.
5557  *
5558  * This is somewhat redundant with heap_delete, but we prefer to have a
5559  * dedicated routine with stripped down requirements.  Note that this is also
5560  * used to delete the TOAST tuples created during speculative insertion.
5561  *
5562  * This routine does not affect logical decoding as it only looks at
5563  * confirmation records.
5564  */
5565 void
heap_abort_speculative(Relation relation,ItemPointer tid)5566 heap_abort_speculative(Relation relation, ItemPointer tid)
5567 {
5568 	TransactionId xid = GetCurrentTransactionId();
5569 	ItemId		lp;
5570 	HeapTupleData tp;
5571 	Page		page;
5572 	BlockNumber block;
5573 	Buffer		buffer;
5574 	TransactionId prune_xid;
5575 
5576 	Assert(ItemPointerIsValid(tid));
5577 
5578 	block = ItemPointerGetBlockNumber(tid);
5579 	buffer = ReadBuffer(relation, block);
5580 	page = BufferGetPage(buffer);
5581 
5582 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5583 
5584 	/*
5585 	 * Page can't be all visible, we just inserted into it, and are still
5586 	 * running.
5587 	 */
5588 	Assert(!PageIsAllVisible(page));
5589 
5590 	lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
5591 	Assert(ItemIdIsNormal(lp));
5592 
5593 	tp.t_tableOid = RelationGetRelid(relation);
5594 	tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
5595 	tp.t_len = ItemIdGetLength(lp);
5596 	tp.t_self = *tid;
5597 
5598 	/*
5599 	 * Sanity check that the tuple really is a speculatively inserted tuple,
5600 	 * inserted by us.
5601 	 */
5602 	if (tp.t_data->t_choice.t_heap.t_xmin != xid)
5603 		elog(ERROR, "attempted to kill a tuple inserted by another transaction");
5604 	if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
5605 		elog(ERROR, "attempted to kill a non-speculative tuple");
5606 	Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
5607 
5608 	/*
5609 	 * No need to check for serializable conflicts here.  There is never a
5610 	 * need for a combocid, either.  No need to extract replica identity, or
5611 	 * do anything special with infomask bits.
5612 	 */
5613 
5614 	START_CRIT_SECTION();
5615 
5616 	/*
5617 	 * The tuple will become DEAD immediately.  Flag that this page is a
5618 	 * candidate for pruning by setting xmin to TransactionXmin. While not
5619 	 * immediately prunable, it is the oldest xid we can cheaply determine
5620 	 * that's safe against wraparound / being older than the table's
5621 	 * relfrozenxid.  To defend against the unlikely case of a new relation
5622 	 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
5623 	 * if so (vacuum can't subsequently move relfrozenxid to beyond
5624 	 * TransactionXmin, so there's no race here).
5625 	 */
5626 	Assert(TransactionIdIsValid(TransactionXmin));
5627 	if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
5628 		prune_xid = relation->rd_rel->relfrozenxid;
5629 	else
5630 		prune_xid = TransactionXmin;
5631 	PageSetPrunable(page, prune_xid);
5632 
5633 	/* store transaction information of xact deleting the tuple */
5634 	tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
5635 	tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5636 
5637 	/*
5638 	 * Set the tuple header xmin to InvalidTransactionId.  This makes the
5639 	 * tuple immediately invisible everyone.  (In particular, to any
5640 	 * transactions waiting on the speculative token, woken up later.)
5641 	 */
5642 	HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
5643 
5644 	/* Clear the speculative insertion token too */
5645 	tp.t_data->t_ctid = tp.t_self;
5646 
5647 	MarkBufferDirty(buffer);
5648 
5649 	/*
5650 	 * XLOG stuff
5651 	 *
5652 	 * The WAL records generated here match heap_delete().  The same recovery
5653 	 * routines are used.
5654 	 */
5655 	if (RelationNeedsWAL(relation))
5656 	{
5657 		xl_heap_delete xlrec;
5658 		XLogRecPtr	recptr;
5659 
5660 		xlrec.flags = XLH_DELETE_IS_SUPER;
5661 		xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
5662 											  tp.t_data->t_infomask2);
5663 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
5664 		xlrec.xmax = xid;
5665 
5666 		XLogBeginInsert();
5667 		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
5668 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5669 
5670 		/* No replica identity & replication origin logged */
5671 
5672 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
5673 
5674 		PageSetLSN(page, recptr);
5675 	}
5676 
5677 	END_CRIT_SECTION();
5678 
5679 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5680 
5681 	if (HeapTupleHasExternal(&tp))
5682 	{
5683 		Assert(!IsToastRelation(relation));
5684 		heap_toast_delete(relation, &tp, true);
5685 	}
5686 
5687 	/*
5688 	 * Never need to mark tuple for invalidation, since catalogs don't support
5689 	 * speculative insertion
5690 	 */
5691 
5692 	/* Now we can release the buffer */
5693 	ReleaseBuffer(buffer);
5694 
5695 	/* count deletion, as we counted the insertion too */
5696 	pgstat_count_heap_delete(relation);
5697 }
5698 
5699 /*
5700  * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5701  *
5702  * Overwriting violates both MVCC and transactional safety, so the uses
5703  * of this function in Postgres are extremely limited.  Nonetheless we
5704  * find some places to use it.
5705  *
5706  * The tuple cannot change size, and therefore it's reasonable to assume
5707  * that its null bitmap (if any) doesn't change either.  So we just
5708  * overwrite the data portion of the tuple without touching the null
5709  * bitmap or any of the header fields.
5710  *
5711  * tuple is an in-memory tuple structure containing the data to be written
5712  * over the target tuple.  Also, tuple->t_self identifies the target tuple.
5713  */
5714 void
heap_inplace_update(Relation relation,HeapTuple tuple)5715 heap_inplace_update(Relation relation, HeapTuple tuple)
5716 {
5717 	Buffer		buffer;
5718 	Page		page;
5719 	OffsetNumber offnum;
5720 	ItemId		lp = NULL;
5721 	HeapTupleHeader htup;
5722 	uint32		oldlen;
5723 	uint32		newlen;
5724 
5725 	/*
5726 	 * For now, we don't allow parallel updates.  Unlike a regular update,
5727 	 * this should never create a combo CID, so it might be possible to relax
5728 	 * this restriction, but not without more thought and testing.  It's not
5729 	 * clear that it would be useful, anyway.
5730 	 */
5731 	if (IsInParallelMode())
5732 		ereport(ERROR,
5733 				(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
5734 				 errmsg("cannot update tuples during a parallel operation")));
5735 
5736 	buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
5737 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
5738 	page = (Page) BufferGetPage(buffer);
5739 
5740 	offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
5741 	if (PageGetMaxOffsetNumber(page) >= offnum)
5742 		lp = PageGetItemId(page, offnum);
5743 
5744 	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5745 		elog(ERROR, "invalid lp");
5746 
5747 	htup = (HeapTupleHeader) PageGetItem(page, lp);
5748 
5749 	oldlen = ItemIdGetLength(lp) - htup->t_hoff;
5750 	newlen = tuple->t_len - tuple->t_data->t_hoff;
5751 	if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
5752 		elog(ERROR, "wrong tuple length");
5753 
5754 	/* NO EREPORT(ERROR) from here till changes are logged */
5755 	START_CRIT_SECTION();
5756 
5757 	memcpy((char *) htup + htup->t_hoff,
5758 		   (char *) tuple->t_data + tuple->t_data->t_hoff,
5759 		   newlen);
5760 
5761 	MarkBufferDirty(buffer);
5762 
5763 	/* XLOG stuff */
5764 	if (RelationNeedsWAL(relation))
5765 	{
5766 		xl_heap_inplace xlrec;
5767 		XLogRecPtr	recptr;
5768 
5769 		xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5770 
5771 		XLogBeginInsert();
5772 		XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
5773 
5774 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5775 		XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
5776 
5777 		/* inplace updates aren't decoded atm, don't log the origin */
5778 
5779 		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
5780 
5781 		PageSetLSN(page, recptr);
5782 	}
5783 
5784 	END_CRIT_SECTION();
5785 
5786 	UnlockReleaseBuffer(buffer);
5787 
5788 	/*
5789 	 * Send out shared cache inval if necessary.  Note that because we only
5790 	 * pass the new version of the tuple, this mustn't be used for any
5791 	 * operations that could change catcache lookup keys.  But we aren't
5792 	 * bothering with index updates either, so that's true a fortiori.
5793 	 */
5794 	if (!IsBootstrapProcessingMode())
5795 		CacheInvalidateHeapTuple(relation, tuple, NULL);
5796 }
5797 
5798 #define		FRM_NOOP				0x0001
5799 #define		FRM_INVALIDATE_XMAX		0x0002
5800 #define		FRM_RETURN_IS_XID		0x0004
5801 #define		FRM_RETURN_IS_MULTI		0x0008
5802 #define		FRM_MARK_COMMITTED		0x0010
5803 
5804 /*
5805  * FreezeMultiXactId
5806  *		Determine what to do during freezing when a tuple is marked by a
5807  *		MultiXactId.
5808  *
5809  * NB -- this might have the side-effect of creating a new MultiXactId!
5810  *
5811  * "flags" is an output value; it's used to tell caller what to do on return.
5812  * Possible flags are:
5813  * FRM_NOOP
5814  *		don't do anything -- keep existing Xmax
5815  * FRM_INVALIDATE_XMAX
5816  *		mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
5817  * FRM_RETURN_IS_XID
5818  *		The Xid return value is a single update Xid to set as xmax.
5819  * FRM_MARK_COMMITTED
5820  *		Xmax can be marked as HEAP_XMAX_COMMITTED
5821  * FRM_RETURN_IS_MULTI
5822  *		The return value is a new MultiXactId to set as new Xmax.
5823  *		(caller must obtain proper infomask bits using GetMultiXactIdHintBits)
5824  */
5825 static TransactionId
FreezeMultiXactId(MultiXactId multi,uint16 t_infomask,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,MultiXactId cutoff_multi,uint16 * flags)5826 FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
5827 				  TransactionId relfrozenxid, TransactionId relminmxid,
5828 				  TransactionId cutoff_xid, MultiXactId cutoff_multi,
5829 				  uint16 *flags)
5830 {
5831 	TransactionId xid = InvalidTransactionId;
5832 	int			i;
5833 	MultiXactMember *members;
5834 	int			nmembers;
5835 	bool		need_replace;
5836 	int			nnewmembers;
5837 	MultiXactMember *newmembers;
5838 	bool		has_lockers;
5839 	TransactionId update_xid;
5840 	bool		update_committed;
5841 
5842 	*flags = 0;
5843 
5844 	/* We should only be called in Multis */
5845 	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
5846 
5847 	if (!MultiXactIdIsValid(multi) ||
5848 		HEAP_LOCKED_UPGRADED(t_infomask))
5849 	{
5850 		/* Ensure infomask bits are appropriately set/reset */
5851 		*flags |= FRM_INVALIDATE_XMAX;
5852 		return InvalidTransactionId;
5853 	}
5854 	else if (MultiXactIdPrecedes(multi, relminmxid))
5855 		ereport(ERROR,
5856 				(errcode(ERRCODE_DATA_CORRUPTED),
5857 				 errmsg_internal("found multixact %u from before relminmxid %u",
5858 								 multi, relminmxid)));
5859 	else if (MultiXactIdPrecedes(multi, cutoff_multi))
5860 	{
5861 		/*
5862 		 * This old multi cannot possibly have members still running, but
5863 		 * verify just in case.  If it was a locker only, it can be removed
5864 		 * without any further consideration; but if it contained an update,
5865 		 * we might need to preserve it.
5866 		 */
5867 		if (MultiXactIdIsRunning(multi,
5868 								 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
5869 			ereport(ERROR,
5870 					(errcode(ERRCODE_DATA_CORRUPTED),
5871 					 errmsg_internal("multixact %u from before cutoff %u found to be still running",
5872 									 multi, cutoff_multi)));
5873 
5874 		if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
5875 		{
5876 			*flags |= FRM_INVALIDATE_XMAX;
5877 			xid = InvalidTransactionId; /* not strictly necessary */
5878 		}
5879 		else
5880 		{
5881 			/* replace multi by update xid */
5882 			xid = MultiXactIdGetUpdateXid(multi, t_infomask);
5883 
5884 			/* wasn't only a lock, xid needs to be valid */
5885 			Assert(TransactionIdIsValid(xid));
5886 
5887 			if (TransactionIdPrecedes(xid, relfrozenxid))
5888 				ereport(ERROR,
5889 						(errcode(ERRCODE_DATA_CORRUPTED),
5890 						 errmsg_internal("found update xid %u from before relfrozenxid %u",
5891 										 xid, relfrozenxid)));
5892 
5893 			/*
5894 			 * If the xid is older than the cutoff, it has to have aborted,
5895 			 * otherwise the tuple would have gotten pruned away.
5896 			 */
5897 			if (TransactionIdPrecedes(xid, cutoff_xid))
5898 			{
5899 				if (TransactionIdDidCommit(xid))
5900 					ereport(ERROR,
5901 							(errcode(ERRCODE_DATA_CORRUPTED),
5902 							 errmsg_internal("cannot freeze committed update xid %u", xid)));
5903 				*flags |= FRM_INVALIDATE_XMAX;
5904 				xid = InvalidTransactionId; /* not strictly necessary */
5905 			}
5906 			else
5907 			{
5908 				*flags |= FRM_RETURN_IS_XID;
5909 			}
5910 		}
5911 
5912 		return xid;
5913 	}
5914 
5915 	/*
5916 	 * This multixact might have or might not have members still running, but
5917 	 * we know it's valid and is newer than the cutoff point for multis.
5918 	 * However, some member(s) of it may be below the cutoff for Xids, so we
5919 	 * need to walk the whole members array to figure out what to do, if
5920 	 * anything.
5921 	 */
5922 
5923 	nmembers =
5924 		GetMultiXactIdMembers(multi, &members, false,
5925 							  HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
5926 	if (nmembers <= 0)
5927 	{
5928 		/* Nothing worth keeping */
5929 		*flags |= FRM_INVALIDATE_XMAX;
5930 		return InvalidTransactionId;
5931 	}
5932 
5933 	/* is there anything older than the cutoff? */
5934 	need_replace = false;
5935 	for (i = 0; i < nmembers; i++)
5936 	{
5937 		if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
5938 		{
5939 			need_replace = true;
5940 			break;
5941 		}
5942 	}
5943 
5944 	/*
5945 	 * In the simplest case, there is no member older than the cutoff; we can
5946 	 * keep the existing MultiXactId as is.
5947 	 */
5948 	if (!need_replace)
5949 	{
5950 		*flags |= FRM_NOOP;
5951 		pfree(members);
5952 		return InvalidTransactionId;
5953 	}
5954 
5955 	/*
5956 	 * If the multi needs to be updated, figure out which members do we need
5957 	 * to keep.
5958 	 */
5959 	nnewmembers = 0;
5960 	newmembers = palloc(sizeof(MultiXactMember) * nmembers);
5961 	has_lockers = false;
5962 	update_xid = InvalidTransactionId;
5963 	update_committed = false;
5964 
5965 	for (i = 0; i < nmembers; i++)
5966 	{
5967 		/*
5968 		 * Determine whether to keep this member or ignore it.
5969 		 */
5970 		if (ISUPDATE_from_mxstatus(members[i].status))
5971 		{
5972 			TransactionId xid = members[i].xid;
5973 
5974 			Assert(TransactionIdIsValid(xid));
5975 			if (TransactionIdPrecedes(xid, relfrozenxid))
5976 				ereport(ERROR,
5977 						(errcode(ERRCODE_DATA_CORRUPTED),
5978 						 errmsg_internal("found update xid %u from before relfrozenxid %u",
5979 										 xid, relfrozenxid)));
5980 
5981 			/*
5982 			 * It's an update; should we keep it?  If the transaction is known
5983 			 * aborted or crashed then it's okay to ignore it, otherwise not.
5984 			 * Note that an updater older than cutoff_xid cannot possibly be
5985 			 * committed, because HeapTupleSatisfiesVacuum would have returned
5986 			 * HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
5987 			 *
5988 			 * As with all tuple visibility routines, it's critical to test
5989 			 * TransactionIdIsInProgress before TransactionIdDidCommit,
5990 			 * because of race conditions explained in detail in
5991 			 * heapam_visibility.c.
5992 			 */
5993 			if (TransactionIdIsCurrentTransactionId(xid) ||
5994 				TransactionIdIsInProgress(xid))
5995 			{
5996 				Assert(!TransactionIdIsValid(update_xid));
5997 				update_xid = xid;
5998 			}
5999 			else if (TransactionIdDidCommit(xid))
6000 			{
6001 				/*
6002 				 * The transaction committed, so we can tell caller to set
6003 				 * HEAP_XMAX_COMMITTED.  (We can only do this because we know
6004 				 * the transaction is not running.)
6005 				 */
6006 				Assert(!TransactionIdIsValid(update_xid));
6007 				update_committed = true;
6008 				update_xid = xid;
6009 			}
6010 			else
6011 			{
6012 				/*
6013 				 * Not in progress, not committed -- must be aborted or
6014 				 * crashed; we can ignore it.
6015 				 */
6016 			}
6017 
6018 			/*
6019 			 * Since the tuple wasn't marked HEAPTUPLE_DEAD by vacuum, the
6020 			 * update Xid cannot possibly be older than the xid cutoff. The
6021 			 * presence of such a tuple would cause corruption, so be paranoid
6022 			 * and check.
6023 			 */
6024 			if (TransactionIdIsValid(update_xid) &&
6025 				TransactionIdPrecedes(update_xid, cutoff_xid))
6026 				ereport(ERROR,
6027 						(errcode(ERRCODE_DATA_CORRUPTED),
6028 						 errmsg_internal("found update xid %u from before xid cutoff %u",
6029 										 update_xid, cutoff_xid)));
6030 
6031 			/*
6032 			 * If we determined that it's an Xid corresponding to an update
6033 			 * that must be retained, additionally add it to the list of
6034 			 * members of the new Multi, in case we end up using that.  (We
6035 			 * might still decide to use only an update Xid and not a multi,
6036 			 * but it's easier to maintain the list as we walk the old members
6037 			 * list.)
6038 			 */
6039 			if (TransactionIdIsValid(update_xid))
6040 				newmembers[nnewmembers++] = members[i];
6041 		}
6042 		else
6043 		{
6044 			/* We only keep lockers if they are still running */
6045 			if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
6046 				TransactionIdIsInProgress(members[i].xid))
6047 			{
6048 				/* running locker cannot possibly be older than the cutoff */
6049 				Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
6050 				newmembers[nnewmembers++] = members[i];
6051 				has_lockers = true;
6052 			}
6053 		}
6054 	}
6055 
6056 	pfree(members);
6057 
6058 	if (nnewmembers == 0)
6059 	{
6060 		/* nothing worth keeping!? Tell caller to remove the whole thing */
6061 		*flags |= FRM_INVALIDATE_XMAX;
6062 		xid = InvalidTransactionId;
6063 	}
6064 	else if (TransactionIdIsValid(update_xid) && !has_lockers)
6065 	{
6066 		/*
6067 		 * If there's a single member and it's an update, pass it back alone
6068 		 * without creating a new Multi.  (XXX we could do this when there's a
6069 		 * single remaining locker, too, but that would complicate the API too
6070 		 * much; moreover, the case with the single updater is more
6071 		 * interesting, because those are longer-lived.)
6072 		 */
6073 		Assert(nnewmembers == 1);
6074 		*flags |= FRM_RETURN_IS_XID;
6075 		if (update_committed)
6076 			*flags |= FRM_MARK_COMMITTED;
6077 		xid = update_xid;
6078 	}
6079 	else
6080 	{
6081 		/*
6082 		 * Create a new multixact with the surviving members of the previous
6083 		 * one, to set as new Xmax in the tuple.
6084 		 */
6085 		xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
6086 		*flags |= FRM_RETURN_IS_MULTI;
6087 	}
6088 
6089 	pfree(newmembers);
6090 
6091 	return xid;
6092 }
6093 
6094 /*
6095  * heap_prepare_freeze_tuple
6096  *
6097  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6098  * are older than the specified cutoff XID and cutoff MultiXactId.  If so,
6099  * setup enough state (in the *frz output argument) to later execute and
6100  * WAL-log what we would need to do, and return true.  Return false if nothing
6101  * is to be changed.  In addition, set *totally_frozen_p to true if the tuple
6102  * will be totally frozen after these operations are performed and false if
6103  * more freezing will eventually be required.
6104  *
6105  * Caller is responsible for setting the offset field, if appropriate.
6106  *
6107  * It is assumed that the caller has checked the tuple with
6108  * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
6109  * (else we should be removing the tuple, not freezing it).
6110  *
6111  * NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
6112  * XID older than it could neither be running nor seen as running by any
6113  * open transaction.  This ensures that the replacement will not change
6114  * anyone's idea of the tuple state.
6115  * Similarly, cutoff_multi must be less than or equal to the smallest
6116  * MultiXactId used by any transaction currently open.
6117  *
6118  * If the tuple is in a shared buffer, caller must hold an exclusive lock on
6119  * that buffer.
6120  *
6121  * NB: It is not enough to set hint bits to indicate something is
6122  * committed/invalid -- they might not be set on a standby, or after crash
6123  * recovery.  We really need to remove old xids.
6124  */
6125 bool
heap_prepare_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi,xl_heap_freeze_tuple * frz,bool * totally_frozen_p)6126 heap_prepare_freeze_tuple(HeapTupleHeader tuple,
6127 						  TransactionId relfrozenxid, TransactionId relminmxid,
6128 						  TransactionId cutoff_xid, TransactionId cutoff_multi,
6129 						  xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
6130 {
6131 	bool		changed = false;
6132 	bool		xmax_already_frozen = false;
6133 	bool		xmin_frozen;
6134 	bool		freeze_xmax;
6135 	TransactionId xid;
6136 
6137 	frz->frzflags = 0;
6138 	frz->t_infomask2 = tuple->t_infomask2;
6139 	frz->t_infomask = tuple->t_infomask;
6140 	frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
6141 
6142 	/*
6143 	 * Process xmin.  xmin_frozen has two slightly different meanings: in the
6144 	 * !XidIsNormal case, it means "the xmin doesn't need any freezing" (it's
6145 	 * already a permanent value), while in the block below it is set true to
6146 	 * mean "xmin won't need freezing after what we do to it here" (false
6147 	 * otherwise).  In both cases we're allowed to set totally_frozen, as far
6148 	 * as xmin is concerned.
6149 	 */
6150 	xid = HeapTupleHeaderGetXmin(tuple);
6151 	if (!TransactionIdIsNormal(xid))
6152 		xmin_frozen = true;
6153 	else
6154 	{
6155 		if (TransactionIdPrecedes(xid, relfrozenxid))
6156 			ereport(ERROR,
6157 					(errcode(ERRCODE_DATA_CORRUPTED),
6158 					 errmsg_internal("found xmin %u from before relfrozenxid %u",
6159 									 xid, relfrozenxid)));
6160 
6161 		xmin_frozen = TransactionIdPrecedes(xid, cutoff_xid);
6162 		if (xmin_frozen)
6163 		{
6164 			if (!TransactionIdDidCommit(xid))
6165 				ereport(ERROR,
6166 						(errcode(ERRCODE_DATA_CORRUPTED),
6167 						 errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
6168 										 xid, cutoff_xid)));
6169 
6170 			frz->t_infomask |= HEAP_XMIN_FROZEN;
6171 			changed = true;
6172 		}
6173 	}
6174 
6175 	/*
6176 	 * Process xmax.  To thoroughly examine the current Xmax value we need to
6177 	 * resolve a MultiXactId to its member Xids, in case some of them are
6178 	 * below the given cutoff for Xids.  In that case, those values might need
6179 	 * freezing, too.  Also, if a multi needs freezing, we cannot simply take
6180 	 * it out --- if there's a live updater Xid, it needs to be kept.
6181 	 *
6182 	 * Make sure to keep heap_tuple_needs_freeze in sync with this.
6183 	 */
6184 	xid = HeapTupleHeaderGetRawXmax(tuple);
6185 
6186 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6187 	{
6188 		TransactionId newxmax;
6189 		uint16		flags;
6190 
6191 		newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
6192 									relfrozenxid, relminmxid,
6193 									cutoff_xid, cutoff_multi, &flags);
6194 
6195 		freeze_xmax = (flags & FRM_INVALIDATE_XMAX);
6196 
6197 		if (flags & FRM_RETURN_IS_XID)
6198 		{
6199 			/*
6200 			 * NB -- some of these transformations are only valid because we
6201 			 * know the return Xid is a tuple updater (i.e. not merely a
6202 			 * locker.) Also note that the only reason we don't explicitly
6203 			 * worry about HEAP_KEYS_UPDATED is because it lives in
6204 			 * t_infomask2 rather than t_infomask.
6205 			 */
6206 			frz->t_infomask &= ~HEAP_XMAX_BITS;
6207 			frz->xmax = newxmax;
6208 			if (flags & FRM_MARK_COMMITTED)
6209 				frz->t_infomask |= HEAP_XMAX_COMMITTED;
6210 			changed = true;
6211 		}
6212 		else if (flags & FRM_RETURN_IS_MULTI)
6213 		{
6214 			uint16		newbits;
6215 			uint16		newbits2;
6216 
6217 			/*
6218 			 * We can't use GetMultiXactIdHintBits directly on the new multi
6219 			 * here; that routine initializes the masks to all zeroes, which
6220 			 * would lose other bits we need.  Doing it this way ensures all
6221 			 * unrelated bits remain untouched.
6222 			 */
6223 			frz->t_infomask &= ~HEAP_XMAX_BITS;
6224 			frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6225 			GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
6226 			frz->t_infomask |= newbits;
6227 			frz->t_infomask2 |= newbits2;
6228 
6229 			frz->xmax = newxmax;
6230 
6231 			changed = true;
6232 		}
6233 	}
6234 	else if (TransactionIdIsNormal(xid))
6235 	{
6236 		if (TransactionIdPrecedes(xid, relfrozenxid))
6237 			ereport(ERROR,
6238 					(errcode(ERRCODE_DATA_CORRUPTED),
6239 					 errmsg_internal("found xmax %u from before relfrozenxid %u",
6240 									 xid, relfrozenxid)));
6241 
6242 		if (TransactionIdPrecedes(xid, cutoff_xid))
6243 		{
6244 			/*
6245 			 * If we freeze xmax, make absolutely sure that it's not an XID
6246 			 * that is important.  (Note, a lock-only xmax can be removed
6247 			 * independent of committedness, since a committed lock holder has
6248 			 * released the lock).
6249 			 */
6250 			if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
6251 				TransactionIdDidCommit(xid))
6252 				ereport(ERROR,
6253 						(errcode(ERRCODE_DATA_CORRUPTED),
6254 						 errmsg_internal("cannot freeze committed xmax %u",
6255 										 xid)));
6256 			freeze_xmax = true;
6257 		}
6258 		else
6259 			freeze_xmax = false;
6260 	}
6261 	else if ((tuple->t_infomask & HEAP_XMAX_INVALID) ||
6262 			 !TransactionIdIsValid(HeapTupleHeaderGetRawXmax(tuple)))
6263 	{
6264 		freeze_xmax = false;
6265 		xmax_already_frozen = true;
6266 	}
6267 	else
6268 		ereport(ERROR,
6269 				(errcode(ERRCODE_DATA_CORRUPTED),
6270 				 errmsg_internal("found xmax %u (infomask 0x%04x) not frozen, not multi, not normal",
6271 								 xid, tuple->t_infomask)));
6272 
6273 	if (freeze_xmax)
6274 	{
6275 		Assert(!xmax_already_frozen);
6276 
6277 		frz->xmax = InvalidTransactionId;
6278 
6279 		/*
6280 		 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
6281 		 * LOCKED.  Normalize to INVALID just to be sure no one gets confused.
6282 		 * Also get rid of the HEAP_KEYS_UPDATED bit.
6283 		 */
6284 		frz->t_infomask &= ~HEAP_XMAX_BITS;
6285 		frz->t_infomask |= HEAP_XMAX_INVALID;
6286 		frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
6287 		frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6288 		changed = true;
6289 	}
6290 
6291 	/*
6292 	 * Old-style VACUUM FULL is gone, but we have to keep this code as long as
6293 	 * we support having MOVED_OFF/MOVED_IN tuples in the database.
6294 	 */
6295 	if (tuple->t_infomask & HEAP_MOVED)
6296 	{
6297 		xid = HeapTupleHeaderGetXvac(tuple);
6298 
6299 		/*
6300 		 * For Xvac, we ignore the cutoff_xid and just always perform the
6301 		 * freeze operation.  The oldest release in which such a value can
6302 		 * actually be set is PostgreSQL 8.4, because old-style VACUUM FULL
6303 		 * was removed in PostgreSQL 9.0.  Note that if we were to respect
6304 		 * cutoff_xid here, we'd need to make surely to clear totally_frozen
6305 		 * when we skipped freezing on that basis.
6306 		 */
6307 		if (TransactionIdIsNormal(xid))
6308 		{
6309 			/*
6310 			 * If a MOVED_OFF tuple is not dead, the xvac transaction must
6311 			 * have failed; whereas a non-dead MOVED_IN tuple must mean the
6312 			 * xvac transaction succeeded.
6313 			 */
6314 			if (tuple->t_infomask & HEAP_MOVED_OFF)
6315 				frz->frzflags |= XLH_INVALID_XVAC;
6316 			else
6317 				frz->frzflags |= XLH_FREEZE_XVAC;
6318 
6319 			/*
6320 			 * Might as well fix the hint bits too; usually XMIN_COMMITTED
6321 			 * will already be set here, but there's a small chance not.
6322 			 */
6323 			Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
6324 			frz->t_infomask |= HEAP_XMIN_COMMITTED;
6325 			changed = true;
6326 		}
6327 	}
6328 
6329 	*totally_frozen_p = (xmin_frozen &&
6330 						 (freeze_xmax || xmax_already_frozen));
6331 	return changed;
6332 }
6333 
6334 /*
6335  * heap_execute_freeze_tuple
6336  *		Execute the prepared freezing of a tuple.
6337  *
6338  * Caller is responsible for ensuring that no other backend can access the
6339  * storage underlying this tuple, either by holding an exclusive lock on the
6340  * buffer containing it (which is what lazy VACUUM does), or by having it be
6341  * in private storage (which is what CLUSTER and friends do).
6342  *
6343  * Note: it might seem we could make the changes without exclusive lock, since
6344  * TransactionId read/write is assumed atomic anyway.  However there is a race
6345  * condition: someone who just fetched an old XID that we overwrite here could
6346  * conceivably not finish checking the XID against pg_xact before we finish
6347  * the VACUUM and perhaps truncate off the part of pg_xact he needs.  Getting
6348  * exclusive lock ensures no other backend is in process of checking the
6349  * tuple status.  Also, getting exclusive lock makes it safe to adjust the
6350  * infomask bits.
6351  *
6352  * NB: All code in here must be safe to execute during crash recovery!
6353  */
6354 void
heap_execute_freeze_tuple(HeapTupleHeader tuple,xl_heap_freeze_tuple * frz)6355 heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
6356 {
6357 	HeapTupleHeaderSetXmax(tuple, frz->xmax);
6358 
6359 	if (frz->frzflags & XLH_FREEZE_XVAC)
6360 		HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
6361 
6362 	if (frz->frzflags & XLH_INVALID_XVAC)
6363 		HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
6364 
6365 	tuple->t_infomask = frz->t_infomask;
6366 	tuple->t_infomask2 = frz->t_infomask2;
6367 }
6368 
6369 /*
6370  * heap_freeze_tuple
6371  *		Freeze tuple in place, without WAL logging.
6372  *
6373  * Useful for callers like CLUSTER that perform their own WAL logging.
6374  */
6375 bool
heap_freeze_tuple(HeapTupleHeader tuple,TransactionId relfrozenxid,TransactionId relminmxid,TransactionId cutoff_xid,TransactionId cutoff_multi)6376 heap_freeze_tuple(HeapTupleHeader tuple,
6377 				  TransactionId relfrozenxid, TransactionId relminmxid,
6378 				  TransactionId cutoff_xid, TransactionId cutoff_multi)
6379 {
6380 	xl_heap_freeze_tuple frz;
6381 	bool		do_freeze;
6382 	bool		tuple_totally_frozen;
6383 
6384 	do_freeze = heap_prepare_freeze_tuple(tuple,
6385 										  relfrozenxid, relminmxid,
6386 										  cutoff_xid, cutoff_multi,
6387 										  &frz, &tuple_totally_frozen);
6388 
6389 	/*
6390 	 * Note that because this is not a WAL-logged operation, we don't need to
6391 	 * fill in the offset in the freeze record.
6392 	 */
6393 
6394 	if (do_freeze)
6395 		heap_execute_freeze_tuple(tuple, &frz);
6396 	return do_freeze;
6397 }
6398 
6399 /*
6400  * For a given MultiXactId, return the hint bits that should be set in the
6401  * tuple's infomask.
6402  *
6403  * Normally this should be called for a multixact that was just created, and
6404  * so is on our local cache, so the GetMembers call is fast.
6405  */
6406 static void
GetMultiXactIdHintBits(MultiXactId multi,uint16 * new_infomask,uint16 * new_infomask2)6407 GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
6408 					   uint16 *new_infomask2)
6409 {
6410 	int			nmembers;
6411 	MultiXactMember *members;
6412 	int			i;
6413 	uint16		bits = HEAP_XMAX_IS_MULTI;
6414 	uint16		bits2 = 0;
6415 	bool		has_update = false;
6416 	LockTupleMode strongest = LockTupleKeyShare;
6417 
6418 	/*
6419 	 * We only use this in multis we just created, so they cannot be values
6420 	 * pre-pg_upgrade.
6421 	 */
6422 	nmembers = GetMultiXactIdMembers(multi, &members, false, false);
6423 
6424 	for (i = 0; i < nmembers; i++)
6425 	{
6426 		LockTupleMode mode;
6427 
6428 		/*
6429 		 * Remember the strongest lock mode held by any member of the
6430 		 * multixact.
6431 		 */
6432 		mode = TUPLOCK_from_mxstatus(members[i].status);
6433 		if (mode > strongest)
6434 			strongest = mode;
6435 
6436 		/* See what other bits we need */
6437 		switch (members[i].status)
6438 		{
6439 			case MultiXactStatusForKeyShare:
6440 			case MultiXactStatusForShare:
6441 			case MultiXactStatusForNoKeyUpdate:
6442 				break;
6443 
6444 			case MultiXactStatusForUpdate:
6445 				bits2 |= HEAP_KEYS_UPDATED;
6446 				break;
6447 
6448 			case MultiXactStatusNoKeyUpdate:
6449 				has_update = true;
6450 				break;
6451 
6452 			case MultiXactStatusUpdate:
6453 				bits2 |= HEAP_KEYS_UPDATED;
6454 				has_update = true;
6455 				break;
6456 		}
6457 	}
6458 
6459 	if (strongest == LockTupleExclusive ||
6460 		strongest == LockTupleNoKeyExclusive)
6461 		bits |= HEAP_XMAX_EXCL_LOCK;
6462 	else if (strongest == LockTupleShare)
6463 		bits |= HEAP_XMAX_SHR_LOCK;
6464 	else if (strongest == LockTupleKeyShare)
6465 		bits |= HEAP_XMAX_KEYSHR_LOCK;
6466 
6467 	if (!has_update)
6468 		bits |= HEAP_XMAX_LOCK_ONLY;
6469 
6470 	if (nmembers > 0)
6471 		pfree(members);
6472 
6473 	*new_infomask = bits;
6474 	*new_infomask2 = bits2;
6475 }
6476 
6477 /*
6478  * MultiXactIdGetUpdateXid
6479  *
6480  * Given a multixact Xmax and corresponding infomask, which does not have the
6481  * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
6482  * transaction.
6483  *
6484  * Caller is expected to check the status of the updating transaction, if
6485  * necessary.
6486  */
6487 static TransactionId
MultiXactIdGetUpdateXid(TransactionId xmax,uint16 t_infomask)6488 MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
6489 {
6490 	TransactionId update_xact = InvalidTransactionId;
6491 	MultiXactMember *members;
6492 	int			nmembers;
6493 
6494 	Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
6495 	Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6496 
6497 	/*
6498 	 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
6499 	 * pre-pg_upgrade.
6500 	 */
6501 	nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
6502 
6503 	if (nmembers > 0)
6504 	{
6505 		int			i;
6506 
6507 		for (i = 0; i < nmembers; i++)
6508 		{
6509 			/* Ignore lockers */
6510 			if (!ISUPDATE_from_mxstatus(members[i].status))
6511 				continue;
6512 
6513 			/* there can be at most one updater */
6514 			Assert(update_xact == InvalidTransactionId);
6515 			update_xact = members[i].xid;
6516 #ifndef USE_ASSERT_CHECKING
6517 
6518 			/*
6519 			 * in an assert-enabled build, walk the whole array to ensure
6520 			 * there's no other updater.
6521 			 */
6522 			break;
6523 #endif
6524 		}
6525 
6526 		pfree(members);
6527 	}
6528 
6529 	return update_xact;
6530 }
6531 
6532 /*
6533  * HeapTupleGetUpdateXid
6534  *		As above, but use a HeapTupleHeader
6535  *
6536  * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
6537  * checking the hint bits.
6538  */
6539 TransactionId
HeapTupleGetUpdateXid(HeapTupleHeader tuple)6540 HeapTupleGetUpdateXid(HeapTupleHeader tuple)
6541 {
6542 	return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
6543 								   tuple->t_infomask);
6544 }
6545 
6546 /*
6547  * Does the given multixact conflict with the current transaction grabbing a
6548  * tuple lock of the given strength?
6549  *
6550  * The passed infomask pairs up with the given multixact in the tuple header.
6551  *
6552  * If current_is_member is not NULL, it is set to 'true' if the current
6553  * transaction is a member of the given multixact.
6554  */
6555 static bool
DoesMultiXactIdConflict(MultiXactId multi,uint16 infomask,LockTupleMode lockmode,bool * current_is_member)6556 DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
6557 						LockTupleMode lockmode, bool *current_is_member)
6558 {
6559 	int			nmembers;
6560 	MultiXactMember *members;
6561 	bool		result = false;
6562 	LOCKMODE	wanted = tupleLockExtraInfo[lockmode].hwlock;
6563 
6564 	if (HEAP_LOCKED_UPGRADED(infomask))
6565 		return false;
6566 
6567 	nmembers = GetMultiXactIdMembers(multi, &members, false,
6568 									 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6569 	if (nmembers >= 0)
6570 	{
6571 		int			i;
6572 
6573 		for (i = 0; i < nmembers; i++)
6574 		{
6575 			TransactionId memxid;
6576 			LOCKMODE	memlockmode;
6577 
6578 			if (result && (current_is_member == NULL || *current_is_member))
6579 				break;
6580 
6581 			memlockmode = LOCKMODE_from_mxstatus(members[i].status);
6582 
6583 			/* ignore members from current xact (but track their presence) */
6584 			memxid = members[i].xid;
6585 			if (TransactionIdIsCurrentTransactionId(memxid))
6586 			{
6587 				if (current_is_member != NULL)
6588 					*current_is_member = true;
6589 				continue;
6590 			}
6591 			else if (result)
6592 				continue;
6593 
6594 			/* ignore members that don't conflict with the lock we want */
6595 			if (!DoLockModesConflict(memlockmode, wanted))
6596 				continue;
6597 
6598 			if (ISUPDATE_from_mxstatus(members[i].status))
6599 			{
6600 				/* ignore aborted updaters */
6601 				if (TransactionIdDidAbort(memxid))
6602 					continue;
6603 			}
6604 			else
6605 			{
6606 				/* ignore lockers-only that are no longer in progress */
6607 				if (!TransactionIdIsInProgress(memxid))
6608 					continue;
6609 			}
6610 
6611 			/*
6612 			 * Whatever remains are either live lockers that conflict with our
6613 			 * wanted lock, and updaters that are not aborted.  Those conflict
6614 			 * with what we want.  Set up to return true, but keep going to
6615 			 * look for the current transaction among the multixact members,
6616 			 * if needed.
6617 			 */
6618 			result = true;
6619 		}
6620 		pfree(members);
6621 	}
6622 
6623 	return result;
6624 }
6625 
6626 /*
6627  * Do_MultiXactIdWait
6628  *		Actual implementation for the two functions below.
6629  *
6630  * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
6631  * needed to ensure we only sleep on conflicting members, and the infomask is
6632  * used to optimize multixact access in case it's a lock-only multi); 'nowait'
6633  * indicates whether to use conditional lock acquisition, to allow callers to
6634  * fail if lock is unavailable.  'rel', 'ctid' and 'oper' are used to set up
6635  * context information for error messages.  'remaining', if not NULL, receives
6636  * the number of members that are still running, including any (non-aborted)
6637  * subtransactions of our own transaction.
6638  *
6639  * We do this by sleeping on each member using XactLockTableWait.  Any
6640  * members that belong to the current backend are *not* waited for, however;
6641  * this would not merely be useless but would lead to Assert failure inside
6642  * XactLockTableWait.  By the time this returns, it is certain that all
6643  * transactions *of other backends* that were members of the MultiXactId
6644  * that conflict with the requested status are dead (and no new ones can have
6645  * been added, since it is not legal to add members to an existing
6646  * MultiXactId).
6647  *
6648  * But by the time we finish sleeping, someone else may have changed the Xmax
6649  * of the containing tuple, so the caller needs to iterate on us somehow.
6650  *
6651  * Note that in case we return false, the number of remaining members is
6652  * not to be trusted.
6653  */
6654 static bool
Do_MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,bool nowait,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)6655 Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6656 				   uint16 infomask, bool nowait,
6657 				   Relation rel, ItemPointer ctid, XLTW_Oper oper,
6658 				   int *remaining)
6659 {
6660 	bool		result = true;
6661 	MultiXactMember *members;
6662 	int			nmembers;
6663 	int			remain = 0;
6664 
6665 	/* for pre-pg_upgrade tuples, no need to sleep at all */
6666 	nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
6667 		GetMultiXactIdMembers(multi, &members, false,
6668 							  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
6669 
6670 	if (nmembers >= 0)
6671 	{
6672 		int			i;
6673 
6674 		for (i = 0; i < nmembers; i++)
6675 		{
6676 			TransactionId memxid = members[i].xid;
6677 			MultiXactStatus memstatus = members[i].status;
6678 
6679 			if (TransactionIdIsCurrentTransactionId(memxid))
6680 			{
6681 				remain++;
6682 				continue;
6683 			}
6684 
6685 			if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
6686 									 LOCKMODE_from_mxstatus(status)))
6687 			{
6688 				if (remaining && TransactionIdIsInProgress(memxid))
6689 					remain++;
6690 				continue;
6691 			}
6692 
6693 			/*
6694 			 * This member conflicts with our multi, so we have to sleep (or
6695 			 * return failure, if asked to avoid waiting.)
6696 			 *
6697 			 * Note that we don't set up an error context callback ourselves,
6698 			 * but instead we pass the info down to XactLockTableWait.  This
6699 			 * might seem a bit wasteful because the context is set up and
6700 			 * tore down for each member of the multixact, but in reality it
6701 			 * should be barely noticeable, and it avoids duplicate code.
6702 			 */
6703 			if (nowait)
6704 			{
6705 				result = ConditionalXactLockTableWait(memxid);
6706 				if (!result)
6707 					break;
6708 			}
6709 			else
6710 				XactLockTableWait(memxid, rel, ctid, oper);
6711 		}
6712 
6713 		pfree(members);
6714 	}
6715 
6716 	if (remaining)
6717 		*remaining = remain;
6718 
6719 	return result;
6720 }
6721 
6722 /*
6723  * MultiXactIdWait
6724  *		Sleep on a MultiXactId.
6725  *
6726  * By the time we finish sleeping, someone else may have changed the Xmax
6727  * of the containing tuple, so the caller needs to iterate on us somehow.
6728  *
6729  * We return (in *remaining, if not NULL) the number of members that are still
6730  * running, including any (non-aborted) subtransactions of our own transaction.
6731  */
6732 static void
MultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,ItemPointer ctid,XLTW_Oper oper,int * remaining)6733 MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
6734 				Relation rel, ItemPointer ctid, XLTW_Oper oper,
6735 				int *remaining)
6736 {
6737 	(void) Do_MultiXactIdWait(multi, status, infomask, false,
6738 							  rel, ctid, oper, remaining);
6739 }
6740 
6741 /*
6742  * ConditionalMultiXactIdWait
6743  *		As above, but only lock if we can get the lock without blocking.
6744  *
6745  * By the time we finish sleeping, someone else may have changed the Xmax
6746  * of the containing tuple, so the caller needs to iterate on us somehow.
6747  *
6748  * If the multixact is now all gone, return true.  Returns false if some
6749  * transactions might still be running.
6750  *
6751  * We return (in *remaining, if not NULL) the number of members that are still
6752  * running, including any (non-aborted) subtransactions of our own transaction.
6753  */
6754 static bool
ConditionalMultiXactIdWait(MultiXactId multi,MultiXactStatus status,uint16 infomask,Relation rel,int * remaining)6755 ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
6756 						   uint16 infomask, Relation rel, int *remaining)
6757 {
6758 	return Do_MultiXactIdWait(multi, status, infomask, true,
6759 							  rel, NULL, XLTW_None, remaining);
6760 }
6761 
6762 /*
6763  * heap_tuple_needs_eventual_freeze
6764  *
6765  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6766  * will eventually require freezing.  Similar to heap_tuple_needs_freeze,
6767  * but there's no cutoff, since we're trying to figure out whether freezing
6768  * will ever be needed, not whether it's needed now.
6769  */
6770 bool
heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)6771 heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
6772 {
6773 	TransactionId xid;
6774 
6775 	/*
6776 	 * If xmin is a normal transaction ID, this tuple is definitely not
6777 	 * frozen.
6778 	 */
6779 	xid = HeapTupleHeaderGetXmin(tuple);
6780 	if (TransactionIdIsNormal(xid))
6781 		return true;
6782 
6783 	/*
6784 	 * If xmax is a valid xact or multixact, this tuple is also not frozen.
6785 	 */
6786 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6787 	{
6788 		MultiXactId multi;
6789 
6790 		multi = HeapTupleHeaderGetRawXmax(tuple);
6791 		if (MultiXactIdIsValid(multi))
6792 			return true;
6793 	}
6794 	else
6795 	{
6796 		xid = HeapTupleHeaderGetRawXmax(tuple);
6797 		if (TransactionIdIsNormal(xid))
6798 			return true;
6799 	}
6800 
6801 	if (tuple->t_infomask & HEAP_MOVED)
6802 	{
6803 		xid = HeapTupleHeaderGetXvac(tuple);
6804 		if (TransactionIdIsNormal(xid))
6805 			return true;
6806 	}
6807 
6808 	return false;
6809 }
6810 
6811 /*
6812  * heap_tuple_needs_freeze
6813  *
6814  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
6815  * are older than the specified cutoff XID or MultiXactId.  If so, return true.
6816  *
6817  * It doesn't matter whether the tuple is alive or dead, we are checking
6818  * to see if a tuple needs to be removed or frozen to avoid wraparound.
6819  *
6820  * NB: Cannot rely on hint bits here, they might not be set after a crash or
6821  * on a standby.
6822  */
6823 bool
heap_tuple_needs_freeze(HeapTupleHeader tuple,TransactionId cutoff_xid,MultiXactId cutoff_multi,Buffer buf)6824 heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
6825 						MultiXactId cutoff_multi, Buffer buf)
6826 {
6827 	TransactionId xid;
6828 
6829 	xid = HeapTupleHeaderGetXmin(tuple);
6830 	if (TransactionIdIsNormal(xid) &&
6831 		TransactionIdPrecedes(xid, cutoff_xid))
6832 		return true;
6833 
6834 	/*
6835 	 * The considerations for multixacts are complicated; look at
6836 	 * heap_prepare_freeze_tuple for justifications.  This routine had better
6837 	 * be in sync with that one!
6838 	 */
6839 	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
6840 	{
6841 		MultiXactId multi;
6842 
6843 		multi = HeapTupleHeaderGetRawXmax(tuple);
6844 		if (!MultiXactIdIsValid(multi))
6845 		{
6846 			/* no xmax set, ignore */
6847 			;
6848 		}
6849 		else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
6850 			return true;
6851 		else if (MultiXactIdPrecedes(multi, cutoff_multi))
6852 			return true;
6853 		else
6854 		{
6855 			MultiXactMember *members;
6856 			int			nmembers;
6857 			int			i;
6858 
6859 			/* need to check whether any member of the mxact is too old */
6860 
6861 			nmembers = GetMultiXactIdMembers(multi, &members, false,
6862 											 HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
6863 
6864 			for (i = 0; i < nmembers; i++)
6865 			{
6866 				if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
6867 				{
6868 					pfree(members);
6869 					return true;
6870 				}
6871 			}
6872 			if (nmembers > 0)
6873 				pfree(members);
6874 		}
6875 	}
6876 	else
6877 	{
6878 		xid = HeapTupleHeaderGetRawXmax(tuple);
6879 		if (TransactionIdIsNormal(xid) &&
6880 			TransactionIdPrecedes(xid, cutoff_xid))
6881 			return true;
6882 	}
6883 
6884 	if (tuple->t_infomask & HEAP_MOVED)
6885 	{
6886 		xid = HeapTupleHeaderGetXvac(tuple);
6887 		if (TransactionIdIsNormal(xid) &&
6888 			TransactionIdPrecedes(xid, cutoff_xid))
6889 			return true;
6890 	}
6891 
6892 	return false;
6893 }
6894 
6895 /*
6896  * If 'tuple' contains any visible XID greater than latestRemovedXid,
6897  * ratchet forwards latestRemovedXid to the greatest one found.
6898  * This is used as the basis for generating Hot Standby conflicts, so
6899  * if a tuple was never visible then removing it should not conflict
6900  * with queries.
6901  */
6902 void
HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,TransactionId * latestRemovedXid)6903 HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
6904 									   TransactionId *latestRemovedXid)
6905 {
6906 	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
6907 	TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
6908 	TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
6909 
6910 	if (tuple->t_infomask & HEAP_MOVED)
6911 	{
6912 		if (TransactionIdPrecedes(*latestRemovedXid, xvac))
6913 			*latestRemovedXid = xvac;
6914 	}
6915 
6916 	/*
6917 	 * Ignore tuples inserted by an aborted transaction or if the tuple was
6918 	 * updated/deleted by the inserting transaction.
6919 	 *
6920 	 * Look for a committed hint bit, or if no xmin bit is set, check clog.
6921 	 * This needs to work on both master and standby, where it is used to
6922 	 * assess btree delete records.
6923 	 */
6924 	if (HeapTupleHeaderXminCommitted(tuple) ||
6925 		(!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
6926 	{
6927 		if (xmax != xmin &&
6928 			TransactionIdFollows(xmax, *latestRemovedXid))
6929 			*latestRemovedXid = xmax;
6930 	}
6931 
6932 	/* *latestRemovedXid may still be invalid at end */
6933 }
6934 
6935 #ifdef USE_PREFETCH
6936 /*
6937  * Helper function for heap_compute_xid_horizon_for_tuples.  Issue prefetch
6938  * requests for the number of buffers indicated by prefetch_count.  The
6939  * prefetch_state keeps track of all the buffers that we can prefetch and
6940  * which ones have already been prefetched; each call to this function picks
6941  * up where the previous call left off.
6942  */
6943 static void
xid_horizon_prefetch_buffer(Relation rel,XidHorizonPrefetchState * prefetch_state,int prefetch_count)6944 xid_horizon_prefetch_buffer(Relation rel,
6945 							XidHorizonPrefetchState *prefetch_state,
6946 							int prefetch_count)
6947 {
6948 	BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
6949 	int			count = 0;
6950 	int			i;
6951 	int			nitems = prefetch_state->nitems;
6952 	ItemPointerData *tids = prefetch_state->tids;
6953 
6954 	for (i = prefetch_state->next_item;
6955 		 i < nitems && count < prefetch_count;
6956 		 i++)
6957 	{
6958 		ItemPointer htid = &tids[i];
6959 
6960 		if (cur_hblkno == InvalidBlockNumber ||
6961 			ItemPointerGetBlockNumber(htid) != cur_hblkno)
6962 		{
6963 			cur_hblkno = ItemPointerGetBlockNumber(htid);
6964 			PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
6965 			count++;
6966 		}
6967 	}
6968 
6969 	/*
6970 	 * Save the prefetch position so that next time we can continue from that
6971 	 * position.
6972 	 */
6973 	prefetch_state->next_item = i;
6974 	prefetch_state->cur_hblkno = cur_hblkno;
6975 }
6976 #endif
6977 
6978 /*
6979  * Get the latestRemovedXid from the heap pages pointed at by the index
6980  * tuples being deleted.
6981  *
6982  * We used to do this during recovery rather than on the primary, but that
6983  * approach now appears inferior.  It meant that the master could generate
6984  * a lot of work for the standby without any back-pressure to slow down the
6985  * master, and it required the standby to have reached consistency, whereas
6986  * we want to have correct information available even before that point.
6987  *
6988  * It's possible for this to generate a fair amount of I/O, since we may be
6989  * deleting hundreds of tuples from a single index block.  To amortize that
6990  * cost to some degree, this uses prefetching and combines repeat accesses to
6991  * the same block.
6992  */
6993 TransactionId
heap_compute_xid_horizon_for_tuples(Relation rel,ItemPointerData * tids,int nitems)6994 heap_compute_xid_horizon_for_tuples(Relation rel,
6995 									ItemPointerData *tids,
6996 									int nitems)
6997 {
6998 	/* Initial assumption is that earlier pruning took care of conflict */
6999 	TransactionId latestRemovedXid = InvalidTransactionId;
7000 	BlockNumber blkno = InvalidBlockNumber;
7001 	Buffer		buf = InvalidBuffer;
7002 	Page		page = NULL;
7003 	OffsetNumber maxoff = InvalidOffsetNumber;
7004 	TransactionId priorXmax;
7005 #ifdef USE_PREFETCH
7006 	XidHorizonPrefetchState prefetch_state;
7007 	int			prefetch_distance;
7008 #endif
7009 
7010 	/*
7011 	 * Sort to avoid repeated lookups for the same page, and to make it more
7012 	 * likely to access items in an efficient order. In particular, this
7013 	 * ensures that if there are multiple pointers to the same page, they all
7014 	 * get processed looking up and locking the page just once.
7015 	 */
7016 	qsort((void *) tids, nitems, sizeof(ItemPointerData),
7017 		  (int (*) (const void *, const void *)) ItemPointerCompare);
7018 
7019 #ifdef USE_PREFETCH
7020 	/* Initialize prefetch state. */
7021 	prefetch_state.cur_hblkno = InvalidBlockNumber;
7022 	prefetch_state.next_item = 0;
7023 	prefetch_state.nitems = nitems;
7024 	prefetch_state.tids = tids;
7025 
7026 	/*
7027 	 * Compute the prefetch distance that we will attempt to maintain.
7028 	 *
7029 	 * Since the caller holds a buffer lock somewhere in rel, we'd better make
7030 	 * sure that isn't a catalog relation before we call code that does
7031 	 * syscache lookups, to avoid risk of deadlock.
7032 	 */
7033 	if (IsCatalogRelation(rel))
7034 		prefetch_distance = maintenance_io_concurrency;
7035 	else
7036 		prefetch_distance =
7037 			get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
7038 
7039 	/* Start prefetching. */
7040 	xid_horizon_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
7041 #endif
7042 
7043 	/* Iterate over all tids, and check their horizon */
7044 	for (int i = 0; i < nitems; i++)
7045 	{
7046 		ItemPointer htid = &tids[i];
7047 		OffsetNumber offnum;
7048 
7049 		/*
7050 		 * Read heap buffer, but avoid refetching if it's the same block as
7051 		 * required for the last tid.
7052 		 */
7053 		if (blkno == InvalidBlockNumber ||
7054 			ItemPointerGetBlockNumber(htid) != blkno)
7055 		{
7056 			/* release old buffer */
7057 			if (BufferIsValid(buf))
7058 			{
7059 				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
7060 				ReleaseBuffer(buf);
7061 			}
7062 
7063 			blkno = ItemPointerGetBlockNumber(htid);
7064 
7065 			buf = ReadBuffer(rel, blkno);
7066 
7067 #ifdef USE_PREFETCH
7068 
7069 			/*
7070 			 * To maintain the prefetch distance, prefetch one more page for
7071 			 * each page we read.
7072 			 */
7073 			xid_horizon_prefetch_buffer(rel, &prefetch_state, 1);
7074 #endif
7075 
7076 			LockBuffer(buf, BUFFER_LOCK_SHARE);
7077 
7078 			page = BufferGetPage(buf);
7079 			maxoff = PageGetMaxOffsetNumber(page);
7080 		}
7081 
7082 		/*
7083 		 * Maintain latestRemovedXid value for deletion operation as a whole
7084 		 * by advancing current value using heap tuple headers.  This is
7085 		 * loosely based on the logic for pruning a HOT chain.
7086 		 */
7087 		offnum = ItemPointerGetOffsetNumber(htid);
7088 		priorXmax = InvalidTransactionId;	/* cannot check first XMIN */
7089 		for (;;)
7090 		{
7091 			ItemId		lp;
7092 			HeapTupleHeader htup;
7093 
7094 			/* Some sanity checks */
7095 			if (offnum < FirstOffsetNumber || offnum > maxoff)
7096 				break;
7097 
7098 			lp = PageGetItemId(page, offnum);
7099 			if (ItemIdIsRedirected(lp))
7100 			{
7101 				offnum = ItemIdGetRedirect(lp);
7102 				continue;
7103 			}
7104 
7105 			/*
7106 			 * We'll often encounter LP_DEAD line pointers.  No need to do
7107 			 * anything more with htid when that happens.  This is okay
7108 			 * because the earlier pruning operation that made the line
7109 			 * pointer LP_DEAD in the first place must have considered the
7110 			 * tuple header as part of generating its own latestRemovedXid
7111 			 * value.
7112 			 *
7113 			 * Relying on XLOG_HEAP2_CLEANUP_INFO records like this is the
7114 			 * same strategy that index vacuuming uses in all cases.  Index
7115 			 * VACUUM WAL records don't even have a latestRemovedXid field of
7116 			 * their own for this reason.
7117 			 */
7118 			if (!ItemIdIsNormal(lp))
7119 				break;
7120 
7121 			htup = (HeapTupleHeader) PageGetItem(page, lp);
7122 
7123 			/*
7124 			 * Check the tuple XMIN against prior XMAX, if any
7125 			 */
7126 			if (TransactionIdIsValid(priorXmax) &&
7127 				!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
7128 				break;
7129 
7130 			HeapTupleHeaderAdvanceLatestRemovedXid(htup, &latestRemovedXid);
7131 
7132 			/*
7133 			 * If the tuple is not HOT-updated, then we are at the end of this
7134 			 * HOT-chain.  No need to visit later tuples from the same update
7135 			 * chain (they get their own index entries) -- just move on to
7136 			 * next htid from index AM caller.
7137 			 */
7138 			if (!HeapTupleHeaderIsHotUpdated(htup))
7139 				break;
7140 
7141 			/* Advance to next HOT chain member */
7142 			Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
7143 			offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
7144 			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
7145 		}
7146 	}
7147 
7148 	if (BufferIsValid(buf))
7149 	{
7150 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
7151 		ReleaseBuffer(buf);
7152 	}
7153 
7154 	return latestRemovedXid;
7155 }
7156 
7157 /*
7158  * Perform XLogInsert to register a heap cleanup info message. These
7159  * messages are sent once per VACUUM and are required because
7160  * of the phasing of removal operations during a lazy VACUUM.
7161  * see comments for vacuum_log_cleanup_info().
7162  */
7163 XLogRecPtr
log_heap_cleanup_info(RelFileNode rnode,TransactionId latestRemovedXid)7164 log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
7165 {
7166 	xl_heap_cleanup_info xlrec;
7167 	XLogRecPtr	recptr;
7168 
7169 	xlrec.node = rnode;
7170 	xlrec.latestRemovedXid = latestRemovedXid;
7171 
7172 	XLogBeginInsert();
7173 	XLogRegisterData((char *) &xlrec, SizeOfHeapCleanupInfo);
7174 
7175 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO);
7176 
7177 	return recptr;
7178 }
7179 
7180 /*
7181  * Perform XLogInsert for a heap-clean operation.  Caller must already
7182  * have modified the buffer and marked it dirty.
7183  *
7184  * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
7185  * zero-based tuple indexes.  Now they are one-based like other uses
7186  * of OffsetNumber.
7187  *
7188  * We also include latestRemovedXid, which is the greatest XID present in
7189  * the removed tuples. That allows recovery processing to cancel or wait
7190  * for long standby queries that can still see these tuples.
7191  */
7192 XLogRecPtr
log_heap_clean(Relation reln,Buffer buffer,OffsetNumber * redirected,int nredirected,OffsetNumber * nowdead,int ndead,OffsetNumber * nowunused,int nunused,TransactionId latestRemovedXid)7193 log_heap_clean(Relation reln, Buffer buffer,
7194 			   OffsetNumber *redirected, int nredirected,
7195 			   OffsetNumber *nowdead, int ndead,
7196 			   OffsetNumber *nowunused, int nunused,
7197 			   TransactionId latestRemovedXid)
7198 {
7199 	xl_heap_clean xlrec;
7200 	XLogRecPtr	recptr;
7201 
7202 	/* Caller should not call me on a non-WAL-logged relation */
7203 	Assert(RelationNeedsWAL(reln));
7204 
7205 	xlrec.latestRemovedXid = latestRemovedXid;
7206 	xlrec.nredirected = nredirected;
7207 	xlrec.ndead = ndead;
7208 
7209 	XLogBeginInsert();
7210 	XLogRegisterData((char *) &xlrec, SizeOfHeapClean);
7211 
7212 	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7213 
7214 	/*
7215 	 * The OffsetNumber arrays are not actually in the buffer, but we pretend
7216 	 * that they are.  When XLogInsert stores the whole buffer, the offset
7217 	 * arrays need not be stored too.  Note that even if all three arrays are
7218 	 * empty, we want to expose the buffer as a candidate for whole-page
7219 	 * storage, since this record type implies a defragmentation operation
7220 	 * even if no line pointers changed state.
7221 	 */
7222 	if (nredirected > 0)
7223 		XLogRegisterBufData(0, (char *) redirected,
7224 							nredirected * sizeof(OffsetNumber) * 2);
7225 
7226 	if (ndead > 0)
7227 		XLogRegisterBufData(0, (char *) nowdead,
7228 							ndead * sizeof(OffsetNumber));
7229 
7230 	if (nunused > 0)
7231 		XLogRegisterBufData(0, (char *) nowunused,
7232 							nunused * sizeof(OffsetNumber));
7233 
7234 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEAN);
7235 
7236 	return recptr;
7237 }
7238 
7239 /*
7240  * Perform XLogInsert for a heap-freeze operation.  Caller must have already
7241  * modified the buffer and marked it dirty.
7242  */
7243 XLogRecPtr
log_heap_freeze(Relation reln,Buffer buffer,TransactionId cutoff_xid,xl_heap_freeze_tuple * tuples,int ntuples)7244 log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
7245 				xl_heap_freeze_tuple *tuples, int ntuples)
7246 {
7247 	xl_heap_freeze_page xlrec;
7248 	XLogRecPtr	recptr;
7249 
7250 	/* Caller should not call me on a non-WAL-logged relation */
7251 	Assert(RelationNeedsWAL(reln));
7252 	/* nor when there are no tuples to freeze */
7253 	Assert(ntuples > 0);
7254 
7255 	xlrec.cutoff_xid = cutoff_xid;
7256 	xlrec.ntuples = ntuples;
7257 
7258 	XLogBeginInsert();
7259 	XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
7260 
7261 	/*
7262 	 * The freeze plan array is not actually in the buffer, but pretend that
7263 	 * it is.  When XLogInsert stores the whole buffer, the freeze plan need
7264 	 * not be stored too.
7265 	 */
7266 	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
7267 	XLogRegisterBufData(0, (char *) tuples,
7268 						ntuples * sizeof(xl_heap_freeze_tuple));
7269 
7270 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
7271 
7272 	return recptr;
7273 }
7274 
7275 /*
7276  * Perform XLogInsert for a heap-visible operation.  'block' is the block
7277  * being marked all-visible, and vm_buffer is the buffer containing the
7278  * corresponding visibility map block.  Both should have already been modified
7279  * and dirtied.
7280  *
7281  * If checksums are enabled, we also generate a full-page image of
7282  * heap_buffer, if necessary.
7283  */
7284 XLogRecPtr
log_heap_visible(RelFileNode rnode,Buffer heap_buffer,Buffer vm_buffer,TransactionId cutoff_xid,uint8 vmflags)7285 log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
7286 				 TransactionId cutoff_xid, uint8 vmflags)
7287 {
7288 	xl_heap_visible xlrec;
7289 	XLogRecPtr	recptr;
7290 	uint8		flags;
7291 
7292 	Assert(BufferIsValid(heap_buffer));
7293 	Assert(BufferIsValid(vm_buffer));
7294 
7295 	xlrec.cutoff_xid = cutoff_xid;
7296 	xlrec.flags = vmflags;
7297 	XLogBeginInsert();
7298 	XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
7299 
7300 	XLogRegisterBuffer(0, vm_buffer, 0);
7301 
7302 	flags = REGBUF_STANDARD;
7303 	if (!XLogHintBitIsNeeded())
7304 		flags |= REGBUF_NO_IMAGE;
7305 	XLogRegisterBuffer(1, heap_buffer, flags);
7306 
7307 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
7308 
7309 	return recptr;
7310 }
7311 
7312 /*
7313  * Perform XLogInsert for a heap-update operation.  Caller must already
7314  * have modified the buffer(s) and marked them dirty.
7315  */
7316 static XLogRecPtr
log_heap_update(Relation reln,Buffer oldbuf,Buffer newbuf,HeapTuple oldtup,HeapTuple newtup,HeapTuple old_key_tuple,bool all_visible_cleared,bool new_all_visible_cleared)7317 log_heap_update(Relation reln, Buffer oldbuf,
7318 				Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
7319 				HeapTuple old_key_tuple,
7320 				bool all_visible_cleared, bool new_all_visible_cleared)
7321 {
7322 	xl_heap_update xlrec;
7323 	xl_heap_header xlhdr;
7324 	xl_heap_header xlhdr_idx;
7325 	uint8		info;
7326 	uint16		prefix_suffix[2];
7327 	uint16		prefixlen = 0,
7328 				suffixlen = 0;
7329 	XLogRecPtr	recptr;
7330 	Page		page = BufferGetPage(newbuf);
7331 	bool		need_tuple_data = RelationIsLogicallyLogged(reln);
7332 	bool		init;
7333 	int			bufflags;
7334 
7335 	/* Caller should not call me on a non-WAL-logged relation */
7336 	Assert(RelationNeedsWAL(reln));
7337 
7338 	XLogBeginInsert();
7339 
7340 	if (HeapTupleIsHeapOnly(newtup))
7341 		info = XLOG_HEAP_HOT_UPDATE;
7342 	else
7343 		info = XLOG_HEAP_UPDATE;
7344 
7345 	/*
7346 	 * If the old and new tuple are on the same page, we only need to log the
7347 	 * parts of the new tuple that were changed.  That saves on the amount of
7348 	 * WAL we need to write.  Currently, we just count any unchanged bytes in
7349 	 * the beginning and end of the tuple.  That's quick to check, and
7350 	 * perfectly covers the common case that only one field is updated.
7351 	 *
7352 	 * We could do this even if the old and new tuple are on different pages,
7353 	 * but only if we don't make a full-page image of the old page, which is
7354 	 * difficult to know in advance.  Also, if the old tuple is corrupt for
7355 	 * some reason, it would allow the corruption to propagate the new page,
7356 	 * so it seems best to avoid.  Under the general assumption that most
7357 	 * updates tend to create the new tuple version on the same page, there
7358 	 * isn't much to be gained by doing this across pages anyway.
7359 	 *
7360 	 * Skip this if we're taking a full-page image of the new page, as we
7361 	 * don't include the new tuple in the WAL record in that case.  Also
7362 	 * disable if wal_level='logical', as logical decoding needs to be able to
7363 	 * read the new tuple in whole from the WAL record alone.
7364 	 */
7365 	if (oldbuf == newbuf && !need_tuple_data &&
7366 		!XLogCheckBufferNeedsBackup(newbuf))
7367 	{
7368 		char	   *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
7369 		char	   *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
7370 		int			oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
7371 		int			newlen = newtup->t_len - newtup->t_data->t_hoff;
7372 
7373 		/* Check for common prefix between old and new tuple */
7374 		for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
7375 		{
7376 			if (newp[prefixlen] != oldp[prefixlen])
7377 				break;
7378 		}
7379 
7380 		/*
7381 		 * Storing the length of the prefix takes 2 bytes, so we need to save
7382 		 * at least 3 bytes or there's no point.
7383 		 */
7384 		if (prefixlen < 3)
7385 			prefixlen = 0;
7386 
7387 		/* Same for suffix */
7388 		for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
7389 		{
7390 			if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
7391 				break;
7392 		}
7393 		if (suffixlen < 3)
7394 			suffixlen = 0;
7395 	}
7396 
7397 	/* Prepare main WAL data chain */
7398 	xlrec.flags = 0;
7399 	if (all_visible_cleared)
7400 		xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
7401 	if (new_all_visible_cleared)
7402 		xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
7403 	if (prefixlen > 0)
7404 		xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
7405 	if (suffixlen > 0)
7406 		xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
7407 	if (need_tuple_data)
7408 	{
7409 		xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
7410 		if (old_key_tuple)
7411 		{
7412 			if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
7413 				xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
7414 			else
7415 				xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
7416 		}
7417 	}
7418 
7419 	/* If new tuple is the single and first tuple on page... */
7420 	if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
7421 		PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
7422 	{
7423 		info |= XLOG_HEAP_INIT_PAGE;
7424 		init = true;
7425 	}
7426 	else
7427 		init = false;
7428 
7429 	/* Prepare WAL data for the old page */
7430 	xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
7431 	xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
7432 	xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
7433 											  oldtup->t_data->t_infomask2);
7434 
7435 	/* Prepare WAL data for the new page */
7436 	xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
7437 	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
7438 
7439 	bufflags = REGBUF_STANDARD;
7440 	if (init)
7441 		bufflags |= REGBUF_WILL_INIT;
7442 	if (need_tuple_data)
7443 		bufflags |= REGBUF_KEEP_DATA;
7444 
7445 	XLogRegisterBuffer(0, newbuf, bufflags);
7446 	if (oldbuf != newbuf)
7447 		XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
7448 
7449 	XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
7450 
7451 	/*
7452 	 * Prepare WAL data for the new tuple.
7453 	 */
7454 	if (prefixlen > 0 || suffixlen > 0)
7455 	{
7456 		if (prefixlen > 0 && suffixlen > 0)
7457 		{
7458 			prefix_suffix[0] = prefixlen;
7459 			prefix_suffix[1] = suffixlen;
7460 			XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
7461 		}
7462 		else if (prefixlen > 0)
7463 		{
7464 			XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
7465 		}
7466 		else
7467 		{
7468 			XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
7469 		}
7470 	}
7471 
7472 	xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
7473 	xlhdr.t_infomask = newtup->t_data->t_infomask;
7474 	xlhdr.t_hoff = newtup->t_data->t_hoff;
7475 	Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
7476 
7477 	/*
7478 	 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
7479 	 *
7480 	 * The 'data' doesn't include the common prefix or suffix.
7481 	 */
7482 	XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
7483 	if (prefixlen == 0)
7484 	{
7485 		XLogRegisterBufData(0,
7486 							((char *) newtup->t_data) + SizeofHeapTupleHeader,
7487 							newtup->t_len - SizeofHeapTupleHeader - suffixlen);
7488 	}
7489 	else
7490 	{
7491 		/*
7492 		 * Have to write the null bitmap and data after the common prefix as
7493 		 * two separate rdata entries.
7494 		 */
7495 		/* bitmap [+ padding] [+ oid] */
7496 		if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
7497 		{
7498 			XLogRegisterBufData(0,
7499 								((char *) newtup->t_data) + SizeofHeapTupleHeader,
7500 								newtup->t_data->t_hoff - SizeofHeapTupleHeader);
7501 		}
7502 
7503 		/* data after common prefix */
7504 		XLogRegisterBufData(0,
7505 							((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
7506 							newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
7507 	}
7508 
7509 	/* We need to log a tuple identity */
7510 	if (need_tuple_data && old_key_tuple)
7511 	{
7512 		/* don't really need this, but its more comfy to decode */
7513 		xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
7514 		xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
7515 		xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
7516 
7517 		XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
7518 
7519 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
7520 		XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
7521 						 old_key_tuple->t_len - SizeofHeapTupleHeader);
7522 	}
7523 
7524 	/* filtering by origin on a row level is much more efficient */
7525 	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
7526 
7527 	recptr = XLogInsert(RM_HEAP_ID, info);
7528 
7529 	return recptr;
7530 }
7531 
7532 /*
7533  * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
7534  *
7535  * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
7536  * tuples.
7537  */
7538 static XLogRecPtr
log_heap_new_cid(Relation relation,HeapTuple tup)7539 log_heap_new_cid(Relation relation, HeapTuple tup)
7540 {
7541 	xl_heap_new_cid xlrec;
7542 
7543 	XLogRecPtr	recptr;
7544 	HeapTupleHeader hdr = tup->t_data;
7545 
7546 	Assert(ItemPointerIsValid(&tup->t_self));
7547 	Assert(tup->t_tableOid != InvalidOid);
7548 
7549 	xlrec.top_xid = GetTopTransactionId();
7550 	xlrec.target_node = relation->rd_node;
7551 	xlrec.target_tid = tup->t_self;
7552 
7553 	/*
7554 	 * If the tuple got inserted & deleted in the same TX we definitely have a
7555 	 * combocid, set cmin and cmax.
7556 	 */
7557 	if (hdr->t_infomask & HEAP_COMBOCID)
7558 	{
7559 		Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
7560 		Assert(!HeapTupleHeaderXminInvalid(hdr));
7561 		xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
7562 		xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
7563 		xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
7564 	}
7565 	/* No combocid, so only cmin or cmax can be set by this TX */
7566 	else
7567 	{
7568 		/*
7569 		 * Tuple inserted.
7570 		 *
7571 		 * We need to check for LOCK ONLY because multixacts might be
7572 		 * transferred to the new tuple in case of FOR KEY SHARE updates in
7573 		 * which case there will be an xmax, although the tuple just got
7574 		 * inserted.
7575 		 */
7576 		if (hdr->t_infomask & HEAP_XMAX_INVALID ||
7577 			HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
7578 		{
7579 			xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
7580 			xlrec.cmax = InvalidCommandId;
7581 		}
7582 		/* Tuple from a different tx updated or deleted. */
7583 		else
7584 		{
7585 			xlrec.cmin = InvalidCommandId;
7586 			xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
7587 
7588 		}
7589 		xlrec.combocid = InvalidCommandId;
7590 	}
7591 
7592 	/*
7593 	 * Note that we don't need to register the buffer here, because this
7594 	 * operation does not modify the page. The insert/update/delete that
7595 	 * called us certainly did, but that's WAL-logged separately.
7596 	 */
7597 	XLogBeginInsert();
7598 	XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
7599 
7600 	/* will be looked at irrespective of origin */
7601 
7602 	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
7603 
7604 	return recptr;
7605 }
7606 
7607 /*
7608  * Build a heap tuple representing the configured REPLICA IDENTITY to represent
7609  * the old tuple in a UPDATE or DELETE.
7610  *
7611  * Returns NULL if there's no need to log an identity or if there's no suitable
7612  * key defined.
7613  *
7614  * key_changed should be false if caller knows that no replica identity
7615  * columns changed value.  It's always true in the DELETE case.
7616  *
7617  * *copy is set to true if the returned tuple is a modified copy rather than
7618  * the same tuple that was passed in.
7619  */
7620 static HeapTuple
ExtractReplicaIdentity(Relation relation,HeapTuple tp,bool key_changed,bool * copy)7621 ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed,
7622 					   bool *copy)
7623 {
7624 	TupleDesc	desc = RelationGetDescr(relation);
7625 	char		replident = relation->rd_rel->relreplident;
7626 	Bitmapset  *idattrs;
7627 	HeapTuple	key_tuple;
7628 	bool		nulls[MaxHeapAttributeNumber];
7629 	Datum		values[MaxHeapAttributeNumber];
7630 
7631 	*copy = false;
7632 
7633 	if (!RelationIsLogicallyLogged(relation))
7634 		return NULL;
7635 
7636 	if (replident == REPLICA_IDENTITY_NOTHING)
7637 		return NULL;
7638 
7639 	if (replident == REPLICA_IDENTITY_FULL)
7640 	{
7641 		/*
7642 		 * When logging the entire old tuple, it very well could contain
7643 		 * toasted columns. If so, force them to be inlined.
7644 		 */
7645 		if (HeapTupleHasExternal(tp))
7646 		{
7647 			*copy = true;
7648 			tp = toast_flatten_tuple(tp, desc);
7649 		}
7650 		return tp;
7651 	}
7652 
7653 	/* if the key hasn't changed and we're only logging the key, we're done */
7654 	if (!key_changed)
7655 		return NULL;
7656 
7657 	/* find out the replica identity columns */
7658 	idattrs = RelationGetIndexAttrBitmap(relation,
7659 										 INDEX_ATTR_BITMAP_IDENTITY_KEY);
7660 
7661 	/*
7662 	 * If there's no defined replica identity columns, treat as !key_changed.
7663 	 * (This case should not be reachable from heap_update, since that should
7664 	 * calculate key_changed accurately.  But heap_delete just passes constant
7665 	 * true for key_changed, so we can hit this case in deletes.)
7666 	 */
7667 	if (bms_is_empty(idattrs))
7668 		return NULL;
7669 
7670 	/*
7671 	 * Construct a new tuple containing only the replica identity columns,
7672 	 * with nulls elsewhere.  While we're at it, assert that the replica
7673 	 * identity columns aren't null.
7674 	 */
7675 	heap_deform_tuple(tp, desc, values, nulls);
7676 
7677 	for (int i = 0; i < desc->natts; i++)
7678 	{
7679 		if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
7680 						  idattrs))
7681 			Assert(!nulls[i]);
7682 		else
7683 			nulls[i] = true;
7684 	}
7685 
7686 	key_tuple = heap_form_tuple(desc, values, nulls);
7687 	*copy = true;
7688 
7689 	bms_free(idattrs);
7690 
7691 	/*
7692 	 * If the tuple, which by here only contains indexed columns, still has
7693 	 * toasted columns, force them to be inlined. This is somewhat unlikely
7694 	 * since there's limits on the size of indexed columns, so we don't
7695 	 * duplicate toast_flatten_tuple()s functionality in the above loop over
7696 	 * the indexed columns, even if it would be more efficient.
7697 	 */
7698 	if (HeapTupleHasExternal(key_tuple))
7699 	{
7700 		HeapTuple	oldtup = key_tuple;
7701 
7702 		key_tuple = toast_flatten_tuple(oldtup, desc);
7703 		heap_freetuple(oldtup);
7704 	}
7705 
7706 	return key_tuple;
7707 }
7708 
7709 /*
7710  * Handles CLEANUP_INFO
7711  */
7712 static void
heap_xlog_cleanup_info(XLogReaderState * record)7713 heap_xlog_cleanup_info(XLogReaderState *record)
7714 {
7715 	xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
7716 
7717 	if (InHotStandby)
7718 		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
7719 
7720 	/*
7721 	 * Actual operation is a no-op. Record type exists to provide a means for
7722 	 * conflict processing to occur before we begin index vacuum actions. see
7723 	 * vacuumlazy.c and also comments in btvacuumpage()
7724 	 */
7725 
7726 	/* Backup blocks are not used in cleanup_info records */
7727 	Assert(!XLogRecHasAnyBlockRefs(record));
7728 }
7729 
7730 /*
7731  * Handles XLOG_HEAP2_CLEAN record type
7732  */
7733 static void
heap_xlog_clean(XLogReaderState * record)7734 heap_xlog_clean(XLogReaderState *record)
7735 {
7736 	XLogRecPtr	lsn = record->EndRecPtr;
7737 	xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
7738 	Buffer		buffer;
7739 	RelFileNode rnode;
7740 	BlockNumber blkno;
7741 	XLogRedoAction action;
7742 
7743 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
7744 
7745 	/*
7746 	 * We're about to remove tuples. In Hot Standby mode, ensure that there's
7747 	 * no queries running for which the removed tuples are still visible.
7748 	 *
7749 	 * Not all HEAP2_CLEAN records remove tuples with xids, so we only want to
7750 	 * conflict on the records that cause MVCC failures for user queries. If
7751 	 * latestRemovedXid is invalid, skip conflict processing.
7752 	 */
7753 	if (InHotStandby && TransactionIdIsValid(xlrec->latestRemovedXid))
7754 		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, rnode);
7755 
7756 	/*
7757 	 * If we have a full-page image, restore it (using a cleanup lock) and
7758 	 * we're done.
7759 	 */
7760 	action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
7761 										   &buffer);
7762 	if (action == BLK_NEEDS_REDO)
7763 	{
7764 		Page		page = (Page) BufferGetPage(buffer);
7765 		OffsetNumber *end;
7766 		OffsetNumber *redirected;
7767 		OffsetNumber *nowdead;
7768 		OffsetNumber *nowunused;
7769 		int			nredirected;
7770 		int			ndead;
7771 		int			nunused;
7772 		Size		datalen;
7773 
7774 		redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
7775 
7776 		nredirected = xlrec->nredirected;
7777 		ndead = xlrec->ndead;
7778 		end = (OffsetNumber *) ((char *) redirected + datalen);
7779 		nowdead = redirected + (nredirected * 2);
7780 		nowunused = nowdead + ndead;
7781 		nunused = (end - nowunused);
7782 		Assert(nunused >= 0);
7783 
7784 		/* Update all line pointers per the record, and repair fragmentation */
7785 		heap_page_prune_execute(buffer,
7786 								redirected, nredirected,
7787 								nowdead, ndead,
7788 								nowunused, nunused);
7789 
7790 		/*
7791 		 * Note: we don't worry about updating the page's prunability hints.
7792 		 * At worst this will cause an extra prune cycle to occur soon.
7793 		 */
7794 
7795 		PageSetLSN(page, lsn);
7796 		MarkBufferDirty(buffer);
7797 	}
7798 
7799 	if (BufferIsValid(buffer))
7800 	{
7801 		Size		freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
7802 
7803 		UnlockReleaseBuffer(buffer);
7804 
7805 		/*
7806 		 * After cleaning records from a page, it's useful to update the FSM
7807 		 * about it, as it may cause the page become target for insertions
7808 		 * later even if vacuum decides not to visit it (which is possible if
7809 		 * gets marked all-visible.)
7810 		 *
7811 		 * Do this regardless of a full-page image being applied, since the
7812 		 * FSM data is not in the page anyway.
7813 		 */
7814 		XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
7815 	}
7816 }
7817 
7818 /*
7819  * Replay XLOG_HEAP2_VISIBLE record.
7820  *
7821  * The critical integrity requirement here is that we must never end up with
7822  * a situation where the visibility map bit is set, and the page-level
7823  * PD_ALL_VISIBLE bit is clear.  If that were to occur, then a subsequent
7824  * page modification would fail to clear the visibility map bit.
7825  */
7826 static void
heap_xlog_visible(XLogReaderState * record)7827 heap_xlog_visible(XLogReaderState *record)
7828 {
7829 	XLogRecPtr	lsn = record->EndRecPtr;
7830 	xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
7831 	Buffer		vmbuffer = InvalidBuffer;
7832 	Buffer		buffer;
7833 	Page		page;
7834 	RelFileNode rnode;
7835 	BlockNumber blkno;
7836 	XLogRedoAction action;
7837 
7838 	XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno);
7839 
7840 	/*
7841 	 * If there are any Hot Standby transactions running that have an xmin
7842 	 * horizon old enough that this page isn't all-visible for them, they
7843 	 * might incorrectly decide that an index-only scan can skip a heap fetch.
7844 	 *
7845 	 * NB: It might be better to throw some kind of "soft" conflict here that
7846 	 * forces any index-only scan that is in flight to perform heap fetches,
7847 	 * rather than killing the transaction outright.
7848 	 */
7849 	if (InHotStandby)
7850 		ResolveRecoveryConflictWithSnapshot(xlrec->cutoff_xid, rnode);
7851 
7852 	/*
7853 	 * Read the heap page, if it still exists. If the heap file has dropped or
7854 	 * truncated later in recovery, we don't need to update the page, but we'd
7855 	 * better still update the visibility map.
7856 	 */
7857 	action = XLogReadBufferForRedo(record, 1, &buffer);
7858 	if (action == BLK_NEEDS_REDO)
7859 	{
7860 		/*
7861 		 * We don't bump the LSN of the heap page when setting the visibility
7862 		 * map bit (unless checksums or wal_hint_bits is enabled, in which
7863 		 * case we must), because that would generate an unworkable volume of
7864 		 * full-page writes.  This exposes us to torn page hazards, but since
7865 		 * we're not inspecting the existing page contents in any way, we
7866 		 * don't care.
7867 		 *
7868 		 * However, all operations that clear the visibility map bit *do* bump
7869 		 * the LSN, and those operations will only be replayed if the XLOG LSN
7870 		 * follows the page LSN.  Thus, if the page LSN has advanced past our
7871 		 * XLOG record's LSN, we mustn't mark the page all-visible, because
7872 		 * the subsequent update won't be replayed to clear the flag.
7873 		 */
7874 		page = BufferGetPage(buffer);
7875 
7876 		PageSetAllVisible(page);
7877 
7878 		MarkBufferDirty(buffer);
7879 	}
7880 	else if (action == BLK_RESTORED)
7881 	{
7882 		/*
7883 		 * If heap block was backed up, we already restored it and there's
7884 		 * nothing more to do. (This can only happen with checksums or
7885 		 * wal_log_hints enabled.)
7886 		 */
7887 	}
7888 
7889 	if (BufferIsValid(buffer))
7890 	{
7891 		Size		space = PageGetFreeSpace(BufferGetPage(buffer));
7892 
7893 		UnlockReleaseBuffer(buffer);
7894 
7895 		/*
7896 		 * Since FSM is not WAL-logged and only updated heuristically, it
7897 		 * easily becomes stale in standbys.  If the standby is later promoted
7898 		 * and runs VACUUM, it will skip updating individual free space
7899 		 * figures for pages that became all-visible (or all-frozen, depending
7900 		 * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
7901 		 * propagates too optimistic free space values to upper FSM layers;
7902 		 * later inserters try to use such pages only to find out that they
7903 		 * are unusable.  This can cause long stalls when there are many such
7904 		 * pages.
7905 		 *
7906 		 * Forestall those problems by updating FSM's idea about a page that
7907 		 * is becoming all-visible or all-frozen.
7908 		 *
7909 		 * Do this regardless of a full-page image being applied, since the
7910 		 * FSM data is not in the page anyway.
7911 		 */
7912 		if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
7913 			XLogRecordPageWithFreeSpace(rnode, blkno, space);
7914 	}
7915 
7916 	/*
7917 	 * Even if we skipped the heap page update due to the LSN interlock, it's
7918 	 * still safe to update the visibility map.  Any WAL record that clears
7919 	 * the visibility map bit does so before checking the page LSN, so any
7920 	 * bits that need to be cleared will still be cleared.
7921 	 */
7922 	if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
7923 									  &vmbuffer) == BLK_NEEDS_REDO)
7924 	{
7925 		Page		vmpage = BufferGetPage(vmbuffer);
7926 		Relation	reln;
7927 
7928 		/* initialize the page if it was read as zeros */
7929 		if (PageIsNew(vmpage))
7930 			PageInit(vmpage, BLCKSZ, 0);
7931 
7932 		/*
7933 		 * XLogReadBufferForRedoExtended locked the buffer. But
7934 		 * visibilitymap_set will handle locking itself.
7935 		 */
7936 		LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
7937 
7938 		reln = CreateFakeRelcacheEntry(rnode);
7939 		visibilitymap_pin(reln, blkno, &vmbuffer);
7940 
7941 		/*
7942 		 * Don't set the bit if replay has already passed this point.
7943 		 *
7944 		 * It might be safe to do this unconditionally; if replay has passed
7945 		 * this point, we'll replay at least as far this time as we did
7946 		 * before, and if this bit needs to be cleared, the record responsible
7947 		 * for doing so should be again replayed, and clear it.  For right
7948 		 * now, out of an abundance of conservatism, we use the same test here
7949 		 * we did for the heap page.  If this results in a dropped bit, no
7950 		 * real harm is done; and the next VACUUM will fix it.
7951 		 */
7952 		if (lsn > PageGetLSN(vmpage))
7953 			visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
7954 							  xlrec->cutoff_xid, xlrec->flags);
7955 
7956 		ReleaseBuffer(vmbuffer);
7957 		FreeFakeRelcacheEntry(reln);
7958 	}
7959 	else if (BufferIsValid(vmbuffer))
7960 		UnlockReleaseBuffer(vmbuffer);
7961 }
7962 
7963 /*
7964  * Replay XLOG_HEAP2_FREEZE_PAGE records
7965  */
7966 static void
heap_xlog_freeze_page(XLogReaderState * record)7967 heap_xlog_freeze_page(XLogReaderState *record)
7968 {
7969 	XLogRecPtr	lsn = record->EndRecPtr;
7970 	xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
7971 	TransactionId cutoff_xid = xlrec->cutoff_xid;
7972 	Buffer		buffer;
7973 	int			ntup;
7974 
7975 	/*
7976 	 * In Hot Standby mode, ensure that there's no queries running which still
7977 	 * consider the frozen xids as running.
7978 	 */
7979 	if (InHotStandby)
7980 	{
7981 		RelFileNode rnode;
7982 		TransactionId latestRemovedXid = cutoff_xid;
7983 
7984 		TransactionIdRetreat(latestRemovedXid);
7985 
7986 		XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
7987 		ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
7988 	}
7989 
7990 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
7991 	{
7992 		Page		page = BufferGetPage(buffer);
7993 		xl_heap_freeze_tuple *tuples;
7994 
7995 		tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL);
7996 
7997 		/* now execute freeze plan for each frozen tuple */
7998 		for (ntup = 0; ntup < xlrec->ntuples; ntup++)
7999 		{
8000 			xl_heap_freeze_tuple *xlrec_tp;
8001 			ItemId		lp;
8002 			HeapTupleHeader tuple;
8003 
8004 			xlrec_tp = &tuples[ntup];
8005 			lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
8006 			tuple = (HeapTupleHeader) PageGetItem(page, lp);
8007 
8008 			heap_execute_freeze_tuple(tuple, xlrec_tp);
8009 		}
8010 
8011 		PageSetLSN(page, lsn);
8012 		MarkBufferDirty(buffer);
8013 	}
8014 	if (BufferIsValid(buffer))
8015 		UnlockReleaseBuffer(buffer);
8016 }
8017 
8018 /*
8019  * Given an "infobits" field from an XLog record, set the correct bits in the
8020  * given infomask and infomask2 for the tuple touched by the record.
8021  *
8022  * (This is the reverse of compute_infobits).
8023  */
8024 static void
fix_infomask_from_infobits(uint8 infobits,uint16 * infomask,uint16 * infomask2)8025 fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
8026 {
8027 	*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
8028 				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
8029 	*infomask2 &= ~HEAP_KEYS_UPDATED;
8030 
8031 	if (infobits & XLHL_XMAX_IS_MULTI)
8032 		*infomask |= HEAP_XMAX_IS_MULTI;
8033 	if (infobits & XLHL_XMAX_LOCK_ONLY)
8034 		*infomask |= HEAP_XMAX_LOCK_ONLY;
8035 	if (infobits & XLHL_XMAX_EXCL_LOCK)
8036 		*infomask |= HEAP_XMAX_EXCL_LOCK;
8037 	/* note HEAP_XMAX_SHR_LOCK isn't considered here */
8038 	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
8039 		*infomask |= HEAP_XMAX_KEYSHR_LOCK;
8040 
8041 	if (infobits & XLHL_KEYS_UPDATED)
8042 		*infomask2 |= HEAP_KEYS_UPDATED;
8043 }
8044 
8045 static void
heap_xlog_delete(XLogReaderState * record)8046 heap_xlog_delete(XLogReaderState *record)
8047 {
8048 	XLogRecPtr	lsn = record->EndRecPtr;
8049 	xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
8050 	Buffer		buffer;
8051 	Page		page;
8052 	ItemId		lp = NULL;
8053 	HeapTupleHeader htup;
8054 	BlockNumber blkno;
8055 	RelFileNode target_node;
8056 	ItemPointerData target_tid;
8057 
8058 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8059 	ItemPointerSetBlockNumber(&target_tid, blkno);
8060 	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8061 
8062 	/*
8063 	 * The visibility map may need to be fixed even if the heap page is
8064 	 * already up-to-date.
8065 	 */
8066 	if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8067 	{
8068 		Relation	reln = CreateFakeRelcacheEntry(target_node);
8069 		Buffer		vmbuffer = InvalidBuffer;
8070 
8071 		visibilitymap_pin(reln, blkno, &vmbuffer);
8072 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8073 		ReleaseBuffer(vmbuffer);
8074 		FreeFakeRelcacheEntry(reln);
8075 	}
8076 
8077 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8078 	{
8079 		page = BufferGetPage(buffer);
8080 
8081 		if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
8082 			lp = PageGetItemId(page, xlrec->offnum);
8083 
8084 		if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
8085 			elog(PANIC, "invalid lp");
8086 
8087 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8088 
8089 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8090 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8091 		HeapTupleHeaderClearHotUpdated(htup);
8092 		fix_infomask_from_infobits(xlrec->infobits_set,
8093 								   &htup->t_infomask, &htup->t_infomask2);
8094 		if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
8095 			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8096 		else
8097 			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
8098 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8099 
8100 		/* Mark the page as a candidate for pruning */
8101 		PageSetPrunable(page, XLogRecGetXid(record));
8102 
8103 		if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
8104 			PageClearAllVisible(page);
8105 
8106 		/* Make sure t_ctid is set correctly */
8107 		if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
8108 			HeapTupleHeaderSetMovedPartitions(htup);
8109 		else
8110 			htup->t_ctid = target_tid;
8111 		PageSetLSN(page, lsn);
8112 		MarkBufferDirty(buffer);
8113 	}
8114 	if (BufferIsValid(buffer))
8115 		UnlockReleaseBuffer(buffer);
8116 }
8117 
8118 static void
heap_xlog_insert(XLogReaderState * record)8119 heap_xlog_insert(XLogReaderState *record)
8120 {
8121 	XLogRecPtr	lsn = record->EndRecPtr;
8122 	xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
8123 	Buffer		buffer;
8124 	Page		page;
8125 	union
8126 	{
8127 		HeapTupleHeaderData hdr;
8128 		char		data[MaxHeapTupleSize];
8129 	}			tbuf;
8130 	HeapTupleHeader htup;
8131 	xl_heap_header xlhdr;
8132 	uint32		newlen;
8133 	Size		freespace = 0;
8134 	RelFileNode target_node;
8135 	BlockNumber blkno;
8136 	ItemPointerData target_tid;
8137 	XLogRedoAction action;
8138 
8139 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
8140 	ItemPointerSetBlockNumber(&target_tid, blkno);
8141 	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
8142 
8143 	/*
8144 	 * The visibility map may need to be fixed even if the heap page is
8145 	 * already up-to-date.
8146 	 */
8147 	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8148 	{
8149 		Relation	reln = CreateFakeRelcacheEntry(target_node);
8150 		Buffer		vmbuffer = InvalidBuffer;
8151 
8152 		visibilitymap_pin(reln, blkno, &vmbuffer);
8153 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8154 		ReleaseBuffer(vmbuffer);
8155 		FreeFakeRelcacheEntry(reln);
8156 	}
8157 
8158 	/*
8159 	 * If we inserted the first and only tuple on the page, re-initialize the
8160 	 * page from scratch.
8161 	 */
8162 	if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8163 	{
8164 		buffer = XLogInitBufferForRedo(record, 0);
8165 		page = BufferGetPage(buffer);
8166 		PageInit(page, BufferGetPageSize(buffer), 0);
8167 		action = BLK_NEEDS_REDO;
8168 	}
8169 	else
8170 		action = XLogReadBufferForRedo(record, 0, &buffer);
8171 	if (action == BLK_NEEDS_REDO)
8172 	{
8173 		Size		datalen;
8174 		char	   *data;
8175 
8176 		page = BufferGetPage(buffer);
8177 
8178 		if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
8179 			elog(PANIC, "invalid max offset number");
8180 
8181 		data = XLogRecGetBlockData(record, 0, &datalen);
8182 
8183 		newlen = datalen - SizeOfHeapHeader;
8184 		Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
8185 		memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
8186 		data += SizeOfHeapHeader;
8187 
8188 		htup = &tbuf.hdr;
8189 		MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8190 		/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8191 		memcpy((char *) htup + SizeofHeapTupleHeader,
8192 			   data,
8193 			   newlen);
8194 		newlen += SizeofHeapTupleHeader;
8195 		htup->t_infomask2 = xlhdr.t_infomask2;
8196 		htup->t_infomask = xlhdr.t_infomask;
8197 		htup->t_hoff = xlhdr.t_hoff;
8198 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8199 		HeapTupleHeaderSetCmin(htup, FirstCommandId);
8200 		htup->t_ctid = target_tid;
8201 
8202 		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
8203 						true, true) == InvalidOffsetNumber)
8204 			elog(PANIC, "failed to add tuple");
8205 
8206 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8207 
8208 		PageSetLSN(page, lsn);
8209 
8210 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8211 			PageClearAllVisible(page);
8212 
8213 		MarkBufferDirty(buffer);
8214 	}
8215 	if (BufferIsValid(buffer))
8216 		UnlockReleaseBuffer(buffer);
8217 
8218 	/*
8219 	 * If the page is running low on free space, update the FSM as well.
8220 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8221 	 * better than that without knowing the fill-factor for the table.
8222 	 *
8223 	 * XXX: Don't do this if the page was restored from full page image. We
8224 	 * don't bother to update the FSM in that case, it doesn't need to be
8225 	 * totally accurate anyway.
8226 	 */
8227 	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8228 		XLogRecordPageWithFreeSpace(target_node, blkno, freespace);
8229 }
8230 
8231 /*
8232  * Handles MULTI_INSERT record type.
8233  */
8234 static void
heap_xlog_multi_insert(XLogReaderState * record)8235 heap_xlog_multi_insert(XLogReaderState *record)
8236 {
8237 	XLogRecPtr	lsn = record->EndRecPtr;
8238 	xl_heap_multi_insert *xlrec;
8239 	RelFileNode rnode;
8240 	BlockNumber blkno;
8241 	Buffer		buffer;
8242 	Page		page;
8243 	union
8244 	{
8245 		HeapTupleHeaderData hdr;
8246 		char		data[MaxHeapTupleSize];
8247 	}			tbuf;
8248 	HeapTupleHeader htup;
8249 	uint32		newlen;
8250 	Size		freespace = 0;
8251 	int			i;
8252 	bool		isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
8253 	XLogRedoAction action;
8254 
8255 	/*
8256 	 * Insertion doesn't overwrite MVCC data, so no conflict processing is
8257 	 * required.
8258 	 */
8259 	xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
8260 
8261 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
8262 
8263 	/*
8264 	 * The visibility map may need to be fixed even if the heap page is
8265 	 * already up-to-date.
8266 	 */
8267 	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8268 	{
8269 		Relation	reln = CreateFakeRelcacheEntry(rnode);
8270 		Buffer		vmbuffer = InvalidBuffer;
8271 
8272 		visibilitymap_pin(reln, blkno, &vmbuffer);
8273 		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
8274 		ReleaseBuffer(vmbuffer);
8275 		FreeFakeRelcacheEntry(reln);
8276 	}
8277 
8278 	if (isinit)
8279 	{
8280 		buffer = XLogInitBufferForRedo(record, 0);
8281 		page = BufferGetPage(buffer);
8282 		PageInit(page, BufferGetPageSize(buffer), 0);
8283 		action = BLK_NEEDS_REDO;
8284 	}
8285 	else
8286 		action = XLogReadBufferForRedo(record, 0, &buffer);
8287 	if (action == BLK_NEEDS_REDO)
8288 	{
8289 		char	   *tupdata;
8290 		char	   *endptr;
8291 		Size		len;
8292 
8293 		/* Tuples are stored as block data */
8294 		tupdata = XLogRecGetBlockData(record, 0, &len);
8295 		endptr = tupdata + len;
8296 
8297 		page = (Page) BufferGetPage(buffer);
8298 
8299 		for (i = 0; i < xlrec->ntuples; i++)
8300 		{
8301 			OffsetNumber offnum;
8302 			xl_multi_insert_tuple *xlhdr;
8303 
8304 			/*
8305 			 * If we're reinitializing the page, the tuples are stored in
8306 			 * order from FirstOffsetNumber. Otherwise there's an array of
8307 			 * offsets in the WAL record, and the tuples come after that.
8308 			 */
8309 			if (isinit)
8310 				offnum = FirstOffsetNumber + i;
8311 			else
8312 				offnum = xlrec->offsets[i];
8313 			if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8314 				elog(PANIC, "invalid max offset number");
8315 
8316 			xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
8317 			tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
8318 
8319 			newlen = xlhdr->datalen;
8320 			Assert(newlen <= MaxHeapTupleSize);
8321 			htup = &tbuf.hdr;
8322 			MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8323 			/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
8324 			memcpy((char *) htup + SizeofHeapTupleHeader,
8325 				   (char *) tupdata,
8326 				   newlen);
8327 			tupdata += newlen;
8328 
8329 			newlen += SizeofHeapTupleHeader;
8330 			htup->t_infomask2 = xlhdr->t_infomask2;
8331 			htup->t_infomask = xlhdr->t_infomask;
8332 			htup->t_hoff = xlhdr->t_hoff;
8333 			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8334 			HeapTupleHeaderSetCmin(htup, FirstCommandId);
8335 			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
8336 			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
8337 
8338 			offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8339 			if (offnum == InvalidOffsetNumber)
8340 				elog(PANIC, "failed to add tuple");
8341 		}
8342 		if (tupdata != endptr)
8343 			elog(PANIC, "total tuple length mismatch");
8344 
8345 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8346 
8347 		PageSetLSN(page, lsn);
8348 
8349 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
8350 			PageClearAllVisible(page);
8351 
8352 		MarkBufferDirty(buffer);
8353 	}
8354 	if (BufferIsValid(buffer))
8355 		UnlockReleaseBuffer(buffer);
8356 
8357 	/*
8358 	 * If the page is running low on free space, update the FSM as well.
8359 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8360 	 * better than that without knowing the fill-factor for the table.
8361 	 *
8362 	 * XXX: Don't do this if the page was restored from full page image. We
8363 	 * don't bother to update the FSM in that case, it doesn't need to be
8364 	 * totally accurate anyway.
8365 	 */
8366 	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
8367 		XLogRecordPageWithFreeSpace(rnode, blkno, freespace);
8368 }
8369 
8370 /*
8371  * Handles UPDATE and HOT_UPDATE
8372  */
8373 static void
heap_xlog_update(XLogReaderState * record,bool hot_update)8374 heap_xlog_update(XLogReaderState *record, bool hot_update)
8375 {
8376 	XLogRecPtr	lsn = record->EndRecPtr;
8377 	xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
8378 	RelFileNode rnode;
8379 	BlockNumber oldblk;
8380 	BlockNumber newblk;
8381 	ItemPointerData newtid;
8382 	Buffer		obuffer,
8383 				nbuffer;
8384 	Page		page;
8385 	OffsetNumber offnum;
8386 	ItemId		lp = NULL;
8387 	HeapTupleData oldtup;
8388 	HeapTupleHeader htup;
8389 	uint16		prefixlen = 0,
8390 				suffixlen = 0;
8391 	char	   *newp;
8392 	union
8393 	{
8394 		HeapTupleHeaderData hdr;
8395 		char		data[MaxHeapTupleSize];
8396 	}			tbuf;
8397 	xl_heap_header xlhdr;
8398 	uint32		newlen;
8399 	Size		freespace = 0;
8400 	XLogRedoAction oldaction;
8401 	XLogRedoAction newaction;
8402 
8403 	/* initialize to keep the compiler quiet */
8404 	oldtup.t_data = NULL;
8405 	oldtup.t_len = 0;
8406 
8407 	XLogRecGetBlockTag(record, 0, &rnode, NULL, &newblk);
8408 	if (XLogRecGetBlockTag(record, 1, NULL, NULL, &oldblk))
8409 	{
8410 		/* HOT updates are never done across pages */
8411 		Assert(!hot_update);
8412 	}
8413 	else
8414 		oldblk = newblk;
8415 
8416 	ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
8417 
8418 	/*
8419 	 * The visibility map may need to be fixed even if the heap page is
8420 	 * already up-to-date.
8421 	 */
8422 	if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8423 	{
8424 		Relation	reln = CreateFakeRelcacheEntry(rnode);
8425 		Buffer		vmbuffer = InvalidBuffer;
8426 
8427 		visibilitymap_pin(reln, oldblk, &vmbuffer);
8428 		visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8429 		ReleaseBuffer(vmbuffer);
8430 		FreeFakeRelcacheEntry(reln);
8431 	}
8432 
8433 	/*
8434 	 * In normal operation, it is important to lock the two pages in
8435 	 * page-number order, to avoid possible deadlocks against other update
8436 	 * operations going the other way.  However, during WAL replay there can
8437 	 * be no other update happening, so we don't need to worry about that. But
8438 	 * we *do* need to worry that we don't expose an inconsistent state to Hot
8439 	 * Standby queries --- so the original page can't be unlocked before we've
8440 	 * added the new tuple to the new page.
8441 	 */
8442 
8443 	/* Deal with old tuple version */
8444 	oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
8445 									  &obuffer);
8446 	if (oldaction == BLK_NEEDS_REDO)
8447 	{
8448 		page = BufferGetPage(obuffer);
8449 		offnum = xlrec->old_offnum;
8450 		if (PageGetMaxOffsetNumber(page) >= offnum)
8451 			lp = PageGetItemId(page, offnum);
8452 
8453 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8454 			elog(PANIC, "invalid lp");
8455 
8456 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8457 
8458 		oldtup.t_data = htup;
8459 		oldtup.t_len = ItemIdGetLength(lp);
8460 
8461 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8462 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8463 		if (hot_update)
8464 			HeapTupleHeaderSetHotUpdated(htup);
8465 		else
8466 			HeapTupleHeaderClearHotUpdated(htup);
8467 		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
8468 								   &htup->t_infomask2);
8469 		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
8470 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8471 		/* Set forward chain link in t_ctid */
8472 		htup->t_ctid = newtid;
8473 
8474 		/* Mark the page as a candidate for pruning */
8475 		PageSetPrunable(page, XLogRecGetXid(record));
8476 
8477 		if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
8478 			PageClearAllVisible(page);
8479 
8480 		PageSetLSN(page, lsn);
8481 		MarkBufferDirty(obuffer);
8482 	}
8483 
8484 	/*
8485 	 * Read the page the new tuple goes into, if different from old.
8486 	 */
8487 	if (oldblk == newblk)
8488 	{
8489 		nbuffer = obuffer;
8490 		newaction = oldaction;
8491 	}
8492 	else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
8493 	{
8494 		nbuffer = XLogInitBufferForRedo(record, 0);
8495 		page = (Page) BufferGetPage(nbuffer);
8496 		PageInit(page, BufferGetPageSize(nbuffer), 0);
8497 		newaction = BLK_NEEDS_REDO;
8498 	}
8499 	else
8500 		newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
8501 
8502 	/*
8503 	 * The visibility map may need to be fixed even if the heap page is
8504 	 * already up-to-date.
8505 	 */
8506 	if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8507 	{
8508 		Relation	reln = CreateFakeRelcacheEntry(rnode);
8509 		Buffer		vmbuffer = InvalidBuffer;
8510 
8511 		visibilitymap_pin(reln, newblk, &vmbuffer);
8512 		visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
8513 		ReleaseBuffer(vmbuffer);
8514 		FreeFakeRelcacheEntry(reln);
8515 	}
8516 
8517 	/* Deal with new tuple */
8518 	if (newaction == BLK_NEEDS_REDO)
8519 	{
8520 		char	   *recdata;
8521 		char	   *recdata_end;
8522 		Size		datalen;
8523 		Size		tuplen;
8524 
8525 		recdata = XLogRecGetBlockData(record, 0, &datalen);
8526 		recdata_end = recdata + datalen;
8527 
8528 		page = BufferGetPage(nbuffer);
8529 
8530 		offnum = xlrec->new_offnum;
8531 		if (PageGetMaxOffsetNumber(page) + 1 < offnum)
8532 			elog(PANIC, "invalid max offset number");
8533 
8534 		if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
8535 		{
8536 			Assert(newblk == oldblk);
8537 			memcpy(&prefixlen, recdata, sizeof(uint16));
8538 			recdata += sizeof(uint16);
8539 		}
8540 		if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
8541 		{
8542 			Assert(newblk == oldblk);
8543 			memcpy(&suffixlen, recdata, sizeof(uint16));
8544 			recdata += sizeof(uint16);
8545 		}
8546 
8547 		memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
8548 		recdata += SizeOfHeapHeader;
8549 
8550 		tuplen = recdata_end - recdata;
8551 		Assert(tuplen <= MaxHeapTupleSize);
8552 
8553 		htup = &tbuf.hdr;
8554 		MemSet((char *) htup, 0, SizeofHeapTupleHeader);
8555 
8556 		/*
8557 		 * Reconstruct the new tuple using the prefix and/or suffix from the
8558 		 * old tuple, and the data stored in the WAL record.
8559 		 */
8560 		newp = (char *) htup + SizeofHeapTupleHeader;
8561 		if (prefixlen > 0)
8562 		{
8563 			int			len;
8564 
8565 			/* copy bitmap [+ padding] [+ oid] from WAL record */
8566 			len = xlhdr.t_hoff - SizeofHeapTupleHeader;
8567 			memcpy(newp, recdata, len);
8568 			recdata += len;
8569 			newp += len;
8570 
8571 			/* copy prefix from old tuple */
8572 			memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
8573 			newp += prefixlen;
8574 
8575 			/* copy new tuple data from WAL record */
8576 			len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
8577 			memcpy(newp, recdata, len);
8578 			recdata += len;
8579 			newp += len;
8580 		}
8581 		else
8582 		{
8583 			/*
8584 			 * copy bitmap [+ padding] [+ oid] + data from record, all in one
8585 			 * go
8586 			 */
8587 			memcpy(newp, recdata, tuplen);
8588 			recdata += tuplen;
8589 			newp += tuplen;
8590 		}
8591 		Assert(recdata == recdata_end);
8592 
8593 		/* copy suffix from old tuple */
8594 		if (suffixlen > 0)
8595 			memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
8596 
8597 		newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
8598 		htup->t_infomask2 = xlhdr.t_infomask2;
8599 		htup->t_infomask = xlhdr.t_infomask;
8600 		htup->t_hoff = xlhdr.t_hoff;
8601 
8602 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
8603 		HeapTupleHeaderSetCmin(htup, FirstCommandId);
8604 		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
8605 		/* Make sure there is no forward chain link in t_ctid */
8606 		htup->t_ctid = newtid;
8607 
8608 		offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
8609 		if (offnum == InvalidOffsetNumber)
8610 			elog(PANIC, "failed to add tuple");
8611 
8612 		if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
8613 			PageClearAllVisible(page);
8614 
8615 		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
8616 
8617 		PageSetLSN(page, lsn);
8618 		MarkBufferDirty(nbuffer);
8619 	}
8620 
8621 	if (BufferIsValid(nbuffer) && nbuffer != obuffer)
8622 		UnlockReleaseBuffer(nbuffer);
8623 	if (BufferIsValid(obuffer))
8624 		UnlockReleaseBuffer(obuffer);
8625 
8626 	/*
8627 	 * If the new page is running low on free space, update the FSM as well.
8628 	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
8629 	 * better than that without knowing the fill-factor for the table.
8630 	 *
8631 	 * However, don't update the FSM on HOT updates, because after crash
8632 	 * recovery, either the old or the new tuple will certainly be dead and
8633 	 * prunable. After pruning, the page will have roughly as much free space
8634 	 * as it did before the update, assuming the new tuple is about the same
8635 	 * size as the old one.
8636 	 *
8637 	 * XXX: Don't do this if the page was restored from full page image. We
8638 	 * don't bother to update the FSM in that case, it doesn't need to be
8639 	 * totally accurate anyway.
8640 	 */
8641 	if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
8642 		XLogRecordPageWithFreeSpace(rnode, newblk, freespace);
8643 }
8644 
8645 static void
heap_xlog_confirm(XLogReaderState * record)8646 heap_xlog_confirm(XLogReaderState *record)
8647 {
8648 	XLogRecPtr	lsn = record->EndRecPtr;
8649 	xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
8650 	Buffer		buffer;
8651 	Page		page;
8652 	OffsetNumber offnum;
8653 	ItemId		lp = NULL;
8654 	HeapTupleHeader htup;
8655 
8656 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8657 	{
8658 		page = BufferGetPage(buffer);
8659 
8660 		offnum = xlrec->offnum;
8661 		if (PageGetMaxOffsetNumber(page) >= offnum)
8662 			lp = PageGetItemId(page, offnum);
8663 
8664 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8665 			elog(PANIC, "invalid lp");
8666 
8667 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8668 
8669 		/*
8670 		 * Confirm tuple as actually inserted
8671 		 */
8672 		ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
8673 
8674 		PageSetLSN(page, lsn);
8675 		MarkBufferDirty(buffer);
8676 	}
8677 	if (BufferIsValid(buffer))
8678 		UnlockReleaseBuffer(buffer);
8679 }
8680 
8681 static void
heap_xlog_lock(XLogReaderState * record)8682 heap_xlog_lock(XLogReaderState *record)
8683 {
8684 	XLogRecPtr	lsn = record->EndRecPtr;
8685 	xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
8686 	Buffer		buffer;
8687 	Page		page;
8688 	OffsetNumber offnum;
8689 	ItemId		lp = NULL;
8690 	HeapTupleHeader htup;
8691 
8692 	/*
8693 	 * The visibility map may need to be fixed even if the heap page is
8694 	 * already up-to-date.
8695 	 */
8696 	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8697 	{
8698 		RelFileNode rnode;
8699 		Buffer		vmbuffer = InvalidBuffer;
8700 		BlockNumber block;
8701 		Relation	reln;
8702 
8703 		XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
8704 		reln = CreateFakeRelcacheEntry(rnode);
8705 
8706 		visibilitymap_pin(reln, block, &vmbuffer);
8707 		visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8708 
8709 		ReleaseBuffer(vmbuffer);
8710 		FreeFakeRelcacheEntry(reln);
8711 	}
8712 
8713 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8714 	{
8715 		page = (Page) BufferGetPage(buffer);
8716 
8717 		offnum = xlrec->offnum;
8718 		if (PageGetMaxOffsetNumber(page) >= offnum)
8719 			lp = PageGetItemId(page, offnum);
8720 
8721 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8722 			elog(PANIC, "invalid lp");
8723 
8724 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8725 
8726 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8727 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8728 		fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8729 								   &htup->t_infomask2);
8730 
8731 		/*
8732 		 * Clear relevant update flags, but only if the modified infomask says
8733 		 * there's no update.
8734 		 */
8735 		if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
8736 		{
8737 			HeapTupleHeaderClearHotUpdated(htup);
8738 			/* Make sure there is no forward chain link in t_ctid */
8739 			ItemPointerSet(&htup->t_ctid,
8740 						   BufferGetBlockNumber(buffer),
8741 						   offnum);
8742 		}
8743 		HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
8744 		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
8745 		PageSetLSN(page, lsn);
8746 		MarkBufferDirty(buffer);
8747 	}
8748 	if (BufferIsValid(buffer))
8749 		UnlockReleaseBuffer(buffer);
8750 }
8751 
8752 static void
heap_xlog_lock_updated(XLogReaderState * record)8753 heap_xlog_lock_updated(XLogReaderState *record)
8754 {
8755 	XLogRecPtr	lsn = record->EndRecPtr;
8756 	xl_heap_lock_updated *xlrec;
8757 	Buffer		buffer;
8758 	Page		page;
8759 	OffsetNumber offnum;
8760 	ItemId		lp = NULL;
8761 	HeapTupleHeader htup;
8762 
8763 	xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
8764 
8765 	/*
8766 	 * The visibility map may need to be fixed even if the heap page is
8767 	 * already up-to-date.
8768 	 */
8769 	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
8770 	{
8771 		RelFileNode rnode;
8772 		Buffer		vmbuffer = InvalidBuffer;
8773 		BlockNumber block;
8774 		Relation	reln;
8775 
8776 		XLogRecGetBlockTag(record, 0, &rnode, NULL, &block);
8777 		reln = CreateFakeRelcacheEntry(rnode);
8778 
8779 		visibilitymap_pin(reln, block, &vmbuffer);
8780 		visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
8781 
8782 		ReleaseBuffer(vmbuffer);
8783 		FreeFakeRelcacheEntry(reln);
8784 	}
8785 
8786 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8787 	{
8788 		page = BufferGetPage(buffer);
8789 
8790 		offnum = xlrec->offnum;
8791 		if (PageGetMaxOffsetNumber(page) >= offnum)
8792 			lp = PageGetItemId(page, offnum);
8793 
8794 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8795 			elog(PANIC, "invalid lp");
8796 
8797 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8798 
8799 		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
8800 		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
8801 		fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
8802 								   &htup->t_infomask2);
8803 		HeapTupleHeaderSetXmax(htup, xlrec->xmax);
8804 
8805 		PageSetLSN(page, lsn);
8806 		MarkBufferDirty(buffer);
8807 	}
8808 	if (BufferIsValid(buffer))
8809 		UnlockReleaseBuffer(buffer);
8810 }
8811 
8812 static void
heap_xlog_inplace(XLogReaderState * record)8813 heap_xlog_inplace(XLogReaderState *record)
8814 {
8815 	XLogRecPtr	lsn = record->EndRecPtr;
8816 	xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
8817 	Buffer		buffer;
8818 	Page		page;
8819 	OffsetNumber offnum;
8820 	ItemId		lp = NULL;
8821 	HeapTupleHeader htup;
8822 	uint32		oldlen;
8823 	Size		newlen;
8824 
8825 	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
8826 	{
8827 		char	   *newtup = XLogRecGetBlockData(record, 0, &newlen);
8828 
8829 		page = BufferGetPage(buffer);
8830 
8831 		offnum = xlrec->offnum;
8832 		if (PageGetMaxOffsetNumber(page) >= offnum)
8833 			lp = PageGetItemId(page, offnum);
8834 
8835 		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
8836 			elog(PANIC, "invalid lp");
8837 
8838 		htup = (HeapTupleHeader) PageGetItem(page, lp);
8839 
8840 		oldlen = ItemIdGetLength(lp) - htup->t_hoff;
8841 		if (oldlen != newlen)
8842 			elog(PANIC, "wrong tuple length");
8843 
8844 		memcpy((char *) htup + htup->t_hoff, newtup, newlen);
8845 
8846 		PageSetLSN(page, lsn);
8847 		MarkBufferDirty(buffer);
8848 	}
8849 	if (BufferIsValid(buffer))
8850 		UnlockReleaseBuffer(buffer);
8851 }
8852 
8853 void
heap_redo(XLogReaderState * record)8854 heap_redo(XLogReaderState *record)
8855 {
8856 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8857 
8858 	/*
8859 	 * These operations don't overwrite MVCC data so no conflict processing is
8860 	 * required. The ones in heap2 rmgr do.
8861 	 */
8862 
8863 	switch (info & XLOG_HEAP_OPMASK)
8864 	{
8865 		case XLOG_HEAP_INSERT:
8866 			heap_xlog_insert(record);
8867 			break;
8868 		case XLOG_HEAP_DELETE:
8869 			heap_xlog_delete(record);
8870 			break;
8871 		case XLOG_HEAP_UPDATE:
8872 			heap_xlog_update(record, false);
8873 			break;
8874 		case XLOG_HEAP_TRUNCATE:
8875 
8876 			/*
8877 			 * TRUNCATE is a no-op because the actions are already logged as
8878 			 * SMGR WAL records.  TRUNCATE WAL record only exists for logical
8879 			 * decoding.
8880 			 */
8881 			break;
8882 		case XLOG_HEAP_HOT_UPDATE:
8883 			heap_xlog_update(record, true);
8884 			break;
8885 		case XLOG_HEAP_CONFIRM:
8886 			heap_xlog_confirm(record);
8887 			break;
8888 		case XLOG_HEAP_LOCK:
8889 			heap_xlog_lock(record);
8890 			break;
8891 		case XLOG_HEAP_INPLACE:
8892 			heap_xlog_inplace(record);
8893 			break;
8894 		default:
8895 			elog(PANIC, "heap_redo: unknown op code %u", info);
8896 	}
8897 }
8898 
8899 void
heap2_redo(XLogReaderState * record)8900 heap2_redo(XLogReaderState *record)
8901 {
8902 	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8903 
8904 	switch (info & XLOG_HEAP_OPMASK)
8905 	{
8906 		case XLOG_HEAP2_CLEAN:
8907 			heap_xlog_clean(record);
8908 			break;
8909 		case XLOG_HEAP2_FREEZE_PAGE:
8910 			heap_xlog_freeze_page(record);
8911 			break;
8912 		case XLOG_HEAP2_CLEANUP_INFO:
8913 			heap_xlog_cleanup_info(record);
8914 			break;
8915 		case XLOG_HEAP2_VISIBLE:
8916 			heap_xlog_visible(record);
8917 			break;
8918 		case XLOG_HEAP2_MULTI_INSERT:
8919 			heap_xlog_multi_insert(record);
8920 			break;
8921 		case XLOG_HEAP2_LOCK_UPDATED:
8922 			heap_xlog_lock_updated(record);
8923 			break;
8924 		case XLOG_HEAP2_NEW_CID:
8925 
8926 			/*
8927 			 * Nothing to do on a real replay, only used during logical
8928 			 * decoding.
8929 			 */
8930 			break;
8931 		case XLOG_HEAP2_REWRITE:
8932 			heap_xlog_logical_rewrite(record);
8933 			break;
8934 		default:
8935 			elog(PANIC, "heap2_redo: unknown op code %u", info);
8936 	}
8937 }
8938 
8939 /*
8940  * Mask a heap page before performing consistency checks on it.
8941  */
8942 void
heap_mask(char * pagedata,BlockNumber blkno)8943 heap_mask(char *pagedata, BlockNumber blkno)
8944 {
8945 	Page		page = (Page) pagedata;
8946 	OffsetNumber off;
8947 
8948 	mask_page_lsn_and_checksum(page);
8949 
8950 	mask_page_hint_bits(page);
8951 	mask_unused_space(page);
8952 
8953 	for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
8954 	{
8955 		ItemId		iid = PageGetItemId(page, off);
8956 		char	   *page_item;
8957 
8958 		page_item = (char *) (page + ItemIdGetOffset(iid));
8959 
8960 		if (ItemIdIsNormal(iid))
8961 		{
8962 			HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
8963 
8964 			/*
8965 			 * If xmin of a tuple is not yet frozen, we should ignore
8966 			 * differences in hint bits, since they can be set without
8967 			 * emitting WAL.
8968 			 */
8969 			if (!HeapTupleHeaderXminFrozen(page_htup))
8970 				page_htup->t_infomask &= ~HEAP_XACT_MASK;
8971 			else
8972 			{
8973 				/* Still we need to mask xmax hint bits. */
8974 				page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
8975 				page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
8976 			}
8977 
8978 			/*
8979 			 * During replay, we set Command Id to FirstCommandId. Hence, mask
8980 			 * it. See heap_xlog_insert() for details.
8981 			 */
8982 			page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
8983 
8984 			/*
8985 			 * For a speculative tuple, heap_insert() does not set ctid in the
8986 			 * caller-passed heap tuple itself, leaving the ctid field to
8987 			 * contain a speculative token value - a per-backend monotonically
8988 			 * increasing identifier. Besides, it does not WAL-log ctid under
8989 			 * any circumstances.
8990 			 *
8991 			 * During redo, heap_xlog_insert() sets t_ctid to current block
8992 			 * number and self offset number. It doesn't care about any
8993 			 * speculative insertions in master. Hence, we set t_ctid to
8994 			 * current block number and self offset number to ignore any
8995 			 * inconsistency.
8996 			 */
8997 			if (HeapTupleHeaderIsSpeculative(page_htup))
8998 				ItemPointerSet(&page_htup->t_ctid, blkno, off);
8999 
9000 			/*
9001 			 * NB: Not ignoring ctid changes due to the tuple having moved
9002 			 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
9003 			 * important information that needs to be in-sync between primary
9004 			 * and standby, and thus is WAL logged.
9005 			 */
9006 		}
9007 
9008 		/*
9009 		 * Ignore any padding bytes after the tuple, when the length of the
9010 		 * item is not MAXALIGNed.
9011 		 */
9012 		if (ItemIdHasStorage(iid))
9013 		{
9014 			int			len = ItemIdGetLength(iid);
9015 			int			padlen = MAXALIGN(len) - len;
9016 
9017 			if (padlen > 0)
9018 				memset(page_item + len, MASK_MARKER, padlen);
9019 		}
9020 	}
9021 }
9022 
9023 /*
9024  * HeapCheckForSerializableConflictOut
9025  *		We are reading a tuple.  If it's not visible, there may be a
9026  *		rw-conflict out with the inserter.  Otherwise, if it is visible to us
9027  *		but has been deleted, there may be a rw-conflict out with the deleter.
9028  *
9029  * We will determine the top level xid of the writing transaction with which
9030  * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9031  * for overlap with our own transaction.
9032  *
9033  * This function should be called just about anywhere in heapam.c where a
9034  * tuple has been read. The caller must hold at least a shared lock on the
9035  * buffer, because this function might set hint bits on the tuple. There is
9036  * currently no known reason to call this function from an index AM.
9037  */
9038 void
HeapCheckForSerializableConflictOut(bool visible,Relation relation,HeapTuple tuple,Buffer buffer,Snapshot snapshot)9039 HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9040 									HeapTuple tuple, Buffer buffer,
9041 									Snapshot snapshot)
9042 {
9043 	TransactionId xid;
9044 	HTSV_Result htsvResult;
9045 
9046 	if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9047 		return;
9048 
9049 	/*
9050 	 * Check to see whether the tuple has been written to by a concurrent
9051 	 * transaction, either to create it not visible to us, or to delete it
9052 	 * while it is visible to us.  The "visible" bool indicates whether the
9053 	 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9054 	 * is going on with it.
9055 	 *
9056 	 * In the event of a concurrently inserted tuple that also happens to have
9057 	 * been concurrently updated (by a separate transaction), the xmin of the
9058 	 * tuple will be used -- not the updater's xid.
9059 	 */
9060 	htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
9061 	switch (htsvResult)
9062 	{
9063 		case HEAPTUPLE_LIVE:
9064 			if (visible)
9065 				return;
9066 			xid = HeapTupleHeaderGetXmin(tuple->t_data);
9067 			break;
9068 		case HEAPTUPLE_RECENTLY_DEAD:
9069 		case HEAPTUPLE_DELETE_IN_PROGRESS:
9070 			if (visible)
9071 				xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9072 			else
9073 				xid = HeapTupleHeaderGetXmin(tuple->t_data);
9074 
9075 			if (TransactionIdPrecedes(xid, TransactionXmin))
9076 			{
9077 				/* This is like the HEAPTUPLE_DEAD case */
9078 				Assert(!visible);
9079 				return;
9080 			}
9081 			break;
9082 		case HEAPTUPLE_INSERT_IN_PROGRESS:
9083 			xid = HeapTupleHeaderGetXmin(tuple->t_data);
9084 			break;
9085 		case HEAPTUPLE_DEAD:
9086 			Assert(!visible);
9087 			return;
9088 		default:
9089 
9090 			/*
9091 			 * The only way to get to this default clause is if a new value is
9092 			 * added to the enum type without adding it to this switch
9093 			 * statement.  That's a bug, so elog.
9094 			 */
9095 			elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9096 
9097 			/*
9098 			 * In spite of having all enum values covered and calling elog on
9099 			 * this default, some compilers think this is a code path which
9100 			 * allows xid to be used below without initialization. Silence
9101 			 * that warning.
9102 			 */
9103 			xid = InvalidTransactionId;
9104 	}
9105 
9106 	Assert(TransactionIdIsValid(xid));
9107 	Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
9108 
9109 	/*
9110 	 * Find top level xid.  Bail out if xid is too early to be a conflict, or
9111 	 * if it's our own xid.
9112 	 */
9113 	if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
9114 		return;
9115 	xid = SubTransGetTopmostTransaction(xid);
9116 	if (TransactionIdPrecedes(xid, TransactionXmin))
9117 		return;
9118 
9119 	CheckForSerializableConflictOut(relation, xid, snapshot);
9120 }
9121